Adding upstream version 124.0.1.upstream/124.0.1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
commit: 26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree: f435a8308119effd964b339f76abb83a57c29483 /third_party/aom/av1/encoder
parent: Initial commit. (diff)
download: firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
232 files changed, 188397 insertions, 0 deletions
diff --git a/third_party/aom/av1/encoder/allintra_vis.c b/third_party/aom/av1/encoder/allintra_vis.c
new file mode 100644
index 0000000000..8dcef5fc85
--- /dev/null
+++ b/third_party/aom/av1/encoder/allintra_vis.c
@@ -0,0 +1,1055 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#if CONFIG_TFLITE
+#include "tensorflow/lite/c/c_api.h"
+#include "av1/encoder/deltaq4_model.c"
+#endif
+
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#define MB_WIENER_PRED_BLOCK_SIZE BLOCK_128X128
+#define MB_WIENER_PRED_BUF_STRIDE 128
+
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td) {
+  const int is_high_bitdepth = is_cur_buf_hbd(&td->mb.e_mbd);
+  assert(MB_WIENER_PRED_BLOCK_SIZE < BLOCK_SIZES_ALL);
+  const int buf_width = block_size_wide[MB_WIENER_PRED_BLOCK_SIZE];
+  const int buf_height = block_size_high[MB_WIENER_PRED_BLOCK_SIZE];
+  assert(buf_width == MB_WIENER_PRED_BUF_STRIDE);
+  const size_t buf_size =
+      (buf_width * buf_height * sizeof(*td->wiener_tmp_pred_buf))
+      << is_high_bitdepth;
+  CHECK_MEM_ERROR(cm, td->wiener_tmp_pred_buf, aom_memalign(32, buf_size));
+}
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td) {
+  aom_free(td->wiener_tmp_pred_buf);
+  td->wiener_tmp_pred_buf = NULL;
+}
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  // This block size is also used to determine number of workers in
+  // multi-threading. If it is changed, one needs to change it accordingly in
+  // "compute_num_ai_workers()".
+  cpi->weber_bsize = BLOCK_8X8;
+
+  if (cpi->oxcf.enable_rate_guide_deltaq) {
+    if (cpi->mb_weber_stats && cpi->prep_rate_estimates &&
+        cpi->ext_rate_distribution)
+      return;
+  } else {
+    if (cpi->mb_weber_stats) return;
+  }
+
+  CHECK_MEM_ERROR(cm, cpi->mb_weber_stats,
+                  aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+                             sizeof(*cpi->mb_weber_stats)));
+
+  if (cpi->oxcf.enable_rate_guide_deltaq) {
+    CHECK_MEM_ERROR(
+        cm, cpi->prep_rate_estimates,
+        aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+                   sizeof(*cpi->prep_rate_estimates)));
+
+    CHECK_MEM_ERROR(
+        cm, cpi->ext_rate_distribution,
+        aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+                   sizeof(*cpi->ext_rate_distribution)));
+  }
+}
+
+static int64_t get_satd(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                        int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  int64_t satd = 0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      satd += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+                  .satd;
+      ++mb_count;
+    }
+  }
+
+  if (mb_count) satd = (int)(satd / mb_count);
+  satd = AOMMAX(1, satd);
+
+  return (int)satd;
+}
+
+static int64_t get_sse(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                       int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  int64_t distortion = 0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      distortion +=
+          cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+              .distortion;
+      ++mb_count;
+    }
+  }
+
+  if (mb_count) distortion = (int)(distortion / mb_count);
+  distortion = AOMMAX(1, distortion);
+
+  return (int)distortion;
+}
+
+static double get_max_scale(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                            int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  double min_max_scale = 10.0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+      WeberStats *weber_stats =
+          &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+      if (weber_stats->max_scale < 1.0) continue;
+      if (weber_stats->max_scale < min_max_scale)
+        min_max_scale = weber_stats->max_scale;
+    }
+  }
+  return min_max_scale;
+}
+
+static int get_window_wiener_var(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int sb_wiener_var = 0;
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  double base_num = 1;
+  double base_den = 1;
+  double base_reg = 1;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      WeberStats *weber_stats =
+          &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+
+      base_num += ((double)weber_stats->distortion) *
+                  sqrt((double)weber_stats->src_variance) *
+                  weber_stats->rec_pix_max;
+
+      base_den += fabs(
+          weber_stats->rec_pix_max * sqrt((double)weber_stats->src_variance) -
+          weber_stats->src_pix_max * sqrt((double)weber_stats->rec_variance));
+
+      base_reg += sqrt((double)weber_stats->distortion) *
+                  sqrt((double)weber_stats->src_pix_max) * 0.1;
+      ++mb_count;
+    }
+  }
+
+  sb_wiener_var =
+      (int)(((base_num + base_reg) / (base_den + base_reg)) / mb_count);
+  sb_wiener_var = AOMMAX(1, sb_wiener_var);
+
+  return (int)sb_wiener_var;
+}
+
+static int get_var_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  int sb_wiener_var = get_window_wiener_var(cpi, bsize, mi_row, mi_col);
+
+  if (mi_row >= (mi_high / 2)) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row - mi_high / 2, mi_col));
+  }
+  if (mi_row <= (cm->mi_params.mi_rows - mi_high - (mi_high / 2))) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row + mi_high / 2, mi_col));
+  }
+  if (mi_col >= (mi_wide / 2)) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row, mi_col - mi_wide / 2));
+  }
+  if (mi_col <= (cm->mi_params.mi_cols - mi_wide - (mi_wide / 2))) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row, mi_col + mi_wide / 2));
+  }
+
+  return sb_wiener_var;
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+  int rate_cost = 1;
+
+  for (int idx = 0; idx < eob; ++idx) {
+    int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += (int)(log1p(abs_level) / log(2.0)) + 1 + (abs_level > 0);
+  }
+
+  return (rate_cost << AV1_PROB_COST_SHIFT);
+}
+
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+                                MACROBLOCKD *xd, const int mi_row,
+                                int16_t *src_diff, tran_low_t *coeff,
+                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                double *sum_rec_distortion,
+                                double *sum_est_rate, uint8_t *pred_buffer) {
+  AV1_COMMON *const cm = &cpi->common;
+  uint8_t *buffer = cpi->source->y_buffer;
+  int buf_stride = cpi->source->y_stride;
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int block_size = tx_size_wide[tx_size];
+  const int coeff_count = block_size * block_size;
+  const int mb_step = mi_size_wide[bsize];
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  const MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt;
+  AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+      &cpi->ppi->intra_row_mt_sync;
+  const int mi_cols = cm->mi_params.mi_cols;
+  const int mt_thread_id = mi_row / mb_step;
+  // TODO(chengchen): test different unit step size
+  const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE];
+  const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
+  int mt_unit_col = 0;
+  const int is_high_bitdepth = is_cur_buf_hbd(xd);
+
+  uint8_t *dst_buffer = pred_buffer;
+  const int dst_buffer_stride = MB_WIENER_PRED_BUF_STRIDE;
+
+  if (is_high_bitdepth) {
+    uint16_t *pred_buffer_16 = (uint16_t *)pred_buffer;
+    dst_buffer = CONVERT_TO_BYTEPTR(pred_buffer_16);
+  }
+
+  for (int mi_col = 0; mi_col < mi_cols; mi_col += mb_step) {
+    if (mi_col % mt_unit_step == 0) {
+      intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id,
+                                    mt_unit_col);
+#if CONFIG_MULTITHREAD
+      const int num_workers =
+          AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers);
+      if (num_workers > 1) {
+        const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+        pthread_mutex_lock(enc_row_mt->mutex_);
+        const bool exit = enc_row_mt->mb_wiener_mt_exit;
+        pthread_mutex_unlock(enc_row_mt->mutex_);
+        // Stop further processing in case any worker has encountered an error.
+        if (exit) break;
+      }
+#endif
+    }
+
+    PREDICTION_MODE best_mode = DC_PRED;
+    int best_intra_cost = INT_MAX;
+    const int mi_width = mi_size_wide[bsize];
+    const int mi_height = mi_size_high[bsize];
+    set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                          mi_row, mi_col);
+    set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+                   AOMMIN(mi_row + mi_height, cm->mi_params.mi_rows),
+                   AOMMIN(mi_col + mi_width, cm->mi_params.mi_cols));
+    set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+                 av1_num_planes(cm));
+    xd->mi[0]->bsize = bsize;
+    xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+    // Set above and left mbmi to NULL as they are not available in the
+    // preprocessing stage.
+    // They are used to detemine intra edge filter types in intra prediction.
+    if (xd->up_available) {
+      xd->above_mbmi = NULL;
+    }
+    if (xd->left_available) {
+      xd->left_mbmi = NULL;
+    }
+    uint8_t *mb_buffer =
+        buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
+    for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END;
+         ++mode) {
+      // TODO(chengchen): Here we use src instead of reconstructed frame as
+      // the intra predictor to make single and multithread version match.
+      // Ideally we want to use the reconstructed.
+      av1_predict_intra_block(
+          xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
+          block_size, block_size, tx_size, mode, 0, 0, FILTER_INTRA_MODES,
+          mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+      av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+                         mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+      av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+      int intra_cost = aom_satd(coeff, coeff_count);
+      if (intra_cost < best_intra_cost) {
+        best_intra_cost = intra_cost;
+        best_mode = mode;
+      }
+    }
+
+    av1_predict_intra_block(
+        xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
+        block_size, block_size, tx_size, best_mode, 0, 0, FILTER_INTRA_MODES,
+        mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+    av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+                       mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+    av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+
+    const struct macroblock_plane *const p = &x->plane[0];
+    uint16_t eob;
+    const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+    QUANT_PARAM quant_param;
+    int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+    av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_cur_buf_hbd(xd)) {
+      av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                                    scan_order, &quant_param);
+    } else {
+      av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                             scan_order, &quant_param);
+    }
+#else
+    av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order,
+                           &quant_param);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+    if (cpi->oxcf.enable_rate_guide_deltaq) {
+      const int rate_cost = rate_estimator(qcoeff, eob, tx_size);
+      cpi->prep_rate_estimates[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+                               (mi_col / mb_step)] = rate_cost;
+    }
+
+    av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer,
+                                dst_buffer_stride, eob, 0);
+    WeberStats *weber_stats =
+        &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+                             (mi_col / mb_step)];
+
+    weber_stats->rec_pix_max = 1;
+    weber_stats->rec_variance = 0;
+    weber_stats->src_pix_max = 1;
+    weber_stats->src_variance = 0;
+    weber_stats->distortion = 0;
+
+    int64_t src_mean = 0;
+    int64_t rec_mean = 0;
+    int64_t dist_mean = 0;
+
+    for (int pix_row = 0; pix_row < block_size; ++pix_row) {
+      for (int pix_col = 0; pix_col < block_size; ++pix_col) {
+        int src_pix, rec_pix;
+#if CONFIG_AV1_HIGHBITDEPTH
+        if (is_cur_buf_hbd(xd)) {
+          uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer);
+          uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer);
+          src_pix = src[pix_row * buf_stride + pix_col];
+          rec_pix = rec[pix_row * dst_buffer_stride + pix_col];
+        } else {
+          src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+          rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+        }
+#else
+        src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+        rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+#endif
+        src_mean += src_pix;
+        rec_mean += rec_pix;
+        dist_mean += src_pix - rec_pix;
+        weber_stats->src_variance += src_pix * src_pix;
+        weber_stats->rec_variance += rec_pix * rec_pix;
+        weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix);
+        weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix);
+        weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix);
+      }
+    }
+
+    if (cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) {
+      *sum_rec_distortion += weber_stats->distortion;
+      int est_block_rate = 0;
+      int64_t est_block_dist = 0;
+      model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion,
+                                      pix_num, &est_block_rate,
+                                      &est_block_dist);
+      *sum_est_rate += est_block_rate;
+    }
+
+    weber_stats->src_variance -= (src_mean * src_mean) / pix_num;
+    weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num;
+    weber_stats->distortion -= (dist_mean * dist_mean) / pix_num;
+    weber_stats->satd = best_intra_cost;
+
+    qcoeff[0] = 0;
+    int max_scale = 0;
+    for (int idx = 1; idx < coeff_count; ++idx) {
+      const int abs_qcoeff = abs(qcoeff[idx]);
+      max_scale = AOMMAX(max_scale, abs_qcoeff);
+    }
+    weber_stats->max_scale = max_scale;
+
+    if ((mi_col + mb_step) % mt_unit_step == 0 ||
+        (mi_col + mb_step) >= mi_cols) {
+      intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id,
+                                     mt_unit_col, mt_unit_cols);
+      ++mt_unit_col;
+    }
+  }
+  // Set the pointer to null since mbmi is only allocated inside this function.
+  xd->mi = NULL;
+}
+
+static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion,
+                               double *sum_est_rate) {
+  MACROBLOCK *x = &cpi->td.mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const int mb_step = mi_size_wide[bsize];
+  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+  for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
+    av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff,
+                               dqcoeff, sum_rec_distortion, sum_est_rate,
+                               cpi->td.wiener_tmp_pred_buf);
+  }
+}
+
+static int64_t estimate_wiener_var_norm(AV1_COMP *const cpi,
+                                        const BLOCK_SIZE norm_block_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int64_t norm_factor = 1;
+  assert(norm_block_size >= BLOCK_16X16 && norm_block_size <= BLOCK_128X128);
+  const int norm_step = mi_size_wide[norm_block_size];
+  double sb_wiener_log = 0;
+  double sb_count = 0;
+  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) {
+      const int sb_wiener_var =
+          get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
+      const int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
+      const int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col);
+      const double scaled_satd = (double)satd / sqrt((double)sse);
+      sb_wiener_log += scaled_satd * log(sb_wiener_var);
+      sb_count += scaled_satd;
+    }
+  }
+  if (sb_count > 0) norm_factor = (int64_t)(exp(sb_wiener_log / sb_count));
+  norm_factor = AOMMAX(1, norm_factor);
+
+  return norm_factor;
+}
+
+static void automatic_intra_tools_off(AV1_COMP *cpi,
+                                      const double sum_rec_distortion,
+                                      const double sum_est_rate) {
+  if (!cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) return;
+
+  // Thresholds
+  const int high_quality_qindex = 128;
+  const double high_quality_bpp = 2.0;
+  const double high_quality_dist_per_pix = 4.0;
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int qindex = cm->quant_params.base_qindex;
+  const double dist_per_pix =
+      (double)sum_rec_distortion / (cm->width * cm->height);
+  // The estimate bpp is not accurate, an empirical constant 100 is divided.
+  const double estimate_bpp = sum_est_rate / (cm->width * cm->height * 100);
+
+  if (qindex < high_quality_qindex && estimate_bpp > high_quality_bpp &&
+      dist_per_pix < high_quality_dist_per_pix) {
+    cpi->oxcf.intra_mode_cfg.enable_smooth_intra = 0;
+    cpi->oxcf.intra_mode_cfg.enable_paeth_intra = 0;
+    cpi->oxcf.intra_mode_cfg.enable_cfl_intra = 0;
+    cpi->oxcf.intra_mode_cfg.enable_diagonal_intra = 0;
+  }
+}
+
+static void ext_rate_guided_quantization(AV1_COMP *cpi) {
+  // Calculation uses 8x8.
+  const int mb_step = mi_size_wide[cpi->weber_bsize];
+  // Accumulate to 16x16, step size is in the unit of mi.
+  const int block_step = 4;
+
+  const char *filename = cpi->oxcf.rate_distribution_info;
+  FILE *pfile = fopen(filename, "r");
+  if (pfile == NULL) {
+    assert(pfile != NULL);
+    return;
+  }
+
+  double ext_rate_sum = 0.0;
+  for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) {
+    for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) {
+      float val;
+      const int fields_converted = fscanf(pfile, "%f", &val);
+      if (fields_converted != 1) {
+        assert(fields_converted == 1);
+        fclose(pfile);
+        return;
+      }
+      ext_rate_sum += val;
+      cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols +
+                                 (col / mb_step)] = val;
+    }
+  }
+  fclose(pfile);
+
+  int uniform_rate_sum = 0;
+  for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) {
+    for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) {
+      int rate_sum = 0;
+      for (int r = 0; r < block_step; r += mb_step) {
+        for (int c = 0; c < block_step; c += mb_step) {
+          const int mi_row = row + r;
+          const int mi_col = col + c;
+          rate_sum += cpi->prep_rate_estimates[(mi_row / mb_step) *
+                                                   cpi->frame_info.mi_cols +
+                                               (mi_col / mb_step)];
+        }
+      }
+      uniform_rate_sum += rate_sum;
+    }
+  }
+
+  const double scale = uniform_rate_sum / ext_rate_sum;
+  cpi->ext_rate_scale = scale;
+}
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  if (aom_realloc_frame_buffer(
+          &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL, cpi->image_pyramid_levels, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td);
+  cpi->norm_wiener_variance = 0;
+
+  MACROBLOCK *x = &cpi->td.mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  // xd->mi needs to be setup since it is used in av1_frame_init_quantizer.
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+  cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level;
+  av1_frame_init_quantizer(cpi);
+
+  double sum_rec_distortion = 0.0;
+  double sum_est_rate = 0.0;
+
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int num_workers =
+      AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers);
+  AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt;
+  intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read_dummy;
+  intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write_dummy;
+  // Calculate differential contrast for each block for the entire image.
+  // TODO(chengchen): properly accumulate the distortion and rate in
+  // av1_calc_mb_wiener_var_mt(). Until then, call calc_mb_wiener_var() if
+  // auto_intra_tools_off is true.
+  if (num_workers > 1 && !cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) {
+    intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read;
+    intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write;
+    av1_calc_mb_wiener_var_mt(cpi, num_workers, &sum_rec_distortion,
+                              &sum_est_rate);
+  } else {
+    calc_mb_wiener_var(cpi, &sum_rec_distortion, &sum_est_rate);
+  }
+
+  // Determine whether to turn off several intra coding tools.
+  automatic_intra_tools_off(cpi, sum_rec_distortion, sum_est_rate);
+
+  // Read external rate distribution and use it to guide delta quantization
+  if (cpi->oxcf.enable_rate_guide_deltaq) ext_rate_guided_quantization(cpi);
+
+  const BLOCK_SIZE norm_block_size = cm->seq_params->sb_size;
+  cpi->norm_wiener_variance = estimate_wiener_var_norm(cpi, norm_block_size);
+  const int norm_step = mi_size_wide[norm_block_size];
+
+  double sb_wiener_log = 0;
+  double sb_count = 0;
+  for (int its_cnt = 0; its_cnt < 2; ++its_cnt) {
+    sb_wiener_log = 0;
+    sb_count = 0;
+    for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+      for (int mi_col = 0; mi_col < cm->mi_params.mi_cols;
+           mi_col += norm_step) {
+        int sb_wiener_var =
+            get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
+
+        double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+        double min_max_scale = AOMMAX(
+            1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
+
+        beta = AOMMIN(beta, 4);
+        beta = AOMMAX(beta, 0.25);
+
+        if (beta < 1 / min_max_scale) continue;
+
+        sb_wiener_var = (int)(cpi->norm_wiener_variance / beta);
+
+        int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
+        int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col);
+        double scaled_satd = (double)satd / sqrt((double)sse);
+        sb_wiener_log += scaled_satd * log(sb_wiener_var);
+        sb_count += scaled_satd;
+      }
+    }
+
+    if (sb_count > 0)
+      cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count));
+    cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance);
+  }
+
+  // Set the pointer to null since mbmi is only allocated inside this function.
+  xd->mi = NULL;
+  aom_free_frame_buffer(&cm->cur_frame->buf);
+  av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
+}
+
+static int get_rate_guided_quantizer(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                     int mi_row, int mi_col) {
+  // Calculation uses 8x8.
+  const int mb_step = mi_size_wide[cpi->weber_bsize];
+  // Accumulate to 16x16
+  const int block_step = mi_size_wide[BLOCK_16X16];
+  double sb_rate_hific = 0.0;
+  double sb_rate_uniform = 0.0;
+  for (int row = mi_row; row < mi_row + mi_size_wide[bsize];
+       row += block_step) {
+    for (int col = mi_col; col < mi_col + mi_size_high[bsize];
+         col += block_step) {
+      sb_rate_hific +=
+          cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols +
+                                     (col / mb_step)];
+
+      for (int r = 0; r < block_step; r += mb_step) {
+        for (int c = 0; c < block_step; c += mb_step) {
+          const int this_row = row + r;
+          const int this_col = col + c;
+          sb_rate_uniform +=
+              cpi->prep_rate_estimates[(this_row / mb_step) *
+                                           cpi->frame_info.mi_cols +
+                                       (this_col / mb_step)];
+        }
+      }
+    }
+  }
+  sb_rate_hific *= cpi->ext_rate_scale;
+
+  const double weight = 1.0;
+  const double rate_diff =
+      weight * (sb_rate_hific - sb_rate_uniform) / sb_rate_uniform;
+  double scale = pow(2, rate_diff);
+
+  scale = scale * scale;
+  double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+  scale = 1.0 / AOMMIN(1.0 / scale, min_max_scale);
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  int offset =
+      av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, scale);
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  const int max_offset = delta_q_info->delta_q_res * 10;
+  offset = AOMMIN(offset, max_offset - 1);
+  offset = AOMMAX(offset, -max_offset + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+  if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1);
+
+  return qindex;
+}
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                              int mi_col) {
+  if (cpi->oxcf.enable_rate_guide_deltaq) {
+    return get_rate_guided_quantizer(cpi, bsize, mi_row, mi_col);
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  int sb_wiener_var = get_var_perceptual_ai(cpi, bsize, mi_row, mi_col);
+  int offset = 0;
+  double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+  double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+  beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+
+  // Cap beta such that the delta q value is not much far away from the base q.
+  beta = AOMMIN(beta, 4);
+  beta = AOMMAX(beta, 0.25);
+  offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 20 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+  if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1);
+
+  return qindex;
+}
+
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  if (cpi->mb_delta_q) return;
+
+  CHECK_MEM_ERROR(cm, cpi->mb_delta_q,
+                  aom_calloc(cpi->frame_info.mb_rows * cpi->frame_info.mb_cols,
+                             sizeof(*cpi->mb_delta_q)));
+}
+
+#if CONFIG_TFLITE
+static int model_predict(BLOCK_SIZE block_size, int num_cols, int num_rows,
+                         int bit_depth, uint8_t *y_buffer, int y_stride,
+                         float *predicts0, float *predicts1) {
+  // Create the model and interpreter options.
+  TfLiteModel *model =
+      TfLiteModelCreate(av1_deltaq4_model_file, av1_deltaq4_model_fsize);
+  if (model == NULL) return 1;
+
+  TfLiteInterpreterOptions *options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsSetNumThreads(options, 2);
+  if (options == NULL) {
+    TfLiteModelDelete(model);
+    return 1;
+  }
+
+  // Create the interpreter.
+  TfLiteInterpreter *interpreter = TfLiteInterpreterCreate(model, options);
+  if (interpreter == NULL) {
+    TfLiteInterpreterOptionsDelete(options);
+    TfLiteModelDelete(model);
+    return 1;
+  }
+
+  // Allocate tensors and populate the input tensor data.
+  TfLiteInterpreterAllocateTensors(interpreter);
+  TfLiteTensor *input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+  if (input_tensor == NULL) {
+    TfLiteInterpreterDelete(interpreter);
+    TfLiteInterpreterOptionsDelete(options);
+    TfLiteModelDelete(model);
+    return 1;
+  }
+
+  size_t input_size = TfLiteTensorByteSize(input_tensor);
+  float *input_data = aom_calloc(input_size, 1);
+  if (input_data == NULL) {
+    TfLiteInterpreterDelete(interpreter);
+    TfLiteInterpreterOptionsDelete(options);
+    TfLiteModelDelete(model);
+    return 1;
+  }
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int row_offset = (row * num_mi_h) << 2;
+      const int col_offset = (col * num_mi_w) << 2;
+
+      uint8_t *buf = y_buffer + row_offset * y_stride + col_offset;
+      int r = row_offset, pos = 0;
+      const float base = (float)((1 << bit_depth) - 1);
+      while (r < row_offset + (num_mi_h << 2)) {
+        for (int c = 0; c < (num_mi_w << 2); ++c) {
+          input_data[pos++] = bit_depth > 8
+                                  ? (float)*CONVERT_TO_SHORTPTR(buf + c) / base
+                                  : (float)*(buf + c) / base;
+        }
+        buf += y_stride;
+        ++r;
+      }
+      TfLiteTensorCopyFromBuffer(input_tensor, input_data, input_size);
+
+      // Execute inference.
+      if (TfLiteInterpreterInvoke(interpreter) != kTfLiteOk) {
+        TfLiteInterpreterDelete(interpreter);
+        TfLiteInterpreterOptionsDelete(options);
+        TfLiteModelDelete(model);
+        return 1;
+      }
+
+      // Extract the output tensor data.
+      const TfLiteTensor *output_tensor =
+          TfLiteInterpreterGetOutputTensor(interpreter, 0);
+      if (output_tensor == NULL) {
+        TfLiteInterpreterDelete(interpreter);
+        TfLiteInterpreterOptionsDelete(options);
+        TfLiteModelDelete(model);
+        return 1;
+      }
+
+      size_t output_size = TfLiteTensorByteSize(output_tensor);
+      float output_data[2];
+
+      TfLiteTensorCopyToBuffer(output_tensor, output_data, output_size);
+      predicts0[row * num_cols + col] = output_data[0];
+      predicts1[row * num_cols + col] = output_data[1];
+    }
+  }
+
+  // Dispose of the model and interpreter objects.
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+  aom_free(input_data);
+  return 0;
+}
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = cpi->common.seq_params->sb_size;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+
+  // TODO(sdeng): fit a better model_1; disable it at this time.
+  float *mb_delta_q0, *mb_delta_q1, delta_q_avg0 = 0.0f;
+  CHECK_MEM_ERROR(cm, mb_delta_q0,
+                  aom_calloc(num_rows * num_cols, sizeof(float)));
+  CHECK_MEM_ERROR(cm, mb_delta_q1,
+                  aom_calloc(num_rows * num_cols, sizeof(float)));
+
+  if (model_predict(block_size, num_cols, num_rows, bit_depth, y_buffer,
+                    y_stride, mb_delta_q0, mb_delta_q1)) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "Failed to call TFlite functions.");
+  }
+
+  // Loop through each SB block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      delta_q_avg0 += mb_delta_q0[index];
+    }
+  }
+
+  delta_q_avg0 /= (float)(num_rows * num_cols);
+
+  float scaling_factor;
+  const float cq_level = (float)cpi->oxcf.rc_cfg.cq_level / (float)MAXQ;
+  if (cq_level < delta_q_avg0) {
+    scaling_factor = cq_level / delta_q_avg0;
+  } else {
+    scaling_factor = 1.0f - (cq_level - delta_q_avg0) / (1.0f - delta_q_avg0);
+  }
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->mb_delta_q[index] =
+          RINT((float)cpi->oxcf.q_cfg.deltaq_strength / 100.0f * (float)MAXQ *
+               scaling_factor * (mb_delta_q0[index] - delta_q_avg0));
+    }
+  }
+
+  aom_free(mb_delta_q0);
+  aom_free(mb_delta_q1);
+}
+#else  // !CONFIG_TFLITE
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = cpi->common.seq_params->sb_size;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+
+  int *mb_delta_q[2];
+  CHECK_MEM_ERROR(cm, mb_delta_q[0],
+                  aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0])));
+  CHECK_MEM_ERROR(cm, mb_delta_q[1],
+                  aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1])));
+
+  // Approximates the model change between current version (Spet 2021) and the
+  // baseline (July 2021).
+  const double model_change[] = { 3.0, 3.0 };
+  // The following parameters are fitted from user labeled data.
+  const double a[] = { -24.50 * 4.0, -17.20 * 4.0 };
+  const double b[] = { 0.004898, 0.003093 };
+  const double c[] = { (29.932 + model_change[0]) * 4.0,
+                       (42.100 + model_change[1]) * 4.0 };
+  int delta_q_avg[2] = { 0, 0 };
+  // Loop through each SB block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      // Loop through each 8x8 block.
+      for (int mi_row = row * num_mi_h;
+           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+           mi_row += 2) {
+        for (int mi_col = col * num_mi_w;
+             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+             mi_col += 2) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 2;
+          const int col_offset_y = mi_col << 2;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          unsigned int block_variance;
+          block_variance = av1_get_perpixel_variance_facade(
+              cpi, xd, &buf, BLOCK_8X8, AOM_PLANE_Y);
+
+          block_variance = AOMMAX(block_variance, 1);
+          var += log((double)block_variance);
+          num_of_var += 1.0;
+        }
+      }
+      var = exp(var / num_of_var);
+      mb_delta_q[0][index] = RINT(a[0] * exp(-b[0] * var) + c[0]);
+      mb_delta_q[1][index] = RINT(a[1] * exp(-b[1] * var) + c[1]);
+      delta_q_avg[0] += mb_delta_q[0][index];
+      delta_q_avg[1] += mb_delta_q[1][index];
+    }
+  }
+
+  delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols));
+  delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols));
+
+  int model_idx;
+  double scaling_factor;
+  const int cq_level = cpi->oxcf.rc_cfg.cq_level;
+  if (cq_level < delta_q_avg[0]) {
+    model_idx = 0;
+    scaling_factor = (double)cq_level / delta_q_avg[0];
+  } else if (cq_level < delta_q_avg[1]) {
+    model_idx = 2;
+    scaling_factor =
+        (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]);
+  } else {
+    model_idx = 1;
+    scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]);
+  }
+
+  const double new_delta_q_avg =
+      delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]);
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      if (model_idx == 2) {
+        const double delta_q =
+            mb_delta_q[0][index] +
+            scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]);
+        cpi->mb_delta_q[index] = RINT((double)cpi->oxcf.q_cfg.deltaq_strength /
+                                      100.0 * (delta_q - new_delta_q_avg));
+      } else {
+        cpi->mb_delta_q[index] = RINT(
+            (double)cpi->oxcf.q_cfg.deltaq_strength / 100.0 * scaling_factor *
+            (mb_delta_q[model_idx][index] - delta_q_avg[model_idx]));
+      }
+    }
+  }
+
+  aom_free(mb_delta_q[0]);
+  aom_free(mb_delta_q[1]);
+}
+#endif
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col) {
+  const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  if (base_qindex == MINQ || base_qindex == MAXQ) return base_qindex;
+
+  const int num_mi_w = mi_size_wide[bsize];
+  const int num_mi_h = mi_size_high[bsize];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int index = (mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w);
+  const int delta_q = cpi->mb_delta_q[index];
+
+  int qindex = base_qindex + delta_q;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ + 1);
+
+  return qindex;
+}
diff --git a/third_party/aom/av1/encoder/allintra_vis.h b/third_party/aom/av1/encoder/allintra_vis.h
new file mode 100644
index 0000000000..0d34ce0841
--- /dev/null
+++ b/third_party/aom/av1/encoder/allintra_vis.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+#define AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+
+#define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi);
+
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+                                MACROBLOCKD *xd, const int mi_row,
+                                int16_t *src_diff, tran_low_t *coeff,
+                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                double *sum_rec_distortion,
+                                double *sum_est_rate, uint8_t *pred_buffer);
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                              int mi_col);
+
+// User rating based mode
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi);
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col);
+
+#endif  // AOM_AV1_ENCODER_ALLINTRA_VIS_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
new file mode 100644
index 0000000000..4cf6bd572d
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3  // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { 1.75, 1.25, 1.05, 1.00, 0.90 },
+  { 2.00, 1.50, 1.15, 1.00, 0.85 },
+  { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { 0.15, 0.30, 0.55, 2.00, 100.0 },
+  { 0.20, 0.40, 0.65, 2.00, 100.0 },
+  { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { -4.0, -3.0, -2.0, 100.00, 100.0 },
+  { -3.5, -2.5, -1.5, 100.00, 100.0 },
+  { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
+  // Approximate base quatizer (truncated to int)
+  const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4;
+  return (base_quant > 10) + (base_quant > 25);
+}
+
+static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+
+  return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+         refresh_frame->alt_ref_frame ||
+         (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+// Segmentation only makes sense if the target bits per SB is above a threshold.
+// Below this the overheads will usually outweigh any benefit.
+static bool is_sb_aq_enabled(const AV1_COMP *const cpi) {
+  return cpi->rc.sb64_target_rate >= 256;
+}
+
+void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  struct segmentation *const seg = &cm->seg;
+  const int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+
+  // Make SURE use of floating point in this function is safe.
+
+  if (resolution_change) {
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    av1_clearall_segfeatures(seg);
+    av1_disable_segmentation(seg);
+    return;
+  }
+
+  if (is_frame_aq_enabled(cpi)) {
+    int segment;
+    const int aq_strength =
+        get_aq_c_strength(base_qindex, cm->seq_params->bit_depth);
+
+    // Clear down the segment map.
+    memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG,
+           cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+    av1_clearall_segfeatures(seg);
+
+    if (!is_sb_aq_enabled(cpi)) {
+      av1_disable_segmentation(seg);
+      return;
+    }
+
+    av1_enable_segmentation(seg);
+
+    // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+    av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment.
+    for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+      int qindex_delta;
+
+      if (segment == DEFAULT_AQ2_SEG) continue;
+
+      qindex_delta = av1_compute_qdelta_by_rate(
+          cpi, cm->current_frame.frame_type, base_qindex,
+          aq_c_q_adj_factor[aq_strength][segment]);
+
+      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -base_qindex + 1;
+      }
+      if ((base_qindex + qindex_delta) > 0) {
+        av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
+    }
+  }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate) {
+  if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col;
+  const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]);
+  const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]);
+  int i;
+  unsigned char segment;
+
+  // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+  // It is converted to bits << AV1_PROB_COST_SHIFT units.
+  const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+                      << AV1_PROB_COST_SHIFT;
+  const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size;
+  const int target_rate = (int)(num / denom);
+  double logvar;
+  double low_var_thresh;
+  const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
+                                            cm->seq_params->bit_depth);
+
+  low_var_thresh =
+      (is_stat_consumption_stage_twopass(cpi))
+          ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+          : DEFAULT_LV_THRESH;
+
+  av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
+  logvar = av1_log_block_var(cpi, mb, bs);
+
+  segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
+  for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+    // Test rate against a threshold value and variance against a threshold.
+    // Increasing segment number (higher variance and complexity) = higher Q.
+    if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+        (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+      segment = i;
+      break;
+    }
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64.
+  const int mi_stride = cm->mi_params.mi_cols;
+  set_segment_id(cpi->enc_seg.map, mi_offset, xmis, ymis, mi_stride, segment);
+}
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
new file mode 100644
index 0000000000..3421d74c93
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *,
+                            BLOCK_SIZE bs, int mi_row, int mi_col,
+                            int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
new file mode 100644
index 0000000000..f48ff11e51
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/common/pred_common.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
+  if (cr == NULL) return NULL;
+
+  cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  cr->counter_encode_maxq_scene_change = 0;
+  cr->percent_refresh_adjustment = 5;
+  cr->rate_ratio_qdelta_adjustment = 0.25;
+  if (cr->map == NULL) {
+    av1_cyclic_refresh_free(cr);
+    return NULL;
+  }
+  return cr;
+}
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+  if (cr != NULL) {
+    aom_free(cr->map);
+    aom_free(cr);
+  }
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MB_MODE_INFO *mbmi, int64_t rate,
+                                int64_t dist, BLOCK_SIZE bsize,
+                                int noise_level) {
+  MV mv = mbmi->mv[0].as_mv;
+  int is_compound = has_second_ref(mbmi);
+  // Reject the block for lower-qp coding for non-compound mode if
+  // projected distortion is above the threshold, and any of the following
+  // is true:
+  // 1) mode uses large mv
+  // 2) mode is an intra-mode
+  // Otherwise accept for refresh.
+  if (!is_compound && dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mbmi)))
+    return CR_SEGMENT_ID_BASE;
+  else if ((is_compound && noise_level < kMedium) ||
+           (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+            is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+            cr->rate_boost_fac > 10))
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int deltaq = av1_compute_qdelta_by_rate(
+      cpi, cpi->common.current_frame.frame_type, q, rate_factor);
+  if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+    deltaq = -cr->max_qdelta_perc * q / 100;
+  }
+  return deltaq;
+}
+
+int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
+                                          double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int mbs = cm->mi_params.MBs;
+  const int num4x4bl = mbs << 4;
+  // Weight for non-base segments: use actual number of blocks refreshed in
+  // previous/just encoded frame. Note number of blocks here is in 4x4 units.
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
+  if (cpi->rc.rtc_external_ratectrl) {
+    weight_segment1 = (double)(cr->percent_refresh * cm->mi_params.mi_rows *
+                               cm->mi_params.mi_cols / 100) /
+                      num4x4bl;
+    weight_segment2 = 0;
+  }
+  // Take segment weighted average for estimated bits.
+  const int estimated_bits =
+      (int)((1.0 - weight_segment1 - weight_segment2) *
+                av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) +
+            weight_segment1 *
+                av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1],
+                                       correction_factor) +
+            weight_segment2 *
+                av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2],
+                                       correction_factor));
+  return estimated_bits;
+}
+
+int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
+                                      double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int bits_per_mb;
+  int num4x4bl = cm->mi_params.MBs << 4;
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  double weight_segment =
+      (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
+                cr->actual_num_seg2_blocks) >>
+               1) /
+      num4x4bl;
+  if (cpi->rc.rtc_external_ratectrl) {
+    weight_segment = (double)((cr->target_num_seg_blocks +
+                               cr->percent_refresh * cm->mi_params.mi_rows *
+                                   cm->mi_params.mi_cols / 100) >>
+                              1) /
+                     num4x4bl;
+  }
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate;
+  // Take segment weighted average for bits per mb.
+  bits_per_mb =
+      (int)((1.0 - weight_segment) *
+                av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i,
+                                   correction_factor, accurate_estimate) +
+            weight_segment * av1_rc_bits_per_mb(
+                                 cpi, cm->current_frame.frame_type, i + deltaq,
+                                 correction_factor, accurate_estimate));
+  return bits_per_mb;
+}
+
+void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                   RUN_TYPE dry_run) {
+  int cdf_num;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int prev_segment_id = mbmi->segment_id;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+
+  assert(cm->seg.enabled);
+
+  if (!cr->skip_over4x4) {
+    mbmi->segment_id =
+        av1_get_spatial_seg_pred(cm, xd, &cdf_num, cr->skip_over4x4);
+    if (prev_segment_id != mbmi->segment_id) {
+      const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+      const int mi_stride = cm->mi_params.mi_cols;
+      const uint8_t segment_id = mbmi->segment_id;
+      for (int mi_y = 0; mi_y < ymis; mi_y++) {
+        const int map_offset = block_index + mi_y * mi_stride;
+        memset(&cr->map[map_offset], 0, xmis);
+        memset(&cpi->enc_seg.map[map_offset], segment_id, xmis);
+        memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
+      }
+    }
+  }
+  if (!dry_run) {
+    if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks -= xmis * ymis;
+    else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks -= xmis * ymis;
+  }
+}
+
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip,
+                                       RUN_TYPE dry_run) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+  int noise_level = 0;
+  if (cpi->noise_estimate.enabled) noise_level = cpi->noise_estimate.level;
+  const int refresh_this_block =
+      candidate_refresh_aq(cr, mbmi, rate, dist, bsize, noise_level);
+  int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
+  // Default is to not update the refresh map.
+  int new_map_value = cr->map[block_index];
+
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    mbmi->segment_id = refresh_this_block;
+    // Reset segment_id if will be skipped.
+    if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
+  }
+  const uint8_t segment_id = mbmi->segment_id;
+
+  // Update the cyclic refresh map, to be used for setting segmentation map
+  // for the next frame. If the block  will be refreshed this frame, mark it
+  // as clean. The magnitude of the -ve influences how long before we consider
+  // it for refresh again.
+  if (cyclic_refresh_segment_id_boosted(segment_id)) {
+    new_map_value = -cr->time_for_refresh;
+  } else if (refresh_this_block) {
+    // Else if it is accepted as candidate for refresh, and has not already
+    // been refreshed (marked as 1) then mark it as a candidate for cleanup
+    // for future time (marked as 0), otherwise don't update it.
+    if (cr->map[block_index] == 1) new_map_value = 0;
+  } else {
+    // Leave it marked as block that is not candidate for refresh.
+    new_map_value = 1;
+  }
+
+  // Update entries in the cyclic refresh map with new_map_value, and
+  // copy mbmi->segment_id into global segmentation map.
+  const int mi_stride = cm->mi_params.mi_cols;
+  for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+    const int map_offset = block_index + mi_y * mi_stride;
+    memset(&cr->map[map_offset], new_map_value, xmis);
+    memset(&cpi->enc_seg.map[map_offset], segment_id, xmis);
+    memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
+  }
+
+  // Accumulate cyclic refresh update counters.
+  if (!dry_run) {
+    if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks += xmis * ymis;
+    else if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks += xmis * ymis;
+  }
+}
+
+// Initializes counters used for cyclic refresh.
+void av1_init_cyclic_refresh_counters(MACROBLOCK *const x) {
+  x->actual_num_seg1_blocks = 0;
+  x->actual_num_seg2_blocks = 0;
+}
+
+// Accumulate cyclic refresh counters.
+void av1_accumulate_cyclic_refresh_counters(
+    CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x) {
+  cyclic_refresh->actual_num_seg1_blocks += x->actual_num_seg1_blocks;
+  cyclic_refresh->actual_num_seg2_blocks += x->actual_num_seg2_blocks;
+}
+
+void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // Set minimum gf_interval for GF update to a multiple of the refresh period,
+  // with some max limit. Depending on past encoding stats, GF flag may be
+  // reset and update may not occur until next baseline_gf_interval.
+  const int gf_length_mult[2] = { 8, 4 };
+  if (cr->percent_refresh > 0)
+    p_rc->baseline_gf_interval =
+        AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] *
+                   (100 / cr->percent_refresh),
+               MAX_GF_INTERVAL_RT);
+  else
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+  if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+    p_rc->baseline_gf_interval = 16;
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+  int xmis, ymis, x, y;
+  uint64_t sb_sad = 0;
+  uint64_t thresh_sad_low = 0;
+  uint64_t thresh_sad = INT64_MAX;
+  const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols;
+  const int mi_stride = mi_cols;
+  memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols);
+  sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
+  sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
+  sbs_in_frame = sb_cols * sb_rows;
+  // Number of target blocks to get the q delta (segment 1).
+  block_count = cr->percent_refresh * mi_rows * mi_cols / 100;
+  // Set the segmentation map: cycle through the superblocks, starting at
+  // cr->mb_index, and stopping when either block_count blocks have been found
+  // to be refreshed, or we have passed through whole frame.
+  if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0;
+  assert(cr->sb_index < sbs_in_frame);
+  i = cr->sb_index;
+  cr->last_sb_index = cr->sb_index;
+  cr->target_num_seg_blocks = 0;
+  do {
+    int sum_map = 0;
+    // Get the mi_row/mi_col corresponding to superblock index i.
+    int sb_row_index = (i / sb_cols);
+    int sb_col_index = i - sb_row_index * sb_cols;
+    int mi_row = sb_row_index * cm->seq_params->mib_size;
+    int mi_col = sb_col_index * cm->seq_params->mib_size;
+    assert(mi_row >= 0 && mi_row < mi_rows);
+    assert(mi_col >= 0 && mi_col < mi_cols);
+    bl_index = mi_row * mi_stride + mi_col;
+    // Loop through all MI blocks in superblock and update map.
+    xmis = AOMMIN(mi_cols - mi_col, cm->seq_params->mib_size);
+    ymis = AOMMIN(mi_rows - mi_row, cm->seq_params->mib_size);
+    if (cr->use_block_sad_scene_det && cpi->rc.frames_since_key > 30 &&
+        cr->counter_encode_maxq_scene_change > 30 &&
+        cpi->src_sad_blk_64x64 != NULL &&
+        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+      sb_sad = cpi->src_sad_blk_64x64[sb_col_index + sb_cols * sb_row_index];
+      int scale = (cm->width * cm->height < 640 * 360) ? 6 : 8;
+      int scale_low = 2;
+      thresh_sad = (scale * 64 * 64);
+      thresh_sad_low = (scale_low * 64 * 64);
+      // For temporal layers: the base temporal layer (temporal_layer_id = 0)
+      // has larger frame separation (2 or 4 frames apart), so use larger sad
+      // thresholds to compensate for larger frame sad. The larger thresholds
+      // also increase the amount of refresh, which is needed for the base
+      // temporal layer.
+      if (cpi->svc.number_temporal_layers > 1 &&
+          cpi->svc.temporal_layer_id == 0) {
+        thresh_sad <<= 4;
+        thresh_sad_low <<= 2;
+      }
+    }
+    // cr_map only needed at 8x8 blocks.
+    for (y = 0; y < ymis; y += 2) {
+      for (x = 0; x < xmis; x += 2) {
+        const int bl_index2 = bl_index + y * mi_stride + x;
+        // If the block is as a candidate for clean up then mark it
+        // for possible boost/refresh (segment 1). The segment id may get
+        // reset to 0 later if block gets coded anything other than low motion.
+        // If the block_sad (sb_sad) is very low label it for refresh anyway.
+        if (cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) {
+          sum_map += 4;
+        } else if (cr->map[bl_index2] < 0) {
+          cr->map[bl_index2]++;
+        }
+      }
+    }
+    // Enforce constant segment over superblock.
+    // If segment is at least half of superblock, set to 1.
+    // Enforce that block sad (sb_sad) is not too high.
+    if (sum_map >= (xmis * ymis) >> 1 && sb_sad < thresh_sad) {
+      set_segment_id(seg_map, bl_index, xmis, ymis, mi_stride,
+                     CR_SEGMENT_ID_BOOST1);
+      cr->target_num_seg_blocks += xmis * ymis;
+    }
+    i++;
+    if (i == sbs_in_frame) {
+      i = 0;
+    }
+  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+  cr->sb_index = i;
+  if (cr->target_num_seg_blocks == 0) {
+    // Disable segmentation, seg_map is already set to 0 above.
+    av1_disable_segmentation(&cm->seg);
+  }
+}
+
+static int is_scene_change_detected(AV1_COMP *const cpi) {
+  return cpi->rc.high_source_sad;
+}
+
+// Set cyclic refresh parameters.
+void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+  // TODO(marpan): Parameters need to be tuned.
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  SVC *const svc = &cpi->svc;
+  const int qp_thresh = AOMMAX(16, rc->best_quality + 4);
+  const int qp_max_thresh = 118 * MAXQ >> 7;
+  const int scene_change_detected = is_scene_change_detected(cpi);
+  const int is_screen_content =
+      (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+
+  // A scene change or key frame marks the start of a cyclic refresh cycle.
+  const int frames_since_scene_change =
+      (cpi->ppi->use_svc || !is_screen_content)
+          ? cpi->rc.frames_since_key
+          : AOMMIN(cpi->rc.frames_since_key,
+                   cr->counter_encode_maxq_scene_change);
+
+  // Cases to reset the cyclic refresh adjustment parameters.
+  if (frame_is_intra_only(cm) || scene_change_detected ||
+      cpi->ppi->rtc_ref.bias_recovery_frame) {
+    // Reset adaptive elements for intra only frames and scene changes.
+    cr->percent_refresh_adjustment = 5;
+    cr->rate_ratio_qdelta_adjustment = 0.25;
+  }
+
+  // Although this segment feature for RTC is only used for
+  // blocks >= 8X8, for more efficient coding of the seg map
+  // cur_frame->seg_map needs to set at 4x4 along with the
+  // function av1_cyclic_reset_segment_skip(). Skipping over
+  // 4x4 will therefore have small bdrate loss (~0.2%), so
+  // we use it only for speed > 9 for now.
+  // Also if loop-filter deltas is applied via segment, then
+  // we need to set cr->skip_over4x4 = 1.
+  cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
+
+  // should we enable cyclic refresh on this frame.
+  cr->apply_cyclic_refresh = 1;
+  if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
+      scene_change_detected || svc->temporal_layer_id > 0 ||
+      svc->prev_number_spatial_layers != svc->number_spatial_layers ||
+      p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      (svc->number_spatial_layers > 1 &&
+       svc->layer_context[svc->temporal_layer_id].is_key_frame) ||
+      (frames_since_scene_change > 20 &&
+       p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
+      (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 30 &&
+       frames_since_scene_change > 40) ||
+      cpi->ppi->rtc_ref.bias_recovery_frame) {
+    cr->apply_cyclic_refresh = 0;
+    return;
+  }
+
+  // Increase the amount of refresh for #temporal_layers > 2
+  if (svc->number_temporal_layers > 2)
+    cr->percent_refresh = 15;
+  else
+    cr->percent_refresh = 10 + cr->percent_refresh_adjustment;
+
+  cr->max_qdelta_perc = 60;
+  cr->time_for_refresh = 0;
+  cr->use_block_sad_scene_det =
+      (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+       cm->seq_params->sb_size == BLOCK_64X64)
+          ? 1
+          : 0;
+  cr->motion_thresh = 32;
+  cr->rate_boost_fac =
+      (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 10 : 15;
+
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few
+  // refresh cycles after a key frame (svc) or scene change (non svc).
+  // For non svc screen content, after a scene change gradually reduce
+  // this boost and supress it further if either of the previous two
+  // frames overshot.
+  if (cr->percent_refresh > 0) {
+    if (cpi->ppi->use_svc || !is_screen_content) {
+      if (frames_since_scene_change <
+          ((4 * svc->number_temporal_layers) * (100 / cr->percent_refresh))) {
+        cr->rate_ratio_qdelta = 3.0 + cr->rate_ratio_qdelta_adjustment;
+      } else {
+        cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment;
+      }
+    } else {
+      double distance_from_sc_factor =
+          AOMMIN(0.75, (int)(frames_since_scene_change / 10) * 0.1);
+      cr->rate_ratio_qdelta =
+          3.0 + cr->rate_ratio_qdelta_adjustment - distance_from_sc_factor;
+      if ((frames_since_scene_change < 10) &&
+          ((cpi->rc.rc_1_frame < 0) || (cpi->rc.rc_2_frame < 0))) {
+        cr->rate_ratio_qdelta -= 0.25;
+      }
+    }
+  } else {
+    cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment;
+  }
+  // Adjust some parameters for low resolutions.
+  if (cm->width * cm->height <= 352 * 288) {
+    if (cpi->svc.number_temporal_layers > 1) {
+      cr->motion_thresh = 32;
+      cr->rate_boost_fac = 13;
+    } else {
+      if (rc->avg_frame_bandwidth < 3000) {
+        cr->motion_thresh = 16;
+        cr->rate_boost_fac = 13;
+      } else {
+        cr->max_qdelta_perc = 50;
+        cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.0);
+      }
+    }
+  }
+  if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
+    // To be adjusted for VBR mode, e.g., based on gf period and boost.
+    // For now use smaller qp-delta (than CBR), no second boosted seg, and
+    // turn-off (no refresh) on golden refresh (since it's already boosted).
+    cr->percent_refresh = 10;
+    cr->rate_ratio_qdelta = 1.5;
+    cr->rate_boost_fac = 10;
+    if (cpi->refresh_frame.golden_frame) {
+      cr->percent_refresh = 0;
+      cr->rate_ratio_qdelta = 1.0;
+    }
+  }
+  if (rc->rtc_external_ratectrl) {
+    cr->actual_num_seg1_blocks = cr->percent_refresh * cm->mi_params.mi_rows *
+                                 cm->mi_params.mi_cols / 100;
+    cr->actual_num_seg2_blocks = 0;
+  }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  struct segmentation *const seg = &cm->seg;
+  const int scene_change_detected = is_scene_change_detected(cpi);
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+  // Set resolution_change flag: for svc only set it when the
+  // number of spatial layers has not changed.
+  const int resolution_change =
+      cm->prev_frame &&
+      (cm->width != cm->prev_frame->width ||
+       cm->height != cm->prev_frame->height) &&
+      cpi->svc.prev_number_spatial_layers == cpi->svc.number_spatial_layers;
+
+  if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
+  if (!cr->apply_cyclic_refresh) {
+    // Set segmentation map to 0 and disable.
+    unsigned char *const seg_map = cpi->enc_seg.map;
+    memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    av1_disable_segmentation(&cm->seg);
+    if (frame_is_intra_only(cm) || scene_change_detected ||
+        cpi->ppi->rtc_ref.bias_recovery_frame) {
+      cr->sb_index = 0;
+      cr->last_sb_index = 0;
+      cr->counter_encode_maxq_scene_change = 0;
+      cr->actual_num_seg1_blocks = 0;
+      cr->actual_num_seg2_blocks = 0;
+    }
+    return;
+  } else {
+    cr->counter_encode_maxq_scene_change++;
+    const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
+                                             cm->seq_params->bit_depth);
+    // Set rate threshold to some multiple (set to 2 for now) of the target
+    // rate (target is given by sb64_target_rate and scaled by 256).
+    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+    // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+    // q will not exceed 457, so (q * q) is within 32bit; see:
+    // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
+    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+    // For low-resoln or lower speeds, the rate/dist thresholds need to be
+    // tuned/updated.
+    if (cpi->oxcf.speed <= 7 || (cm->width * cm->height < 640 * 360)) {
+      cr->thresh_dist_sb = 0;
+      cr->thresh_rate_sb = INT64_MAX;
+    }
+    // Set up segmentation.
+    // Clear down the segment map.
+    av1_enable_segmentation(&cm->seg);
+    av1_clearall_segfeatures(seg);
+
+    // Note: setting temporal_update has no effect, as the seg-map coding method
+    // (temporal or spatial) is determined in
+    // av1_choose_segmap_coding_method(),
+    // based on the coding cost of each method. For error_resilient mode on the
+    // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+    // relative to 0 previous map.
+    // seg->temporal_update = 0;
+
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+    // Set the q delta for segment BOOST1.
+    const CommonQuantParams *const quant_params = &cm->quant_params;
+    int qindex_delta =
+        compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta);
+    cr->qindex_delta[1] = qindex_delta;
+
+    // Compute rd-mult for segment BOOST1.
+    const int qindex2 = clamp(
+        quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta,
+        0, MAXQ);
+    cr->rdmult = av1_compute_rd_mult(
+        qindex2, cm->seq_params->bit_depth,
+        cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+        boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+        is_stat_consumption_stage(cpi));
+
+    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(
+        cpi, quant_params->base_qindex,
+        AOMMIN(CR_MAX_RATE_TARGET_RATIO,
+               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+    cr->qindex_delta[2] = qindex_delta;
+    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Update the segmentation and refresh map.
+    cyclic_refresh_update_map(cpi);
+  }
+}
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+  return cr->rdmult;
+}
+
+void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+  cr->sb_index = 0;
+  cr->last_sb_index = 0;
+  cpi->refresh_frame.golden_frame = true;
+  cr->apply_cyclic_refresh = 0;
+  cr->counter_encode_maxq_scene_change = 0;
+  cr->percent_refresh_adjustment = 5;
+  cr->rate_ratio_qdelta_adjustment = 0.25;
+}
+
+int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int qindex = cpi->common.quant_params.base_qindex;
+  if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 &&
+      cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh &&
+      cpi->rc.frame_source_sad < 1000 &&
+      qindex < 7 * (cpi->rc.worst_quality >> 3))
+    return 1;
+  // More aggressive skip.
+  else if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad &&
+           cpi->rc.frame_source_sad < 50000 && qindex < cpi->rc.worst_quality)
+    return 1;
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
new file mode 100644
index 0000000000..10974f018b
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+/*!
+ * \brief The stucture of CYCLIC_REFRESH.
+ * \ingroup cyclic_refresh
+ */
+struct CYCLIC_REFRESH {
+  /*!
+   * Percentage of blocks per frame that are targeted as candidates
+   * for cyclic refresh.
+   */
+  int percent_refresh;
+
+  /*!
+   * Active adjustment delta for cyclic refresh for rate control.
+   */
+  int percent_refresh_adjustment;
+
+  /*!
+   * Maximum q-delta as percentage of base q.
+   */
+  int max_qdelta_perc;
+  /*!
+   *Superblock starting index for cycling through the frame.
+   */
+  int sb_index;
+  /*!
+   *Superblock index cyclic refresh index last frame
+   */
+  int last_sb_index;
+  /*!
+   * Controls how long block will need to wait to be refreshed again, in
+   * excess of the cycle time, i.e., in the case of all zero motion, block
+   * will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+   */
+  int time_for_refresh;
+  /*!
+   * Target number of (4x4) blocks that are set for delta-q.
+   */
+  int target_num_seg_blocks;
+  /*!
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 1.
+   */
+  int actual_num_seg1_blocks;
+  /*!
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 2.
+   */
+  int actual_num_seg2_blocks;
+  /*!
+   * RD mult. parameters for segment 1.
+   */
+  int rdmult;
+  /*!
+   * Cyclic refresh map.
+   */
+  int8_t *map;
+  /*!
+   * Threshold applied to the projected rate of the coding block,
+   * when deciding whether block should be refreshed.
+   */
+  int64_t thresh_rate_sb;
+  /*!
+   * Threshold applied to the projected distortion of the coding block,
+   * when deciding whether block should be refreshed.
+   */
+  int64_t thresh_dist_sb;
+  /*!
+   * Threshold applied to the motion vector (in units of 1/8 pel) of the
+   * coding block, when deciding whether block should be refreshed.
+   */
+  int16_t motion_thresh;
+  /*!
+   * Rate target ratio to set q delta.
+   */
+  double rate_ratio_qdelta;
+
+  /*!
+   * Active adjustment of qdelta rate ratio for enhanced rate control
+   */
+  double rate_ratio_qdelta_adjustment;
+
+  /*!
+   * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+   */
+  int rate_boost_fac;
+
+  /*!\cond */
+  int qindex_delta[3];
+  int apply_cyclic_refresh;
+  int skip_over4x4;
+  int counter_encode_maxq_scene_change;
+  int use_block_sad_scene_det;
+  /*!\endcond */
+};
+
+struct AV1_COMP;
+
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+/*!\brief Estimate the bits, incorporating the delta-q from the segments.
+ *
+ * For the just encoded frame, estimate the bits, incorporating the delta-q
+ * from non-base segment(s). Note this function is called in the postencode
+ * (called from rc_update_rate_correction_factors()).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       correction_factor rate correction factor
+ *
+ * \return Return the estimated bits at given q.
+ */
+int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
+                                          double correction_factor);
+
+/*!\brief Estimate the bits per mb, for given q = i and delta-q.
+ *
+ * Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+ * a corresponding delta-q (for segment 1). This function is called in the
+ * rc_regulate_q() to set the base qp index. Note: the segment map is set to
+ * either 0/CR_SEGMENT_ID_BASE (no refresh) or to 1/CR_SEGMENT_ID_BOOST1
+ * (refresh) for each superblock, prior to encoding.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       i                 q index
+ * \param[in]       correction_factor rate correction factor
+ *
+ * \return Return the estimated bits for q = i and delta-q (segment 1).
+ */
+int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
+                                      double correction_factor);
+
+/*!\brief Update segment_id for blocks are skipped.
+ *
+ * After encoding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id based on skip_txfm,
+ * and update the cyclic_refresh map and segmentation counters.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ * \param[in]   x         Pointer to MACROBLOCK structure
+ * \param[in]   mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]   mi_col    Col coordinate of the block in a step size of MI_SIZE
+ * \param[in]   bsize     Block size
+ * \param[in]   dry_run   A code indicating whether it is part of the final
+ *                        pass for reconstructing the superblock
+ *
+ * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+
+void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi,
+                                   MACROBLOCK *const x, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize, RUN_TYPE dry_run);
+
+/*!\brief Update segment_id for block based on mode selected.
+ *
+ * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id (based on mode/motion/skip selected
+ * for that block) and update the cyclic_refresh map and segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ * \param[in]   x         Pointer to MACROBLOCK structure
+ * \param[in]   mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]   mi_col    Col coordinate of the block in a step size of MI_SIZE
+ * \param[in]   bsize     Block size
+ * \param[in]   rate      Projected block rate from pickmode
+ * \param[in]   dist      Projected block dist from pickmode
+ * \param[in]   skip      Skip flag set from picmode
+ * \param[in]   dry_run   A code indicating whether it is part of the final
+ *                         pass for reconstructing the superblock
+ *
+ * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
+                                       MACROBLOCK *const x, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip,
+                                       RUN_TYPE dry_run);
+
+/*!\brief Initialize counters used for cyclic refresh.
+ *
+ * Initializes cyclic refresh counters actual_num_seg1_blocks and
+ * actual_num_seg2_blocks.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   x         Pointer to MACROBLOCK structure
+ *
+ * \remark Update the \c x->actual_num_seg1_blocks and the
+ * \c x->actual_num_seg2_blocks.
+ */
+void av1_init_cyclic_refresh_counters(MACROBLOCK *const x);
+
+/*!\brief Accumulate cyclic refresh counters.
+ *
+ * Accumulates cyclic refresh counters actual_num_seg1_blocks and
+ * actual_num_seg2_blocks from MACROBLOCK strcture to CYCLIC_REFRESH strcture.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cyclic_refresh Pointer to CYCLIC_REFRESH structure
+ * \param[in]   x              Pointer to MACROBLOCK structure
+ *
+ * \remark Update the \c cyclic_refresh->actual_num_seg1_blocks and the
+ * \c cyclic_refresh->actual_num_seg2_blocks.
+ */
+void av1_accumulate_cyclic_refresh_counters(
+    CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x);
+
+/*!\brief Set golden frame update interval nased on cyclic refresh.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ *
+ * \remark Returns the interval in \c cpi->rc.baseline_gf_interval.
+ */
+void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
+
+/*!\brief Set the global/frame level parameters for cyclic refresh.
+ *
+ * First call to the cyclic refresh, before encoding the frame.
+ * Sets the flag on whether cyclic refresh should be applied, sets
+ * the amount/percent of refresh, and the amount of boost applied to
+ * the two segments (set by rate_ratio_qdelta and rate_boost_fac).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \remark Updates the \c cpi->cyclic_refresh with the settings.
+ */
+void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
+
+/*!\brief Setup the cyclic background refresh.
+ *
+ * Set the delta q for the segment(s), and set the segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \remark Updates the \c cpi->cyclic_refresh with the cyclic refresh
+ * parameters and the \c cm->seg with the segmentation data.
+ */
+void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+  return segment_id == CR_SEGMENT_ID_BOOST1 ||
+         segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+  if (segment_id == CR_SEGMENT_ID_BOOST1)
+    return CR_SEGMENT_ID_BOOST1;
+  else if (segment_id == CR_SEGMENT_ID_BOOST2)
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
new file mode 100644
index 0000000000..086928a118
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/dwt.h"
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+                                                 0.9, .8,  .7,  .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5,  2.0, 1.5, 1.0,
+                                                        0.75, 1.0, 1.0, 1.0 };
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+  assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+DECLARE_ALIGNED(16, static const uint16_t,
+                av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+void av1_vaq_frame_setup(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const int base_qindex = cm->quant_params.base_qindex;
+  struct segmentation *seg = &cm->seg;
+  int i;
+
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  int avg_energy = (int)(cpi->twopass_frame.mb_av_energy - 2);
+  double avg_ratio;
+  if (avg_energy > 7) avg_energy = 7;
+  if (avg_energy < 0) avg_energy = 0;
+  avg_ratio = rate_ratio[avg_energy];
+
+  if (resolution_change) {
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    av1_clearall_segfeatures(seg);
+    av1_disable_segmentation(seg);
+    return;
+  }
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      refresh_frame->alt_ref_frame ||
+      (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    cpi->vaq_refresh = 1;
+
+    av1_enable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      // Set up avg segment id to be 1.0 and adjust the other segments around
+      // it.
+      int qindex_delta =
+          av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type,
+                                     base_qindex, rate_ratio[i] / avg_ratio);
+
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -base_qindex + 1;
+      }
+
+      av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    }
+  }
+}
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  // This functions returns a score for the blocks local variance as calculated
+  // by: sum of the log of the (4x4 variances) of each subblock to the current
+  // block (x,bs)
+  // * 32 / number of pixels in the block_size.
+  // This is used for segmentation because to avoid situations in which a large
+  // block with a gentle gradient gets marked high variance even though each
+  // subblock has a low variance.   This allows us to assign the same segment
+  // number for the same sorts of area regardless of how the partitioning goes.
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  double var = 0;
+  unsigned int sse;
+  int i, j;
+
+  int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+  const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+  for (i = 0; i < bh; i += 4) {
+    for (j = 0; j < bw; j += 4) {
+      if (is_cur_buf_hbd(xd)) {
+        var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+                         x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                         x->plane[0].src.stride,
+                         CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+                     16.0);
+      } else {
+        var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+                         x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                         x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+                     16.0);
+      }
+    }
+  }
+  // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+  var /= (bw / 4 * bh / 4);
+  if (var > 7) var = 7;
+
+  return (int)(var);
+}
+
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                      int mi_row, int mi_col) {
+  // This functions returns the block average of luma block
+  unsigned int sum, avg, num_pix;
+  int r, c;
+  const int pic_w = cpi->common.width;
+  const int pic_h = cpi->common.height;
+  const int bw = MI_SIZE * mi_size_wide[bs];
+  const int bh = MI_SIZE * mi_size_high[bs];
+  const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+  sum = 0;
+  num_pix = 0;
+  avg = 0;
+  int row = mi_row << MI_SIZE_LOG2;
+  int col = mi_col << MI_SIZE_LOG2;
+  for (r = row; (r < (row + bh)) && (r < pic_h); r++) {
+    for (c = col; (c < (col + bw)) && (c < pic_w); c++) {
+      sum += *(x16 + r * x->plane[0].src.stride + c);
+      num_pix++;
+    }
+  }
+  if (num_pix != 0) {
+    avg = sum / num_pix;
+  }
+  return avg;
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+
+static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int stride = x->plane[0].src.stride;
+  uint8_t *buf = x->plane[0].src.buf;
+  const int num_8x8_cols = block_size_wide[bs] / 8;
+  const int num_8x8_rows = block_size_high[bs] / 8;
+  const int hbd = is_cur_buf_hbd(xd);
+
+  int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows,
+                                                num_8x8_cols);
+
+  return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+}
+
+static double log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int haar_sad = haar_ac_energy(x, bs);
+  return log1p(haar_sad);
+}
+
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  double energy, energy_midpoint;
+  energy_midpoint = (is_stat_consumption_stage_twopass(cpi))
+                        ? cpi->twopass_frame.frame_avg_haar_energy
+                        : DEFAULT_E_MIDPOINT;
+  energy = log_block_wavelet_energy(x, bs) - energy_midpoint;
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+                                                int block_var_level) {
+  int rate_level;
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+    ENERGY_IN_BOUNDS(block_var_level);
+    rate_level = SEGMENT_ID(block_var_level);
+  } else {
+    rate_level = block_var_level;
+  }
+  const int base_qindex = cm->quant_params.base_qindex;
+  int qindex_delta =
+      av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, base_qindex,
+                                 deltaq_rate_ratio[rate_level]);
+
+  if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+    qindex_delta = -base_qindex + 1;
+  }
+  return base_qindex + qindex_delta;
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
new file mode 100644
index 0000000000..aa0535ad72
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_vaq_frame_setup(AV1_COMP *cpi);
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                      int mi_row, int mi_col);
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+                                                int block_var_level);
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c
new file mode 100644
index 0000000000..91fc1e00a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+#else
+#include <arm_acle.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config/aom_config.h"
+
+#define CRC_LOOP(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+#define CRC_SINGLE(op, crc, type, buf, len) \
+  if ((len) >= sizeof(type)) {              \
+    (crc) = op((crc), *(type *)(buf));      \
+    (len) -= sizeof(type);                  \
+    buf += sizeof(type);                    \
+  }
+
+/* Return 32-bit CRC for the input buffer.
+ * Polynomial is 0x1EDC6F41.
+ */
+
+uint32_t av1_get_crc32c_value_arm_crc32(void *crc_calculator, uint8_t *p,
+                                        size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+#if !AOM_ARCH_AARCH64
+  // Align input to 8-byte boundary (only necessary for 32-bit builds.)
+  while (len && ((uintptr_t)buf & 7)) {
+    crc = __crc32cb(crc, *buf++);
+    len--;
+  }
+#endif
+
+  CRC_LOOP(__crc32cd, crc, uint64_t, buf, len)
+  CRC_SINGLE(__crc32cw, crc, uint32_t, buf, len)
+  CRC_SINGLE(__crc32ch, crc, uint16_t, buf, len)
+  CRC_SINGLE(__crc32cb, crc, uint8_t, buf, len)
+
+  return ~crc;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c
new file mode 100644
index 0000000000..26d06b46fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c
@@ -0,0 +1,95 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // By operating on unsigned integers we can store up to 4 squared diff in a
+    // 32-bit element before having to widen to 64 bits.
+    uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+    err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+    err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64 = vpadalq_u32(err_u64, err);
+
+    // We can't do the same here as we're operating on signed integers, so we
+    // can only accumulate 2 squares.
+    int32x4_t ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+    ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+    int32x4_t ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+    ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  *ssz = horizontal_add_s64x2(ssz_s64);
+  return (int64_t)horizontal_add_u64x2(err_u64);
+}
+
+int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+                                int block_size) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = vld1q_s16(coeff);
+    const int16x8_t c1 = vld1q_s16(coeff + 8);
+    const int16x8_t d0 = vld1q_s16(dqcoeff);
+    const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // By operating on unsigned integers we can store up to 4 squared diff in a
+    // 32-bit element before having to widen to 64 bits.
+    uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+    err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+    err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64 = vpadalq_u32(err_u64, err);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  return (int64_t)horizontal_add_u64x2(err_u64);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
new file mode 100644
index 0000000000..63aad0b785
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                            intptr_t block_size, int64_t *ssz) {
+  int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+  int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const int16x8_t diff0 = vsubq_s16(c0, d0);
+    const int16x8_t diff1 = vsubq_s16(c1, d1);
+
+    error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+    error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+    sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0);
+    sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1]));
+  return vaddvq_s64(vaddq_s64(error[0], error[1]));
+}
+
+int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff,
+                               int block_size) {
+  if (block_size % 32 == 0) {
+    int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                           vdupq_n_s64(0) };
+
+    do {
+      const int16x8_t c0 = vld1q_s16(coeff);
+      const int16x8_t c1 = vld1q_s16(coeff + 8);
+      const int16x8_t c2 = vld1q_s16(coeff + 16);
+      const int16x8_t c3 = vld1q_s16(coeff + 24);
+      const int16x8_t d0 = vld1q_s16(dqcoeff);
+      const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+      const int16x8_t d2 = vld1q_s16(dqcoeff + 16);
+      const int16x8_t d3 = vld1q_s16(dqcoeff + 24);
+
+      const int16x8_t diff0 = vsubq_s16(c0, d0);
+      const int16x8_t diff1 = vsubq_s16(c1, d1);
+      const int16x8_t diff2 = vsubq_s16(c2, d2);
+      const int16x8_t diff3 = vsubq_s16(c3, d3);
+
+      error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+      error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+      error[2] = aom_sdotq_s16(error[2], diff2, diff2);
+      error[3] = aom_sdotq_s16(error[3], diff3, diff3);
+
+      coeff += 32;
+      dqcoeff += 32;
+      block_size -= 32;
+    } while (block_size != 0);
+
+    error[0] = vaddq_s64(error[0], error[1]);
+    error[2] = vaddq_s64(error[2], error[3]);
+    error[0] = vaddq_s64(error[0], error[2]);
+    return vaddvq_s64(error[0]);
+  }
+  assert(block_size == 16);
+
+  int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    const int16x8_t c0 = vld1q_s16(coeff);
+    const int16x8_t c1 = vld1q_s16(coeff + 8);
+    const int16x8_t d0 = vld1q_s16(dqcoeff);
+    const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+
+    const int16x8_t diff0 = vsubq_s16(c0, d0);
+    const int16x8_t diff1 = vsubq_s16(c1, d1);
+
+    error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+    error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  return vaddvq_s64(vaddq_s64(error[0], error[1]));
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
new file mode 100644
index 0000000000..5148ee74a9
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -0,0 +1,3090 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
+
+#define TXFM_COS_BIT_MAX 13
+
+// A note on butterfly helper naming:
+//
+// butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon
+// e.g. butterfly_s32_s32_x4_0231_neon
+//                |   |   |  ^ Weights are applied as indices 0, 2, 3, 1
+//                |   |   |    (see more detail below)
+//                |   |   ^ (int32)x4 input/output parameters
+//                |   ^ 32-bit accumulators internally
+//                ^ 32-bit input/output parameters
+//
+// Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to
+// avoid needing separate negation instructions. This is represented in the
+// helper naming by referring to the lane index in the loaded tuple that each
+// multiply is performed with:
+//
+//        in0  in1
+//      /----------
+// out0 |  w0   w1   ==>  out0 = in0 * w0 + in1 * w1
+// out1 |  w2   w3   ==>  out1 = in0 * w2 + in1 * w3
+//
+// So for indices 0331 from the earlier example, we end up with:
+//
+//          in0       in1
+//      /------------------
+// out0 | (lane 0) (lane 2)   ==>  out0 = in0 *   w0   + in1 *  -w0
+// out1 | (lane 3) (lane 1)   ==>  out1 = in0 * (w0-1) + in1 * (1-w0)
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon(
+    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+    int32x4_t *out0, int32x4_t *out1) {
+  int32x4_t w0101 = vmovl_s16(w0101_s16);
+  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+  o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1);
+  int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon(
+    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+    int32x4_t *out0, int32x4_t *out1) {
+  int32x4_t w0101 = vmovl_s16(w0101_s16);
+  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+  o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1);
+  int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1);
+  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon(
+    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+    int32x4_t *out0, int32x4_t *out1) {
+  int32x4_t w0101 = vmovl_s16(w0101_s16);
+  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+  o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0);
+  int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon(
+    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+    int32x4_t *out0, int32x4_t *out1) {
+  int32x4_t w0101 = vmovl_s16(w0101_s16);
+  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+  o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0);
+  int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0);
+  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+#define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+                                  out0, out1)                                 \
+  do {                                                                        \
+    int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0);                          \
+    u0 = vmlal_lane_s16(u0, in1, wvec, lane1);                                \
+    int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2);                          \
+    v0 = vmlal_lane_s16(v0, in1, wvec, lane3);                                \
+    *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX);                              \
+    *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX);                              \
+  } while (0)
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon(
+    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+    int16x4_t *out0, int16x4_t *out1) {
+  butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon(
+    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+    int16x4_t *out0, int16x4_t *out1) {
+  butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon(
+    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+    int16x4_t *out0, int16x4_t *out1) {
+  butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon(
+    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+    int16x4_t *out0, int16x4_t *out1) {
+  butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+#define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+                                  out0, out1)                                 \
+  do {                                                                        \
+    int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0);            \
+    u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1);                  \
+    int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0);           \
+    u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1);                 \
+    int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2);            \
+    v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3);                  \
+    int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2);           \
+    v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3);                 \
+    const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX);                  \
+    const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX);                  \
+    const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX);                  \
+    const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX);                  \
+    *out0 = vcombine_s16(c0, c1);                                             \
+    *out1 = vcombine_s16(d0, d1);                                             \
+  } while (0)
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon(
+    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+    int16x8_t *out0, int16x8_t *out1) {
+  butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon(
+    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+    int16x8_t *out0, int16x8_t *out1) {
+  butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon(
+    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+    int16x8_t *out0, int16x8_t *out1) {
+  butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon(
+    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+    int16x8_t *out0, int16x8_t *out1) {
+  butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out,
+                                             int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out,
+                                             int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8(
+    int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2,
+    const int stride, const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    vst1q_s32(out + stride * i, in1[i]);
+    vst1q_s32(out + stride * i + 4, in2[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in,
+                                                const int stride,
+                                                int16x4_t *const out,
+                                                const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = vld1_s16(in);
+    in += stride;
+  }
+}
+
+static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
+                                                int16x8_t *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = vld1q_s16(in + i * stride);
+  }
+}
+
+static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in,
+                                                 int32_t *const out,
+                                                 const int stride,
+                                                 const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    vst1q_s32(out + i * stride, vmovl_s16(in[i]));
+  }
+}
+
+static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in,
+                                                 int32_t *const out,
+                                                 const int stride,
+                                                 const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
+    vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
+  }
+}
+
+// A note on naming:
+//   round_shift_[sqrt2]_s16_s32_4x1_neon(...)
+//                |      |   |     ^ 1 => a single vector
+//                |      |   |       n => an array of vectors
+//                |      |   |   ^ input/output vector element count
+//                |      |   ^ output type
+//                |      ^ input type
+//                ^ multiplicand and shift identifier
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+  return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int16x8_t
+round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+  return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+                      round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
+}
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+  return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int16x8_t
+round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+  return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+                      round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
+}
+
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) {
+  return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) {
+  return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits);
+}
+
+#define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn)                 \
+  static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \
+    for (int i = 0; i < size; ++i) {                                         \
+      out[i] = fn(in[i]);                                                    \
+    }                                                                        \
+  }
+
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t,
+                             int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t,
+                             int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t,
+                             int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t,
+                             int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
+                             int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
+
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
+                                                      int32_t *const out,
+                                                      const int stride,
+                                                      const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
+  }
+}
+
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
+                                                      int32_t *const out,
+                                                      const int stride,
+                                                      const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    vst1q_s32(out + i * stride + 0,
+              round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
+    vst1q_s32(out + i * stride + 4,
+              round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i])));
+  }
+}
+
+static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
+  int32x4_t u[6], v[6];
+  const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+  const int16x4_t u01 = vqadd_s16(input[0], input[1]);
+
+  v[5] = vmull_lane_s16(input[2], sinpi, 2);
+  v[0] = vmull_lane_s16(input[1], sinpi, 1);
+  v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0);
+  v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3);
+  v[2] = vmull_lane_s16(u01, sinpi, 2);
+  v[3] = vmull_lane_s16(input[0], sinpi, 3);
+  v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0);
+  v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1);
+
+  u[0] = vaddq_s32(v[0], v[1]);
+  u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2);
+  u[2] = vsubq_s32(v[3], v[4]);
+  u[3] = vsubq_s32(u[2], u[0]);
+  u[3] = vmlaq_n_s32(u[3], v[5], 3);
+
+  output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+  output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+  output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+  output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+  // stage 1-2
+  int16x4_t x2[8];
+  butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+  butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
+
+  // stage 3
+  int16x4_t x3[8];
+  x3[0] = vqadd_s16(input[0], x2[2]);
+  x3[1] = vqsub_s16(x2[3], input[7]);
+  x3[2] = vqsub_s16(input[0], x2[2]);
+  x3[3] = vqadd_s16(input[7], x2[3]);
+  x3[4] = vqsub_s16(x2[6], input[1]);
+  x3[5] = vqadd_s16(input[6], x2[7]);
+  x3[6] = vqadd_s16(input[1], x2[6]);
+  x3[7] = vqsub_s16(input[6], x2[7]);
+
+  // stage 4
+  int16x4_t x4[8];
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]);
+
+  // stage 5
+  int16x4_t x5[8];
+  x5[0] = vqadd_s16(x3[0], x4[4]);
+  x5[1] = vqadd_s16(x3[1], x4[5]);
+  x5[2] = vqadd_s16(x3[2], x4[6]);
+  x5[3] = vqsub_s16(x4[7], x3[3]);
+  x5[4] = vqsub_s16(x3[0], x4[4]);
+  x5[5] = vqsub_s16(x3[1], x4[5]);
+  x5[6] = vqsub_s16(x3[2], x4[6]);
+  x5[7] = vqadd_s16(x3[3], x4[7]);
+
+  // stage 6-7
+  butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+  butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+  butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+  butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  int32x4_t u_lo[4], u_hi[4];
+  const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+  const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
+
+  u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1);
+  u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1);
+
+  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0);
+  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0);
+
+  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3);
+  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3);
+
+  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2);
+  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2);
+
+  u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2);
+  u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2);
+
+  u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3);
+  u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3);
+
+  u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0);
+  u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0);
+
+  u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1);
+  u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1);
+
+  u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2);
+  u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2);
+
+  u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2);
+  u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2);
+
+  u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
+  u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
+
+  const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3);
+  u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2);
+  u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2);
+
+  output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX),
+                           vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX));
+  output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX),
+                           vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX));
+  output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX),
+                           vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX));
+  output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX),
+                           vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
+}
+
+static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
+                                          int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
+
+  int16x4_t in12a = vadd_s16(input[1], input[2]);
+  int16x4_t in12s = vsub_s16(input[1], input[2]);
+  int16x4_t in03a = vadd_s16(input[0], input[3]);
+  int16x4_t in03s = vsub_s16(input[0], input[3]);
+
+  int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
+  int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
+
+  int32x4_t u[4];
+  u[0] = vaddq_s32(u0ad1, u0ad2);
+  u[1] = vsubq_s32(u0ad2, u0ad1);
+  u[2] = vmull_lane_s16(in12s, cospi16, 1);
+  u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0);
+  u[3] = vmull_lane_s16(in03s, cospi16, 1);
+  u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0);
+
+  output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+  output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+  output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+  output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
+}
+
+// Butterfly pre-processing:
+// e.g. n=4:
+//   out[0] = in[0] + in[3]
+//   out[1] = in[1] + in[2]
+//   out[2] = in[1] - in[2]
+//   out[3] = in[0] - in[3]
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input,
+                                                      int16x4_t *output,
+                                                      int n) {
+  for (int i = 0; i < n / 2; ++i) {
+    output[i] = vqadd_s16(input[i], input[n - i - 1]);
+  }
+  for (int i = 0; i < n / 2; ++i) {
+    output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+  }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input,
+                                                      int16x8_t *output,
+                                                      int n) {
+  for (int i = 0; i < n / 2; ++i) {
+    output[i] = vqaddq_s16(input[i], input[n - i - 1]);
+  }
+  for (int i = 0; i < n / 2; ++i) {
+    output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+  }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input,
+                                                      int32x4_t *output,
+                                                      int n) {
+  for (int i = 0; i < n / 2; ++i) {
+    output[i] = vqaddq_s32(input[i], input[n - i - 1]);
+  }
+  for (int i = 0; i < n / 2; ++i) {
+    output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
+  }
+}
+
+// Butterfly post-processing:
+// e.g. n=8:
+//   out[0] = in0[0] + in1[3];
+//   out[1] = in0[1] + in1[2];
+//   out[2] = in0[1] - in1[2];
+//   out[3] = in0[0] - in1[3];
+//   out[4] = in0[7] - in1[4];
+//   out[5] = in0[6] - in1[5];
+//   out[6] = in0[6] + in1[5];
+//   out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0,
+                                                       const int16x4_t *in1,
+                                                       int16x4_t *output,
+                                                       int n) {
+  for (int i = 0; i < n / 4; ++i) {
+    output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[(3 * n) / 4 + i] =
+        vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+  }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0,
+                                                       const int16x8_t *in1,
+                                                       int16x8_t *output,
+                                                       int n) {
+  for (int i = 0; i < n / 4; ++i) {
+    output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[(3 * n) / 4 + i] =
+        vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+  }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0,
+                                                       const int32x4_t *in1,
+                                                       int32x4_t *output,
+                                                       int n) {
+  for (int i = 0; i < n / 4; ++i) {
+    output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[(3 * n) / 4 + i] =
+        vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+  }
+}
+
+static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
+                                          int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+
+  // stage 1
+  int16x8_t x1[4];
+  butterfly_dct_pre_s16_x8(input, x1, 4);
+
+  // stage 2
+  int16x8_t x2[4];
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]);
+
+  // stage 3
+  output[0] = x2[0];
+  output[1] = x2[2];
+  output[2] = x2[1];
+  output[3] = x2[3];
+}
+
+static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
+                                          int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+
+  // stage 1
+  int16x4_t x1[8];
+  butterfly_dct_pre_s16_x4(input, x1, 8);
+
+  // stage 2
+  int16x4_t x2[8];
+  butterfly_dct_pre_s16_x4(x1, x2, 4);
+  butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
+
+  // stage 3
+  int16x4_t x3[8];
+  butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+  butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4);
+
+  // stage 4-5
+  butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+  butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
+                                          int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+
+  // stage 1
+  int16x8_t x1[8];
+  butterfly_dct_pre_s16_x8(input, x1, 8);
+
+  // stage 2
+  int16x8_t x2[8];
+  butterfly_dct_pre_s16_x8(x1, x2, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
+
+  // stage 3
+  int16x8_t x3[8];
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+  butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4);
+
+  // stage 4-5
+  butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+  // stage 1
+  int16x4_t x1[16];
+  butterfly_dct_pre_s16_x4(input, x1, 16);
+
+  // stage 2
+  int16x4_t x2[16];
+  butterfly_dct_pre_s16_x4(x1, x2, 8);
+  butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+  butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
+
+  // stage 3
+  int16x4_t x3[16];
+  butterfly_dct_pre_s16_x4(x2, x3, 4);
+  butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+  butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8);
+
+  // stage 4
+  int16x4_t x4[16];
+  butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4],
+                                 &output[12]);
+  butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+  butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
+
+  // stage 5
+  int16x4_t x5[16];
+  butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+  butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10],
+                                 &output[6]);
+  butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4);
+  butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4);
+
+  // stage 6-7
+  butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1],
+                                 &output[15]);
+  butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9],
+                                 &output[7]);
+  butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5],
+                                 &output[11]);
+  butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13],
+                                 &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+  // stage 1
+  int16x8_t x1[16];
+  butterfly_dct_pre_s16_x8(input, x1, 16);
+
+  // stage 2
+  int16x8_t x2[16];
+  butterfly_dct_pre_s16_x8(x1, x2, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
+
+  // stage 3
+  int16x8_t x3[16];
+  butterfly_dct_pre_s16_x8(x2, x3, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+  butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8);
+
+  // stage 4
+  int16x8_t x4[16];
+  butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4],
+                                 &output[12]);
+  butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
+
+  // stage 5
+  int16x8_t x5[16];
+  butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10],
+                                 &output[6]);
+  butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4);
+  butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4);
+
+  // stage 6-7
+  butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1],
+                                 &output[15]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9],
+                                 &output[7]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5],
+                                 &output[11]);
+  butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13],
+                                 &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+  // stage 1
+  int16x8_t x1[32];
+  butterfly_dct_pre_s16_x8(input, x1, 32);
+
+  // stage 2
+  int16x8_t x2[32];
+  butterfly_dct_pre_s16_x8(x1, x2, 16);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]);
+
+  // stage 3
+  int16x8_t x3[32];
+  butterfly_dct_pre_s16_x8(x2, x3, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]);
+  butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16);
+
+  // stage 4
+  int16x8_t x4[32];
+  butterfly_dct_pre_s16_x8(x3, x4, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]);
+  butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]);
+
+  // stage 5
+  int16x8_t x5[32];
+  butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0],
+                                 &output[16]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8],
+                                 &output[24]);
+  butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]);
+  butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8);
+  butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8);
+
+  // stage 6
+  int16x8_t x6[32];
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20],
+                                 &output[12]);
+  butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4);
+  butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]);
+  butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]);
+  butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]);
+
+  // stage 7
+  int16x8_t x7[32];
+  butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2],
+                                 &output[30]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18],
+                                 &output[14]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10],
+                                 &output[22]);
+  butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26],
+                                 &output[6]);
+  butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4);
+  butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4);
+  butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4);
+  butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4);
+
+  butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1],
+                                 &output[31]);
+  butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17],
+                                 &output[15]);
+  butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9],
+                                 &output[23]);
+  butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25],
+                                 &output[7]);
+  butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5],
+                                 &output[27]);
+  butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21],
+                                 &output[11]);
+  butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13],
+                                 &output[19]);
+  butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29],
+                                 &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+  const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+  const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+  const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+  const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+  const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+  const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+  const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+  const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+  const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+  const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+  const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+  const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+  const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+  const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+  const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+  const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+  const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+  const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+  const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+  const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+  const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+  const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+  const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+  const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
+  // stage 1
+  int16x8_t x1[64];
+  butterfly_dct_pre_s16_x8(input, x1, 64);
+
+  // stage 2
+  int16x8_t x2[64];
+  butterfly_dct_pre_s16_x8(x1, x2, 32);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
+
+  // stage 3
+  int16x8_t x3[64];
+  butterfly_dct_pre_s16_x8(x2, x3, 16);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32);
+
+  // stage 4
+  int16x8_t x4[64];
+  butterfly_dct_pre_s16_x8(x3, x4, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+  butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
+
+  // stage 5
+  int16x8_t x5[64];
+  butterfly_dct_pre_s16_x8(x4, x5, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+  butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+  butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16);
+  butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16);
+
+  // stage 6
+  int16x8_t x6[64];
+  butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+  butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+  butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8);
+  butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+  butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+  butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+  butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+  butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
+
+  // stage 7
+  int16x8_t x7[64];
+  butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+  butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4);
+  butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+  butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+  butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+  butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8);
+  butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8);
+  butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8);
+  butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8);
+
+  // stage 8
+  int16x8_t x8[64];
+  butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+  butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+  butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4);
+  butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4);
+  butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4);
+  butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+  butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+  butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+  butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+  butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+  butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
+
+  // stage 9
+  int16x8_t x9[64];
+  butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+  butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+  butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+  butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+  butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+  butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+  butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+  butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+  butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4);
+  butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4);
+  butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4);
+  butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4);
+  butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4);
+  butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4);
+  butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4);
+  butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4);
+
+  // stage 10
+  butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1],
+                                 &output[63]);
+  butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33],
+                                 &output[31]);
+  butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17],
+                                 &output[47]);
+  butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49],
+                                 &output[15]);
+  butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9],
+                                 &output[55]);
+  butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41],
+                                 &output[23]);
+  butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25],
+                                 &output[39]);
+  butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57],
+                                 &output[7]);
+  butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5],
+                                 &output[59]);
+  butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37],
+                                 &output[27]);
+  butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21],
+                                 &output[43]);
+  butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53],
+                                 &output[11]);
+  butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13],
+                                 &output[51]);
+  butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45],
+                                 &output[19]);
+  butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29],
+                                 &output[35]);
+  butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61],
+                                 &output[3]);
+
+  // stage 11
+  output[0] = x6[0];
+  output[2] = x9[16];
+  output[4] = x8[8];
+  output[6] = x9[24];
+  output[8] = x7[4];
+  output[10] = x9[20];
+  output[12] = x8[12];
+  output[14] = x9[28];
+  output[16] = x6[2];
+  output[18] = x9[18];
+  output[20] = x8[10];
+  output[22] = x9[26];
+  output[24] = x7[6];
+  output[26] = x9[22];
+  output[28] = x8[14];
+  output[30] = x9[30];
+  output[32] = x6[1];
+  output[34] = x9[17];
+  output[36] = x8[9];
+  output[38] = x9[25];
+  output[40] = x7[5];
+  output[42] = x9[21];
+  output[44] = x8[13];
+  output[46] = x9[29];
+  output[48] = x6[3];
+  output[52] = x8[11];
+  output[54] = x9[27];
+  output[56] = x7[7];
+  output[58] = x9[23];
+  output[60] = x8[15];
+  output[62] = x9[31];
+}
+
+static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+  // stage 2
+  int16x8_t x2[8];
+  butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
+
+  // stage 3
+  int16x8_t x3[8];
+  x3[0] = vqaddq_s16(input[0], x2[2]);
+  x3[1] = vqsubq_s16(x2[3], input[7]);
+  x3[2] = vqsubq_s16(input[0], x2[2]);
+  x3[3] = vqaddq_s16(input[7], x2[3]);
+  x3[4] = vqsubq_s16(x2[6], input[1]);
+  x3[5] = vqaddq_s16(input[6], x2[7]);
+  x3[6] = vqaddq_s16(input[1], x2[6]);
+  x3[7] = vqsubq_s16(input[6], x2[7]);
+
+  // stage 4
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+
+  // stage 5
+  int16x8_t x5[8];
+  x5[0] = vqaddq_s16(x3[0], x3[4]);
+  x5[1] = vqaddq_s16(x3[1], x3[5]);
+  x5[2] = vqaddq_s16(x3[2], x3[6]);
+  x5[3] = vqsubq_s16(x3[7], x3[3]);
+  x5[4] = vqsubq_s16(x3[0], x3[4]);
+  x5[5] = vqsubq_s16(x3[1], x3[5]);
+  x5[6] = vqsubq_s16(x3[2], x3[6]);
+  x5[7] = vqaddq_s16(x3[3], x3[7]);
+
+  // stage 6
+  butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+  butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
+                                            int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+  // stage 2
+  int16x4_t x2[8];
+  butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+  butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+  butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+  butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
+  // stage 3
+  int16x4_t x3[16];
+  x3[0] = vqadd_s16(input[0], x2[0]);
+  x3[1] = vqsub_s16(x2[1], input[15]);
+  x3[2] = vqsub_s16(input[0], x2[0]);
+  x3[3] = vqadd_s16(input[15], x2[1]);
+  x3[4] = vqsub_s16(x2[2], input[3]);
+  x3[5] = vqadd_s16(input[12], x2[3]);
+  x3[6] = vqadd_s16(input[3], x2[2]);
+  x3[7] = vqsub_s16(input[12], x2[3]);
+  x3[8] = vqsub_s16(x2[4], input[1]);
+  x3[9] = vqadd_s16(input[14], x2[5]);
+  x3[10] = vqadd_s16(input[1], x2[4]);
+  x3[11] = vqsub_s16(input[14], x2[5]);
+  x3[12] = vqadd_s16(input[2], x2[6]);
+  x3[13] = vqsub_s16(x2[7], input[13]);
+  x3[14] = vqsub_s16(input[2], x2[6]);
+  x3[15] = vqadd_s16(input[13], x2[7]);
+
+  // stage 4
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+  butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
+
+  // stage 5
+  int16x4_t x5[16];
+  x5[0] = vqadd_s16(x3[0], x3[4]);
+  x5[1] = vqadd_s16(x3[1], x3[5]);
+  x5[2] = vqadd_s16(x3[2], x3[6]);
+  x5[3] = vqsub_s16(x3[7], x3[3]);
+  x5[4] = vqsub_s16(x3[0], x3[4]);
+  x5[5] = vqsub_s16(x3[1], x3[5]);
+  x5[6] = vqsub_s16(x3[2], x3[6]);
+  x5[7] = vqadd_s16(x3[3], x3[7]);
+  x5[8] = vqadd_s16(x3[8], x3[12]);
+  x5[9] = vqadd_s16(x3[9], x3[13]);
+  x5[10] = vqsub_s16(x3[14], x3[10]);
+  x5[11] = vqadd_s16(x3[11], x3[15]);
+  x5[12] = vqsub_s16(x3[8], x3[12]);
+  x5[13] = vqsub_s16(x3[9], x3[13]);
+  x5[14] = vqadd_s16(x3[10], x3[14]);
+  x5[15] = vqsub_s16(x3[11], x3[15]);
+
+  // stage 6
+  butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+  butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+  butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+  butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
+
+  // stage 7
+  int16x4_t x7[16];
+  x7[0] = vqadd_s16(x5[0], x5[8]);
+  x7[1] = vqadd_s16(x5[1], x5[9]);
+  x7[2] = vqadd_s16(x5[2], x5[10]);
+  x7[3] = vqadd_s16(x5[3], x5[11]);
+  x7[4] = vqadd_s16(x5[4], x5[12]);
+  x7[5] = vqadd_s16(x5[5], x5[13]);
+  x7[6] = vqadd_s16(x5[6], x5[14]);
+  x7[7] = vqsub_s16(x5[15], x5[7]);
+  x7[8] = vqsub_s16(x5[0], x5[8]);
+  x7[9] = vqsub_s16(x5[1], x5[9]);
+  x7[10] = vqsub_s16(x5[2], x5[10]);
+  x7[11] = vqsub_s16(x5[3], x5[11]);
+  x7[12] = vqsub_s16(x5[4], x5[12]);
+  x7[13] = vqsub_s16(x5[5], x5[13]);
+  x7[14] = vqsub_s16(x5[6], x5[14]);
+  x7[15] = vqadd_s16(x5[7], x5[15]);
+
+  // stage 8
+  butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+  butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13],
+                                 &output[2]);
+  butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11],
+                                 &output[4]);
+  butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+  butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+  butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5],
+                                 &output[10]);
+  butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3],
+                                 &output[12]);
+  butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14],
+                                 &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
+                                            int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+  // stage 2
+  int16x8_t x2[8];
+  butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+  butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
+  // stage 3
+  int16x8_t x3[16];
+  x3[0] = vqaddq_s16(input[0], x2[0]);
+  x3[1] = vqsubq_s16(x2[1], input[15]);
+  x3[2] = vqsubq_s16(input[0], x2[0]);
+  x3[3] = vqaddq_s16(input[15], x2[1]);
+  x3[4] = vqsubq_s16(x2[2], input[3]);
+  x3[5] = vqaddq_s16(input[12], x2[3]);
+  x3[6] = vqaddq_s16(input[3], x2[2]);
+  x3[7] = vqsubq_s16(input[12], x2[3]);
+  x3[8] = vqsubq_s16(x2[4], input[1]);
+  x3[9] = vqaddq_s16(input[14], x2[5]);
+  x3[10] = vqaddq_s16(input[1], x2[4]);
+  x3[11] = vqsubq_s16(input[14], x2[5]);
+  x3[12] = vqaddq_s16(input[2], x2[6]);
+  x3[13] = vqsubq_s16(x2[7], input[13]);
+  x3[14] = vqsubq_s16(input[2], x2[6]);
+  x3[15] = vqaddq_s16(input[13], x2[7]);
+
+  // stage 4
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+  butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
+
+  // stage 5
+  int16x8_t x5[16];
+  x5[0] = vqaddq_s16(x3[0], x3[4]);
+  x5[1] = vqaddq_s16(x3[1], x3[5]);
+  x5[2] = vqaddq_s16(x3[2], x3[6]);
+  x5[3] = vqsubq_s16(x3[7], x3[3]);
+  x5[4] = vqsubq_s16(x3[0], x3[4]);
+  x5[5] = vqsubq_s16(x3[1], x3[5]);
+  x5[6] = vqsubq_s16(x3[2], x3[6]);
+  x5[7] = vqaddq_s16(x3[3], x3[7]);
+  x5[8] = vqaddq_s16(x3[8], x3[12]);
+  x5[9] = vqaddq_s16(x3[9], x3[13]);
+  x5[10] = vqsubq_s16(x3[14], x3[10]);
+  x5[11] = vqaddq_s16(x3[11], x3[15]);
+  x5[12] = vqsubq_s16(x3[8], x3[12]);
+  x5[13] = vqsubq_s16(x3[9], x3[13]);
+  x5[14] = vqaddq_s16(x3[10], x3[14]);
+  x5[15] = vqsubq_s16(x3[11], x3[15]);
+
+  // stage 6
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+  butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
+
+  // stage 7
+  int16x8_t x7[16];
+  x7[0] = vqaddq_s16(x5[0], x5[8]);
+  x7[1] = vqaddq_s16(x5[1], x5[9]);
+  x7[2] = vqaddq_s16(x5[2], x5[10]);
+  x7[3] = vqaddq_s16(x5[3], x5[11]);
+  x7[4] = vqaddq_s16(x5[4], x5[12]);
+  x7[5] = vqaddq_s16(x5[5], x5[13]);
+  x7[6] = vqaddq_s16(x5[6], x5[14]);
+  x7[7] = vqsubq_s16(x5[15], x5[7]);
+  x7[8] = vqsubq_s16(x5[0], x5[8]);
+  x7[9] = vqsubq_s16(x5[1], x5[9]);
+  x7[10] = vqsubq_s16(x5[2], x5[10]);
+  x7[11] = vqsubq_s16(x5[3], x5[11]);
+  x7[12] = vqsubq_s16(x5[4], x5[12]);
+  x7[13] = vqsubq_s16(x5[5], x5[13]);
+  x7[14] = vqsubq_s16(x5[6], x5[14]);
+  x7[15] = vqaddq_s16(x5[7], x5[15]);
+
+  // stage 8
+  butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+  butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13],
+                                 &output[2]);
+  butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11],
+                                 &output[4]);
+  butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+  butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+  butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5],
+                                 &output[10]);
+  butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3],
+                                 &output[12]);
+  butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14],
+                                 &output[1]);
+}
+
+static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
+                                               int16x4_t *const output,
+                                               const int cos_bit) {
+  (void)cos_bit;
+  round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
+}
+
+static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
+                                               int16x8_t *const output,
+                                               const int cos_bit) {
+  (void)cos_bit;
+  round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
+}
+
+static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
+                                               int16x4_t *output, int cos_bit) {
+  (void)cos_bit;
+  shift_left_1_s16_x4(input, output, 8);
+}
+
+static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
+                                               int16x8_t *output, int cos_bit) {
+  (void)cos_bit;
+  shift_left_1_s16_x8(input, output, 8);
+}
+
+static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
+                                                int16x4_t *output,
+                                                int cos_bit) {
+  (void)cos_bit;
+  round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
+                                                int16x8_t *output,
+                                                int cos_bit) {
+  (void)cos_bit;
+  round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
+                                                int16x8_t *output,
+                                                int cos_bit) {
+  (void)cos_bit;
+  shift_left_2_s16_x8(input, output, 32);
+}
+
+#define TRANSFORM_COL(name, tw, n)                                          \
+  static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
+                              int stride, int cos_bit) {                    \
+    int16x##tw##_t buf0[n];                                                 \
+    load_buffer_s16_x##tw(input, stride, buf0, n);                          \
+    shift_left_2_s16_x##tw(buf0, buf0, n);                                  \
+    name##_neon(buf0, output, cos_bit);                                     \
+  }
+
+TRANSFORM_COL(fadst4x4, 4, 4)
+TRANSFORM_COL(fadst4x8, 4, 8)
+TRANSFORM_COL(fadst4x16, 4, 16)
+TRANSFORM_COL(fadst8x4, 8, 4)
+TRANSFORM_COL(fadst8x8, 8, 8)
+TRANSFORM_COL(fadst8x16, 8, 16)
+TRANSFORM_COL(fdct4x4, 4, 4)
+TRANSFORM_COL(fdct4x8, 4, 8)
+TRANSFORM_COL(fdct4x16, 4, 16)
+TRANSFORM_COL(fdct8x4, 8, 4)
+TRANSFORM_COL(fdct8x8, 8, 8)
+TRANSFORM_COL(fdct8x16, 8, 16)
+TRANSFORM_COL(fdct8x32, 8, 32)
+TRANSFORM_COL(fidentity4x4, 4, 4)
+TRANSFORM_COL(fidentity4x8, 4, 8)
+TRANSFORM_COL(fidentity4x16, 4, 16)
+TRANSFORM_COL(fidentity8x4, 8, 4)
+TRANSFORM_COL(fidentity8x8, 8, 8)
+TRANSFORM_COL(fidentity8x16, 8, 16)
+TRANSFORM_COL(fidentity8x32, 8, 32)
+
+#define TRANSFORM_ROW(name, tw, n)                                          \
+  static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
+                              int stride, int cos_bit) {                    \
+    int16x##tw##_t buf0[n];                                                 \
+    name##_neon(input, buf0, cos_bit);                                      \
+    store_buffer_s16_x##tw(buf0, output, stride, n);                        \
+  }
+
+#define TRANSFORM_ROW_RECT(name, tw, n)                                        \
+  static void name##_row_rect_neon(const int16x##tw##_t *input,                \
+                                   int32_t *output, int stride, int cos_bit) { \
+    int16x##tw##_t buf0[n];                                                    \
+    name##_neon(input, buf0, cos_bit);                                         \
+    store_rect_buffer_s16_x##tw(buf0, output, stride, n);                      \
+  }
+
+TRANSFORM_ROW(fadst4x4, 4, 4)
+TRANSFORM_ROW(fadst4x16, 4, 16)
+TRANSFORM_ROW(fadst8x4, 8, 4)
+TRANSFORM_ROW(fadst8x8, 8, 8)
+TRANSFORM_ROW(fadst8x16, 8, 16)
+TRANSFORM_ROW(fdct4x4, 4, 4)
+TRANSFORM_ROW(fdct4x16, 4, 16)
+TRANSFORM_ROW(fdct8x4, 8, 4)
+TRANSFORM_ROW(fdct8x8, 8, 8)
+TRANSFORM_ROW(fdct8x16, 8, 16)
+TRANSFORM_ROW(fdct8x32, 8, 32)
+TRANSFORM_ROW(fidentity4x4, 4, 4)
+TRANSFORM_ROW(fidentity4x16, 4, 16)
+TRANSFORM_ROW(fidentity8x4, 8, 4)
+TRANSFORM_ROW(fidentity8x8, 8, 8)
+TRANSFORM_ROW(fidentity8x16, 8, 16)
+TRANSFORM_ROW(fidentity8x32, 8, 32)
+
+TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
+TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
+TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
+TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
+TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
+TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
+TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
+TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
+TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
+TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
+TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
+TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
+
+typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
+                                        int16x4_t *output, int cos_bit);
+typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
+                                        int16x8_t *output, int cos_bit);
+
+typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
+                                            int16x4_t *output, int stride,
+                                            int cos_bit);
+typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
+                                            int16x8_t *output, int stride,
+                                            int cos_bit);
+
+typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
+                                            int32_t *output, int stride,
+                                            int cos_bit);
+typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
+                                            int32_t *output, int stride,
+                                            int cos_bit);
+
+static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_col_neon,       // DCT_DCT
+  fadst4x8_col_neon,      // ADST_DCT
+  fdct4x8_col_neon,       // DCT_ADST
+  fadst4x8_col_neon,      // ADST_ADST
+  fadst4x8_col_neon,      // FLIPADST_DCT
+  fdct4x8_col_neon,       // DCT_FLIPADST
+  fadst4x8_col_neon,      // FLIPADST_FLIPADST
+  fadst4x8_col_neon,      // ADST_FLIPADST
+  fadst4x8_col_neon,      // FLIPADST_ADST
+  fidentity4x8_col_neon,  // IDTX
+  fdct4x8_col_neon,       // V_DCT
+  fidentity4x8_col_neon,  // H_DCT
+  fadst4x8_col_neon,      // V_ADST
+  fidentity4x8_col_neon,  // H_ADST
+  fadst4x8_col_neon,      // V_FLIPADST
+  fidentity4x8_col_neon   // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_row_neon,       // DCT_DCT
+  fdct8x4_row_neon,       // ADST_DCT
+  fadst8x4_row_neon,      // DCT_ADST
+  fadst8x4_row_neon,      // ADST_ADST
+  fdct8x4_row_neon,       // FLIPADST_DCT
+  fadst8x4_row_neon,      // DCT_FLIPADST
+  fadst8x4_row_neon,      // FLIPADST_FLIPADST
+  fadst8x4_row_neon,      // ADST_FLIPADST
+  fadst8x4_row_neon,      // FLIPADST_ADST
+  fidentity8x4_row_neon,  // IDTX
+  fidentity8x4_row_neon,  // V_DCT
+  fdct8x4_row_neon,       // H_DCT
+  fidentity8x4_row_neon,  // V_ADST
+  fadst8x4_row_neon,      // H_ADST
+  fidentity8x4_row_neon,  // V_FLIPADST
+  fadst8x4_row_neon       // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_row_rect_neon,       // DCT_DCT
+  fdct8x4_row_rect_neon,       // ADST_DCT
+  fadst8x4_row_rect_neon,      // DCT_ADST
+  fadst8x4_row_rect_neon,      // ADST_ADST
+  fdct8x4_row_rect_neon,       // FLIPADST_DCT
+  fadst8x4_row_rect_neon,      // DCT_FLIPADST
+  fadst8x4_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x4_row_rect_neon,      // ADST_FLIPADST
+  fadst8x4_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x4_row_rect_neon,  // IDTX
+  fidentity8x4_row_rect_neon,  // V_DCT
+  fdct8x4_row_rect_neon,       // H_DCT
+  fidentity8x4_row_rect_neon,  // V_ADST
+  fadst8x4_row_rect_neon,      // H_ADST
+  fidentity8x4_row_rect_neon,  // V_FLIPADST
+  fadst8x4_row_rect_neon       // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_col_neon,       // DCT_DCT
+  fadst8x4_col_neon,      // ADST_DCT
+  fdct8x4_col_neon,       // DCT_ADST
+  fadst8x4_col_neon,      // ADST_ADST
+  fadst8x4_col_neon,      // FLIPADST_DCT
+  fdct8x4_col_neon,       // DCT_FLIPADST
+  fadst8x4_col_neon,      // FLIPADST_FLIPADST
+  fadst8x4_col_neon,      // ADST_FLIPADST
+  fadst8x4_col_neon,      // FLIPADST_ADST
+  fidentity8x4_col_neon,  // IDTX
+  fdct8x4_col_neon,       // V_DCT
+  fidentity8x4_col_neon,  // H_DCT
+  fadst8x4_col_neon,      // V_ADST
+  fidentity8x4_col_neon,  // H_ADST
+  fadst8x4_col_neon,      // V_FLIPADST
+  fidentity8x4_col_neon   // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_row_rect_neon,       // DCT_DCT
+  fdct4x8_row_rect_neon,       // ADST_DCT
+  fadst4x8_row_rect_neon,      // DCT_ADST
+  fadst4x8_row_rect_neon,      // ADST_ADST
+  fdct4x8_row_rect_neon,       // FLIPADST_DCT
+  fadst4x8_row_rect_neon,      // DCT_FLIPADST
+  fadst4x8_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst4x8_row_rect_neon,      // ADST_FLIPADST
+  fadst4x8_row_rect_neon,      // FLIPADST_ADST
+  fidentity4x8_row_rect_neon,  // IDTX
+  fidentity4x8_row_rect_neon,  // V_DCT
+  fdct4x8_row_rect_neon,       // H_DCT
+  fidentity4x8_row_rect_neon,  // V_ADST
+  fadst4x8_row_rect_neon,      // H_ADST
+  fidentity4x8_row_rect_neon,  // V_FLIPADST
+  fadst4x8_row_rect_neon       // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_col_neon,       // DCT_DCT
+  fadst8x8_col_neon,      // ADST_DCT
+  fdct8x8_col_neon,       // DCT_ADST
+  fadst8x8_col_neon,      // ADST_ADST
+  fadst8x8_col_neon,      // FLIPADST_DCT
+  fdct8x8_col_neon,       // DCT_FLIPADST
+  fadst8x8_col_neon,      // FLIPADST_FLIPADST
+  fadst8x8_col_neon,      // ADST_FLIPADST
+  fadst8x8_col_neon,      // FLIPADST_ADST
+  fidentity8x8_col_neon,  // IDTX
+  fdct8x8_col_neon,       // V_DCT
+  fidentity8x8_col_neon,  // H_DCT
+  fadst8x8_col_neon,      // V_ADST
+  fidentity8x8_col_neon,  // H_ADST
+  fadst8x8_col_neon,      // V_FLIPADST
+  fidentity8x8_col_neon,  // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_row_neon,       // DCT_DCT
+  fdct8x8_row_neon,       // ADST_DCT
+  fadst8x8_row_neon,      // DCT_ADST
+  fadst8x8_row_neon,      // ADST_ADST
+  fdct8x8_row_neon,       // FLIPADST_DCT
+  fadst8x8_row_neon,      // DCT_FLIPADST
+  fadst8x8_row_neon,      // FLIPADST_FLIPADST
+  fadst8x8_row_neon,      // ADST_FLIPADST
+  fadst8x8_row_neon,      // FLIPADST_ADST
+  fidentity8x8_row_neon,  // IDTX
+  fidentity8x8_row_neon,  // V_DCT
+  fdct8x8_row_neon,       // H_DCT
+  fidentity8x8_row_neon,  // V_ADST
+  fadst8x8_row_neon,      // H_ADST
+  fidentity8x8_row_neon,  // V_FLIPADST
+  fadst8x8_row_neon       // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_row_rect_neon,       // DCT_DCT
+  fdct8x8_row_rect_neon,       // ADST_DCT
+  fadst8x8_row_rect_neon,      // DCT_ADST
+  fadst8x8_row_rect_neon,      // ADST_ADST
+  fdct8x8_row_rect_neon,       // FLIPADST_DCT
+  fadst8x8_row_rect_neon,      // DCT_FLIPADST
+  fadst8x8_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x8_row_rect_neon,      // ADST_FLIPADST
+  fadst8x8_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x8_row_rect_neon,  // IDTX
+  fidentity8x8_row_rect_neon,  // V_DCT
+  fdct8x8_row_rect_neon,       // H_DCT
+  fidentity8x8_row_rect_neon,  // V_ADST
+  fadst8x8_row_rect_neon,      // H_ADST
+  fidentity8x8_row_rect_neon,  // V_FLIPADST
+  fadst8x8_row_rect_neon       // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
+  fdct4x16_col_neon,       // DCT_DCT
+  fadst4x16_col_neon,      // ADST_DCT
+  fdct4x16_col_neon,       // DCT_ADST
+  fadst4x16_col_neon,      // ADST_ADST
+  fadst4x16_col_neon,      // FLIPADST_DCT
+  fdct4x16_col_neon,       // DCT_FLIPADST
+  fadst4x16_col_neon,      // FLIPADST_FLIPADST
+  fadst4x16_col_neon,      // ADST_FLIPADST
+  fadst4x16_col_neon,      // FLIPADST_ADST
+  fidentity4x16_col_neon,  // IDTX
+  fdct4x16_col_neon,       // V_DCT
+  fidentity4x16_col_neon,  // H_DCT
+  fadst4x16_col_neon,      // V_ADST
+  fidentity4x16_col_neon,  // H_ADST
+  fadst4x16_col_neon,      // V_FLIPADST
+  fidentity4x16_col_neon   // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
+  fdct4x16_row_neon,       // DCT_DCT
+  fdct4x16_row_neon,       // ADST_DCT
+  fadst4x16_row_neon,      // DCT_ADST
+  fadst4x16_row_neon,      // ADST_ADST
+  fdct4x16_row_neon,       // FLIPADST_DCT
+  fadst4x16_row_neon,      // DCT_FLIPADST
+  fadst4x16_row_neon,      // FLIPADST_FLIPADST
+  fadst4x16_row_neon,      // ADST_FLIPADST
+  fadst4x16_row_neon,      // FLIPADST_ADST
+  fidentity4x16_row_neon,  // IDTX
+  fidentity4x16_row_neon,  // V_DCT
+  fdct4x16_row_neon,       // H_DCT
+  fidentity4x16_row_neon,  // V_ADST
+  fadst4x16_row_neon,      // H_ADST
+  fidentity4x16_row_neon,  // V_FLIPADST
+  fadst4x16_row_neon       // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_col_neon,       // DCT_DCT
+  fadst8x16_col_neon,      // ADST_DCT
+  fdct8x16_col_neon,       // DCT_ADST
+  fadst8x16_col_neon,      // ADST_ADST
+  fadst8x16_col_neon,      // FLIPADST_DCT
+  fdct8x16_col_neon,       // DCT_FLIPADST
+  fadst8x16_col_neon,      // FLIPADST_FLIPADST
+  fadst8x16_col_neon,      // ADST_FLIPADST
+  fadst8x16_col_neon,      // FLIPADST_ADST
+  fidentity8x16_col_neon,  // IDTX
+  fdct8x16_col_neon,       // V_DCT
+  fidentity8x16_col_neon,  // H_DCT
+  fadst8x16_col_neon,      // V_ADST
+  fidentity8x16_col_neon,  // H_ADST
+  fadst8x16_col_neon,      // V_FLIPADST
+  fidentity8x16_col_neon   // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_row_neon,       // DCT_DCT
+  fdct8x16_row_neon,       // ADST_DCT
+  fadst8x16_row_neon,      // DCT_ADST
+  fadst8x16_row_neon,      // ADST_ADST
+  fdct8x16_row_neon,       // FLIPADST_DCT
+  fadst8x16_row_neon,      // DCT_FLIPADST
+  fadst8x16_row_neon,      // FLIPADST_FLIPADST
+  fadst8x16_row_neon,      // ADST_FLIPADST
+  fadst8x16_row_neon,      // FLIPADST_ADST
+  fidentity8x16_row_neon,  // IDTX
+  fidentity8x16_row_neon,  // V_DCT
+  fdct8x16_row_neon,       // H_DCT
+  fidentity8x16_row_neon,  // V_ADST
+  fadst8x16_row_neon,      // H_ADST
+  fidentity8x16_row_neon,  // V_FLIPADST
+  fadst8x16_row_neon       // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_row_rect_neon,       // DCT_DCT
+  fdct8x16_row_rect_neon,       // ADST_DCT
+  fadst8x16_row_rect_neon,      // DCT_ADST
+  fadst8x16_row_rect_neon,      // ADST_ADST
+  fdct8x16_row_rect_neon,       // FLIPADST_DCT
+  fadst8x16_row_rect_neon,      // DCT_FLIPADST
+  fadst8x16_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x16_row_rect_neon,      // ADST_FLIPADST
+  fadst8x16_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x16_row_rect_neon,  // IDTX
+  fidentity8x16_row_rect_neon,  // V_DCT
+  fdct8x16_row_rect_neon,       // H_DCT
+  fidentity8x16_row_rect_neon,  // V_ADST
+  fadst8x16_row_rect_neon,      // H_ADST
+  fidentity8x16_row_rect_neon,  // V_FLIPADST
+  fadst8x16_row_rect_neon       // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_row_neon,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_row_neon,  // IDTX
+  fidentity8x32_row_neon,  // V_DCT
+  fdct8x32_row_neon,       // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_row_rect_neon,       // DCT_DCT
+  NULL,                         // ADST_DCT
+  NULL,                         // DCT_ADST
+  NULL,                         // ADST_ADST
+  NULL,                         // FLIPADST_DCT
+  NULL,                         // DCT_FLIPADST
+  NULL,                         // FLIPADST_FLIPADST
+  NULL,                         // ADST_FLIPADST
+  NULL,                         // FLIPADST_ADST
+  fidentity8x32_row_rect_neon,  // IDTX
+  fidentity8x32_row_rect_neon,  // V_DCT
+  fdct8x32_row_rect_neon,       // H_DCT
+  NULL,                         // V_ADST
+  NULL,                         // H_ADST
+  NULL,                         // V_FLIPADST
+  NULL                          // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_col_neon,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_col_neon,  // IDTX
+  fdct8x32_col_neon,       // V_DCT
+  fidentity8x32_col_neon,  // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
+                                      int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+  int16x4_t buf0[4], buf1[4];
+  switch (tx_type) {
+    case DCT_DCT:
+      fdct4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fdct4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case ADST_DCT:
+      fadst4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fdct4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case DCT_ADST:
+      fdct4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fadst4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case ADST_ADST:
+      fadst4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fadst4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case FLIPADST_DCT:
+      fadst4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fdct4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case DCT_FLIPADST:
+      fdct4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      flip_buf_4_neon(buf1, buf0, 4);
+      fadst4x4_row_neon(buf0, output, 4, 13);
+      break;
+    case FLIPADST_FLIPADST:
+      fadst4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      flip_buf_4_neon(buf1, buf0, 4);
+      fadst4x4_row_neon(buf0, output, 4, 13);
+      break;
+    case ADST_FLIPADST:
+      fadst4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      flip_buf_4_neon(buf1, buf0, 4);
+      fadst4x4_row_neon(buf0, output, 4, 13);
+      break;
+    case FLIPADST_ADST:
+      fadst4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fadst4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case IDTX:
+      fidentity4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fidentity4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case V_DCT:
+      fdct4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fidentity4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case H_DCT:
+      fidentity4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fdct4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case V_ADST:
+      fadst4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fidentity4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case H_ADST:
+      fidentity4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fadst4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case V_FLIPADST:
+      fadst4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      fidentity4x4_row_neon(buf1, output, 4, 13);
+      break;
+    case H_FLIPADST:
+      fidentity4x4_col_neon(input, buf0, stride, 13);
+      transpose_arrays_s16_4x4(buf0, buf1);
+      flip_buf_4_neon(buf1, buf0, 4);
+      fadst4x4_row_neon(buf0, output, 4, 13);
+      break;
+  }
+}
+
+static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
+                                      int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x4_t buf0[8];
+  int16x8_t buf1[8];
+  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_1_round_s16_x4(buf0, buf0, 8);
+  transpose_arrays_s16_4x8(buf0, buf1);
+
+  if (lr_flip) {
+    int16x8_t buf2[8];
+    flip_buf_8_neon(buf1, buf2, 4);
+    row_txfm(buf2, output, 8, 13);
+  } else {
+    row_txfm(buf1, output, 8, 13);
+  }
+}
+
+static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x4_t buf0[16];
+  int16x8_t buf1[16];
+  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_1_round_s16_x4(buf0, buf0, 16);
+  transpose_arrays_s16_4x8(buf0, buf1);
+  transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    if (lr_flip) {
+      int16x8_t buf2[16];
+      flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
+      row_txfm(buf2, output + 8 * i, 16, 12);
+    } else {
+      int16x8_t *buf = buf1 + 8 * i;
+      row_txfm(buf, output + 8 * i, 16, 12);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
+                                      int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[8];
+  int16x4_t buf1[8];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_1_round_s16_x8(buf0, buf0, 4);
+  transpose_arrays_s16_8x4(buf0, buf1);
+
+  if (lr_flip) {
+    int16x4_t buf2[8];
+    flip_buf_4_neon(buf1, buf2, 8);
+    row_txfm(buf2, output, 4, 13);
+  } else {
+    row_txfm(buf1, output, 4, 13);
+  }
+}
+
+static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
+                                      int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+  int16x8_t buf0[8], buf1[8];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      fdct8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fdct8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case ADST_DCT:
+      fadst8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fdct8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case DCT_ADST:
+      fdct8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fadst8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case ADST_ADST:
+      fadst8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fadst8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case FLIPADST_DCT:
+      fadst8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fdct8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case DCT_FLIPADST:
+      fdct8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      flip_buf_8_neon(buf1, buf0, 8);
+      fadst8x8_row_neon(buf0, output, 8, 13);
+      break;
+    case FLIPADST_FLIPADST:
+      fadst8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      flip_buf_8_neon(buf1, buf0, 8);
+      fadst8x8_row_neon(buf0, output, 8, 13);
+      break;
+    case ADST_FLIPADST:
+      fadst8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      flip_buf_8_neon(buf1, buf0, 8);
+      fadst8x8_row_neon(buf0, output, 8, 13);
+      break;
+    case FLIPADST_ADST:
+      fadst8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fadst8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case IDTX:
+      fidentity8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fidentity8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case V_DCT:
+      fdct8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fidentity8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case H_DCT:
+      fidentity8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fdct8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case V_ADST:
+      fadst8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fidentity8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case H_ADST:
+      fidentity8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fadst8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case V_FLIPADST:
+      fadst8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      fidentity8x8_row_neon(buf1, output, 8, 13);
+      break;
+    case H_FLIPADST:
+      fidentity8x8_col_neon(input, buf0, stride, 13);
+      shift_right_1_round_s16_x8(buf0, buf0, 8);
+      transpose_arrays_s16_8x8(buf0, buf1);
+      flip_buf_8_neon(buf1, buf0, 8);
+      fadst8x8_row_neon(buf0, output, 8, 13);
+      break;
+  }
+}
+
+static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_2_round_s16_x8(buf0, buf0, 16);
+  transpose_arrays_s16_8x8(buf0, buf1);
+  transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+      row_txfm(buf0, output + 8 * i, 16, 13);
+    } else {
+      int16x8_t *buf = buf1 + 8 * i;
+      row_txfm(buf, output + 8 * i, 16, 13);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[32];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+  col_txfm(input, buf0, stride, 12);
+  shift_right_2_round_s16_x8(buf0, buf0, 32);
+  transpose_arrays_s16_8x8(buf0, buf1);
+  transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
+  transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16);
+  transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24);
+
+  for (int i = 0; i < 4; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+      row_txfm(buf0, output + 8 * i, 32, 12);
+    } else {
+      int16x8_t *buf = buf1 + 8 * i;
+      row_txfm(buf, output + 8 * i, 32, 12);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16];
+  int16x4_t buf1[16];
+  int16x4_t buf2[16];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+  for (int i = 0; i < 2; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_1_round_s16_x8(buf0, buf0, 4);
+    transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    flip_buf_4_neon(buf1, buf2, 16);
+    row_txfm(buf2, output, 4, 13);
+  } else {
+    row_txfm(buf1, output, 4, 13);
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+  for (int i = 0; i < 2; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 8);
+    transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    flip_buf_8_neon(buf1, buf0, 16);
+    row_txfm(buf0, output, 8, 13);
+  } else {
+    row_txfm(buf1, output, 8, 13);
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[32];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+  for (int i = 0; i < 2; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 16);
+    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+      row_txfm(buf0, output + 8 * i, 16, 12);
+    } else {
+      int16x8_t *buf = buf1 + 16 * i;
+      row_txfm(buf, output + 8 * i, 16, 12);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[64];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
+
+  if (col_txfm == NULL || row_txfm == NULL) {
+    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+    return;
+  }
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+  for (int i = 0; i < 2; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 12);
+    shift_right_4_round_s16_x8(buf0, buf0, 32);
+    transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i);
+  }
+
+  for (int i = 0; i < 4; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+      row_txfm(buf0, output + 8 * i, 32, 13);
+    } else {
+      int16x8_t *buf = buf1 + 16 * i;
+      row_txfm(buf, output + 8 * i, 32, 13);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[32];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm == NULL || row_txfm == NULL) {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+    return;
+  }
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+  for (int i = 0; i < 4; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 8);
+    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+  }
+
+  if (lr_flip) {
+    flip_buf_8_neon(buf1, buf0, 32);
+    row_txfm(buf0, output, 8, 12);
+  } else {
+    row_txfm(buf1, output, 8, 12);
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[64];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
+
+  if (col_txfm == NULL || row_txfm == NULL) {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+    return;
+  }
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+  for (int i = 0; i < 4; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_4_round_s16_x8(buf0, buf0, 16);
+    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+      row_txfm(buf0, output + 8 * i, 16, 13);
+    } else {
+      int16x8_t *buf = buf1 + 32 * i;
+      row_txfm(buf, output + 8 * i, 16, 13);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[128];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm == NULL || row_txfm == NULL) {
+    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+    return;
+  }
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+  for (int i = 0; i < 4; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 12);
+    shift_right_4_round_s16_x8(buf0, buf0, 32);
+    transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i);
+  }
+
+  for (int i = 0; i < 4; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+      row_txfm(buf0, output + 8 * i, 32, 12);
+    } else {
+      int16x8_t *buf = buf1 + 32 * i;
+      row_txfm(buf, output + 8 * i, 32, 12);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  int16x8_t buf0[64], buf1[128];
+  const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon;
+  const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
+
+  for (int i = 0; i < 8; i++) {
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
+    shift_left_2_s16_x8(buf0, buf0, 16);
+    col_txfm(buf0, buf0, 13);
+    shift_right_4_round_s16_x8(buf0, buf0, 16);
+    for (int j = 0; j < 2; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    int16x8_t *buf = buf1 + 64 * i;
+    row_txfm(buf, buf, 12);
+    store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  int16x8_t buf0[64], buf1[128];
+  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+  const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
+
+  for (int i = 0; i < 2; i++) {
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+    col_txfm(buf0, buf0, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 64);
+    for (int j = 0; j < 8; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    int16x8_t *buf = buf1 + 16 * i;
+    row_txfm(buf, buf, 12);
+    store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
+  }
+}
+
+static void fdct32_neon(const int32x4_t *input, int32x4_t *output,
+                        int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+  int32x4_t buf0[32];
+  int32x4_t buf1[32];
+
+  // stage 1
+  butterfly_dct_pre_s32_x4(input, buf1, 32);
+
+  // stage 2
+  butterfly_dct_pre_s32_x4(buf1, buf0, 16);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27],
+                                 &buf0[20]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26],
+                                 &buf0[21]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25],
+                                 &buf0[22]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24],
+                                 &buf0[23]);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  butterfly_dct_pre_s32_x4(buf0, buf1, 8);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13],
+                                 &buf1[10]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12],
+                                 &buf1[11]);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16);
+
+  // stage 4
+  butterfly_dct_pre_s32_x4(buf1, buf0, 4);
+  buf0[4] = buf1[4];
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]);
+  buf0[7] = buf1[7];
+  butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29],
+                                 &buf0[18]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28],
+                                 &buf0[19]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27],
+                                 &buf0[20]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26],
+                                 &buf0[21]);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]);
+  butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4);
+  buf1[8] = buf0[8];
+  butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14],
+                                 &buf1[9]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13],
+                                 &buf1[10]);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+  butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8);
+
+  // stage 6
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]);
+  butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+  butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4);
+  buf0[16] = buf1[16];
+  butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30],
+                                 &buf0[17]);
+  butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29],
+                                 &buf0[18]);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26],
+                                 &buf0[21]);
+  butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25],
+                                 &buf0[22]);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8],
+                                 &buf1[15]);
+  butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9],
+                                 &buf1[14]);
+  butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10],
+                                 &buf1[13]);
+  butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11],
+                                 &buf1[12]);
+  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+  butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+  butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+  butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4);
+
+  // stage 8
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16],
+                                 &buf0[31]);
+  butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17],
+                                 &buf0[30]);
+  butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18],
+                                 &buf0[29]);
+  butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19],
+                                 &buf0[28]);
+  butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20],
+                                 &buf0[27]);
+  butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21],
+                                 &buf0[26]);
+  butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22],
+                                 &buf0[25]);
+  butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23],
+                                 &buf0[24]);
+
+  // stage 9
+  output[0] = buf0[0];
+  output[1] = buf0[16];
+  output[2] = buf0[8];
+  output[3] = buf0[24];
+  output[4] = buf0[4];
+  output[5] = buf0[20];
+  output[6] = buf0[12];
+  output[7] = buf0[28];
+  output[8] = buf0[2];
+  output[9] = buf0[18];
+  output[10] = buf0[10];
+  output[11] = buf0[26];
+  output[12] = buf0[6];
+  output[13] = buf0[22];
+  output[14] = buf0[14];
+  output[15] = buf0[30];
+  output[16] = buf0[1];
+  output[17] = buf0[17];
+  output[18] = buf0[9];
+  output[19] = buf0[25];
+  output[20] = buf0[5];
+  output[21] = buf0[21];
+  output[22] = buf0[13];
+  output[23] = buf0[29];
+  output[24] = buf0[3];
+  output[25] = buf0[19];
+  output[26] = buf0[11];
+  output[27] = buf0[27];
+  output[28] = buf0[7];
+  output[29] = buf0[23];
+  output[30] = buf0[15];
+  output[31] = buf0[31];
+}
+
+static void fdct64_neon(const int32x4_t *input, int32x4_t *output,
+                        int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+  const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+  const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+  const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+  const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+  const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+  const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+  const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+  const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+  const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+  const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+  const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+  const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+  const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+  const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+  const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+  const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+  const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+  const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+  const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+  const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+  const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+  const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+  const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+  const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
+  // stage 1
+  int32x4_t x1[64];
+  butterfly_dct_pre_s32_x4(input, x1, 64);
+
+  // stage 2
+  int32x4_t x2[64];
+  butterfly_dct_pre_s32_x4(x1, x2, 32);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
+
+  // stage 3
+  int32x4_t x3[64];
+  butterfly_dct_pre_s32_x4(x2, x3, 16);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
+  butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32);
+
+  // stage 4
+  int32x4_t x4[64];
+  butterfly_dct_pre_s32_x4(x3, x4, 8);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+  butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
+
+  // stage 5
+  int32x4_t x5[64];
+  butterfly_dct_pre_s32_x4(x4, x5, 4);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+  butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+  butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16);
+  butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16);
+
+  // stage 6
+  int32x4_t x6[64];
+  butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+  butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+  butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8);
+  butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8);
+  butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+  butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+  butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+  butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+  butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+  butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
+
+  // stage 7
+  int32x4_t x7[64];
+  butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+  butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4);
+  butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4);
+  butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+  butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+  butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+  butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8);
+  butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8);
+  butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8);
+  butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8);
+
+  // stage 8
+  int32x4_t x8[64];
+  butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+  butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+  butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+  butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+  butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4);
+  butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4);
+  butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4);
+  butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4);
+  butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+  butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+  butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+  butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+  butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+  butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+  butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+  butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
+
+  // stage 9
+  int32x4_t x9[64];
+  butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+  butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+  butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+  butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+  butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+  butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+  butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+  butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+  butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4);
+  butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4);
+  butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4);
+  butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4);
+  butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4);
+  butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4);
+  butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4);
+  butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4);
+
+  // stage 10
+  int32x4_t x10[64];
+  butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]);
+  butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]);
+  butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]);
+  butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]);
+  butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]);
+  butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]);
+  butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]);
+  butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]);
+  butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]);
+  butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]);
+  butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]);
+  butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]);
+  butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]);
+  butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]);
+  butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]);
+  butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]);
+
+  // stage 11, only store into the low 32 output indices.
+  output[0] = x6[0];
+  output[1] = x10[32];
+  output[2] = x9[16];
+  output[3] = x10[48];
+  output[4] = x8[8];
+  output[5] = x10[40];
+  output[6] = x9[24];
+  output[7] = x10[56];
+  output[8] = x7[4];
+  output[9] = x10[36];
+  output[10] = x9[20];
+  output[11] = x10[52];
+  output[12] = x8[12];
+  output[13] = x10[44];
+  output[14] = x9[28];
+  output[15] = x10[60];
+  output[16] = x6[2];
+  output[17] = x10[34];
+  output[18] = x9[18];
+  output[19] = x10[50];
+  output[20] = x8[10];
+  output[21] = x10[42];
+  output[22] = x9[26];
+  output[23] = x10[58];
+  output[24] = x7[6];
+  output[25] = x10[38];
+  output[26] = x9[22];
+  output[27] = x10[54];
+  output[28] = x8[14];
+  output[29] = x10[46];
+  output[30] = x9[30];
+  output[31] = x10[62];
+}
+
+static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  int16x8_t buf0[64], buf1[512];
+  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+
+  for (int i = 0; i < 8; i++) {
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+    col_txfm(buf0, buf0, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 64);
+    for (int j = 0; j < 4; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+    }
+  }
+  for (int i = 0; i < 4; i++) {
+    int32x4_t bufA[64];
+    int32x4_t bufB[64];
+    int16x8_t *buf = buf1 + 64 * i;
+    for (int j = 0; j < 64; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    fdct64_neon(bufA, bufA, 10);
+    fdct64_neon(bufB, bufB, 10);
+    shift_right_2_round_s32_x4(bufA, bufA, 32);
+    shift_right_2_round_s32_x4(bufB, bufB, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[64], buf1[256];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+
+  for (int i = 0; i < 8; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 12);
+    shift_right_4_round_s16_x8(buf0, buf0, 32);
+    for (int j = 0; j < 4; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < 4; i++) {
+    int32x4_t bufA[64];
+    int32x4_t bufB[64];
+    int16x8_t *buf = buf1 + 64 * i;
+    for (int j = 0; j < 64; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    fdct64_neon(bufA, bufA, 11);
+    fdct64_neon(bufB, bufB, 11);
+    shift_right_2_round_s32_x4(bufA, bufA, 32);
+    shift_right_2_round_s32_x4(bufB, bufB, 32);
+    round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+    round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  int16x8_t buf0[64], buf1[256];
+  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+
+  for (int i = 0; i < 4; i++) {
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+    col_txfm(buf0, buf0, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 64);
+    for (int j = 0; j < 4; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    int32x4_t bufA[32];
+    int32x4_t bufB[32];
+    int16x8_t *buf = buf1 + 32 * i;
+    for (int j = 0; j < 32; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    fdct32_neon(bufA, bufA, 11);
+    fdct32_neon(bufB, bufB, 11);
+    shift_right_2_round_s32_x4(bufA, bufA, 32);
+    shift_right_2_round_s32_x4(bufB, bufB, 32);
+    round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+    round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+  }
+}
+
+static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
+  lowbd_fwd_txfm2d_4x4_neon,    // 4x4 transform
+  lowbd_fwd_txfm2d_8x8_neon,    // 8x8 transform
+  lowbd_fwd_txfm2d_16x16_neon,  // 16x16 transform
+  lowbd_fwd_txfm2d_32x32_neon,  // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_neon,  // 64x64 transform
+  lowbd_fwd_txfm2d_4x8_neon,    // 4x8 transform
+  lowbd_fwd_txfm2d_8x4_neon,    // 8x4 transform
+  lowbd_fwd_txfm2d_8x16_neon,   // 8x16 transform
+  lowbd_fwd_txfm2d_16x8_neon,   // 16x8 transform
+  lowbd_fwd_txfm2d_16x32_neon,  // 16x32 transform
+  lowbd_fwd_txfm2d_32x16_neon,  // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_neon,  // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_neon,  // 64x32 transform
+  lowbd_fwd_txfm2d_4x16_neon,   // 4x16 transform
+  lowbd_fwd_txfm2d_16x4_neon,   // 16x4 transform
+  lowbd_fwd_txfm2d_8x32_neon,   // 8x32 transform
+  lowbd_fwd_txfm2d_32x8_neon,   // 32x8 transform
+  lowbd_fwd_txfm2d_16x64_neon,  // 16x64 transform
+  lowbd_fwd_txfm2d_64x16_neon,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
+  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
new file mode 100644
index 0000000000..11d3def16b
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    int32x4_t v_quant_s32,
+                                    int32x4_t v_dequant_s32,
+                                    int32x4_t v_round_s32, int log_scale) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+  const int32x4_t v_abs_coeff_scaled =
+      vshlq_s32(v_abs_coeff, vdupq_n_s32(1 + log_scale));
+  const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+  // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+                                    vreinterpretq_s32_u32(v_mask));
+  //  const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+  const int32x4_t v_abs_qcoeff =
+      vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Used to find eob.
+  const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
+  return vmovn_u32(nz_qcoeff_mask);
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+                                         int16x8_t v_eobmax,
+                                         uint16x8_t v_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+void av1_highbd_quantize_fp_neon(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  (void)scan;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+  const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+  const int16x4_t v_round_log_scale =
+      vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+  const int16x4_t v_round =
+      vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+  // DC and first 3 AC
+  v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+                         v_dequant_s32, v_round_s32, log_scale);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                         v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  count -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+                           v_dequant_s32, v_round_s32, log_scale);
+    v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    count -= 8;
+  } while (count);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c
new file mode 100644
index 0000000000..d13cc65ae0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static int32x4_t k_means_multiply_add_neon(const int16x8_t a) {
+  const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));
+  const int32x4_t h = vmull_s16(vget_high_s16(a), vget_high_s16(a));
+#if AOM_ARCH_AARCH64
+  return vpaddq_s32(l, h);
+#else
+  const int32x2_t dl = vpadd_s32(vget_low_s32(l), vget_high_s32(l));
+  const int32x2_t dh = vpadd_s32(vget_low_s32(h), vget_high_s32(h));
+  return vcombine_s32(dl, dh);
+#endif
+}
+
+void av1_calc_indices_dim1_neon(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
+  int64x2_t sum = vdupq_n_s64(0);
+  int16x8_t cents[PALETTE_MAX_SIZE];
+  for (int j = 0; j < k; ++j) {
+    cents[j] = vdupq_n_s16(centroids[j]);
+  }
+
+  for (int i = 0; i < n; i += 8) {
+    const int16x8_t in = vld1q_s16(data);
+    uint16x8_t ind = vdupq_n_u16(0);
+    // Compute the distance to the first centroid.
+    int16x8_t dist_min = vabdq_s16(in, cents[0]);
+
+    for (int j = 1; j < k; ++j) {
+      // Compute the distance to the centroid.
+      const int16x8_t dist = vabdq_s16(in, cents[j]);
+      // Compare to the minimal one.
+      const uint16x8_t cmp = vcgtq_s16(dist_min, dist);
+      dist_min = vminq_s16(dist_min, dist);
+      const uint16x8_t ind1 = vdupq_n_u16(j);
+      ind = vbslq_u16(cmp, ind1, ind);
+    }
+    if (total_dist) {
+      // Square, convert to 32 bit and add together.
+      const int32x4_t l =
+          vmull_s16(vget_low_s16(dist_min), vget_low_s16(dist_min));
+      const int32x4_t sum32_tmp =
+          vmlal_s16(l, vget_high_s16(dist_min), vget_high_s16(dist_min));
+      // Pairwise sum, convert to 64 bit and add to sum.
+      sum = vpadalq_s32(sum, sum32_tmp);
+    }
+    vst1_u8(indices, vmovn_u16(ind));
+    indices += 8;
+    data += 8;
+  }
+  if (total_dist) {
+    *total_dist = horizontal_add_s64x2(sum);
+  }
+}
+
+void av1_calc_indices_dim2_neon(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
+  int64x2_t sum = vdupq_n_s64(0);
+  uint32x4_t ind[2];
+  int16x8_t cents[PALETTE_MAX_SIZE];
+  for (int j = 0; j < k; ++j) {
+    const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+    const int16_t cxcy[8] = { cx, cy, cx, cy, cx, cy, cx, cy };
+    cents[j] = vld1q_s16(cxcy);
+  }
+
+  for (int i = 0; i < n; i += 8) {
+    for (int l = 0; l < 2; ++l) {
+      const int16x8_t in = vld1q_s16(data);
+      ind[l] = vdupq_n_u32(0);
+      // Compute the distance to the first centroid.
+      int16x8_t d1 = vsubq_s16(in, cents[0]);
+      int32x4_t dist_min = k_means_multiply_add_neon(d1);
+
+      for (int j = 1; j < k; ++j) {
+        // Compute the distance to the centroid.
+        d1 = vsubq_s16(in, cents[j]);
+        const int32x4_t dist = k_means_multiply_add_neon(d1);
+        // Compare to the minimal one.
+        const uint32x4_t cmp = vcgtq_s32(dist_min, dist);
+        dist_min = vminq_s32(dist_min, dist);
+        const uint32x4_t ind1 = vdupq_n_u32(j);
+        ind[l] = vbslq_u32(cmp, ind1, ind[l]);
+      }
+      if (total_dist) {
+        // Pairwise sum, convert to 64 bit and add to sum.
+        sum = vpadalq_s32(sum, dist_min);
+      }
+      data += 8;
+    }
+    // Cast to 8 bit and store.
+    vst1_u8(indices,
+            vmovn_u16(vcombine_u16(vmovn_u32(ind[0]), vmovn_u32(ind[1]))));
+    indices += 8;
+  }
+  if (total_dist) {
+    *total_dist = horizontal_add_s64x2(sum);
+  }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
new file mode 100644
index 0000000000..18cd0ce4c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if AOM_ARCH_AARCH64
+  return vaddlvq_s8(v_sum_diff_total);
+#else
+  const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+  const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+  const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+  const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+                                vget_low_s64(fedcba98_76543210));
+  const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+  return sum_diff;
+#endif
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+    const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+    const uint8x16_t v_delta_level_1_and_2,
+    const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+  /* Figure out which level that put us in. */
+  const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+  const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+  const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+  /* Calculate absolute adjustments for level 1, 2 and 3. */
+  const uint8x16_t v_level2_adjustment =
+      vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+  const uint8x16_t v_level3_adjustment =
+      vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+  const uint8x16_t v_level1and2_adjustment =
+      vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+  const uint8x16_t v_level1and2and3_adjustment =
+      vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+  /* Figure adjustment absolute value by selecting between the absolute
+   * difference if in level0 or the value for level 1, 2 and 3.
+   */
+  const uint8x16_t v_abs_adjustment =
+      vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+  /* Calculate positive and negative adjustments. Apply them to the signal
+   * and accumulate them. Adjustments are less than eight and the maximum
+   * sum of them (7 * 16) can fit in a signed char.
+   */
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  /* Sum all the accumulators to have the sum of all pixel differences
+   * for this macroblock.
+   */
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                  vreinterpretq_s8_u8(v_neg_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+  uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+  // Clamp absolute difference to delta to get the adjustment.
+  const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+  v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                  vreinterpretq_s8_u8(v_pos_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int av1_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude,
+                                 int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_height = block_size_high[bs] >> 1;
+
+  int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+  for (r = 0; r < b_height; ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+           width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    v_sum_diff_total = denoiser_16x1_neon(
+        sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+        v_level1_threshold, v_level2_threshold, v_level3_threshold,
+        v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+        v_sum_diff_total);
+    {
+      const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+      const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+      const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+      vst1_u8(running_avg_y, v_running_buffer_low);
+      vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+    }
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vmovq_n_u8(delta);
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
+          v_sum_diff_total = denoiser_adjust_16x1_neon(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+              v_sum_diff_total);
+          {
+            const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+            const uint8x8_t v_running_buffer_high =
+                vget_high_u8(v_running_buffer);
+            const uint8x8_t v_running_buffer_low =
+                vget_low_u8(v_running_buffer);
+            vst1_u8(running_avg_y, v_running_buffer_low);
+            vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+          }
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+
+  return FILTER_BLOCK;
+}
+
+// Denoise 16x16, to 128x128 blocks.
+static int av1_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude) {
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_width = block_size_wide[bs];
+  const int b_height = block_size_high[bs];
+  const int b_width_shift4 = b_width >> 4;
+
+  int8x16_t v_sum_diff_total[8][8];
+  int r, c, sum_diff = 0;
+
+  for (r = 0; r < 8; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r] = vdupq_n_s8(0);
+    }
+  }
+
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+          sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+          v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+          v_delta_level_1_and_2, v_delta_level_2_and_3,
+          v_sum_diff_total[c][r >> 4]);
+
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
+  }
+
+  {
+    const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vdupq_n_u8(delta);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
+        sum_diff = 0;
+
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            v_sum_diff_total[c][r >> 4] =
+                denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+                                          k_delta, v_sum_diff_total[c][r >> 4]);
+
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+            }
+          }
+
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
+        }
+
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int av1_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg, int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising, BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+    return av1_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return av1_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude, 8);
+  }
+  return COPY_BLOCK;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/cnn_neon.c b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c
new file mode 100644
index 0000000000..8e686260d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c
@@ -0,0 +1,1144 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_cnn_weights.h"
+
+// The CNN weights used in av1_cnn_convolve_no_maxpool_padding_valid are
+// declared (av1_intra_mode_cnn_partition_cnn_layer_[01234]_kernel) in
+// partition_cnn_weights.h. However, to enable linear memory access, rearrange
+// the weight tables here.
+static const float weights_layer_1[] = {
+  0.228403f,  0.031690f,  -0.251710f, -0.046230f, 0.413294f,  -0.236732f,
+  -0.038291f, 0.210766f,  0.427196f,  -0.384319f, -0.439463f, 0.366015f,
+  0.112263f,  -0.144168f, -0.075017f, 0.119629f,  0.325200f,  -0.678246f,
+  -0.370826f, -0.341362f, -0.503392f, 0.400884f,  0.465214f,  -0.360847f,
+  0.187100f,  -0.190757f, -0.131906f, 0.121492f,  -0.303556f, -0.007658f,
+  0.380077f,  -0.066394f, -0.016043f, -1.490730f, -0.120682f, 0.132062f,
+  0.086185f,  -0.042766f, -0.087069f, 0.029426f,  0.309583f,  -0.029985f,
+  -0.297429f, -0.018139f, -0.688828f, 0.756607f,  0.706410f,  -0.696826f,
+  -0.087793f, -0.023304f, -0.012332f, -0.018043f, -0.410268f, 0.352143f,
+  0.391284f,  -0.363178f, -0.295034f, 0.160246f,  -0.149446f, 0.260145f,
+  -0.252249f, 0.190826f,  0.251206f,  -0.270796f, -0.979219f, 0.884880f,
+  0.962057f,  -0.847601f, -0.011053f, 0.118765f,  -0.028428f, -0.020138f,
+  0.400274f,  -0.382845f, -0.462766f, 0.390654f,  0.361223f,  -0.320068f,
+  -0.372084f, 0.313196f,  0.241933f,  -0.416614f, -0.008722f, -0.255078f,
+  0.078730f,  -0.381935f, -0.204577f, 0.159768f,  0.071853f,  -0.126294f,
+  -0.036186f, -0.007900f, 0.380071f,  -0.298882f, 0.387941f,  -0.267350f,
+  -0.586802f, 0.477785f,  -0.000013f, 0.197296f,  -0.079154f, -0.005811f,
+  -0.044300f, -0.021192f, -0.020879f, -0.005265f, 0.082277f,  -0.139132f,
+  -0.239237f, 0.440234f,  -0.542342f, 0.378360f,  -0.070974f, 0.272702f,
+  -0.278939f, -0.044948f, -0.134197f, -0.007172f, -0.353628f, -0.128091f,
+  0.357458f,  -0.037614f, -0.144983f, 0.220623f,  -0.003394f, -0.070166f,
+  0.200370f,  -0.166037f, 0.224448f,  -0.012990f, -0.098853f, 0.008613f,
+  -0.017669f, 0.070641f,  0.174530f,  -0.119822f, -0.065096f, 0.118487f,
+  -0.024764f, -0.050466f, 0.066631f,  -0.075896f, -0.062363f, 0.212604f,
+  -0.377322f, 0.306306f,  -0.399733f, 0.238624f,  0.233571f,  -0.344080f,
+  0.462491f,  -0.565210f, -0.035074f, -0.010459f, 0.084382f,  0.052294f,
+  0.065714f,  0.013716f,  0.135036f,  0.000588f,  0.181079f,  -0.566344f,
+  0.395561f,  -0.398509f, 0.450017f,  -1.462710f, 1.138280f,  -0.447774f,
+  0.247936f,  -0.417067f, 0.165997f,  -0.458632f, -0.018527f, 0.308461f,
+  0.541266f,  0.162257f,  0.601786f,  -1.275840f, -0.373404f, -0.589747f,
+  0.026539f,  -0.219327f, 0.142972f,  -0.018496f, 0.075204f,  -0.775190f,
+  0.237307f,  -0.348252f, 0.117792f,  -0.094332f, 0.363101f,  -0.065025f,
+  0.816662f,  0.590110f,  0.752202f,  -0.308599f, 0.258337f,  -0.842085f,
+  0.695788f,  -0.205615f, 0.093930f,  -0.392536f, 0.463093f,  -0.432456f,
+  0.041660f,  -0.827264f, 0.309128f,  -0.354658f, 0.451957f,  -1.406640f,
+  0.773192f,  -0.892943f, 0.134856f,  -0.467808f, 0.306003f,  -0.226560f,
+  0.086865f,  -0.104102f, 0.148098f,  -0.082658f, 0.316655f,  -1.028310f,
+  0.741566f,  -0.345326f, 0.052379f,  -0.275613f, 0.191765f,  -0.162391f,
+  0.000976f,  0.093061f,  0.068649f,  0.033582f,  0.239727f,  -0.647769f,
+  0.218493f,  -0.397120f, 0.268229f,  -0.303424f, 0.185393f,  -0.314189f,
+  0.101728f,  -0.163083f, -0.084989f, 0.136783f,  -0.264346f, 0.465914f,
+  0.220395f,  -0.252968f, -0.326661f, 0.271483f,  0.374717f,  -0.311570f,
+  -0.082119f, 0.020870f,  0.091975f,  -0.030582f, -0.487148f, 0.198912f,
+  0.024554f,  -0.749363f, -0.102267f, 0.097787f,  0.141459f,  -0.110706f,
+  0.079467f,  -0.082570f, -0.347567f, 0.341043f,  -0.137871f, 0.112319f,
+  0.064733f,  -0.082869f, 0.269999f,  -0.408184f, -0.183443f, 0.180608f,
+  0.223345f,  -0.357376f, -0.244593f, 0.355348f,  -0.072701f, -0.034311f,
+  0.096544f,  0.016407f,  0.417550f,  -0.367772f, -0.484535f, 0.405977f,
+  0.314243f,  -0.099622f, -0.192218f, -0.012780f, 0.434551f,  -0.399047f,
+  -0.531499f, 0.484513f,  -0.691352f, 0.872823f,  1.207720f,  -1.377490f,
+  0.006872f,  -0.041453f, 0.007845f,  0.007463f,  0.467299f,  -0.476372f,
+  -0.452606f, 0.452357f,  0.447332f,  -0.365632f, -0.332435f, 0.300284f,
+  -0.290504f, 0.255410f,  0.310921f,  -0.293717f, -0.616299f, 0.594207f,
+  0.461347f,  -0.449439f, 0.278455f,  0.285085f,  -1.201340f, -0.016463f,
+  0.549095f,  0.610375f,  -4.608530f, -1.727390f, 0.150404f,  -0.012846f,
+  -0.481148f, -0.182257f, 0.918796f,  0.213872f,  1.050410f,  0.681526f,
+  -0.458777f, -0.710395f, -2.347200f, -0.277197f, 0.213294f,  0.337551f,
+  -0.177710f, -0.152136f, 0.167666f,  0.308403f,  -1.248500f, -0.565367f,
+  0.122054f,  0.087874f,  -0.476556f, -0.083548f, -0.358734f, -0.073131f,
+  -0.146320f, -2.241960f, 0.697639f,  0.545581f,  -1.889700f, -0.267725f,
+  0.433045f,  0.298224f,  -0.338508f, 0.250226f,  0.405675f,  0.447201f,
+  -1.184690f, -0.473447f, 0.307403f,  0.711236f,  -3.191560f, -1.663980f,
+  0.165201f,  0.101360f,  -0.624451f, -0.173269f, 0.089795f,  0.227478f,
+  -0.136664f, 0.007907f,  0.131079f,  0.605374f,  -2.991620f, -1.723790f,
+  0.082428f,  0.006781f,  -0.348732f, -0.019271f, -0.032040f, -0.067078f,
+  -0.437166f, -0.144472f, 0.069844f,  0.194625f,  -0.162284f, -0.374656f,
+  0.056472f,  -0.236524f, -0.114241f, -0.029161f, -0.222078f, -0.053435f,
+  -0.313938f, -0.555472f, 1.037550f,  0.689968f,  0.575694f,  0.065826f,
+  -0.659979f, -0.881351f, -0.626417f, -0.953975f, -0.576106f, -0.258708f,
+  0.263004f,  -0.229847f, 0.463835f,  1.390960f,  -2.614480f, -1.272910f,
+  0.065780f,  -0.058603f, 0.015612f,  0.104703f,  0.198028f,  0.262792f,
+  0.253616f,  -0.079126f, -0.587381f, -0.739021f, -0.822676f, -0.795512f,
+  0.193644f,  0.234643f,  -0.034407f, 0.421478f,  -0.572610f, -0.290714f,
+  -0.257803f, -0.644835f, -0.536938f, -0.375899f, -0.651077f, -0.522576f,
+  0.562564f,  0.834616f,  0.513893f,  0.649689f,  0.356530f,  0.400716f,
+  0.300606f,  0.290505f,  0.584608f,  0.671574f,  0.564584f,  0.419870f,
+  0.062061f,  0.018263f,  0.009831f,  0.084103f,  -0.128281f, -0.018818f,
+  -0.187244f, 0.067210f,  0.437147f,  0.442029f,  0.444939f,  0.226661f,
+  0.541609f,  0.444280f,  0.302795f,  0.633026f,  -0.180374f, 0.265197f,
+  0.210404f,  -0.118916f, -0.294013f, -0.692627f, -0.402347f, -0.356287f,
+  0.387578f,  0.385496f,  0.789542f,  0.690396f,  -0.203542f, -0.688546f,
+  0.045319f,  -0.448747f, -0.157148f, 0.152581f,  0.022360f,  0.058358f,
+  0.593007f,  1.131860f,  0.289006f,  1.015560f,  0.144942f,  -0.411577f,
+  0.264794f,  -0.085791f, 0.156996f,  0.200340f,  0.169264f,  0.267615f,
+  -0.361015f, -0.601842f, -0.442217f, -0.781086f, 0.112938f,  0.385305f,
+  0.482454f,  0.470268f,  1.193390f,  0.589642f,  0.127638f,  -0.640946f,
+  0.540310f,  0.741498f,  0.686937f,  0.435879f,  0.534523f,  0.693119f,
+  0.817577f,  0.783109f,  0.021681f,  -0.004973f, 0.201236f,  -0.086311f,
+  0.028628f,  0.227871f,  0.462751f,  0.126832f,  -0.389997f, -0.553965f,
+  -0.343953f, -0.448517f, 0.053129f,  -0.115083f, 0.018138f,  -0.067131f,
+  -0.293468f, -0.220700f, 0.074348f,  -0.273153f, 0.263637f,  0.122049f,
+  0.153025f,  0.076292f,  0.142320f,  0.286734f,  0.100542f,  0.308660f,
+  -0.759591f, -0.750938f, -0.788799f, -0.853076f, -0.588019f, -0.990063f,
+  -0.692327f, -0.722904f, 0.084736f,  0.151068f,  0.159606f,  0.147715f,
+  1.610180f,  1.950330f,  1.765670f,  2.265110f,  0.008262f,  0.185584f,
+  0.039337f,  0.164721f,  0.479446f,  0.314083f,  0.043969f,  0.291320f,
+  0.003400f,  -0.551190f, 0.060158f,  -0.147591f, 0.089117f,  0.042994f,
+  0.042802f,  0.127392f,  -0.066172f, 0.078370f,  0.051408f,  0.014004f,
+  0.086726f,  0.133334f,  -0.046733f, 0.155100f,  -0.118223f, -0.100778f,
+  -0.225245f, -0.460397f, 0.892644f,  1.003770f,  0.405155f,  0.517477f,
+  0.184585f,  0.279090f,  -0.036477f, 0.198703f,  0.027139f,  -0.055728f,
+  -0.022396f, -0.147319f, 2.275540f,  2.014990f,  2.296800f,  2.081730f,
+  -0.088713f, 0.105729f,  -0.027871f, -0.095047f, 0.012429f,  0.014244f,
+  -0.014755f, -0.003017f, 1.332700f,  1.300040f,  1.464250f,  1.305030f,
+  0.032568f,  0.118042f,  0.079632f,  -0.089405f, 0.163905f,  0.146608f,
+  0.026502f,  0.065307f,  -0.056909f, -0.065052f, 0.069851f,  -0.082958f,
+  0.023419f,  -0.026293f, 0.037616f,  -0.048096f, -0.073701f, -0.208295f,
+  -0.782095f, 0.000523f,  0.374131f,  0.420946f,  0.466151f,  0.349651f,
+  -0.679275f, -0.745827f, -0.379918f, -0.900107f, 0.044070f,  -0.347536f,
+  -1.224390f, 0.740113f,  -0.779966f, 0.510920f,  -0.968597f, -0.095630f,
+  0.120805f,  0.676803f,  -0.164827f, 0.172996f,  -0.106720f, 0.197527f,
+  0.337561f,  0.571094f,  -0.279090f, -0.396697f, -0.253083f, -0.690170f,
+  -0.363291f, 0.516921f,  0.489391f,  -0.920628f, 0.497572f,  0.483864f,
+  -0.125696f, -0.338123f, -0.041517f, -0.534630f, -0.388465f, -0.784554f,
+  0.215227f,  0.055088f,  0.179638f,  0.086997f,  0.569313f,  0.572926f,
+  0.137182f,  -0.045485f, 0.118087f,  0.210383f,  0.212664f,  0.482443f,
+  0.151921f,  0.307947f,  -0.084656f, -0.386206f, 0.542277f,  -0.207005f,
+  0.073792f,  -1.013240f, 0.303581f,  0.270527f,  0.265985f,  0.332702f,
+  0.848609f,  0.686757f,  0.767212f,  0.316901f,  -0.502460f, -0.567092f,
+  -0.484799f, -0.173350f, -0.426863f, 0.222375f,  -0.200267f, -0.523758f,
+  0.265180f,  -0.175648f, -0.229754f, 0.148740f,  0.402515f,  0.028243f,
+  -0.366109f, 0.157232f,  -0.131564f, 0.055136f,  0.211046f,  -0.115542f,
+  0.322379f,  -0.137768f, -0.247832f, 0.070394f,  0.058530f,  -0.295023f,
+  -0.196022f, -0.109097f, 0.261285f,  -0.273585f, -0.240632f, 0.258326f,
+  -0.077364f, 0.071405f,  -0.014766f, -0.008751f, -0.203622f, 0.177818f,
+  0.116726f,  -0.116735f, -0.723616f, -0.700154f, 0.145082f,  -0.184949f,
+  -0.287076f, 0.150405f,  0.258075f,  -0.157764f, -0.120909f, 0.105459f,
+  0.113288f,  -0.092963f, 0.328183f,  -0.300115f, -0.361289f, 0.319792f,
+  -0.048875f, 0.135673f,  0.132539f,  -0.162481f, 0.002109f,  0.065048f,
+  -0.135969f, 0.061558f,  1.510670f,  -0.884925f, -0.827022f, 0.190311f,
+  -0.060088f, -0.033362f, 0.013354f,  0.002847f,  0.353479f,  -0.462538f,
+  -0.319638f, 0.424484f,  0.199540f,  -0.073843f, -0.140621f, 0.072133f,
+  -0.098662f, 0.070613f,  0.031150f,  -0.021869f, -0.511253f, 0.503412f,
+  0.565963f,  -0.576146f, -1.081700f, 0.047670f,  0.266687f,  0.524804f,
+  -2.361150f, 0.147823f,  0.594717f,  0.956842f,  -1.048220f, 0.127083f,
+  0.079581f,  0.065419f,  0.176783f,  0.653953f,  0.260967f,  0.537892f,
+  -1.207580f, 0.245983f,  -0.727067f, 0.071755f,  -0.343025f, -0.173435f,
+  0.215289f,  0.268578f,  -1.158560f, 0.039263f,  -0.132888f, 0.217132f,
+  -0.622195f, -0.071256f, 0.317333f,  0.157614f,  -1.588250f, 0.316432f,
+  -0.736720f, -0.041698f, -1.959280f, 0.083451f,  0.570584f,  0.327620f,
+  -1.262200f, -0.026738f, 0.231198f,  0.326861f,  -1.644200f, -0.143833f,
+  -0.079495f, 0.493026f,  -2.488090f, -0.034046f, 0.165884f,  1.074260f,
+  -1.076980f, 0.248198f,  -0.017987f, 0.421900f,  -0.105860f, 0.076710f,
+  0.002072f,  0.070264f,  -1.734750f, 0.227145f,  0.209220f,  0.851459f,
+  -0.142369f, 0.066502f,  0.027816f,  0.044321f,  -0.186591f, -0.100340f,
+  0.115580f,  0.192252f,  -0.892114f, 0.209531f,  -0.308243f, 0.367968f,
+  -0.721770f, 0.220224f,  -0.062744f, 0.133754f,  0.040416f,  0.190428f,
+  -0.035428f, 0.162974f,  0.116427f,  0.669393f,  0.278891f,  0.856676f,
+  1.060390f,  0.936983f,  0.863355f,  0.990560f,  -0.147111f, -0.217883f,
+  0.355794f,  -0.186530f, -0.275614f, -0.095719f, 0.167346f,  0.359078f,
+  -0.079223f, -0.581596f, -0.213134f, -0.431123f, -0.516443f, -0.388628f,
+  -0.643821f, -0.202345f, 0.426230f,  0.516923f,  0.548131f,  0.555973f,
+  0.022286f,  0.361170f,  0.980065f,  0.648400f,  -0.056813f, -0.100310f,
+  -0.439481f, -0.166454f, 0.412449f,  0.509400f,  0.316208f,  0.470293f,
+  -0.827838f, -1.078380f, -1.047040f, -1.074560f, 0.274555f,  -0.316736f,
+  0.128818f,  0.228566f,  -0.520967f, -0.731674f, -0.687887f, -0.536388f,
+  -0.031187f, 0.041404f,  0.047821f,  0.064397f,  0.054230f,  0.105059f,
+  -0.178671f, 0.176847f,  -0.394797f, -0.260255f, -0.333734f, -0.162345f,
+  -0.444650f, -0.928438f, -0.705840f, -0.833162f, 0.306737f,  0.429699f,
+  0.417298f,  0.478469f,  0.420903f,  0.676871f,  0.429677f,  0.616921f,
+  -0.805199f, -0.643391f, -0.304100f, 0.797599f,  -0.172157f, 0.429085f,
+  -0.750676f, 0.149227f,  -0.207898f, -0.022534f, -0.341448f, -0.247976f,
+  0.095325f,  -0.561120f, 0.599694f,  -0.025236f, 0.292346f,  -0.312001f,
+  0.517478f,  0.301457f,  -0.106415f, 0.226263f,  -0.184163f, -0.114419f,
+  -0.322702f, 0.172541f,  0.445573f,  0.157213f,  0.670704f,  0.102174f,
+  -0.234667f, -0.293311f, 0.769852f,  0.038028f,  -0.036741f, -0.228060f,
+  -0.253335f, 0.424054f,  -0.597980f, 0.221007f,  -0.114741f, -0.411557f,
+  -0.592201f, 0.442684f,  0.115491f,  -0.106896f, -0.028110f, 0.354751f,
+  -0.248375f, 0.242570f,  -0.155856f, 0.280528f,  -0.198742f, 0.588725f,
+  0.371065f,  0.078197f,  0.114706f,  -0.448021f, 0.065255f,  0.133741f,
+  -0.227522f, -0.047339f, -0.052849f, 0.309480f,  0.597185f,  0.209182f,
+  0.226108f,  -0.601036f, -0.431672f, -0.172601f, -0.000174f, 0.194292f,
+  -0.133937f, 0.130676f,  0.059372f,  0.091381f,  0.098751f,  -0.150996f,
+  0.170514f,  -0.085494f, 0.336576f,  0.484004f,  0.033862f,  0.277473f,
+  -0.231482f, -0.328385f, -0.332739f, -0.626957f, 0.510167f,  0.575861f,
+  0.421494f,  0.482540f,  -0.636377f, -0.864661f, -0.694180f, -0.420014f,
+  -0.132781f, 0.017599f,  0.003538f,  0.486934f,  0.133878f,  -0.094622f,
+  0.016132f,  0.010117f,  0.156680f,  -0.022201f, -0.014621f, 0.228445f,
+  0.190826f,  0.171580f,  0.579923f,  0.245428f,  0.322713f,  0.480101f,
+  0.406320f,  0.412229f,  0.002334f,  -0.022349f, 0.074571f,  -0.043828f,
+  0.290453f,  0.451749f,  0.530376f,  0.271879f,  0.095144f,  0.169450f,
+  0.049482f,  0.114605f,  -0.635634f, -0.700768f, -0.558538f, -0.537625f,
+  0.190255f,  -0.308237f, -0.053703f, 0.212489f,  0.056520f,  -0.040019f,
+  0.089822f,  -0.014155f, -0.376004f, -0.448752f, -0.526717f, -0.571440f,
+  0.116482f,  0.162321f,  0.147895f,  0.280527f,  0.159037f,  -0.095958f,
+  0.007931f,  -0.086630f, 0.285625f,  0.514914f,  0.208908f,  0.519251f,
+  0.309368f,  0.379777f,  0.350565f,  0.487487f,  -0.541494f, -0.421836f,
+  -0.390001f, -0.500696f, -0.905736f, -0.150439f, -0.942304f, -0.566771f,
+  0.484233f,  0.767417f,  0.410477f,  0.670196f,  0.070210f,  0.488836f,
+  0.372805f,  0.197631f,  0.337892f,  0.524423f,  0.777219f,  -0.260955f,
+  -0.112981f, -0.060088f, -0.200250f, -0.195671f, 0.007584f,  0.252096f,
+  0.235511f,  0.366612f,  -0.304979f, -0.211068f, -0.420683f, -0.085370f,
+  0.085762f,  -0.097549f, -0.802509f, -0.468079f, -0.192787f, -0.069670f,
+  -0.235162f, -0.077772f, -0.441671f, -0.348479f, -0.431434f, -0.108256f,
+  -0.133779f, 0.017032f,  0.001964f,  -0.120647f, -0.187663f, -0.194985f,
+  -0.231742f, -0.175288f, -0.162639f, 0.245110f,  0.049951f,  0.104229f,
+  -0.159634f, -0.076545f, -0.022496f, -0.036532f, -0.147028f, -0.034215f,
+  0.028213f,  -0.059669f, -0.078259f, 0.062993f,  -0.124066f, -0.137362f,
+  -0.129977f, -0.010532f, -0.049090f, -0.189401f, 0.495471f,  0.615778f,
+  0.451437f,  0.803526f,  0.523532f,  0.841339f,  0.699528f,  0.745129f,
+  0.246264f,  -0.198290f, -0.283620f, 0.189917f,  -0.018306f, -0.419097f,
+  0.280363f,  -0.098085f, 0.138972f,  -0.140867f, -0.117025f, 0.098585f,
+  0.130979f,  0.268133f,  -0.161731f, -0.176629f, -0.357677f, -0.126379f,
+  0.553128f,  -0.126821f, -0.001511f, -0.010081f, -0.031162f, 0.079203f,
+  -0.157731f, 0.072865f,  0.535830f,  -0.529989f, -0.570075f, 0.295795f,
+  0.595613f,  -0.449278f, -0.669756f, 0.941452f,  0.356897f,  -0.723720f,
+  -0.115203f, -0.134479f, 0.133048f,  0.109860f,  -0.024250f, -0.049732f,
+  0.020098f,  0.048356f,  -0.048293f, 0.108754f,  0.062548f,  -0.238315f,
+  0.182700f,  0.312011f,  -0.244377f, -0.118012f, 0.012276f,  0.006089f,
+  0.098068f,  -0.079280f, -0.423987f, -0.411931f, -0.027425f, 0.870280f,
+  0.022825f,  -0.024481f, -0.036320f, -0.111189f, 0.364539f,  -0.244896f,
+  -0.373060f, 0.266345f,  -0.141778f, 0.277549f,  0.059834f,  -0.178242f,
+  -0.686222f, 0.594535f,  0.354546f,  -0.272516f, 1.060730f,  -1.059810f,
+  -0.948126f, 0.993267f,  0.116597f,  -0.227574f, -0.436144f, -0.333309f,
+  -0.575746f, -0.828102f, 0.284561f,  0.351668f,  -0.080164f, -0.762518f,
+  -0.511108f, -0.212855f, 0.293892f,  -0.548664f, 0.072057f,  0.006748f,
+  1.485110f,  0.124687f,  0.727211f,  1.557560f,  -0.064383f, -0.022242f,
+  0.002921f,  -0.151505f, 0.270926f,  0.173632f,  -0.640644f, 0.422410f,
+  -0.240699f, -0.361980f, -0.279864f, -0.055165f, -1.084140f, 0.231705f,
+  0.366172f,  -0.347698f, -0.097565f, -0.747227f, -0.243033f, 0.941545f,
+  -0.207460f, -0.353913f, 0.104303f,  -0.403151f, 0.203177f,  0.335893f,
+  -0.229033f, 0.029096f,  -0.409634f, -0.179599f, -0.442397f, 0.649114f,
+  0.460774f,  0.170906f,  -0.043857f, 0.402066f,  -0.226896f, -0.199624f,
+  0.016650f,  0.207894f,  0.056954f,  0.220329f,  0.374060f,  0.130361f,
+  -0.303960f, -0.078863f, 0.195410f,  0.729438f,  0.246818f,  0.287730f,
+  0.484876f,  0.111488f,  -0.168647f, -0.087878f, -0.070089f, -0.341329f,
+  -0.330280f, 0.259943f,  -0.364205f, 0.256555f,  -0.756804f, -0.086915f,
+  0.777351f,  0.006136f,  0.110348f,  0.248743f,  0.209326f,  -0.362741f,
+  -0.184416f, 0.422446f,  0.565193f,  0.310072f,  -0.011212f, -0.765226f,
+  0.039466f,  0.301288f,  0.172907f,  -1.539450f, 0.606202f,  0.477469f,
+  0.045894f,  -0.222180f, -0.013192f, -0.064077f, -0.241551f, 0.192914f,
+  0.028004f,  -0.540538f, 0.437440f,  0.179087f,  -0.753204f, -0.001374f,
+  1.185930f,  -0.151182f, 1.238580f,  -1.389900f, 0.277954f,  0.422208f,
+  0.041553f,  -0.542284f, 0.139019f,  -0.148580f, -0.130705f, 0.361830f,
+  0.322953f,  -0.092371f, 0.120180f,  -0.355299f, -0.028057f, 0.128114f,
+  0.250947f,  -0.349926f, -0.684633f, 0.246175f,  0.186731f,  -0.676313f,
+  0.060535f,  0.333371f,  -0.021172f, -0.421266f, -0.079650f, 0.031359f,
+  -0.303658f, -0.298286f, 0.119016f,  0.655585f,  0.200175f,  -0.887182f,
+  -0.197539f, -0.318883f, -0.130250f, 0.522487f,  -0.092616f, 0.405930f,
+  -0.281678f, 0.089728f,  0.081814f,  -0.781745f, 0.348878f,  0.082274f,
+  -0.914136f, 1.098810f,  0.855321f,  -1.078170f, -0.268018f, 0.246440f,
+  0.238347f,  -0.027228f, 0.074111f,  -0.061197f, -0.063582f, 0.089462f,
+  -0.040347f, 0.117082f,  0.122772f,  -0.162816f, -0.148668f, -0.342856f,
+  -0.495604f, -1.453630f, -0.045273f, -0.030463f, 0.043766f,  0.047978f,
+  0.016910f,  -0.009700f, 0.006288f,  -0.042556f, 0.632896f,  -0.845744f,
+  -0.516844f, 0.709439f,  0.486166f,  -1.203050f, -0.978381f, 0.631876f,
+  0.000705f,  0.123858f,  -0.001187f, -0.172312f, -0.422668f, 0.241838f,
+  0.437400f,  -0.268186f, -0.513259f, 0.450209f,  0.542629f,  -0.453810f,
+  -0.207119f, 0.072598f,  0.085066f,  -0.018986f, -0.149512f, 0.149521f,
+  0.182105f,  -0.227200f, -0.363240f, 0.172670f,  -0.502932f, 0.689256f,
+  0.093760f,  -0.090207f, -0.066803f, 0.056759f,  -0.002243f, -0.050662f,
+  -0.059324f, 0.152943f,  -0.701150f, 0.712540f,  0.660349f,  -0.654970f,
+  0.351772f,  -0.303383f, -0.311177f, 0.247653f,  0.013035f,  0.034648f,
+  -0.137832f, 0.041197f,  0.410265f,  0.345129f,  0.653338f,  0.047050f,
+  0.140399f,  0.018613f,  -0.012431f, -0.113632f, -0.029928f, 0.051564f,
+  -0.031349f, 0.151944f,  -0.160340f, 0.326798f,  -0.458067f, 0.636235f,
+  0.243184f,  0.514072f,  2.414450f,  1.421980f,  -0.001474f, -0.141389f,
+  -0.104817f, -0.141882f, -0.026395f, 0.053014f,  0.143885f,  -0.207774f,
+  -0.563846f, -0.242514f, -0.436574f, -0.456796f, -0.520646f, 0.282550f,
+  -0.684924f, 0.061105f,  -0.315884f, -0.392624f, 0.009805f,  -0.256597f,
+  -0.146732f, 0.331039f,  0.362342f,  0.270851f,  0.067679f,  -0.071331f,
+  -0.222423f, 0.081286f,  -0.208192f, -0.193816f, -0.008201f, -0.309340f,
+  0.167556f,  0.106071f,  0.172254f,  -0.163790f, -0.142205f, -0.043182f,
+  0.096145f,  0.145037f,  -0.066015f, -0.073194f, 0.132237f,  -0.088522f,
+  -0.044292f, -0.487128f, 0.033389f,  -0.573548f, 0.185449f,  0.273593f,
+  0.147503f,  0.457049f,  -0.021539f, 0.090786f,  0.009147f,  0.000899f,
+  0.018088f,  0.115791f,  -0.079165f, 0.139388f,
+};
+
+static const float weights_layer_2[] = {
+  0.153048f,  0.112901f,  0.136781f,  0.154580f,  0.091610f,  0.045165f,
+  0.088490f,  0.116991f,  -0.463766f, -0.596567f, -0.567008f, -0.630565f,
+  0.141874f,  0.095726f,  0.175427f,  0.145027f,  -0.969824f, -1.018190f,
+  -1.073300f, -1.041130f, -0.070545f, -0.123600f, -0.114967f, -0.169453f,
+  -0.267458f, -0.147730f, -0.161419f, -0.164894f, -0.117508f, -0.204389f,
+  -0.122695f, -0.163107f, -0.003903f, -0.030470f, -0.037433f, -0.059568f,
+  0.138243f,  0.091019f,  0.160372f,  0.141650f,  -0.544565f, -0.620004f,
+  -0.504503f, -0.429979f, -0.099491f, -0.096384f, -0.155265f, -0.188536f,
+  0.084923f,  0.038345f,  0.066706f,  0.122083f,  0.267087f,  0.184419f,
+  0.261478f,  0.255746f,  -0.245894f, -0.114980f, -0.193880f, -0.227785f,
+  0.087536f,  0.095712f,  0.106105f,  0.099353f,  -0.059473f, -0.173247f,
+  -0.202386f, -0.076010f, 0.125928f,  0.100793f,  0.119638f,  0.129623f,
+  0.136593f,  0.102984f,  0.156550f,  0.140558f,  0.122524f,  0.051596f,
+  0.084164f,  0.123630f,  0.072542f,  0.096063f,  0.083236f,  0.087630f,
+  0.025900f,  0.023738f,  0.036385f,  0.053077f,  -0.029501f, 0.010544f,
+  -0.010026f, -0.051268f, 0.086302f,  0.109909f,  0.101385f,  0.127513f,
+  -0.031869f, 0.005340f,  -0.056267f, -0.032955f, 0.032748f,  0.023162f,
+  0.092118f,  -0.001780f, -0.123612f, -0.183433f, -0.202377f, -0.317516f,
+  0.129052f,  0.208112f,  0.145582f,  0.175502f,  0.018476f,  0.036349f,
+  0.072417f,  0.061194f,  0.086985f,  0.117086f,  0.072465f,  0.129068f,
+  0.020182f,  0.052114f,  0.017878f,  0.010478f,  -0.001381f, -0.034644f,
+  0.025135f,  -0.037748f, 0.004973f,  0.024778f,  0.041816f,  0.032111f,
+  0.080268f,  0.124998f,  0.105719f,  0.177047f,  -0.072114f, -0.011864f,
+  -0.076846f, -0.089840f, 0.069993f,  0.089362f,  0.088035f,  0.120621f,
+  0.065916f,  0.100946f,  -0.006784f, -0.007751f, 0.122039f,  0.126482f,
+  0.078629f,  0.140299f,  0.074034f,  0.092464f,  0.089798f,  0.108968f,
+  0.075729f,  0.057128f,  0.013570f,  0.021195f,  0.068901f,  0.054022f,
+  0.029781f,  0.031404f,  -0.209998f, -0.208731f, -0.198310f, -0.212454f,
+  -0.579168f, -0.490190f, -0.607567f, -0.520541f, 0.083863f,  0.056612f,
+  0.030366f,  0.061790f,  -0.004874f, -0.057203f, -0.060429f, -0.049145f,
+  0.080086f,  0.138602f,  0.223796f,  0.133279f,  -0.495954f, -0.612093f,
+  -0.545393f, -0.562310f, 0.070672f,  0.037702f,  0.139013f,  0.080192f,
+  -0.111387f, -0.048165f, 0.074359f,  -0.042125f, 0.113633f,  0.106579f,
+  0.042633f,  0.102734f,  -0.068220f, 0.128423f,  -0.181821f, -0.013260f,
+  -0.108563f, -0.138667f, -0.109304f, -0.131909f, -0.168667f, -0.126870f,
+  -0.132533f, -0.167096f, -0.184741f, -0.140890f, -0.125361f, -0.150632f,
+  0.309013f,  0.364376f,  0.361102f,  0.271566f,  0.116552f,  0.091160f,
+  0.096846f,  0.095954f,  0.046972f,  0.080489f,  0.028766f,  -0.012223f,
+  0.071379f,  0.041535f,  -0.000668f, 0.033698f,  -0.013493f, -0.027535f,
+  -0.025804f, -0.012267f, -0.097465f, -0.099232f, -0.208863f, -0.225201f,
+  -0.475608f, 0.077358f,  -0.002872f, 0.163890f,  -0.420298f, 0.072114f,
+  0.121601f,  -0.016727f, 0.573853f,  -0.080196f, 0.193053f,  0.053012f,
+  -0.454179f, 0.058563f,  0.067265f,  0.141154f,  0.412541f,  0.086933f,
+  0.030407f,  -0.030413f, 0.478757f,  -0.097731f, 0.277072f,  -0.086393f,
+  0.552604f,  -0.334201f, 0.091765f,  -0.270262f, -1.395060f, 0.271837f,
+  -0.005335f, 0.240499f,  0.175442f,  -0.326329f, -0.019353f, -0.270338f,
+  -0.459273f, 0.096183f,  0.153046f,  0.135818f,  0.759028f,  -0.177673f,
+  -0.099966f, 0.103363f,  0.697289f,  -0.234184f, -0.048706f, -0.116099f,
+  -0.282575f, 0.025655f,  -0.184759f, 0.040658f,  -0.558267f, 0.214087f,
+  -0.095620f, 0.200522f,  0.278996f,  0.031959f,  0.122936f,  -0.209196f,
+  -0.308217f, 0.092917f,  0.113269f,  0.136274f,  -0.037046f, 0.017263f,
+  -0.194183f, 0.089133f,  -0.161244f, 0.042799f,  0.030557f,  0.153545f,
+  -0.355048f, 0.070928f,  -0.152852f, 0.102875f,  -0.193649f, 0.007916f,
+  -0.062952f, 0.050602f,  0.073671f,  0.143045f,  -5.978970f, -7.013850f,
+  0.058713f,  0.076116f,  0.026445f,  -0.056599f, -0.005966f, 0.032234f,
+  0.006753f,  -0.024528f, 0.120308f,  0.179939f,  -6.624630f, -7.638680f,
+  0.026359f,  0.020758f,  0.194274f,  0.051489f,  -0.008491f, -0.028248f,
+  -0.061328f, -0.134423f, -0.103951f, -0.110877f, 0.042263f,  0.127016f,
+  0.012473f,  -0.008595f, 0.031357f,  0.087476f,  -0.084022f, -0.015590f,
+  -0.313546f, 0.120072f,  0.123880f,  0.162148f,  -6.596560f, -7.358830f,
+  0.004797f,  -0.003415f, 0.048455f,  0.026737f,  -0.103702f, 0.034416f,
+  -0.003475f, -0.236827f, 0.005378f,  0.048413f,  0.054612f,  -0.079359f,
+  0.043707f,  0.001085f,  0.023380f,  0.007785f,  0.025938f,  -0.052856f,
+  -0.033421f, 0.022643f,  0.034161f,  0.127681f,  -5.019490f, -5.233580f,
+  -0.128630f, 0.087741f,  -0.239834f, -0.377876f, 0.128082f,  0.142730f,
+  -0.086819f, -0.350927f, 0.089849f,  0.155776f,  -6.155120f, -5.721720f,
+  0.056110f,  0.008761f,  0.045579f,  0.016762f,  -0.134076f, -0.101551f,
+  -0.096058f, -0.117146f, 0.003527f,  -0.056942f, -0.005578f, 0.071287f,
+  0.023776f,  -0.028003f, -0.075390f, -0.191160f, -0.089672f, -0.104372f,
+  -0.104750f, -0.080813f, -0.249824f, -0.124479f, -0.243593f, -0.244284f,
+  -0.554911f, -0.549095f, -0.564693f, -0.475107f, -0.121771f, -0.143441f,
+  -0.171170f, -0.120920f, 0.109831f,  0.079708f,  0.327295f,  0.308907f,
+  -0.178785f, -0.428316f, -0.418882f, -0.366750f, -0.139296f, -0.129645f,
+  -0.081237f, -0.101533f, -0.006256f, -0.146756f, -0.322110f, -0.338865f,
+  -0.306085f, -0.319592f, -0.454803f, -0.363560f, -0.018557f, 0.006605f,
+  -0.131198f, -0.077708f, 0.138160f,  0.119611f,  0.271098f,  0.232168f,
+  0.027812f,  0.035390f,  -0.202503f, -0.091172f, -0.142020f, -0.159929f,
+  -0.106404f, -0.107433f, -0.381743f, -0.353222f, -0.484159f, -0.469926f,
+  -0.234659f, -0.315674f, -0.178327f, -0.213485f, -0.096207f, -0.190944f,
+  -0.118917f, -0.161288f, 0.015996f,  0.060737f,  0.051390f,  0.060876f,
+  0.229289f,  0.282418f,  0.250945f,  0.197273f,  0.045131f,  -0.008305f,
+  0.072024f,  0.044547f,  -0.050010f, 0.055504f,  0.001343f,  -0.014445f,
+  0.254909f,  0.309091f,  0.228249f,  0.274843f,  0.089778f,  -0.046581f,
+  0.072714f,  0.126814f,  -0.048931f, -0.045743f, -0.151333f, -0.004490f,
+  0.179966f,  0.058150f,  -0.178622f, -0.088159f, -0.074416f, -0.005821f,
+  -0.011799f, -0.002225f, -0.069361f, -0.098937f, -0.081575f, -0.034796f,
+  0.253792f,  0.301039f,  0.219163f,  0.256027f,  0.058007f,  -0.041431f,
+  0.040674f,  0.009019f,  -0.099670f, -0.099077f, -0.039437f, 0.017946f,
+  0.060717f,  0.045796f,  0.109664f,  0.032138f,  -0.071094f, 0.023697f,
+  0.011335f,  -0.030465f, 0.068677f,  0.039345f,  -0.045078f, 0.084037f,
+  0.135517f,  0.190417f,  0.175578f,  0.155286f,  -0.044505f, 0.010826f,
+  0.006717f,  -0.134715f, 0.068022f,  0.110095f,  0.079966f,  0.034481f,
+  0.185804f,  0.188273f,  0.227283f,  0.135935f,  0.033447f,  0.031571f,
+  -0.014766f, -0.024565f, 0.021792f,  0.017675f,  -0.001333f, -0.040069f,
+  -0.049384f, -0.045256f, -0.014013f, -0.000107f, -0.096928f, -0.111495f,
+  -0.051225f, -0.060449f, 0.071446f,  0.017294f,  -0.004822f, 0.006932f,
+  0.020884f,  0.089425f,  0.061097f,  -0.038708f, -0.184029f, -0.089541f,
+  -0.158035f, -0.214607f, -0.377947f, -0.318586f, -0.336977f, -0.323908f,
+  0.181612f,  0.140018f,  0.233524f,  0.193366f,  -0.254507f, -0.271902f,
+  -0.197144f, -0.119539f, 0.042162f,  0.000320f,  0.014708f,  -0.014228f,
+  -0.081119f, -0.089326f, 0.001763f,  0.081009f,  -0.142618f, -0.160650f,
+  -0.214597f, -0.202143f, -0.053495f, -0.012819f, -0.071468f, -0.010883f,
+  0.072570f,  0.071507f,  0.091045f,  0.083155f,  -0.271237f, -0.289211f,
+  -0.272345f, -0.299411f, 0.031697f,  -0.029795f, -0.030045f, -0.013604f,
+  -0.106843f, -0.045212f, -0.122459f, -0.096936f, 0.059793f,  0.006157f,
+  0.028092f,  0.040589f,  -0.014560f, -0.008975f, -0.051404f, -0.014309f,
+  -0.016883f, 0.018332f,  0.040114f,  0.050348f,  0.044921f,  -0.002445f,
+  -0.112396f, 0.014395f,  0.115160f,  0.145350f,  -0.166814f, -0.121449f,
+  0.155573f,  -0.099446f, -0.161661f, 0.187251f,  0.004711f,  0.024318f,
+  -0.060871f, -0.028311f, -0.098274f, 0.322030f,  -0.069242f, -0.153173f,
+  -0.227428f, -0.293965f, 0.228491f,  0.111413f,  -1.354720f, -0.344235f,
+  0.866715f,  0.872344f,  0.078789f,  -0.384865f, 0.162388f,  0.109018f,
+  -0.191549f, -0.002638f, 0.305053f,  0.087337f,  0.066506f,  -0.055810f,
+  -0.010984f, -0.056160f, -0.114617f, -0.058478f, 0.022059f,  -0.124368f,
+  -0.130989f, 0.369432f,  -0.248898f, -0.003955f, -0.021578f, 0.115991f,
+  -0.114163f, -0.065232f, 0.339857f,  -0.225997f, 0.006282f,  -0.125395f,
+  0.235082f,  -0.347785f, 0.662321f,  -0.529182f, 0.153297f,  -0.001326f,
+  -0.026725f, -0.024677f, -0.088065f, -0.116127f, 0.080896f,  0.212542f,
+  0.208421f,  0.032047f,  -0.211395f, 0.074997f,  0.096659f,  0.096423f,
+  -0.078643f, 0.106556f,  -0.123860f, 0.075609f,  0.066008f,  -0.097275f,
+  -1.000020f, -0.780154f, -0.856922f, -0.964007f, 0.083135f,  -0.018922f,
+  -0.266214f, -0.151480f, 0.051538f,  0.017802f,  0.066774f,  -0.021341f,
+  -0.869494f, -0.935252f, -0.895836f, -0.853871f, -0.160490f, 0.085850f,
+  -0.029670f, -0.056675f, 0.159989f,  0.166872f,  0.129970f,  0.194377f,
+  0.153294f,  0.199593f,  0.037692f,  0.103391f,  0.029335f,  -0.085324f,
+  -0.079326f, -0.077216f, 0.501561f,  0.366168f,  0.330196f,  0.296432f,
+  -0.977282f, -0.844295f, -1.014870f, -1.098990f, -0.099858f, -0.129552f,
+  0.090051f,  -0.013378f, 0.081330f,  0.194911f,  0.286501f,  0.177363f,
+  -0.148250f, -0.111700f, -0.243081f, -0.102918f, 0.161069f,  -0.012655f,
+  -0.071722f, -0.020329f, -0.077828f, -0.041716f, 0.109247f,  0.062229f,
+  -0.759722f, -0.742756f, -0.563713f, -0.631187f, 0.005911f,  0.268154f,
+  -0.263769f, 0.087149f,  -0.163623f, -0.359600f, -0.464577f, -0.369352f,
+  -0.515784f, -0.475822f, -0.523485f, -0.649813f, -0.112419f, -0.029285f,
+  0.021061f,  -0.041515f, 0.149133f,  -0.254428f, 0.115776f,  -0.061892f,
+  0.103675f,  -0.283363f, 0.005005f,  0.022034f,  -0.178454f, 0.035836f,
+  -0.113702f, -0.217823f, 0.209407f,  -0.296257f, 0.187976f,  -0.157370f,
+  -0.127190f, 0.251780f,  0.055633f,  0.294111f,  -0.067773f, 0.467190f,
+  -0.192625f, -0.071084f, -0.445284f, 0.511090f,  -0.319728f, 0.267971f,
+  0.494929f,  -0.586727f, 0.454543f,  -0.520675f, -0.085900f, 0.325989f,
+  -0.131006f, -0.069501f, 0.199927f,  -0.218919f, 0.170055f,  -0.106538f,
+  0.133312f,  0.127629f,  -0.561625f, 0.595666f,  -0.090927f, 0.363348f,
+  -0.249246f, 0.063068f,  -0.016458f, -0.291045f, -0.040509f, 0.017866f,
+  0.304871f,  -0.459214f, 0.214390f,  -0.238740f, -0.456541f, 0.545848f,
+  -0.218026f, 0.202475f,  0.128490f,  -0.036417f, 0.173885f,  -0.049385f,
+  0.235514f,  -0.132587f, -0.015066f, 0.164638f,  0.196873f,  -0.125330f,
+  0.216912f,  -0.109398f, 0.121602f,  -0.209374f, 0.164400f,  -0.123049f,
+  0.195520f,  -0.212932f, -0.015180f, -0.005784f, 0.049726f,  -5.822150f,
+  0.124536f,  0.040689f,  -0.018560f, -3.155020f, 0.014690f,  0.076202f,
+  -0.154008f, 1.070630f,  -0.071606f, 0.051026f,  0.138285f,  -5.836340f,
+  0.162173f,  0.085890f,  -0.186166f, 0.093221f,  0.019240f,  -0.017053f,
+  -0.090144f, 0.236254f,  -0.125344f, 0.056235f,  -0.089813f, -0.252281f,
+  -0.127406f, -0.155088f, 0.009972f,  -0.066449f, 0.044222f,  0.025943f,
+  -0.164921f, 0.165463f,  -0.001132f, -0.038386f, 0.115194f,  -5.757100f,
+  0.163386f,  0.061226f,  0.024626f,  0.132750f,  0.107279f,  -0.001622f,
+  -0.107860f, -0.356009f, -0.138935f, -0.145173f, -0.061198f, -0.646138f,
+  0.034279f,  0.078187f,  0.108138f,  -0.490444f, 0.074719f,  0.034984f,
+  -0.109303f, 0.741785f,  -0.066939f, 0.015558f,  0.114229f,  -4.001080f,
+  0.130772f,  0.044675f,  -0.165162f, -0.274810f, -0.042987f, -0.048579f,
+  0.156603f,  -1.288370f, 0.076198f,  0.035065f,  0.032043f,  -5.002520f,
+  0.086900f,  -0.010886f, 0.030850f,  -0.782259f, 0.056211f,  -0.097759f,
+  0.118988f,  0.106638f,  0.091419f,  0.079920f,  0.062325f,  0.097116f,
+  0.126035f,  0.122530f,  -0.278299f, -0.083314f, -0.300563f, -0.197946f,
+  0.081664f,  0.089925f,  0.074754f,  0.074628f,  0.102338f,  0.088845f,
+  0.105841f,  0.102381f,  0.003087f,  0.061599f,  0.098326f,  0.040119f,
+  -0.005298f, -0.028834f, 0.059938f,  -0.013668f, -0.585882f, -0.631436f,
+  -0.742673f, -0.736666f, 0.025071f,  0.066851f,  0.075046f,  0.091360f,
+  0.099045f,  0.098261f,  0.106413f,  0.099487f,  -0.016742f, -0.097334f,
+  -0.086152f, -0.212444f, -0.028043f, -0.007362f, 0.003914f,  -0.055864f,
+  0.034756f,  0.081361f,  0.080183f,  0.061319f,  0.193396f,  0.173716f,
+  0.207765f,  0.231701f,  -0.074565f, -0.073257f, -0.086470f, -0.083114f,
+  0.081489f,  0.078477f,  0.033452f,  0.058835f,  -0.069665f, -0.031691f,
+  -0.111255f, -0.167754f, 0.184179f,  0.174673f,  0.160288f,  0.190893f,
+  0.110930f,  0.103495f,  0.098408f,  0.102918f,  0.053764f,  0.089994f,
+  0.140308f,  0.124867f,  0.074176f,  0.117460f,  -0.160775f, -0.144132f,
+  -0.099373f, -0.035913f, 0.081237f,  0.062247f,  -0.166421f, 0.062125f,
+  0.276479f,  0.060955f,  0.066627f,  0.455347f,  0.219953f,  0.109912f,
+  0.273931f,  0.233153f,  0.102236f,  0.447606f,  -0.352243f, 0.499236f,
+  -0.931206f, 0.248595f,  0.254047f,  0.061542f,  0.268804f,  0.309517f,
+  -0.084414f, -0.245828f, -0.144882f, -0.296579f, -0.091628f, -0.142202f,
+  -0.541764f, -0.407470f, 0.053481f,  0.238955f,  0.150188f,  -0.060598f,
+  0.196118f,  -0.215617f, -0.086238f, -0.263420f, 0.206877f,  0.241788f,
+  -0.122544f, -0.448790f, 0.286917f,  0.112063f,  -0.268408f, -0.041770f,
+  0.089161f,  0.355811f,  -0.078245f, -0.148490f, -0.407301f, -1.296870f,
+  -0.633421f, 0.124253f,  0.275402f,  0.223048f,  0.077016f,  0.160766f,
+  0.115374f,  0.061053f,  -0.231872f, -0.515052f, -0.278331f, -0.235912f,
+  -0.416372f, -0.284106f, -0.055942f, 0.110698f,  -0.428288f, -0.298137f,
+  -0.018101f, 0.102677f,  -0.019639f, 0.013479f,  0.038549f,  0.048682f,
+  0.128684f,  0.116416f,  0.044852f,  0.008133f,  0.061597f,  0.083582f,
+  0.014953f,  0.063716f,  -0.155318f, -0.061732f, 0.084855f,  0.129505f,
+  0.068249f,  0.193775f,  -0.088631f, -0.446398f, -0.075710f, -0.061327f,
+  0.278715f,  0.540366f,  0.618715f,  0.538374f,  -0.037843f, 0.062370f,
+  -0.033184f, 0.119901f,  -0.008641f, -0.064789f, 0.087498f,  0.043486f,
+  0.247085f,  0.419992f,  0.299935f,  0.234276f,  0.089283f,  0.070357f,
+  0.068888f,  0.134311f,  0.109823f,  0.072431f,  0.081676f,  0.091366f,
+  -1.707980f, -2.213110f, -2.149930f, -1.556870f, 0.226598f,  0.191675f,
+  0.192207f,  0.159566f,  -0.070194f, -0.136070f, -0.015172f, -0.204272f,
+  -0.162191f, -0.043313f, -0.158007f, -0.227210f, 0.040398f,  0.043014f,
+  0.039439f,  -0.035439f, 0.245558f,  0.439691f,  0.219659f,  0.138210f,
+  -0.048129f, 0.004954f,  -0.102860f, -0.185376f, 0.035548f,  0.006821f,
+  0.079199f,  0.032901f,  0.039218f,  0.068113f,  0.023075f,  -0.037582f,
+  0.225181f,  0.164562f,  0.106718f,  0.032684f,  0.013402f,  0.018797f,
+  0.076606f,  0.046512f,  -0.070024f, 0.099921f,  -0.051231f, 0.074167f,
+  0.173313f,  0.220212f,  0.142665f,  0.069809f,  -0.195130f, -0.007912f,
+  -0.006764f, -0.063687f, 0.306374f,  0.402035f,  0.273759f,  0.449469f,
+  0.114597f,  0.210745f,  0.355326f,  0.271307f,  -0.109943f, -0.171912f,
+  -0.070726f, -0.128932f, 0.138770f,  0.164971f,  0.308516f,  0.332536f,
+  0.081537f,  0.096939f,  0.054136f,  0.052226f,  0.109489f,  0.010223f,
+  0.168072f,  -0.106279f, 0.525568f,  0.704816f,  0.588942f,  0.473398f,
+  0.149497f,  0.120835f,  0.080049f,  0.151340f,  -0.182038f, -0.191091f,
+  -0.196505f, -0.198309f, -0.801819f, -1.441620f, -1.107780f, -1.025650f,
+  0.035750f,  0.018049f,  -0.029033f, -0.067255f, 0.192049f,  0.009664f,
+  -0.043741f, 0.051557f,  0.082815f,  0.069547f,  -0.073379f, 0.010584f,
+  0.192128f,  0.208586f,  0.141904f,  0.100763f,  0.046183f,  0.044776f,
+  -0.033611f, -0.005812f, 0.012966f,  0.030301f,  0.100665f,  0.103641f,
+  -0.294776f, -0.361573f, -0.420156f, -0.388743f, 0.239287f,  0.191975f,
+  0.089644f,  0.117591f,  0.069563f,  0.021480f,  0.100287f,  0.174159f,
+  -0.013571f, 0.090960f,  0.010232f,  -0.034760f, -0.077205f, 0.060632f,
+  -0.145527f, -0.391110f, -0.143052f, -0.236448f, -0.103902f, -0.188463f,
+  0.071311f,  -0.080171f, 0.021987f,  0.041767f,  -0.419487f, -0.515479f,
+  -0.205470f, -0.732132f, 0.150901f,  0.107202f,  0.156307f,  0.143672f,
+  0.474682f,  0.178137f,  0.150063f,  0.414515f,  0.559891f,  0.697019f,
+  0.541231f,  0.505310f,  -0.478101f, -0.444267f, -0.586539f, -0.445996f,
+  -0.451873f, -0.530085f, -0.447980f, -0.364955f, 0.372435f,  0.318894f,
+  0.351211f,  0.193961f,  0.212295f,  0.212842f,  0.220003f,  0.243743f,
+  -0.388628f, -0.789620f, -0.536618f, -0.430691f, 0.247004f,  0.266489f,
+  0.261033f,  0.263692f,  0.050089f,  0.048958f,  0.065207f,  0.120180f,
+  -0.526230f, -0.481969f, -0.422411f, -0.272292f, 0.155593f,  0.229614f,
+  0.139579f,  0.171805f,  -0.251924f, -0.302067f, -0.126157f, -0.346650f,
+  -1.195450f, -1.281100f, -0.987911f, -1.478440f, 0.285667f,  0.284802f,
+  0.301887f,  0.259556f,  -0.194127f, -0.090440f, -0.257959f, -0.259572f,
+  -0.012273f, -0.049993f, -0.099431f, 0.012506f,  0.081526f,  0.166279f,
+  0.042594f,  0.185121f,  0.148830f,  0.073161f,  0.201728f,  0.125747f,
+  -0.295065f, -0.187585f, -0.333066f, -0.312291f, 0.253458f,  0.321585f,
+  0.178844f,  0.219944f,  -0.763475f, -0.943374f, -0.816825f, -0.709901f,
+  -0.166132f, 0.129186f,  0.015405f,  -0.065623f, -0.246006f, -0.340385f,
+  -0.118155f, -0.384905f, -0.233883f, -0.400666f, -0.228597f, -0.228428f,
+  -0.559083f, -0.377784f, -0.541458f, -0.542870f, 0.067400f,  0.122987f,
+  0.180901f,  0.186004f,  -0.482910f, -0.424823f, -0.477831f, -0.394719f,
+  0.091558f,  0.049248f,  0.049370f,  0.160429f,  0.133641f,  0.096625f,
+  0.104429f,  0.100782f,  -0.238252f, -0.221459f, -0.196974f, -0.250393f,
+  -3.071750f, -2.418450f, -0.861410f, -1.051580f, 0.071263f,  0.118014f,
+  -0.028430f, -0.072073f, -0.074463f, 0.034168f,  0.044089f,  -0.091109f,
+  -3.153840f, -2.945850f, -1.977360f, -1.498850f, -0.083429f, 0.131835f,
+  -0.063865f, -0.065785f, -0.069346f, -0.015520f, -0.119551f, 0.044881f,
+  -0.105280f, 0.127516f,  0.005255f,  -0.142777f, 0.061055f,  -0.117250f,
+  0.020454f,  0.157879f,  -0.213812f, -0.151783f, 0.028583f,  0.137759f,
+  -3.248250f, -3.005940f, -1.510540f, -1.475390f, 0.081874f,  -0.171465f,
+  -0.135690f, -0.001989f, -0.227574f, -0.132799f, -0.359742f, -0.137197f,
+  0.066324f,  0.039194f,  -0.050857f, 0.095166f,  0.044475f,  0.011221f,
+  0.054904f,  0.061414f,  -0.039189f, 0.123751f,  -0.017171f, -0.008494f,
+  -2.598220f, -2.832670f, -1.622030f, -1.201990f, 0.154313f,  -0.021436f,
+  0.042190f,  0.143947f,  -0.090623f, 0.086853f,  0.143137f,  0.099821f,
+  -1.732820f, -1.429730f, -0.775125f, -0.648036f, 0.082176f,  0.079448f,
+  -0.040575f, 0.024511f,  -0.064105f, -0.117122f, -0.190323f, -0.182589f,
+  -0.076430f, -0.095615f, -0.112513f, -0.101581f, 0.143037f,  0.148180f,
+  0.430958f,  0.359225f,  0.001403f,  -0.080541f, -0.295001f, -0.156706f,
+  0.426623f,  0.475597f,  0.455210f,  0.454352f,  0.074365f,  0.099440f,
+  0.066348f,  -0.007078f, 0.008335f,  -0.097116f, -0.133687f, -0.110535f,
+  0.204145f,  0.281478f,  0.078886f,  0.112857f,  -0.103620f, -0.068247f,
+  0.191147f,  0.227593f,  -0.011816f, -0.058755f, -0.149477f, -0.101828f,
+  0.079878f,  0.304949f,  0.557555f,  0.305288f,  -0.150955f, -0.118610f,
+  0.052073f,  0.064707f,  -0.121728f, -0.151132f, -0.193987f, -0.175046f,
+  0.043655f,  0.105270f,  -0.120715f, -0.040976f, 0.047776f,  -0.004443f,
+  0.149606f,  0.111240f,  -0.047502f, -0.064146f, -0.151858f, -0.151872f,
+  -0.160207f, -0.113846f, -0.081585f, -0.006708f, -0.203760f, -0.068597f,
+  -0.179979f, -0.127779f, -0.062460f, -0.064513f, -0.121479f, -0.111122f,
+  -0.212384f, -0.229157f, -0.283428f, -0.184891f,
+};
+
+static const float weights_layer_3[] = {
+  -0.039388f, 0.033048f,  -0.113003f, -0.011642f, 0.170478f,  0.145713f,
+  0.040189f,  -0.280129f, -0.049050f, -0.043788f, -0.157425f, 0.323829f,
+  -0.250725f, -0.166349f, 0.101650f,  -0.049690f, 0.205606f,  0.281131f,
+  0.623204f,  0.993452f,  -0.015115f, -0.138995f, 0.009473f,  0.157673f,
+  -0.024687f, -0.067214f, 0.125566f,  -0.317619f, 0.057002f,  0.031202f,
+  -0.018167f, 0.068542f,  0.011609f,  -0.020233f, -0.000428f, -0.035956f,
+  -0.843274f, -0.800587f, -0.214917f, -0.221250f, 0.031255f,  -0.077330f,
+  -0.074902f, -0.063979f, -0.055562f, 0.679495f,  0.146609f,  1.315330f,
+  -0.118399f, -0.034539f, -0.050377f, 0.172867f,  -0.204607f, -0.034930f,
+  0.176014f,  0.089747f,  -0.003889f, 0.044980f,  0.002386f,  -0.141723f,
+  -0.035828f, -0.204701f, 0.099813f,  0.123580f,  0.209851f,  -0.110989f,
+  -0.043655f, -0.461118f, -0.139664f, 0.026855f,  -0.081714f, 0.207623f,
+  0.089942f,  0.253082f,  0.680568f,  0.811360f,  -0.090528f, -0.116818f,
+  -0.432361f, -0.075588f, -0.269924f, -0.276810f, -0.289192f, -0.282570f,
+  0.245566f,  0.267216f,  0.238622f,  0.286528f,  -0.157605f, -0.200401f,
+  -0.138924f, -0.185006f, 0.215203f,  0.203316f,  0.209532f,  0.293135f,
+  0.928046f,  0.733323f,  -0.094120f, 0.036918f,  -0.126643f, -0.083371f,
+  -0.147530f, -0.153195f, 0.097097f,  0.101852f,  0.109160f,  0.105129f,
+  -0.051869f, -0.064359f, -0.073469f, -0.059591f, 0.102431f,  0.109444f,
+  0.113614f,  0.105617f,  0.383311f,  0.325783f,  0.393234f,  0.382508f,
+  0.194720f,  0.189672f,  0.217477f,  0.177786f,  0.326461f,  0.114789f,
+  0.317061f,  0.048291f,  -0.061143f, -0.134641f, -0.067895f, -0.108446f,
+  0.082592f,  0.029918f,  -0.006580f, 0.015533f,  -0.053583f, -0.055540f,
+  -0.063395f, -0.023157f, -0.064955f, -0.073981f, -0.115452f, -0.086626f,
+  -0.036616f, 0.008454f,  0.012029f,  -0.008039f, -0.207395f, -0.216419f,
+  -0.205363f, -0.249099f, 0.343308f,  0.413215f,  -0.009918f, -0.109978f,
+  -0.059711f, -0.045089f, -0.029130f, -0.038483f, -0.070323f, -0.099409f,
+  -0.008849f, -0.063527f, 0.175963f,  0.185335f,  0.149151f,  0.199997f,
+  -0.027516f, -0.039812f, -0.027760f, -0.047910f, -0.007337f, 0.071065f,
+  0.086225f,  0.125539f,  0.151390f,  0.215488f,  0.203450f,  0.045380f,
+  0.095761f,  0.107809f,  0.103918f,  0.122383f,  0.116287f,  0.135455f,
+  0.115446f,  0.155673f,  -0.044648f, -0.027455f, -0.015473f, -0.026657f,
+  0.089852f,  0.077459f,  0.077631f,  0.082507f,  -0.102761f, -0.054669f,
+  -0.132223f, -0.024768f, 0.111573f,  0.060467f,  0.107883f,  0.056621f,
+  0.219357f,  -0.161153f, 0.074379f,  -0.118743f, -0.169931f, -0.153995f,
+  -0.220003f, -0.200186f, 0.032318f,  -0.060687f, -0.087550f, -0.038022f,
+  0.026633f,  -0.005534f, 0.029532f,  0.027081f,  0.011926f,  0.058412f,
+  0.010631f,  0.003068f,  -0.014911f, 0.063070f,  0.065271f,  0.089550f,
+  0.012885f,  0.005320f,  -0.037494f, -0.019849f, -0.009624f, -0.059090f,
+  -0.021222f, -0.088033f, -0.055261f, -0.055113f, -0.047598f, -0.055478f,
+  -0.023648f, -0.046827f, -0.036572f, -0.057655f, 0.104194f,  0.179800f,
+  0.175751f,  0.192851f,  -0.016950f, -0.073650f, -0.028592f, -0.088219f,
+  0.011130f,  0.061825f,  0.025643f,  0.034183f,  0.095548f,  0.001457f,
+  -0.132869f, 0.032981f,  -0.140178f, -0.105343f, -0.161799f, -0.161983f,
+  0.177746f,  0.132903f,  0.135627f,  0.152489f,  -0.012532f, -0.068747f,
+  -0.085849f, -0.095434f, 0.087037f,  0.139497f,  0.111899f,  0.100189f,
+  -0.024649f, -0.092003f, 0.020783f,  -0.115807f, 0.092039f,  0.093943f,
+  0.109466f,  0.049639f,  -0.133727f, 0.128430f,  -0.050546f, 0.190632f,
+  0.123733f,  0.082305f,  0.114878f,  0.122572f,  0.201618f,  0.137588f,
+  0.065582f,  0.125161f,  -0.095179f, -0.120719f, -0.127126f, -0.101961f,
+  -0.118120f, -0.104833f, -0.179632f, -0.131764f, -0.138096f, -0.147861f,
+  -0.131512f, -0.153905f, -0.201816f, -0.206641f, -0.196707f, -0.160013f,
+  -0.212605f, -0.093998f, -0.186258f, -0.076137f, -0.065340f, -0.006969f,
+  -0.071383f, -0.075005f,
+};
+
+static const float weights_layer_4[] = {
+  -0.016102f, -0.022836f, 0.624049f,  0.273485f,  0.222800f,  -0.290175f,
+  -0.518415f, 0.413484f,  -0.264495f, 0.498083f,  -0.450145f, -0.106419f,
+  0.095103f,  -0.187451f, 0.145933f,  -0.371542f, -0.088871f, 0.184017f,
+  -0.429625f, -0.110882f, 0.292781f,  0.289588f,  0.185127f,  0.326017f,
+  -0.432009f, -0.342663f, -0.312206f, 0.004004f,  -1.114290f, 0.028497f,
+  -0.264944f, -0.419611f, 0.046336f,  0.138232f,  -0.869528f, 0.425557f,
+  -0.954838f, -0.186830f, -0.464622f, -0.757107f, -0.432686f, -0.125978f,
+  -0.402633f, -0.172266f, -0.041749f, -0.822238f, -0.118486f, 0.238617f,
+  -0.198037f, 0.146347f,  0.405257f,  0.513303f,  -0.078876f, -0.300385f,
+  -0.010293f, -0.183962f, 0.155738f,  0.186797f,  -0.086814f, 0.000179f,
+  0.123467f,  0.362523f,  0.068805f,  0.371834f,  0.038122f,  -0.117867f,
+  -0.120445f, -0.422322f, -0.131402f, 0.285449f,  0.038957f,  0.008844f,
+  -0.020197f, 0.187723f,  0.190433f,  0.146532f,  -0.091068f, -0.270865f,
+  -0.194231f, -0.226777f, 0.013548f,  0.248351f,  0.537685f,  0.056316f,
+  -0.171540f, -0.003865f, 0.406439f,  0.126507f,  0.192780f,  0.149335f,
+  -0.149602f, 0.255202f,  -0.015426f, 0.032335f,  -1.791330f, -0.894602f,
+  -0.196641f, -0.282846f, -0.391100f, -0.040969f, 0.049934f,  0.056348f,
+  -0.041426f, -0.075159f, -0.658335f, -0.827270f, -0.175029f, -0.427235f,
+  0.311201f,  0.560413f,  0.363408f,  0.374580f,  -0.433531f, -0.180580f,
+  0.142142f,  0.194768f,  -0.054118f, -0.376541f, -0.366185f, -0.308782f,
+  -0.273143f, -0.074097f, 0.009000f,  -0.182198f, -0.015616f, -0.003882f,
+  -0.174340f, -0.354866f, 0.527972f,  0.348355f,  0.091381f,  -0.419828f,
+  -0.530529f, 0.159899f,  -0.511867f, -0.104237f, -0.286079f, -0.659039f,
+  -0.266596f, -0.256557f, -0.600437f, -0.446333f, -0.229629f, 0.024931f,
+  -0.143716f, -0.415754f, -0.003760f, -0.107195f, -0.666165f, -0.697312f,
+  -0.650255f, -0.703877f, 0.243402f,  0.426710f,  0.217210f,  0.260255f,
+  0.027416f,  0.163147f,  0.132188f,  0.142374f,  0.558627f,  0.065717f,
+  0.382781f,  -1.192240f, 0.195492f,  0.028439f,  0.278252f,  -0.491806f,
+  0.497701f,  -0.448835f, -0.245079f, -0.014336f, -0.174907f, -0.409633f,
+  0.207548f,  0.433813f,  0.459889f,  0.431728f,  0.605050f,  0.485520f,
+  0.218548f,  0.437307f,  0.027023f,  -0.204251f, 0.012100f,  0.150677f,
+  -1.097980f, 0.086866f,  -1.293130f, -0.372575f, -0.876264f, -0.021818f,
+  0.322864f,  -0.231043f, -0.271608f, 0.132782f,  -0.314895f, 0.396800f,
+  0.262788f,  -0.317212f, -0.666308f, 0.830742f,  0.319409f,  -0.564373f,
+  -0.178656f, 0.306993f,  0.265634f,  -0.332480f, -0.491514f, -0.186745f,
+  -0.063044f, -0.009321f, 0.074944f,  -0.372082f, -0.029479f, 0.081548f,
+  0.028172f,  -0.233148f, -0.337938f, -0.087695f, 0.596556f,  0.559530f,
+  0.139332f,  0.107223f,  -0.190915f, 0.137401f,  -0.150625f, -0.225484f,
+  -0.191344f, -0.232535f, 0.126510f,  0.296323f,  -0.547901f, -0.653080f,
+  0.358514f,  0.726289f,  -0.421725f, -0.243620f, 0.236206f,  0.390823f,
+  -0.076560f, -0.282329f, -0.012460f, -0.428484f, 0.349469f,  0.394629f,
+  0.421537f,  0.219632f,  -0.117550f, -0.087894f, 0.077155f,  0.016000f,
+  -0.289137f, -0.092937f, -0.014518f, -0.027111f, 0.210329f,  -0.159678f,
+  0.013288f,  -0.039268f, 0.008112f,  0.003152f,  0.030084f,  -0.039859f,
+  0.322028f,  -0.407797f, 0.447087f,  -0.381562f, 0.529297f,  -0.520298f,
+  0.562865f,  -0.616878f, 0.689389f,  0.754262f,  0.138475f,  0.750697f,
+  -0.760157f, -0.383740f, 0.074219f,  0.556257f,  0.087827f,  -0.511826f,
+  -0.305507f, -0.638214f, 0.114833f,  -0.444022f, 0.526612f,  -0.604984f,
+  -0.100415f, 0.037824f,  -0.106264f, 0.337615f,  0.070743f,  0.031129f,
+  0.281954f,  0.176144f,  -0.032833f, -0.073902f, -0.285492f, -0.803803f,
+  -0.015589f, 0.186077f,  -0.033351f, 0.517269f,  -1.878800f, -1.685210f,
+  -0.416581f, 0.158476f,  -0.071929f, -0.624353f, -0.122069f, -0.075065f,
+  0.311816f,  0.506305f,  0.383896f,  0.259450f,  -0.308232f, -0.094221f,
+  -0.421885f, -0.293573f,
+};
+
+static const float weights_layer_5[] = {
+  0.131894f,  0.078431f,  0.323121f,  -0.230680f, -0.684740f, 0.020895f,
+  0.364983f,  0.121656f,  0.132448f,  -0.731198f, 0.071148f,  0.739642f,
+  0.318437f,  -0.033021f, -1.037080f, 0.135335f,  0.383582f,  0.287332f,
+  0.054042f,  -0.825482f, 0.418533f,  0.305606f,  0.041549f,  0.432422f,
+  -0.826878f, -0.593536f, 0.105657f,  0.125357f,  0.408567f,  -0.293338f,
+  0.233905f,  -0.039609f, 0.547727f,  -0.435806f, 0.036160f,  0.220275f,
+  -0.020337f, -0.619403f, -0.455858f, 0.681455f,  0.543846f,  -0.495084f,
+  0.251496f,  -0.085686f, 0.091395f,  -0.476696f, 0.453628f,  -0.109663f,
+  0.383493f,  -0.456563f, -0.212935f, 0.020567f,  -0.719564f, -0.377813f,
+  -0.737511f, 0.765965f,  0.624309f,  -0.063679f, -0.055681f, -0.475969f,
+  -0.069902f, 0.725690f,  0.641094f,  0.439922f,  -0.111544f, -0.309061f,
+  0.280091f,  0.381416f,  0.481168f,  0.483543f,  -0.901267f, -0.499230f,
+  0.043449f,  -0.372395f, 0.021216f,  -0.002200f, -0.524089f, -0.071485f,
+  -0.273974f, -0.462654f, 0.042369f,  -0.138679f, -0.330060f, 0.021886f,
+  -0.306075f, -0.011130f, -0.260224f, -0.288435f, -0.104039f, -0.183563f,
+  0.118990f,  -0.531160f, 0.339632f,  -0.028374f, 0.159084f,  -0.008824f,
+  -0.791388f, 0.245242f,  0.356510f,  0.469867f,  -0.396949f, -0.476146f,
+  -0.168472f, 1.068400f,  0.474629f,  -0.117554f, -0.142453f, -0.306604f,
+  0.348525f,  -0.111929f, -0.435384f, 0.019952f,  -0.260185f, 0.373376f,
+  0.109729f,  -0.639168f, 0.033392f,  -0.082573f, -0.196018f, 0.301637f,
+  -0.124210f, -0.202515f, -1.221920f, -0.253690f, -0.144864f, 0.287753f,
+  -0.161206f, -0.213246f, 0.373968f,  0.141397f,  -0.248237f, 0.283090f,
+  -0.008977f, -0.172960f, -0.234146f, -0.720014f, -0.322451f, 0.181083f,
+  0.310659f,  -0.422646f, -0.719994f, -0.354339f, 0.352739f,  0.230923f,
+  0.427013f,  -0.660316f, 0.232140f,  0.685896f,  0.660208f,  0.225748f,
+  -0.918750f, -0.650790f, -0.674525f, -0.450305f, -0.152529f, 0.498480f,
+  0.895092f,  0.688242f,  0.669057f,  0.612669f,  0.593484f,  0.318204f,
+  -0.169294f, 0.388789f,  -0.529777f, -0.219706f, -0.044916f, 0.161697f,
+  -0.145288f, 0.196153f,  -0.022212f, -0.434209f, -0.208115f, -0.117745f,
+  -0.279029f, -0.009506f, 0.137474f,  0.330148f,  0.439258f,  0.345879f,
+  -0.845131f, -0.215713f, 0.094463f,  0.638604f,  0.882254f,  -0.964082f,
+  -0.383920f, 0.292645f,  0.266341f,  0.747473f,  -0.645631f, -0.538896f,
+  -0.319764f, 0.521880f,  0.460091f,  -0.470898f, -0.778283f, -0.061622f,
+  -0.142433f, 0.210520f,  0.804197f,  0.285840f,  -0.138414f, -0.381846f,
+  -0.499991f, 0.223648f,  0.439025f,  0.321508f,  -0.099560f, -0.622893f,
+  0.750925f,  0.740994f,  0.140405f,  0.074631f,  -0.270223f, -0.829049f,
+  -0.753355f, -0.258015f, 0.006285f,  -0.730573f, -1.107390f, -0.538015f,
+  -1.005520f, -0.724115f, -0.440183f, -0.395239f, 0.508768f,  0.204620f,
+  -0.267331f, 0.001740f,  -0.838709f, 0.659333f,  0.043739f,  -0.024099f,
+  0.262431f,  0.252433f,  -0.265215f, 0.057289f,  -0.428192f, -0.114350f,
+  -0.011475f, 0.463995f,  0.668833f,  -0.604556f, -0.122780f, -0.441645f,
+  0.145769f,  0.310450f,  -1.003500f, 0.936069f,  0.516604f,  -0.643386f,
+  -0.518571f, 0.306130f,  0.337387f,  0.583400f,  -0.366025f, -0.560035f,
+  -0.262332f, 0.465242f,  0.964332f,  -0.545410f, -0.637428f, -0.202695f,
+  0.378931f,  0.834604f,  0.000970f,  -0.553303f, -0.562879f, 0.221665f,
+  0.395160f,  0.446281f,  -0.184394f, -0.591780f, 0.170595f,  1.164390f,
+  0.227068f,  -0.150910f, -0.393690f, -0.131151f, 0.309956f,  -0.413518f,
+  -0.768334f, -0.548975f, 0.245384f,  -0.256904f, -0.514790f, -0.102616f,
+  -0.347625f, 0.420456f,  0.037804f,  -0.283200f, -0.578815f, 0.319282f,
+  0.674622f,  -0.011791f, -0.339329f, 0.466705f,  0.563444f,  0.409660f,
+  0.445784f,  -0.899507f, -0.605116f, 0.622438f,  0.427385f,  -0.062509f,
+  0.666570f,  0.057105f,  0.357894f,  -0.811016f, -0.421715f, -0.458397f,
+  0.288955f,  0.005857f,  0.236331f,  0.107957f,  0.587276f,  -0.375800f,
+  0.323799f,  -0.623363f, 0.254122f,  -0.198478f, -0.098436f, -0.282531f,
+  0.452453f,  -0.163349f, -0.413382f, -0.448732f, -0.528770f, -0.457449f,
+  -0.619619f, -0.265919f, -0.042760f, 0.438730f,  0.501798f,  -0.403851f,
+  0.519564f,  0.817314f,  0.366203f,  0.492610f,  0.546929f,  0.853094f,
+  0.289000f,  0.453941f,  -0.076152f, 0.007226f,  -0.183717f, -0.506252f,
+  -0.599989f, -0.576006f, 0.746488f,  0.631466f,  -0.475599f, -0.334991f,
+  -0.879614f, 0.918957f,  0.473471f,  -0.043781f, -0.688234f, -0.925875f,
+  -0.188081f, 0.050918f,  0.116855f,  0.221413f,  -0.066680f, -0.674395f,
+  -0.481985f, 0.247368f,  0.271129f,  0.637979f,  -1.006970f, -0.855441f,
+  0.144874f,  0.507424f,  1.506960f,  -0.338910f, 0.398203f,  0.738000f,
+  0.263193f,  -0.425908f, 0.358271f,  -1.072900f, -0.816209f, -0.425519f,
+  0.264373f,  0.694014f,  0.036333f,  0.635532f,  0.518856f,  0.047585f,
+  -0.854817f, -0.138202f, 0.006811f,  -0.052020f, -0.468498f, 0.489080f,
+  -0.105778f, 0.357038f,  -0.782875f, 0.649049f,  -0.562652f, -0.544392f,
+  -0.328526f, -0.402121f, -0.263172f, -0.668459f, -0.526702f, -0.395829f,
+  0.190986f,  0.307766f,  -1.001830f, -0.293051f, 0.283334f,  0.572450f,
+  0.906095f,  -1.144300f, 0.180989f,  0.421092f,  0.684571f,  0.527276f,
+  -0.122287f, 0.575067f,  0.675221f,  0.755029f,  0.094957f,  0.481403f,
+  0.825155f,  0.755035f,  0.641420f,  0.034497f,  0.518783f,  0.283800f,
+  0.293733f,  -0.074778f, -0.268720f, 0.798921f,  0.317714f,  -0.236391f,
+  -0.375071f, -0.414600f, 0.223413f,  -0.349044f, -0.191033f, -0.391779f,
+  -0.596894f, -0.378608f, -0.185920f, -0.822171f, -0.754962f, -0.167706f,
+  0.755378f,  0.671847f,  0.969414f,  0.793048f,  1.078610f,  -0.418963f,
+  0.367648f,  0.217645f,  0.294232f,  0.113027f,  0.060312f,  -0.327488f,
+  -0.305035f, -0.243600f, -0.020588f, -0.326324f, -0.417534f, -0.425868f,
+  -0.404614f, -0.346750f, -0.339145f, -0.348094f, -0.527290f, -0.617825f,
+  -0.258342f, -0.200753f, -0.249779f, -0.321039f, -0.023117f, -0.004167f,
+  -0.206788f, -0.612420f, -0.646428f, -0.548969f, -0.158875f, 0.213814f,
+  -0.084040f, -0.217365f, -0.511895f, -0.653285f, 0.440971f,  0.455591f,
+  -0.123900f, 0.134097f,  -0.251241f, 0.682463f,  0.740614f,  0.991212f,
+  0.565984f,  0.592690f,
+};
+
+static INLINE float32x4_t add_f32x4_x4(const float32x4_t a[4]) {
+  float32x4_t sum01 = vaddq_f32(a[0], a[1]);
+  float32x4_t sum23 = vaddq_f32(a[2], a[3]);
+  return vaddq_f32(sum01, sum23);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
+    const float **input, int in_width, int in_height, int in_stride,
+    const float *bias, const int skip_width, const int skip_height,
+    const int filter_width, const int filter_height, const int in_channels,
+    const int out_channels, float **output, int out_stride, int start_idx,
+    const float *weights) {
+  assert(filter_height == 2 && filter_width == 2);
+  assert(skip_width == 2 && skip_height == 2);
+  assert(in_width >= 16);
+  const int in_size = in_height * in_width;
+
+  do {
+    const float32x4_t bias_v = vdupq_n_f32(bias[0]);
+    const float *weight_ptr0 = weights;
+    const float *in_ptr0 = *input;
+    float *out_ptr0 = *output;
+    int h = 0;
+
+    do {
+      const float *in_ptr1 = in_ptr0;
+      float *out_ptr1 = out_ptr0;
+      int w = 0;
+
+      do {
+        const float *weight_ptr1 = weight_ptr0;
+        const float *in_ptr2 = in_ptr1;
+        int k = 0;
+        float32x4_t sum0[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+                                vdupq_n_f32(0) };
+        float32x4_t sum1[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+                                vdupq_n_f32(0) };
+
+        do {
+          const float32x4_t weights0 = vld1q_f32(weight_ptr1);
+          const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4);
+          const float32x2_t weights0_lo = vget_low_f32(weights0);
+          const float32x2_t weights0_hi = vget_high_f32(weights0);
+          const float32x2_t weights1_lo = vget_low_f32(weights1);
+          const float32x2_t weights1_hi = vget_high_f32(weights1);
+
+          const float32x4x2_t in0_lo_0 = vld2q_f32(in_ptr2);
+          const float32x4x2_t in0_hi_0 = vld2q_f32(in_ptr2 + in_stride);
+          const float32x4x2_t in1_lo_0 = vld2q_f32(in_ptr2 + in_size);
+          const float32x4x2_t in1_hi_0 =
+              vld2q_f32(in_ptr2 + in_size + in_stride);
+
+          sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[0], weights0_lo, 0);
+          sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[1], weights0_lo, 1);
+
+          sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[0], weights0_hi, 0);
+          sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[1], weights0_hi, 1);
+
+          sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[0], weights1_lo, 0);
+          sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[1], weights1_lo, 1);
+
+          sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[0], weights1_hi, 0);
+          sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[1], weights1_hi, 1);
+
+          const float32x4x2_t in0_lo_1 = vld2q_f32(in_ptr2 + 8);
+          const float32x4x2_t in0_hi_1 = vld2q_f32(in_ptr2 + in_stride + 8);
+          const float32x4x2_t in1_lo_1 = vld2q_f32(in_ptr2 + in_size + 8);
+          const float32x4x2_t in1_hi_1 =
+              vld2q_f32(in_ptr2 + in_size + in_stride + 8);
+
+          sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[0], weights0_lo, 0);
+          sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[1], weights0_lo, 1);
+
+          sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[0], weights0_hi, 0);
+          sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[1], weights0_hi, 1);
+
+          sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[0], weights1_lo, 0);
+          sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[1], weights1_lo, 1);
+
+          sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[0], weights1_hi, 0);
+          sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[1], weights1_hi, 1);
+
+          weight_ptr1 += 8;
+          in_ptr2 += 2 * in_size;
+          k += 2;
+        } while (k < in_channels);
+
+        vst1q_f32(out_ptr1, add_f32x4_x4(sum0));
+        vst1q_f32(out_ptr1 + 4, add_f32x4_x4(sum1));
+
+        out_ptr1 += 8;
+        in_ptr1 += 8 * skip_width;
+        w += 8 * skip_width;
+      } while (w < in_width - filter_width + 1);
+
+      out_ptr0 += out_stride;
+      in_ptr0 += skip_height * in_stride;
+      h += skip_height;
+    } while (h < in_height - filter_height + 1);
+
+    ++bias;
+    ++output;
+    weights += in_channels * filter_height * filter_width;
+  } while (++start_idx < out_channels);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
+    const float **input, int in_width, int in_height, int in_stride,
+    const float *bias, const int skip_width, const int skip_height,
+    const int filter_width, const int filter_height, const int in_channels,
+    const int out_channels, float **output, int out_stride, int start_idx,
+    const float *weights) {
+  assert(filter_height == 2 && filter_width == 2);
+  assert(skip_width == 2 && skip_height == 2);
+  assert(in_width == 8);
+  const int in_size = in_height * in_width;
+  do {
+    const float32x4_t bias_v = vdupq_n_f32(*bias);
+    const float *weight_ptr0 = weights;
+    const float *in_ptr0 = *input;
+    float *out_ptr0 = *output;
+    int h = 0;
+
+    do {
+      const float *in_ptr1 = in_ptr0;
+      float *out_ptr1 = out_ptr0;
+      int w = 0;
+
+      do {
+        const float *weight_ptr1 = weight_ptr0;
+        const float *in_ptr2 = in_ptr1;
+        int k = 0;
+        float32x4_t sum[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+                               vdupq_n_f32(0) };
+
+        do {
+          const float32x4_t weights0 = vld1q_f32(weight_ptr1);
+          const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4);
+          const float32x2_t weights0_lo = vget_low_f32(weights0);
+          const float32x2_t weights0_hi = vget_high_f32(weights0);
+          const float32x2_t weights1_lo = vget_low_f32(weights1);
+          const float32x2_t weights1_hi = vget_high_f32(weights1);
+
+          const float32x4x2_t in0_lo = vld2q_f32(in_ptr2);
+          const float32x4x2_t in0_hi = vld2q_f32(in_ptr2 + in_stride);
+          const float32x4x2_t in1_lo = vld2q_f32(in_ptr2 + in_size);
+          const float32x4x2_t in1_hi = vld2q_f32(in_ptr2 + in_size + in_stride);
+
+          sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[0], weights0_lo, 0);
+          sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[1], weights0_lo, 1);
+
+          sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[0], weights0_hi, 0);
+          sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[1], weights0_hi, 1);
+
+          sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[0], weights1_lo, 0);
+          sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[1], weights1_lo, 1);
+
+          sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[0], weights1_hi, 0);
+          sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[1], weights1_hi, 1);
+
+          weight_ptr1 += 8;
+          in_ptr2 += 2 * in_size;
+          k += 2;
+        } while (k < in_channels);
+
+        vst1q_f32(out_ptr1, add_f32x4_x4(sum));
+
+        out_ptr1 += 4;
+        in_ptr1 += 4 * skip_width;
+        w += 4 * skip_width;
+      } while (w < in_width - filter_width + 1);
+
+      out_ptr0 += out_stride;
+      in_ptr0 += skip_height * in_stride;
+      h += skip_height;
+    } while (h < in_height - filter_height + 1);
+
+    ++bias;
+    ++output;
+    weights += in_channels * filter_height * filter_width;
+  } while (++start_idx < out_channels);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon(
+    const float **input, int in_width, int in_height, int in_stride,
+    const float *bias, const int skip_width, const int skip_height,
+    const int filter_width, const int filter_height, const int in_channels,
+    const int out_channels, float **output, int out_stride, int start_idx,
+    const float *weights) {
+  assert(filter_height == 5 && filter_width == 5);
+  assert(skip_width == 4 && skip_height == 4);
+  assert(in_width >= 16);
+  assert(in_channels == 1);
+  (void)in_channels;
+
+  do {
+    const float32x4_t bias_v = vdupq_n_f32(*bias);
+    const float *in_ptr0 = *input;
+    const float *weights_ptr0 = weights;
+    float *out_ptr0 = *output;
+    int h = 0;
+
+    do {
+      const float *in_ptr1 = in_ptr0;
+      float *out_ptr1 = out_ptr0;
+      int w = 0;
+
+      do {
+        float32x4_t sum[2] = { bias_v, vdupq_n_f32(0) };
+
+        const float32x4_t weight_0_3 = vld1q_f32(weights_ptr0);
+        const float32x4_t weight_4_7 = vld1q_f32(weights_ptr0 + 4);
+        const float32x4_t weight_8_11 = vld1q_f32(weights_ptr0 + 8);
+        const float32x4_t weight_12_15 = vld1q_f32(weights_ptr0 + 12);
+        const float32x4_t weight_16_19 = vld1q_f32(weights_ptr0 + 16);
+        const float32x4_t weight_20_23 = vld1q_f32(weights_ptr0 + 20);
+
+        const float32x2_t weight_0_3_lo = vget_low_f32(weight_0_3);
+        const float32x2_t weight_0_3_hi = vget_high_f32(weight_0_3);
+        const float32x2_t weight_4_7_lo = vget_low_f32(weight_4_7);
+        const float32x2_t weight_4_7_hi = vget_high_f32(weight_4_7);
+        const float32x2_t weight_8_11_lo = vget_low_f32(weight_8_11);
+        const float32x2_t weight_8_11_hi = vget_high_f32(weight_8_11);
+        const float32x2_t weight_12_15_lo = vget_low_f32(weight_12_15);
+        const float32x2_t weight_12_15_hi = vget_high_f32(weight_12_15);
+        const float32x2_t weight_16_19_lo = vget_low_f32(weight_16_19);
+        const float32x2_t weight_16_19_hi = vget_high_f32(weight_16_19);
+        const float32x2_t weight_20_23_lo = vget_low_f32(weight_20_23);
+        const float32x2_t weight_20_23_hi = vget_high_f32(weight_20_23);
+
+        const float32x4x4_t in0 = vld4q_f32(in_ptr1 + 0 * in_stride);
+        const float32x4x4_t in1 = vld4q_f32(in_ptr1 + 1 * in_stride);
+        const float32x4x4_t in2 = vld4q_f32(in_ptr1 + 2 * in_stride);
+        const float32x4x4_t in3 = vld4q_f32(in_ptr1 + 3 * in_stride);
+        const float32x4x4_t in4 = vld4q_f32(in_ptr1 + 4 * in_stride);
+
+        const float32x4_t in0_4 = vextq_f32(
+            in0.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 0 * in_stride)), 1);
+        const float32x4_t in1_4 = vextq_f32(
+            in1.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 1 * in_stride)), 1);
+        const float32x4_t in2_4 = vextq_f32(
+            in2.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 2 * in_stride)), 1);
+        const float32x4_t in3_4 = vextq_f32(
+            in3.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 3 * in_stride)), 1);
+        const float32x4_t in4_4 = vextq_f32(
+            in4.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 4 * in_stride)), 1);
+
+        // Kernel row 0.
+        sum[0] = vmlaq_lane_f32(sum[0], in0.val[0], weight_0_3_lo, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in0.val[1], weight_0_3_lo, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in0.val[2], weight_0_3_hi, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in0.val[3], weight_0_3_hi, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in0_4, weight_4_7_lo, 0);
+
+        // Kernel row 1.
+        sum[1] = vmlaq_lane_f32(sum[1], in1.val[0], weight_4_7_lo, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in1.val[1], weight_4_7_hi, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in1.val[2], weight_4_7_hi, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in1.val[3], weight_8_11_lo, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in1_4, weight_8_11_lo, 1);
+
+        // Kernel row 2.
+        sum[0] = vmlaq_lane_f32(sum[0], in2.val[0], weight_8_11_hi, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in2.val[1], weight_8_11_hi, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in2.val[2], weight_12_15_lo, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in2.val[3], weight_12_15_lo, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in2_4, weight_12_15_hi, 0);
+
+        // Kernel row 3.
+        sum[1] = vmlaq_lane_f32(sum[1], in3.val[0], weight_12_15_hi, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in3.val[1], weight_16_19_lo, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in3.val[2], weight_16_19_lo, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in3.val[3], weight_16_19_hi, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in3_4, weight_16_19_hi, 1);
+
+        // Kernel row 4.
+        sum[0] = vmlaq_lane_f32(sum[0], in4.val[0], weight_20_23_lo, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in4.val[1], weight_20_23_lo, 1);
+        sum[0] = vmlaq_lane_f32(sum[0], in4.val[2], weight_20_23_hi, 0);
+        sum[1] = vmlaq_lane_f32(sum[1], in4.val[3], weight_20_23_hi, 1);
+        sum[0] = vmlaq_f32(sum[0], vdupq_n_f32(*(weights_ptr0 + 24)), in4_4);
+
+        vst1q_f32(out_ptr1, vaddq_f32(sum[0], sum[1]));
+
+        out_ptr1 += 4;
+        in_ptr1 += 4 * skip_width;
+        w += 4 * skip_width;
+      } while (w < in_width - filter_width + 1);
+
+      out_ptr0 += out_stride;
+      in_ptr0 += skip_height * in_stride;
+      h += skip_height;
+    } while (h < in_height - filter_height + 1);
+
+    ++output;
+    ++bias;
+    weights += 25;
+  } while (++start_idx < out_channels);
+}
+
+// Neon variant of av1_cnn_convolve_no_maxpool_padding_valid_c().
+// As per the current encoder, av1_cnn_convolve function gets called for
+// block size equal to 64x64. av1_cnn_convolve() uses layer config values
+// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few
+// details related to each layer's config parameters.
+// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht
+//     0         64x64    16x16      5         5         4       4
+//     1         16x16    8x8        2         2         2       2
+//     2         8x8      4x4        2         2         2       2
+//     3         4x4      2x2        2         2         2       2
+//     4         2x2      1x1        2         2         2       2
+// Here,
+// filter_wd = filter_width and filter_ht = filter_height,
+// skip_wd = skip_width and skip_ht = skip_height.
+void av1_cnn_convolve_no_maxpool_padding_valid_neon(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+    int start_idx, int cstep, int channel_step) {
+  assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
+         !layer_config->maxpool);
+  assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
+  assert(layer_config->pad == PADDING_VALID);
+  assert(channel_step == 1);
+  assert(cstep == layer_config->in_channels * layer_config->out_channels);
+
+  if (layer_config->filter_width == 5 && layer_config->filter_height == 5 &&
+      layer_config->skip_width == 4 && layer_config->skip_height == 4) {
+    av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon(
+        input, in_width, in_height, in_stride, layer_config->bias,
+        layer_config->skip_width, layer_config->skip_height,
+        layer_config->filter_width, layer_config->filter_height,
+        layer_config->in_channels, layer_config->out_channels, output,
+        out_stride, start_idx, weights_layer_5);
+  } else if (layer_config->filter_width == 2 &&
+             layer_config->filter_height == 2 &&
+             layer_config->skip_width == 2 && layer_config->skip_height == 2) {
+    const float *weights = weights_layer_1;
+    if (layer_config->output_num ==
+        av1_intra_mode_cnn_partition_cnn_config.layer_config[2].output_num) {
+      weights = weights_layer_2;
+    } else if ((layer_config->output_num ==
+                av1_intra_mode_cnn_partition_cnn_config.layer_config[3]
+                    .output_num)) {
+      weights = weights_layer_3;
+    } else if ((layer_config->output_num ==
+                av1_intra_mode_cnn_partition_cnn_config.layer_config[4]
+                    .output_num)) {
+      weights = weights_layer_4;
+    }
+    if (in_width >= 16) {
+      av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
+          input, in_width, in_height, in_stride, layer_config->bias,
+          layer_config->skip_width, layer_config->skip_height,
+          layer_config->filter_width, layer_config->filter_height,
+          layer_config->in_channels, layer_config->out_channels, output,
+          out_stride, start_idx, weights);
+    } else if (in_width == 8) {
+      av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
+          input, in_width, in_height, in_stride, layer_config->bias,
+          layer_config->skip_width, layer_config->skip_height,
+          layer_config->filter_width, layer_config->filter_height,
+          layer_config->in_channels, layer_config->out_channels, output,
+          out_stride, start_idx, weights);
+    } else {
+      av1_cnn_convolve_no_maxpool_padding_valid_c(
+          input, in_width, in_height, in_stride, layer_config, output,
+          out_stride, start_idx, cstep, channel_step);
+    }
+  } else {
+    av1_cnn_convolve_no_maxpool_padding_valid_c(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c
new file mode 100644
index 0000000000..582863a27c
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c
@@ -0,0 +1,646 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encodetxb.h"
+
+void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
+                              const int height, uint8_t *const levels) {
+  const int stride = height + TX_PAD_HOR;
+  memset(levels - TX_PAD_TOP * stride, 0,
+         sizeof(*levels) * TX_PAD_TOP * stride);
+  memset(levels + stride * width, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+  const int32x4_t zeros = vdupq_n_s32(0);
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (height == 4) {
+    do {
+      const int32x4_t coeffA = vld1q_s32(cf);
+      const int32x4_t coeffB = vld1q_s32(cf + height);
+      const int16x8_t coeffAB =
+          vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+      const int16x8_t absAB = vqabsq_s16(coeffAB);
+      const int8x8_t absABs = vqmovn_s16(absAB);
+#if AOM_ARCH_AARCH64
+      const int8x16_t absAB8 =
+          vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
+      const uint8x16_t lsAB =
+          vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
+#else
+      const int32x2x2_t absAB8 =
+          vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
+      const uint8x16_t lsAB =
+          vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
+#endif
+      vst1q_u8(ls, lsAB);
+      ls += (stride << 1);
+      cf += (height << 1);
+      i += 2;
+    } while (i < width);
+  } else if (height == 8) {
+    do {
+      const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
+      const int16x8_t absAB = vqabsq_s16(coeffAB);
+      const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
+          vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
+      vst1q_u8(ls, absAB8);
+      ls += stride;
+      cf += height;
+      i += 1;
+    } while (i < width);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
+        const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8);
+        const int16x8_t absAB = vqabsq_s16(coeffAB);
+        const int16x8_t absCD = vqabsq_s16(coeffCD);
+        const uint8x16_t absABCD = vreinterpretq_u8_s8(
+            vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
+        vst1q_u8((ls + j), absABCD);
+        j += 16;
+        cf += 16;
+      } while (j < height);
+      *(int32_t *)(ls + height) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < width);
+  }
+}
+
+// get_4_nz_map_contexts_2d coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
+  { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
+  { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_hor coefficients:
+/* clang-format off */
+#define SIG_COEF_CONTEXTS_2D_X4_051010                        \
+  (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
+  ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
+/* clang-format on */
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
+  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_8_coeff_contexts_2d coefficients:
+// if (width == 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
+  { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+// if (width < 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
+  { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 },
+  { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 }
+};
+
+// if (width > 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
+  { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_16n_coeff_contexts_2d coefficients:
+// real_width == real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
+  { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width < real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
+  { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width > real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
+  { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_16n_coeff_contexts_hor coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// end of coefficients declaration area
+
+static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
+                                                const int byte_stride) {
+#if AOM_ARCH_AARCH64
+  uint32x4_t v_data = vld1q_u32((uint32_t *)src);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
+
+  return vreinterpretq_u8_u32(v_data);
+#else
+  return load_unaligned_u8q(src, byte_stride);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
+                                                const int byte_stride) {
+#if AOM_ARCH_AARCH64
+  uint64x2_t v_data = vld1q_u64((uint64_t *)src);
+  v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
+
+  return vreinterpretq_u8_u64(v_data);
+#else
+  uint8x8_t v_data_low = vld1_u8(src);
+  uint8x8_t v_data_high = vld1_u8(src + byte_stride);
+
+  return vcombine_u8(v_data_low, v_data_high);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
+                                                 const int byte_stride) {
+  (void)byte_stride;
+  return vld1q_u8(src);
+}
+
+static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
+                                     const ptrdiff_t *const offsets,
+                                     uint8x16_t *const level) {
+  level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
+                                     const ptrdiff_t *const offsets,
+                                     uint8x16_t *const level) {
+  level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_16x1x5(const uint8_t *const src,
+                                      const int stride,
+                                      const ptrdiff_t *const offsets,
+                                      uint8x16_t *const level) {
+  level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
+  const uint8x16_t const_3 = vdupq_n_u8(3);
+  const uint8x16_t const_4 = vdupq_n_u8(4);
+  uint8x16_t count;
+
+  count = vminq_u8(level[0], const_3);
+  level[1] = vminq_u8(level[1], const_3);
+  level[2] = vminq_u8(level[2], const_3);
+  level[3] = vminq_u8(level[3], const_3);
+  level[4] = vminq_u8(level[4], const_3);
+  count = vaddq_u8(count, level[1]);
+  count = vaddq_u8(count, level[2]);
+  count = vaddq_u8(count, level[3]);
+  count = vaddq_u8(count, level[4]);
+
+  count = vrshrq_n_u8(count, 1);
+  count = vminq_u8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int width,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
+
+  uint8x16_t pos_to_offset =
+      (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+  uint8_t *cc = coeff_contexts;
+
+  assert(!(width % 4));
+
+  int col = width;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    col -= 4;
+  } while (col);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int width,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset =
+      vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 4));
+
+  int col = width;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    col -= 4;
+  } while (col);
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int width,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 4));
+
+  int col = width;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    col -= 4;
+  } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int width,
+                                           const ptrdiff_t *const offsets,
+                                           uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  uint8_t *cc = coeff_contexts;
+  uint8x16_t count;
+  uint8x16_t level[5];
+  uint8x16_t pos_to_offset[3];
+
+  assert(!(width % 2));
+
+  if (width == 8) {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
+  } else if (width < 8) {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
+  } else {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
+  }
+  pos_to_offset[2] = vdupq_n_u8(21);
+
+  int col = width;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset[0]);
+    vst1q_u8(cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    col -= 2;
+  } while (col);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int width,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 2));
+
+  int col = width;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    col -= 2;
+  } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int width,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
+                                         vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 2));
+
+  int col = width;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    col -= 2;
+  } while (col);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = height + TX_PAD_HOR;
+  uint8_t *cc = coeff_contexts;
+  int col = width;
+  uint8x16_t pos_to_offset[5];
+  uint8x16_t pos_to_offset_large[3];
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 16));
+
+  pos_to_offset_large[2] = vdupq_n_u8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
+    pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
+    pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
+    pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width < real_height) {
+    pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
+    pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
+        vld1q_u8(c_16_po_2d_g[2]);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width > real_height
+    pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
+    pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
+    pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16);
+  }
+
+  do {
+    int h = height;
+
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset[0]);
+      vst1q_u8(cc, count);
+      levels += 16;
+      cc += 16;
+      h -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (h);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--col);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              uint8_t *coeff_contexts) {
+  const int stride = height + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 16));
+
+  int col = width;
+  do {
+    uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver);
+
+    int h = height;
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset);
+      vst1q_u8(coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      h -= 16;
+    } while (h);
+
+    levels += TX_PAD_HOR;
+  } while (--col);
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              uint8_t *coeff_contexts) {
+  const int stride = height + TX_PAD_HOR;
+
+  uint8x16_t pos_to_offset[3];
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 16));
+
+  pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  int col = width;
+  do {
+    int h = height;
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset[0]);
+      vst1q_u8(coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      h -= 16;
+    } while (h);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--col);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = height + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (height == 4) {
+      get_4_nz_map_contexts_2d(levels, width, offsets, coefficients);
+    } else if (height == 8) {
+      get_8_coeff_contexts_2d(levels, width, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coefficients);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (height == 4) {
+      get_4_nz_map_contexts_hor(levels, width, offsets, coefficients);
+    } else if (height == 8) {
+      get_8_coeff_contexts_hor(levels, width, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (height == 4) {
+      get_4_nz_map_contexts_ver(levels, width, offsets, coefficients);
+    } else if (height == 8) {
+      get_8_coeff_contexts_ver(levels, width, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
+    }
+  }
+
+  const int bhl = get_txb_bhl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (width << bhl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (width << bhl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
new file mode 100644
index 0000000000..aa64a38902
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
@@ -0,0 +1,2619 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
+                                                        int32x4_t *out) {
+  // This is not quite the same as the other transposes defined in
+  // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
+  // unused by the following row transform.
+  for (int j = 0; j < 8; ++j) {
+    for (int i = 0; i < 16; ++i) {
+      transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
+    }
+  }
+}
+
+// A note on butterfly helper naming:
+//
+// butterfly_[weight_indices]_neon
+// e.g. butterfly_0312_neon
+//                ^ Weights are applied as indices 0, 3, 2, 1
+//                  (see more detail below)
+//
+// Weight indices are treated as an index into the 4-tuple of the weight
+// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
+// This is then represented in the helper naming by referring to the lane index
+// in the loaded tuple that each multiply is performed with:
+//
+//         in0   in1
+//      /------------
+// out0 |  w[0]  w[1]   ==>  out0 = in0 * w[0] + in1 * w[1]
+// out1 |  w[2]  w[3]   ==>  out1 = in0 * w[2] + in1 * w[3]
+//
+// So for indices 0321 from the earlier example, we end up with:
+//
+//          in0       in1
+//      /------------------
+// out0 | (lane 0) (lane 3)   ==>  out0 = in0 *  w0 + in1 * (w0-1)
+// out1 | (lane 2) (lane 1)   ==>  out1 = in0 * -w0 + in1 * (1-w0)
+
+#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit)   \
+  do {                                                                  \
+    int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } };                   \
+    int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
+    x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2);        \
+    *out = vrshlq_s32(x, v_bit);                                        \
+  } while (false)
+
+static AOM_FORCE_INLINE void butterfly_0112_neon(
+    const int32_t *cospi, const int widx0, const int32x4_t n0,
+    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+    const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+  butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_2312_neon(
+    const int32_t *cospi, const int widx0, const int32x4_t n0,
+    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+    const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+  butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_0332_neon(
+    const int32_t *cospi, const int widx0, const int32x4_t n0,
+    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+    const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+  butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_0130_neon(
+    const int32_t *cospi, const int widx0, const int32x4_t n0,
+    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+    const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+  butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
+    const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+    int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+  butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
+    const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+    int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+  butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
+                                                       int32x4_t *output,
+                                                       const int size) {
+  const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+  int i = 0;
+  do {
+    const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
+    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+  } while (++i < size);
+}
+
+static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
+    const int32x4_t *input, int32x4_t *output, const int size) {
+  const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+  int i = 0;
+  do {
+    const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
+    const int32x4_t r1 = vmulq_s32(r0, sqrt2);
+    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+  } while (++i < size);
+}
+
+#define LOAD_BUFFER_4XH(h)                                           \
+  static AOM_FORCE_INLINE void load_buffer_4x##h(                    \
+      const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+    if (fliplr) {                                                    \
+      for (int i = 0; i < (h); ++i) {                                \
+        int16x4_t a = vld1_s16(input + i * stride);                  \
+        a = vrev64_s16(a);                                           \
+        in[i] = vshll_n_s16(a, 2);                                   \
+      }                                                              \
+    } else {                                                         \
+      for (int i = 0; i < (h); ++i) {                                \
+        int16x4_t a = vld1_s16(input + i * stride);                  \
+        in[i] = vshll_n_s16(a, 2);                                   \
+      }                                                              \
+    }                                                                \
+  }
+
+// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
+// avoid the expression even though the compiler can prove that the code path
+// is never taken if `shift == 0`.
+#define shift_left_long_s16(a, shift) \
+  ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
+
+#define LOAD_BUFFER_WXH(w, h, shift)                                 \
+  static AOM_FORCE_INLINE void load_buffer_##w##x##h(                \
+      const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+    assert(w >= 8);                                                  \
+    if (fliplr) {                                                    \
+      for (int i = 0; i < (h); ++i) {                                \
+        for (int j = 0; j < (w) / 8; ++j) {                          \
+          int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
+          a = vrev64q_s16(a);                                        \
+          int j2 = (w) / 8 - j - 1;                                  \
+          in[i + (h) * (2 * j2 + 0)] =                               \
+              shift_left_long_s16(vget_high_s16(a), (shift));        \
+          in[i + (h) * (2 * j2 + 1)] =                               \
+              shift_left_long_s16(vget_low_s16(a), (shift));         \
+        }                                                            \
+      }                                                              \
+    } else {                                                         \
+      for (int i = 0; i < (h); ++i) {                                \
+        for (int j = 0; j < (w) / 8; ++j) {                          \
+          int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
+          in[i + (h) * (2 * j + 0)] =                                \
+              shift_left_long_s16(vget_low_s16(a), (shift));         \
+          in[i + (h) * (2 * j + 1)] =                                \
+              shift_left_long_s16(vget_high_s16(a), (shift));        \
+        }                                                            \
+      }                                                              \
+    }                                                                \
+  }
+
+LOAD_BUFFER_4XH(4)
+LOAD_BUFFER_4XH(8)
+LOAD_BUFFER_4XH(16)
+LOAD_BUFFER_4XH(32)
+LOAD_BUFFER_WXH(8, 8, 2)
+LOAD_BUFFER_WXH(16, 16, 2)
+LOAD_BUFFER_WXH(32, 64, 0)
+LOAD_BUFFER_WXH(64, 32, 2)
+LOAD_BUFFER_WXH(64, 64, 0)
+
+#if !CONFIG_REALTIME_ONLY
+LOAD_BUFFER_WXH(16, 64, 0)
+LOAD_BUFFER_WXH(64, 16, 2)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define STORE_BUFFER_WXH(w, h)                                \
+  static AOM_FORCE_INLINE void store_buffer_##w##x##h(        \
+      const int32x4_t *in, int32_t *out, int stride) {        \
+    for (int i = 0; i < (w); ++i) {                           \
+      for (int j = 0; j < (h) / 4; ++j) {                     \
+        vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
+      }                                                       \
+    }                                                         \
+  }
+
+STORE_BUFFER_WXH(4, 4)
+STORE_BUFFER_WXH(8, 4)
+STORE_BUFFER_WXH(8, 8)
+STORE_BUFFER_WXH(16, 4)
+STORE_BUFFER_WXH(16, 16)
+STORE_BUFFER_WXH(32, 4)
+STORE_BUFFER_WXH(32, 32)
+STORE_BUFFER_WXH(64, 32)
+
+#if !CONFIG_REALTIME_ONLY
+STORE_BUFFER_WXH(16, 32)
+STORE_BUFFER_WXH(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
+                                                  int32x4_t *out, int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
+  const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
+  const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
+
+  const int32x4_t a0 = vaddq_s32(in[0], in[3]);
+  const int32x4_t a1 = vsubq_s32(in[0], in[3]);
+  const int32x4_t a2 = vaddq_s32(in[1], in[2]);
+  const int32x4_t a3 = vsubq_s32(in[1], in[2]);
+
+  const int32x4_t b0 = vmulq_s32(a0, cospi32);
+  const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
+  const int32x4_t b2 = vmulq_s32(a2, cospi32);
+  const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
+
+  const int32x4_t c0 = vaddq_s32(b0, b2);
+  const int32x4_t c1 = vsubq_s32(b0, b2);
+  const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
+  const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t d0 = vrshlq_s32(c0, v_bit);
+  const int32x4_t d1 = vrshlq_s32(c1, v_bit);
+  const int32x4_t d2 = vrshlq_s32(c2, v_bit);
+  const int32x4_t d3 = vrshlq_s32(c3, v_bit);
+
+  out[0] = d0;
+  out[1] = d2;
+  out[2] = d1;
+  out[3] = d3;
+}
+
+static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
+                                                   int32x4_t *out, int bit) {
+  const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
+
+  const int32x4_t a0 = vaddq_s32(in[0], in[1]);
+  const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
+  const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
+  const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
+
+  const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
+  const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
+  const int32x4_t b2 = vsubq_s32(a0, in[3]);
+
+  const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
+  const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
+  const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
+
+  const int32x4_t d0 = vaddq_s32(c0, a3);
+  const int32x4_t d1 = vsubq_s32(c1, a3);
+  const int32x4_t d2 = vsubq_s32(c1, c0);
+
+  const int32x4_t e0 = vaddq_s32(d2, a3);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  out[0] = vrshlq_s32(d0, v_bit);
+  out[1] = vrshlq_s32(c2, v_bit);
+  out[2] = vrshlq_s32(d1, v_bit);
+  out[3] = vrshlq_s32(e0, v_bit);
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
+                                                       int32x4_t *out,
+                                                       int bit) {
+  (void)bit;
+  int32x4_t fact = vdupq_n_s32(NewSqrt2);
+
+  for (int i = 0; i < 4; i++) {
+    const int32x4_t a_low = vmulq_s32(in[i], fact);
+    out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
+  }
+}
+
+void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
+                             int input_stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
+
+  // Workspace for column/row-wise transforms.
+  int32x4_t buf[4];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, buf, input_stride, 1);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, buf, input_stride, 1);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, buf, input_stride, 1);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, buf, input_stride, 1);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
+      break;
+    default: assert(0);
+  }
+}
+
+// Butterfly pre-processing:
+// e.g. n=4:
+//   out[0] = in[0] + in[3]
+//   out[1] = in[1] + in[2]
+//   out[2] = in[1] - in[2]
+//   out[3] = in[0] - in[3]
+
+static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
+                                               int32x4_t *output, int n) {
+  for (int i = 0; i < n / 2; ++i) {
+    output[i] = vaddq_s32(input[i], input[n - i - 1]);
+  }
+  for (int i = 0; i < n / 2; ++i) {
+    output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
+  }
+}
+
+// Butterfly post-processing:
+// e.g. n=8:
+//   out[0] = in0[0] + in1[3];
+//   out[1] = in0[1] + in1[2];
+//   out[2] = in0[1] - in1[2];
+//   out[3] = in0[0] - in1[3];
+//   out[4] = in0[7] - in1[4];
+//   out[5] = in0[6] - in1[5];
+//   out[6] = in0[6] + in1[5];
+//   out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
+                                                const int32x4_t *in1,
+                                                int32x4_t *output, int n) {
+  for (int i = 0; i < n / 4; ++i) {
+    output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[(3 * n) / 4 + i] =
+        vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
+                                                  int32x4_t *out, int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  // stage 1
+  int32x4_t a[8];
+  butterfly_dct_pre(in, a, 8);
+
+  // stage 2
+  int32x4_t b[8];
+  butterfly_dct_pre(a, b, 4);
+  butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
+
+  // stage 3
+  int32x4_t c[8];
+  butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
+  butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
+  butterfly_dct_post(a + 4, b + 4, c + 4, 4);
+
+  // stage 4-5
+  butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
+  butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
+
+  out[0] = c[0];
+  out[2] = c[2];
+  out[4] = c[1];
+  out[6] = c[3];
+}
+
+static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
+                                                   int32x4_t *out, int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+
+  // stage 0-1
+  u0 = in[0];
+  u1 = in[7];
+  u2 = in[3];
+  u3 = in[4];
+  u4 = in[1];
+  u5 = in[6];
+  u6 = in[2];
+  u7 = in[5];
+
+  // stage 2
+  v0 = u0;
+  v1 = u1;
+  butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
+  v4 = u4;
+  v5 = u5;
+  butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
+
+  // stage 3
+  u0 = vaddq_s32(v0, v2);
+  u1 = vsubq_s32(v3, v1);
+  u2 = vsubq_s32(v0, v2);
+  u3 = vaddq_s32(v1, v3);
+  u4 = vsubq_s32(v6, v4);
+  u5 = vaddq_s32(v5, v7);
+  u6 = vaddq_s32(v4, v6);
+  u7 = vsubq_s32(v5, v7);
+
+  // stage 4
+  v0 = u0;
+  v1 = u1;
+  v2 = u2;
+  v3 = u3;
+
+  butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
+  butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
+
+  // stage 5
+  u0 = vaddq_s32(v0, v4);
+  u1 = vaddq_s32(v1, v5);
+  u2 = vaddq_s32(v2, v6);
+  u3 = vsubq_s32(v7, v3);
+  u4 = vsubq_s32(v0, v4);
+  u5 = vsubq_s32(v1, v5);
+  u6 = vsubq_s32(v2, v6);
+  u7 = vaddq_s32(v3, v7);
+
+  // stage 6
+  butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
+  butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
+  butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
+  butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
+
+  // stage 7
+  out[0] = v1;
+  out[1] = v6;
+  out[2] = v3;
+  out[3] = v4;
+  out[4] = v5;
+  out[5] = v2;
+  out[6] = v7;
+  out[7] = v0;
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
+                                                       int32x4_t *out,
+                                                       int bit) {
+  (void)bit;
+  out[0] = vshlq_n_s32(in[0], 1);
+  out[1] = vshlq_n_s32(in[1], 1);
+  out[2] = vshlq_n_s32(in[2], 1);
+  out[3] = vshlq_n_s32(in[3], 1);
+  out[4] = vshlq_n_s32(in[4], 1);
+  out[5] = vshlq_n_s32(in[5], 1);
+  out[6] = vshlq_n_s32(in[6], 1);
+  out[7] = vshlq_n_s32(in[7], 1);
+}
+
+static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
+                                                  int32x4_t *out, int bit,
+                                                  int howmany) {
+  const int stride = 8;
+  int i = 0;
+  do {
+    highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
+                                                   int32x4_t *out, int bit,
+                                                   int howmany) {
+  const int stride = 8;
+  int i = 0;
+  do {
+    highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
+                                                       int32x4_t *out, int bit,
+                                                       int howmany) {
+  (void)bit;
+  const int stride = 8;
+  int i = 0;
+  do {
+    highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+  // Workspaces for column/row-wise transforms.
+  int32x4_t buf0[16], buf1[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 1);
+      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 1);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 1);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case IDTX:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 1);
+      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    default: assert(0);
+  }
+}
+
+static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
+                                  int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  int32x4_t u[16], v[16];
+
+  // stage 1
+  butterfly_dct_pre(in, u, 16);
+
+  // stage 2
+  butterfly_dct_pre(u, v, 8);
+  v[8] = u[8];
+  v[9] = u[9];
+  butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
+  butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
+  v[14] = u[14];
+  v[15] = u[15];
+
+  // stage 3
+  butterfly_dct_pre(v, u, 4);
+  u[4] = v[4];
+  butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
+  u[7] = v[7];
+  butterfly_dct_post(v + 8, v + 8, u + 8, 8);
+
+  // stage 4
+  butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
+  butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
+  butterfly_dct_post(u + 4, u + 4, v + 4, 4);
+  v[8] = u[8];
+  butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
+  butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
+  v[11] = u[11];
+  v[12] = u[12];
+  v[15] = u[15];
+
+  // stage 5
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+  butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
+  butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
+  butterfly_dct_post(v + 8, v + 8, u + 8, 4);
+  butterfly_dct_post(v + 12, v + 12, u + 12, 4);
+
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
+  butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
+  butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
+  butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
+  butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
+
+  out[0] = v[0];
+  out[1] = v[8];
+  out[2] = v[4];
+  out[3] = v[12];
+  out[4] = v[2];
+  out[5] = v[10];
+  out[6] = v[6];
+  out[7] = v[14];
+  out[8] = v[1];
+  out[9] = v[9];
+  out[10] = v[5];
+  out[11] = v[13];
+  out[12] = v[3];
+  out[13] = v[11];
+  out[14] = v[7];
+  out[15] = v[15];
+}
+
+static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
+                                   int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  int32x4_t u[16], v[16];
+
+  // stage 0-1
+  u[0] = in[0];
+  u[1] = in[15];
+  u[2] = in[7];
+  u[3] = in[8];
+  u[4] = in[3];
+  u[5] = in[12];
+  u[6] = in[4];
+  u[7] = in[11];
+  u[8] = in[1];
+  u[9] = in[14];
+  u[10] = in[6];
+  u[11] = in[9];
+  u[12] = in[2];
+  u[13] = in[13];
+  u[14] = in[5];
+  u[15] = in[10];
+
+  // stage 2
+  v[0] = u[0];
+  v[1] = u[1];
+  butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
+  v[4] = u[4];
+  v[5] = u[5];
+  butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
+  v[8] = u[8];
+  v[9] = u[9];
+  butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
+  v[12] = u[12];
+  v[13] = u[13];
+  butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
+
+  // stage 3
+  u[0] = vaddq_s32(v[0], v[2]);
+  u[1] = vsubq_s32(v[3], v[1]);
+  u[2] = vsubq_s32(v[0], v[2]);
+  u[3] = vaddq_s32(v[1], v[3]);
+  u[4] = vsubq_s32(v[6], v[4]);
+  u[5] = vaddq_s32(v[5], v[7]);
+  u[6] = vaddq_s32(v[4], v[6]);
+  u[7] = vsubq_s32(v[5], v[7]);
+  u[8] = vsubq_s32(v[10], v[8]);
+  u[9] = vaddq_s32(v[9], v[11]);
+  u[10] = vaddq_s32(v[8], v[10]);
+  u[11] = vsubq_s32(v[9], v[11]);
+  u[12] = vaddq_s32(v[12], v[14]);
+  u[13] = vsubq_s32(v[15], v[13]);
+  u[14] = vsubq_s32(v[12], v[14]);
+  u[15] = vaddq_s32(v[13], v[15]);
+
+  // stage 4
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
+  butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+  v[10] = u[10];
+  v[11] = u[11];
+
+  butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
+  butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
+
+  // stage 5
+  u[0] = vaddq_s32(v[0], v[4]);
+  u[1] = vaddq_s32(v[1], v[5]);
+  u[2] = vaddq_s32(v[2], v[6]);
+  u[3] = vsubq_s32(v[7], v[3]);
+  u[4] = vsubq_s32(v[0], v[4]);
+  u[5] = vsubq_s32(v[1], v[5]);
+  u[6] = vsubq_s32(v[2], v[6]);
+  u[7] = vaddq_s32(v[3], v[7]);
+  u[8] = vaddq_s32(v[8], v[12]);
+  u[9] = vaddq_s32(v[9], v[13]);
+  u[10] = vsubq_s32(v[14], v[10]);
+  u[11] = vaddq_s32(v[11], v[15]);
+  u[12] = vsubq_s32(v[8], v[12]);
+  u[13] = vsubq_s32(v[9], v[13]);
+  u[14] = vaddq_s32(v[10], v[14]);
+  u[15] = vsubq_s32(v[11], v[15]);
+
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
+
+  butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
+  butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
+  butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
+  butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
+
+  // stage 7
+  u[0] = vaddq_s32(v[0], v[8]);
+  u[1] = vaddq_s32(v[1], v[9]);
+  u[2] = vaddq_s32(v[2], v[10]);
+  u[3] = vaddq_s32(v[3], v[11]);
+  u[4] = vaddq_s32(v[4], v[12]);
+  u[5] = vaddq_s32(v[5], v[13]);
+  u[6] = vaddq_s32(v[6], v[14]);
+  u[7] = vsubq_s32(v[15], v[7]);
+  u[8] = vsubq_s32(v[0], v[8]);
+  u[9] = vsubq_s32(v[1], v[9]);
+  u[10] = vsubq_s32(v[2], v[10]);
+  u[11] = vsubq_s32(v[3], v[11]);
+  u[12] = vsubq_s32(v[4], v[12]);
+  u[13] = vsubq_s32(v[5], v[13]);
+  u[14] = vsubq_s32(v[6], v[14]);
+  u[15] = vaddq_s32(v[7], v[15]);
+
+  // stage 8
+  butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
+  butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
+  butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
+  butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
+  butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
+  butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
+  butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
+  butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
+
+  // stage 9
+  out[0] = v[1];
+  out[1] = v[14];
+  out[2] = v[3];
+  out[3] = v[12];
+  out[4] = v[5];
+  out[5] = v[10];
+  out[6] = v[7];
+  out[7] = v[8];
+  out[8] = v[9];
+  out[9] = v[6];
+  out[10] = v[11];
+  out[11] = v[4];
+  out[12] = v[13];
+  out[13] = v[2];
+  out[14] = v[15];
+  out[15] = v[0];
+}
+
+static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
+                                       int bit) {
+  (void)bit;
+  const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
+  const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
+
+  for (int i = 0; i < 16; i++) {
+    int32x4_t a = vmulq_s32(in[i], fact);
+    a = vaddq_s32(a, offset);
+    out[i] = vshrq_n_s32(a, NewSqrt2Bits);
+  }
+}
+
+static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+                                  const int howmany) {
+  const int stride = 16;
+  int i = 0;
+  do {
+    highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+                                   int howmany) {
+  const int stride = 16;
+  int i = 0;
+  do {
+    highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
+                                       int bit, int howmany) {
+  const int stride = 16;
+  int i = 0;
+  do {
+    highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+  // Workspaces for column/row-wise transforms.
+  int32x4_t buf0[64], buf1[64];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 1);
+      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 1);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 1);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case IDTX:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 1);
+      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    default: assert(0);
+  }
+}
+
+typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
+                                          int stride, int bit, int lr_flip);
+typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
+                                               int32x4_t *out, int stride,
+                                               int bit, int lr_flip,
+                                               int howmany, int hm_stride);
+
+typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
+                                          int bit, int stride);
+typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
+                                               int32_t *out, int bit,
+                                               int howmany, int hm_stride,
+                                               int stride);
+
+// Construct component kernels that include the load_buffer and store_buffer
+// stages to avoid the need to spill loaded data to the stack between these and
+// the txfm kernel calls.
+// The TRANSFORM_*_ONE cases are only ever called in situations where the
+// howmany parameter would be one, so no need for the loop at all in these
+// cases.
+
+#define TRANSFORM_COL_ONE(name, n)                                    \
+  static void highbd_##name##_col_neon(const int16_t *input,          \
+                                       int32x4_t *output, int stride, \
+                                       int cos_bit, int lr_flip) {    \
+    int32x4_t buf0[n];                                                \
+    load_buffer_4x##n(input, buf0, stride, lr_flip);                  \
+    highbd_##name##_x4_neon(buf0, output, cos_bit);                   \
+  }
+
+#define TRANSFORM_COL_MANY(name, n)                                     \
+  static void highbd_##name##_col_many_neon(                            \
+      const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
+      int lr_flip, int howmany, int hm_stride) {                        \
+    int i = 0;                                                          \
+    do {                                                                \
+      int32x4_t buf0[n];                                                \
+      load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip);          \
+      highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit);   \
+    } while (++i < howmany);                                            \
+  }
+
+#define TRANSFORM_ROW_ONE(name, n)                                        \
+  static void highbd_##name##_row_neon(                                   \
+      const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+    int32x4_t buf0[n];                                                    \
+    highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
+    store_buffer_##n##x4(buf0, output, stride);                           \
+  }
+
+#define TRANSFORM_ROW_RECT_ONE(name, n)                                   \
+  static void highbd_##name##_row_rect_neon(                              \
+      const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+    int32x4_t buf0[n];                                                    \
+    highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
+    round_rect_array_s32_neon(buf0, buf0, (n));                           \
+    store_buffer_##n##x4(buf0, output, stride);                           \
+  }
+
+#define TRANSFORM_ROW_MANY(name, n)                                      \
+  static void highbd_##name##_row_many_neon(                             \
+      const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+      int hm_stride, int stride) {                                       \
+    int i = 0;                                                           \
+    do {                                                                 \
+      int32x4_t buf0[n];                                                 \
+      highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
+      store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
+    } while (++i < howmany);                                             \
+  }
+
+#define TRANSFORM_ROW_RECT_MANY(name, n)                                 \
+  static void highbd_##name##_row_rect_many_neon(                        \
+      const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+      int hm_stride, int stride) {                                       \
+    int i = 0;                                                           \
+    do {                                                                 \
+      int32x4_t buf0[n];                                                 \
+      highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
+      round_rect_array_s32_neon(buf0, buf0, (n));                        \
+      store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
+    } while (++i < howmany);                                             \
+  }
+
+TRANSFORM_COL_ONE(fdct8, 8)
+TRANSFORM_COL_ONE(fadst8, 8)
+TRANSFORM_COL_ONE(fidentity8, 8)
+
+TRANSFORM_COL_MANY(fdct4, 4)
+TRANSFORM_COL_MANY(fdct8, 8)
+TRANSFORM_COL_MANY(fdct16, 16)
+TRANSFORM_COL_MANY(fadst4, 4)
+TRANSFORM_COL_MANY(fadst8, 8)
+TRANSFORM_COL_MANY(fadst16, 16)
+TRANSFORM_COL_MANY(fidentity4, 4)
+TRANSFORM_COL_MANY(fidentity8, 8)
+TRANSFORM_COL_MANY(fidentity16, 16)
+
+TRANSFORM_ROW_ONE(fdct16, 16)
+TRANSFORM_ROW_ONE(fadst16, 16)
+TRANSFORM_ROW_ONE(fidentity16, 16)
+
+TRANSFORM_ROW_RECT_ONE(fdct8, 8)
+TRANSFORM_ROW_RECT_ONE(fadst8, 8)
+TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
+
+#if !CONFIG_REALTIME_ONLY
+TRANSFORM_ROW_MANY(fdct4, 4)
+TRANSFORM_ROW_MANY(fdct8, 8)
+TRANSFORM_ROW_MANY(fadst4, 4)
+TRANSFORM_ROW_MANY(fadst8, 8)
+TRANSFORM_ROW_MANY(fidentity4, 4)
+TRANSFORM_ROW_MANY(fidentity8, 8)
+#endif
+
+TRANSFORM_ROW_RECT_MANY(fdct4, 4)
+TRANSFORM_ROW_RECT_MANY(fdct8, 8)
+TRANSFORM_ROW_RECT_MANY(fdct16, 16)
+TRANSFORM_ROW_RECT_MANY(fadst4, 4)
+TRANSFORM_ROW_RECT_MANY(fadst8, 8)
+TRANSFORM_ROW_RECT_MANY(fadst16, 16)
+TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
+TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
+TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
+
+static const fwd_transform_1d_col_many_neon
+    col_highbd_txfm8_xn_arr[TX_TYPES] = {
+      highbd_fdct8_col_many_neon,       // DCT_DCT
+      highbd_fadst8_col_many_neon,      // ADST_DCT
+      highbd_fdct8_col_many_neon,       // DCT_ADST
+      highbd_fadst8_col_many_neon,      // ADST_ADST
+      highbd_fadst8_col_many_neon,      // FLIPADST_DCT
+      highbd_fdct8_col_many_neon,       // DCT_FLIPADST
+      highbd_fadst8_col_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst8_col_many_neon,      // ADST_FLIPADST
+      highbd_fadst8_col_many_neon,      // FLIPADST_ADST
+      highbd_fidentity8_col_many_neon,  // IDTX
+      highbd_fdct8_col_many_neon,       // V_DCT
+      highbd_fidentity8_col_many_neon,  // H_DCT
+      highbd_fadst8_col_many_neon,      // V_ADST
+      highbd_fidentity8_col_many_neon,  // H_ADST
+      highbd_fadst8_col_many_neon,      // V_FLIPADST
+      highbd_fidentity8_col_many_neon   // H_FLIPADST
+    };
+
+static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
+  highbd_fdct8_col_neon,       // DCT_DCT
+  highbd_fadst8_col_neon,      // ADST_DCT
+  highbd_fdct8_col_neon,       // DCT_ADST
+  highbd_fadst8_col_neon,      // ADST_ADST
+  highbd_fadst8_col_neon,      // FLIPADST_DCT
+  highbd_fdct8_col_neon,       // DCT_FLIPADST
+  highbd_fadst8_col_neon,      // FLIPADST_FLIPADST
+  highbd_fadst8_col_neon,      // ADST_FLIPADST
+  highbd_fadst8_col_neon,      // FLIPADST_ADST
+  highbd_fidentity8_col_neon,  // IDTX
+  highbd_fdct8_col_neon,       // V_DCT
+  highbd_fidentity8_col_neon,  // H_DCT
+  highbd_fadst8_col_neon,      // V_ADST
+  highbd_fidentity8_col_neon,  // H_ADST
+  highbd_fadst8_col_neon,      // V_FLIPADST
+  highbd_fidentity8_col_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_col_many_neon
+    col_highbd_txfm16_xn_arr[TX_TYPES] = {
+      highbd_fdct16_col_many_neon,       // DCT_DCT
+      highbd_fadst16_col_many_neon,      // ADST_DCT
+      highbd_fdct16_col_many_neon,       // DCT_ADST
+      highbd_fadst16_col_many_neon,      // ADST_ADST
+      highbd_fadst16_col_many_neon,      // FLIPADST_DCT
+      highbd_fdct16_col_many_neon,       // DCT_FLIPADST
+      highbd_fadst16_col_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst16_col_many_neon,      // ADST_FLIPADST
+      highbd_fadst16_col_many_neon,      // FLIPADST_ADST
+      highbd_fidentity16_col_many_neon,  // IDTX
+      highbd_fdct16_col_many_neon,       // V_DCT
+      highbd_fidentity16_col_many_neon,  // H_DCT
+      highbd_fadst16_col_many_neon,      // V_ADST
+      highbd_fidentity16_col_many_neon,  // H_ADST
+      highbd_fadst16_col_many_neon,      // V_FLIPADST
+      highbd_fidentity16_col_many_neon   // H_FLIPADST
+    };
+
+static const fwd_transform_1d_col_many_neon
+    col_highbd_txfm4_xn_arr[TX_TYPES] = {
+      highbd_fdct4_col_many_neon,       // DCT_DCT
+      highbd_fadst4_col_many_neon,      // ADST_DCT
+      highbd_fdct4_col_many_neon,       // DCT_ADST
+      highbd_fadst4_col_many_neon,      // ADST_ADST
+      highbd_fadst4_col_many_neon,      // FLIPADST_DCT
+      highbd_fdct4_col_many_neon,       // DCT_FLIPADST
+      highbd_fadst4_col_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst4_col_many_neon,      // ADST_FLIPADST
+      highbd_fadst4_col_many_neon,      // FLIPADST_ADST
+      highbd_fidentity4_col_many_neon,  // IDTX
+      highbd_fdct4_col_many_neon,       // V_DCT
+      highbd_fidentity4_col_many_neon,  // H_DCT
+      highbd_fadst4_col_many_neon,      // V_ADST
+      highbd_fidentity4_col_many_neon,  // H_ADST
+      highbd_fadst4_col_many_neon,      // V_FLIPADST
+      highbd_fidentity4_col_many_neon   // H_FLIPADST
+    };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
+  highbd_fdct16_row_neon,       // DCT_DCT
+  highbd_fdct16_row_neon,       // ADST_DCT
+  highbd_fadst16_row_neon,      // DCT_ADST
+  highbd_fadst16_row_neon,      // ADST_ADST
+  highbd_fdct16_row_neon,       // FLIPADST_DCT
+  highbd_fadst16_row_neon,      // DCT_FLIPADST
+  highbd_fadst16_row_neon,      // FLIPADST_FLIPADST
+  highbd_fadst16_row_neon,      // ADST_FLIPADST
+  highbd_fadst16_row_neon,      // FLIPADST_ADST
+  highbd_fidentity16_row_neon,  // IDTX
+  highbd_fidentity16_row_neon,  // V_DCT
+  highbd_fdct16_row_neon,       // H_DCT
+  highbd_fidentity16_row_neon,  // V_ADST
+  highbd_fadst16_row_neon,      // H_ADST
+  highbd_fidentity16_row_neon,  // V_FLIPADST
+  highbd_fadst16_row_neon       // H_FLIPADST
+};
+
+static const fwd_transform_1d_row_many_neon
+    row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
+      highbd_fdct16_row_rect_many_neon,       // DCT_DCT
+      highbd_fdct16_row_rect_many_neon,       // ADST_DCT
+      highbd_fadst16_row_rect_many_neon,      // DCT_ADST
+      highbd_fadst16_row_rect_many_neon,      // ADST_ADST
+      highbd_fdct16_row_rect_many_neon,       // FLIPADST_DCT
+      highbd_fadst16_row_rect_many_neon,      // DCT_FLIPADST
+      highbd_fadst16_row_rect_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst16_row_rect_many_neon,      // ADST_FLIPADST
+      highbd_fadst16_row_rect_many_neon,      // FLIPADST_ADST
+      highbd_fidentity16_row_rect_many_neon,  // IDTX
+      highbd_fidentity16_row_rect_many_neon,  // V_DCT
+      highbd_fdct16_row_rect_many_neon,       // H_DCT
+      highbd_fidentity16_row_rect_many_neon,  // V_ADST
+      highbd_fadst16_row_rect_many_neon,      // H_ADST
+      highbd_fidentity16_row_rect_many_neon,  // V_FLIPADST
+      highbd_fadst16_row_rect_many_neon       // H_FLIPADST
+    };
+
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+    row_highbd_txfm8_xn_arr[TX_TYPES] = {
+      highbd_fdct8_row_many_neon,       // DCT_DCT
+      highbd_fdct8_row_many_neon,       // ADST_DCT
+      highbd_fadst8_row_many_neon,      // DCT_ADST
+      highbd_fadst8_row_many_neon,      // ADST_ADST
+      highbd_fdct8_row_many_neon,       // FLIPADST_DCT
+      highbd_fadst8_row_many_neon,      // DCT_FLIPADST
+      highbd_fadst8_row_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst8_row_many_neon,      // ADST_FLIPADST
+      highbd_fadst8_row_many_neon,      // FLIPADST_ADST
+      highbd_fidentity8_row_many_neon,  // IDTX
+      highbd_fidentity8_row_many_neon,  // V_DCT
+      highbd_fdct8_row_many_neon,       // H_DCT
+      highbd_fidentity8_row_many_neon,  // V_ADST
+      highbd_fadst8_row_many_neon,      // H_ADST
+      highbd_fidentity8_row_many_neon,  // V_FLIPADST
+      highbd_fadst8_row_many_neon       // H_FLIPADST
+    };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+    row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
+      highbd_fdct8_row_rect_many_neon,       // DCT_DCT
+      highbd_fdct8_row_rect_many_neon,       // ADST_DCT
+      highbd_fadst8_row_rect_many_neon,      // DCT_ADST
+      highbd_fadst8_row_rect_many_neon,      // ADST_ADST
+      highbd_fdct8_row_rect_many_neon,       // FLIPADST_DCT
+      highbd_fadst8_row_rect_many_neon,      // DCT_FLIPADST
+      highbd_fadst8_row_rect_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst8_row_rect_many_neon,      // ADST_FLIPADST
+      highbd_fadst8_row_rect_many_neon,      // FLIPADST_ADST
+      highbd_fidentity8_row_rect_many_neon,  // IDTX
+      highbd_fidentity8_row_rect_many_neon,  // V_DCT
+      highbd_fdct8_row_rect_many_neon,       // H_DCT
+      highbd_fidentity8_row_rect_many_neon,  // V_ADST
+      highbd_fadst8_row_rect_many_neon,      // H_ADST
+      highbd_fidentity8_row_rect_many_neon,  // V_FLIPADST
+      highbd_fadst8_row_rect_many_neon       // H_FLIPADST
+    };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
+  highbd_fdct8_row_rect_neon,       // DCT_DCT
+  highbd_fdct8_row_rect_neon,       // ADST_DCT
+  highbd_fadst8_row_rect_neon,      // DCT_ADST
+  highbd_fadst8_row_rect_neon,      // ADST_ADST
+  highbd_fdct8_row_rect_neon,       // FLIPADST_DCT
+  highbd_fadst8_row_rect_neon,      // DCT_FLIPADST
+  highbd_fadst8_row_rect_neon,      // FLIPADST_FLIPADST
+  highbd_fadst8_row_rect_neon,      // ADST_FLIPADST
+  highbd_fadst8_row_rect_neon,      // FLIPADST_ADST
+  highbd_fidentity8_row_rect_neon,  // IDTX
+  highbd_fidentity8_row_rect_neon,  // V_DCT
+  highbd_fdct8_row_rect_neon,       // H_DCT
+  highbd_fidentity8_row_rect_neon,  // V_ADST
+  highbd_fadst8_row_rect_neon,      // H_ADST
+  highbd_fidentity8_row_rect_neon,  // V_FLIPADST
+  highbd_fadst8_row_rect_neon       // H_FLIPADST
+};
+
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+    row_highbd_txfm4_xn_arr[TX_TYPES] = {
+      highbd_fdct4_row_many_neon,       // DCT_DCT
+      highbd_fdct4_row_many_neon,       // ADST_DCT
+      highbd_fadst4_row_many_neon,      // DCT_ADST
+      highbd_fadst4_row_many_neon,      // ADST_ADST
+      highbd_fdct4_row_many_neon,       // FLIPADST_DCT
+      highbd_fadst4_row_many_neon,      // DCT_FLIPADST
+      highbd_fadst4_row_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst4_row_many_neon,      // ADST_FLIPADST
+      highbd_fadst4_row_many_neon,      // FLIPADST_ADST
+      highbd_fidentity4_row_many_neon,  // IDTX
+      highbd_fidentity4_row_many_neon,  // V_DCT
+      highbd_fdct4_row_many_neon,       // H_DCT
+      highbd_fidentity4_row_many_neon,  // V_ADST
+      highbd_fadst4_row_many_neon,      // H_ADST
+      highbd_fidentity4_row_many_neon,  // V_FLIPADST
+      highbd_fadst4_row_many_neon       // H_FLIPADST
+    };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+    row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
+      highbd_fdct4_row_rect_many_neon,       // DCT_DCT
+      highbd_fdct4_row_rect_many_neon,       // ADST_DCT
+      highbd_fadst4_row_rect_many_neon,      // DCT_ADST
+      highbd_fadst4_row_rect_many_neon,      // ADST_ADST
+      highbd_fdct4_row_rect_many_neon,       // FLIPADST_DCT
+      highbd_fadst4_row_rect_many_neon,      // DCT_FLIPADST
+      highbd_fadst4_row_rect_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst4_row_rect_many_neon,      // ADST_FLIPADST
+      highbd_fadst4_row_rect_many_neon,      // FLIPADST_ADST
+      highbd_fidentity4_row_rect_many_neon,  // IDTX
+      highbd_fidentity4_row_rect_many_neon,  // V_DCT
+      highbd_fdct4_row_rect_many_neon,       // H_DCT
+      highbd_fidentity4_row_rect_many_neon,  // V_ADST
+      highbd_fadst4_row_rect_many_neon,      // H_ADST
+      highbd_fidentity4_row_rect_many_neon,  // V_FLIPADST
+      highbd_fadst4_row_rect_many_neon       // H_FLIPADST
+    };
+
+static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
+                                  int cos_bit) {
+  const int32_t *const cospi = cospi_arr_s32(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // Workspaces for intermediate transform steps.
+  int32x4_t buf0[32];
+  int32x4_t buf1[32];
+
+  // stage 1
+  butterfly_dct_pre(input, buf1, 32);
+
+  // stage 2
+  butterfly_dct_pre(buf1, buf0, 16);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
+                      v_cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  butterfly_dct_pre(buf0, buf1, 8);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
+                      v_cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
+
+  // stage 4
+  butterfly_dct_pre(buf1, buf0, 4);
+  buf0[4] = buf1[4];
+  butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
+                      v_cos_bit);
+  buf0[7] = buf1[7];
+  butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
+                      v_cos_bit);
+  butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
+                      v_cos_bit);
+  butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
+                      v_cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
+                      v_cos_bit);
+  butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
+  buf1[8] = buf0[8];
+  butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
+                      v_cos_bit);
+  butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
+                      v_cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+  butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
+
+  // stage 6
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+
+  butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
+                      v_cos_bit);
+  butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
+                      v_cos_bit);
+  butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+  butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
+  buf0[16] = buf1[16];
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+
+  butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
+                      v_cos_bit);
+  butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
+                      v_cos_bit);
+
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12],
+                      v_cos_bit);
+  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+  butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+  butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+  butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4);
+
+  // stage 8
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24],
+                      v_cos_bit);
+
+  // stage 9
+  output[0] = buf0[0];
+  output[1] = buf0[16];
+  output[2] = buf0[8];
+  output[3] = buf0[24];
+  output[4] = buf0[4];
+  output[5] = buf0[20];
+  output[6] = buf0[12];
+  output[7] = buf0[28];
+  output[8] = buf0[2];
+  output[9] = buf0[18];
+  output[10] = buf0[10];
+  output[11] = buf0[26];
+  output[12] = buf0[6];
+  output[13] = buf0[22];
+  output[14] = buf0[14];
+  output[15] = buf0[30];
+  output[16] = buf0[1];
+  output[17] = buf0[17];
+  output[18] = buf0[9];
+  output[19] = buf0[25];
+  output[20] = buf0[5];
+  output[21] = buf0[21];
+  output[22] = buf0[13];
+  output[23] = buf0[29];
+  output[24] = buf0[3];
+  output[25] = buf0[19];
+  output[26] = buf0[11];
+  output[27] = buf0[27];
+  output[28] = buf0[7];
+  output[29] = buf0[23];
+  output[30] = buf0[15];
+  output[31] = buf0[31];
+}
+
+static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output,
+                                  int8_t cos_bit) {
+  const int32_t *const cospi = cospi_arr_s32(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int32x4_t x1[64];
+  butterfly_dct_pre(input, x1, 64);
+
+  // stage 2
+  int32x4_t x2[64];
+  butterfly_dct_pre(x1, x2, 32);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  int32x4_t x3[64];
+  butterfly_dct_pre(x2, x3, 16);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32);
+
+  // stage 4
+  int32x4_t x4[64];
+  butterfly_dct_pre(x3, x4, 8);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  int32x4_t x5[64];
+  butterfly_dct_pre(x4, x5, 4);
+  x5[4] = x4[4];
+  butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit);
+  x5[7] = x4[7];
+  butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16);
+  butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16);
+
+  // stage 6
+  int32x4_t x6[64];
+  butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit);
+  butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4);
+  x6[8] = x5[8];
+  butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8);
+  butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit);
+  butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit);
+  butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit);
+  butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit);
+  butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit);
+  butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit);
+  butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  int32x4_t x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit);
+  butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit);
+  butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4);
+  butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4);
+  x7[16] = x6[16];
+  butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit);
+  butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit);
+  butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8);
+  butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8);
+  butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8);
+  butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8);
+
+  // stage 8
+  int32x4_t x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+
+  butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit);
+  butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit);
+  butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit);
+  butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit);
+  butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4);
+  butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4);
+  butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4);
+  butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4);
+  x8[32] = x7[32];
+  butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit);
+  butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit);
+  butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit);
+  butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit);
+  butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  int32x4_t x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit);
+  butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit);
+  butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit);
+  butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit);
+  butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit);
+  butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit);
+  butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit);
+  butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit);
+  butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4);
+  butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4);
+  butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4);
+  butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4);
+  butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4);
+  butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4);
+  butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4);
+  butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4);
+
+  // stage 10
+  int32x4_t x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit);
+  butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit);
+  butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit);
+  butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit);
+  butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit);
+  butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit);
+  butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit);
+  butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit);
+  butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit);
+  butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit);
+  butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit);
+  butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit);
+  butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit);
+  butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit);
+  butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit);
+  butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit);
+
+  // stage 11
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
+}
+
+static void highbd_fidentity32_x4_neon(const int32x4_t *input,
+                                       int32x4_t *output, int cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; i++) {
+    output[i] = vshlq_n_s32(input[i], 2);
+  }
+}
+
+TRANSFORM_COL_MANY(fdct32, 32)
+TRANSFORM_COL_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_col_many_neon
+    col_highbd_txfm32_x4_arr[TX_TYPES] = {
+      highbd_fdct32_col_many_neon,       // DCT_DCT
+      NULL,                              // ADST_DCT
+      NULL,                              // DCT_ADST
+      NULL,                              // ADST_ADST
+      NULL,                              // FLIPADST_DCT
+      NULL,                              // DCT_FLIPADST
+      NULL,                              // FLIPADST_FLIPADST
+      NULL,                              // ADST_FLIPADST
+      NULL,                              // FLIPADST_ADST
+      highbd_fidentity32_col_many_neon,  // IDTX
+      NULL,                              // V_DCT
+      NULL,                              // H_DCT
+      NULL,                              // V_ADST
+      NULL,                              // H_ADST
+      NULL,                              // V_FLIPADST
+      NULL                               // H_FLIPADST
+    };
+
+TRANSFORM_ROW_MANY(fdct32, 32)
+TRANSFORM_ROW_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+    row_highbd_txfm32_x4_arr[TX_TYPES] = {
+      highbd_fdct32_row_many_neon,       // DCT_DCT
+      NULL,                              // ADST_DCT
+      NULL,                              // DCT_ADST
+      NULL,                              // ADST_ADST
+      NULL,                              // FLIPADST_DCT
+      NULL,                              // DCT_FLIPADST
+      NULL,                              // FLIPADST_FLIPADST
+      NULL,                              // ADST_FLIPADST
+      NULL,                              // FLIPADST_ADST
+      highbd_fidentity32_row_many_neon,  // IDTX
+      NULL,                              // V_DCT
+      NULL,                              // H_DCT
+      NULL,                              // V_ADST
+      NULL,                              // H_ADST
+      NULL,                              // V_FLIPADST
+      NULL                               // H_FLIPADST
+    };
+
+TRANSFORM_ROW_RECT_MANY(fdct32, 32)
+TRANSFORM_ROW_RECT_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+    row_rect_highbd_txfm32_x4_arr[TX_TYPES] = {
+      highbd_fdct32_row_rect_many_neon,       // DCT_DCT
+      NULL,                                   // ADST_DCT
+      NULL,                                   // DCT_ADST
+      NULL,                                   // ADST_ADST
+      NULL,                                   // FLIPADST_DCT
+      NULL,                                   // DCT_FLIPADST
+      NULL,                                   // FLIPADST_FLIPADST
+      NULL,                                   // ADST_FLIPADST
+      NULL,                                   // FLIPADST_ADST
+      highbd_fidentity32_row_rect_many_neon,  // IDTX
+      NULL,                                   // V_DCT
+      NULL,                                   // H_DCT
+      NULL,                                   // V_ADST
+      NULL,                                   // H_ADST
+      NULL,                                   // V_FLIPADST
+      NULL                                    // H_FLIPADST
+    };
+
+void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm8_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm16_xn_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[2][1];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+  // Column-wise transform.
+  int32x4_t buf0[32];
+  if (lr_flip) {
+    col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4,
+             /*hm_stride=*/-8);
+  } else {
+    col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4,
+             /*hm_stride=*/8);
+  }
+  shift_right_2_round_s32_x4(buf0, buf0, 32);
+
+  int32x4_t buf1[32];
+  transpose_arrays_s32_16x8(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8);
+}
+
+void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm16_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm8_xn_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[1][2];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+  // Column-wise transform.
+  int32x4_t buf0[32];
+  if (lr_flip) {
+    col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2,
+             /*hm_stride=*/-16);
+  } else {
+    col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2,
+             /*hm_stride=*/16);
+  }
+  shift_right_2_round_s32_x4(buf0, buf0, 32);
+
+  int32x4_t buf1[32];
+  transpose_arrays_s32_8x16(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int bitcol = av1_fwd_cos_bit_col[0][2];
+  int bitrow = av1_fwd_cos_bit_row[0][2];
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm16_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_highbd_txfm4_xn_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+  // Column-wise transform.
+  int32x4_t buf0[16];
+  if (lr_flip) {
+    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1,
+             /*hm_stride=*/0);
+  } else {
+    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1,
+             /*hm_stride=*/0);
+  }
+  shift_right_1_round_s32_x4(buf0, buf0, 16);
+
+  int32x4_t buf1[16];
+  transpose_arrays_s32_4x16(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16);
+}
+#endif
+
+void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int bitcol = av1_fwd_cos_bit_col[2][0];
+  int bitrow = av1_fwd_cos_bit_row[2][0];
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm4_xn_arr[tx_type];
+  const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+  // Column-wise transform.
+  int32x4_t buf0[16];
+  if (lr_flip) {
+    col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4,
+             /*hm_stride=*/-4);
+  } else {
+    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+             /*hm_stride=*/4);
+  }
+
+  shift_right_1_round_s32_x4(buf0, buf0, 16);
+  transpose_arrays_s32_4x16(buf0, buf0);
+
+  // Row-wise transform.
+  row_txfm(buf0, coeff, bitrow, /*stride=*/4);
+}
+
+void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm32_x4_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm16_xn_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[2][3];
+  int bitrow = av1_fwd_cos_bit_row[2][3];
+
+  // Column-wise transform.
+  int32x4_t buf0[128];
+  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+           /*hm_stride=*/32);
+  shift_right_4_round_s32_x4(buf0, buf0, 128);
+
+  int32x4_t buf1[128];
+  transpose_arrays_s32_16x32(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  int bitcol = av1_fwd_cos_bit_col[3][4];
+  int bitrow = av1_fwd_cos_bit_row[3][4];
+
+  // Column-wise transform.
+  int32x4_t buf0[512];
+  load_buffer_32x64(input, buf0, stride, 0);
+  for (int i = 0; i < 8; i++) {
+    highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
+  }
+  shift_right_2_round_s32_x4(buf0, buf0, 512);
+
+  int32x4_t buf1[512];
+  transpose_arrays_s32_32x64(buf0, buf1);
+
+  // Row-wise transform.
+  for (int i = 0; i < 16; i++) {
+    highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow);
+  }
+  round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+  store_buffer_32x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  int bitcol = av1_fwd_cos_bit_col[4][3];
+  int bitrow = av1_fwd_cos_bit_row[4][3];
+
+  // Column-wise transform.
+  int32x4_t buf0[512];
+  load_buffer_64x32(input, buf0, stride, 0);
+  for (int i = 0; i < 16; i++) {
+    highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol);
+  }
+  shift_right_4_round_s32_x4(buf0, buf0, 512);
+
+  int32x4_t buf1[512];
+  transpose_arrays_s32_64x32(buf0, buf1);
+
+  // Row-wise transform.
+  for (int i = 0; i < 8; i++) {
+    highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
+  }
+  round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+  store_buffer_64x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm16_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm32_x4_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[3][2];
+  int bitrow = av1_fwd_cos_bit_row[3][2];
+
+  // Column-wise transform.
+  int32x4_t buf0[128];
+  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+           /*hm_stride=*/16);
+  shift_right_4_round_s32_x4(buf0, buf0, 128);
+
+  int32x4_t buf1[128];
+  transpose_arrays_s32_32x16(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm32_x4_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_highbd_txfm8_xn_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[1][3];
+  int bitrow = av1_fwd_cos_bit_row[1][3];
+
+  // Column-wise transform.
+  int32x4_t buf0[64];
+  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+           /*hm_stride=*/32);
+  shift_right_2_round_s32_x4(buf0, buf0, 64);
+
+  int32x4_t buf1[64];
+  transpose_arrays_s32_8x32(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm8_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_highbd_txfm32_x4_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[3][1];
+  int bitrow = av1_fwd_cos_bit_row[3][1];
+
+  // Column-wise transform.
+  int32x4_t buf0[64];
+  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+           /*hm_stride=*/8);
+  shift_right_2_round_s32_x4(buf0, buf0, 64);
+
+  int32x4_t buf1[64];
+  transpose_arrays_s32_32x8(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8);
+}
+#endif
+
+void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int bitcol = av1_fwd_cos_bit_col[0][1];
+  int bitrow = av1_fwd_cos_bit_row[0][1];
+  const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm4_xn_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+  // Column-wise transform.
+  int32x4_t buf0[8];
+  col_txfm(input, buf0, stride, bitcol, lr_flip);
+  shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+  int32x4_t buf1[8];
+  transpose_arrays_s32_4x8(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8);
+}
+
+void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const int bitcol = av1_fwd_cos_bit_col[1][0];
+  const int bitrow = av1_fwd_cos_bit_row[1][0];
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm4_xn_arr[tx_type];
+  const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+  // Column-wise transform.
+  int32x4_t buf0[8];
+  if (lr_flip) {
+    col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2,
+             /*hm_stride=*/-4);
+  } else {
+    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+             /*hm_stride=*/4);
+  }
+
+  shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+  int32x4_t buf1[8];
+  transpose_arrays_s32_8x4(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*stride=*/4);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const int bitcol = av1_fwd_cos_bit_col[2][4];
+  const int bitrow = av1_fwd_cos_bit_row[2][4];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 64);
+
+  // Column-wise transform.
+  int32x4_t buf0[256];
+  load_buffer_16x64(input, buf0, stride, lr_flip);
+  for (int i = 0; i < 4; i++) {
+    highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
+  }
+  shift_right_2_round_s32_x4(buf0, buf0, 256);
+
+  int32x4_t buf1[256];
+  transpose_arrays_s32_16x64(buf0, buf1);
+
+  // Row-wise transform.
+  highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8);
+  store_buffer_16x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const int bitcol = av1_fwd_cos_bit_col[4][2];
+  const int bitrow = av1_fwd_cos_bit_row[4][2];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+  // Column-wise transform.
+  int32x4_t buf0[256];
+  load_buffer_64x16(input, buf0, stride, lr_flip);
+  highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16);
+  shift_right_4_round_s32_x4(buf0, buf0, 256);
+
+  int32x4_t buf1[256];
+  transpose_arrays_s32_64x16(buf0, buf1);
+
+  // Row-wise transform.
+  for (int i = 0; i < 4; i++) {
+    highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
+  }
+  store_buffer_64x16(buf1, coeff, /*stride=*/16);
+  memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff));
+}
+#endif
+
+void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm32_x4_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_highbd_txfm32_x4_arr[tx_type];
+
+  // Column-wise transform.
+  int32x4_t buf0[256];
+  col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8,
+           /*hm_stride=*/32);
+  shift_right_4_round_s32_x4(buf0, buf0, 256);
+
+  int32x4_t buf1[256];
+  transpose_arrays_s32_32x32(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32,
+           /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+
+  // Column-wise transform.
+  int32x4_t buf0[1024];
+  load_buffer_64x64(input, buf0, stride, 0);
+  for (int col = 0; col < 16; col++) {
+    highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13);
+  }
+  shift_right_2_round_s32_x4(buf0, buf0, 1024);
+
+  int32x4_t buf1[1024];
+  transpose_arrays_s32_64x64(buf0, buf1);
+
+  // Row-wise transform.
+  for (int col = 0; col < 8; col++) {
+    highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10);
+  }
+  shift_right_2_round_s32_x4(buf1, buf1, 512);
+  store_buffer_64x32(buf1, output, /*stride=*/32);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
new file mode 100644
index 0000000000..47b5f5cfb7
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
@@ -0,0 +1,1207 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void highbd_calc_proj_params_r0_r1_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+  int64x2_t h00_lo = vdupq_n_s64(0);
+  int64x2_t h00_hi = vdupq_n_s64(0);
+  int64x2_t h11_lo = vdupq_n_s64(0);
+  int64x2_t h11_hi = vdupq_n_s64(0);
+  int64x2_t h01_lo = vdupq_n_s64(0);
+  int64x2_t h01_hi = vdupq_n_s64(0);
+  int64x2_t c0_lo = vdupq_n_s64(0);
+  int64x2_t c0_hi = vdupq_n_s64(0);
+  int64x2_t c1_lo = vdupq_n_s64(0);
+  int64x2_t c1_hi = vdupq_n_s64(0);
+
+  do {
+    const uint16_t *src_ptr = src;
+    const uint16_t *dat_ptr = dat;
+    int32_t *flt0_ptr = flt0;
+    int32_t *flt1_ptr = flt1;
+    int w = width;
+
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr);
+      uint16x8_t d = vld1q_u16(dat_ptr);
+      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+      int32x4_t u_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t u_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t s_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+      int32x4_t s_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+      s_lo = vsubq_s32(s_lo, u_lo);
+      s_hi = vsubq_s32(s_hi, u_hi);
+
+      f0_lo = vsubq_s32(f0_lo, u_lo);
+      f0_hi = vsubq_s32(f0_hi, u_hi);
+      f1_lo = vsubq_s32(f1_lo, u_lo);
+      f1_hi = vsubq_s32(f1_hi, u_hi);
+
+      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+      h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+      h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+      h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+      h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt0_ptr += 8;
+      flt1_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src += src_stride;
+    dat += dat_stride;
+    flt0 += flt0_stride;
+    flt1 += flt1_stride;
+  } while (--height != 0);
+
+  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+  H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+  H[1][0] = H[0][1];
+  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r0_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+  int64x2_t h00_lo = vdupq_n_s64(0);
+  int64x2_t h00_hi = vdupq_n_s64(0);
+  int64x2_t c0_lo = vdupq_n_s64(0);
+  int64x2_t c0_hi = vdupq_n_s64(0);
+
+  do {
+    const uint16_t *src_ptr = src;
+    const uint16_t *dat_ptr = dat;
+    int32_t *flt0_ptr = flt0;
+    int w = width;
+
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr);
+      uint16x8_t d = vld1q_u16(dat_ptr);
+      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+      int32x4_t u_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t u_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t s_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+      int32x4_t s_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+      s_lo = vsubq_s32(s_lo, u_lo);
+      s_hi = vsubq_s32(s_hi, u_hi);
+
+      f0_lo = vsubq_s32(f0_lo, u_lo);
+      f0_hi = vsubq_s32(f0_hi, u_hi);
+
+      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt0_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src += src_stride;
+    dat += dat_stride;
+    flt0 += flt0_stride;
+  } while (--height != 0);
+
+  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r1_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+  int64x2_t h11_lo = vdupq_n_s64(0);
+  int64x2_t h11_hi = vdupq_n_s64(0);
+  int64x2_t c1_lo = vdupq_n_s64(0);
+  int64x2_t c1_hi = vdupq_n_s64(0);
+
+  do {
+    const uint16_t *src_ptr = src;
+    const uint16_t *dat_ptr = dat;
+    int32_t *flt1_ptr = flt1;
+    int w = width;
+
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr);
+      uint16x8_t d = vld1q_u16(dat_ptr);
+      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+      int32x4_t u_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t u_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t s_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+      int32x4_t s_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+      s_lo = vsubq_s32(s_lo, u_lo);
+      s_hi = vsubq_s32(s_hi, u_hi);
+
+      f1_lo = vsubq_s32(f1_lo, u_lo);
+      f1_hi = vsubq_s32(f1_hi, u_hi);
+
+      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt1_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src += src_stride;
+    dat += dat_stride;
+    flt1 += flt1_stride;
+  } while (--height != 0);
+
+  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+//    of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+//    non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+//    non-zero and need to be computed.
+void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width,
+                                       int height, int src_stride,
+                                       const uint8_t *dat8, int dat_stride,
+                                       int32_t *flt0, int flt0_stride,
+                                       int32_t *flt1, int flt1_stride,
+                                       int64_t H[2][2], int64_t C[2],
+                                       const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt0, flt0_stride, flt1,
+                                       flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8,
+                                    dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8,
+                                    dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+  uint8x16x2_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b) } };
+  return vreinterpretq_s16_u8(vqtbl2q_u8(table, idx));
+#else
+  uint8x8x4_t table = { { vreinterpret_u8_s16(vget_low_s16(a)),
+                          vreinterpret_u8_s16(vget_high_s16(a)),
+                          vreinterpret_u8_s16(vget_low_s16(b)),
+                          vreinterpret_u8_s16(vget_high_s16(b)) } };
+  return vreinterpretq_s16_u8(vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
+                                          vtbl4_u8(table, vget_high_u8(idx))));
+#endif
+}
+
+static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c,
+                              uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+  uint8x16x3_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b),
+                           vreinterpretq_u8_s16(c) } };
+  return vreinterpretq_s16_u8(vqtbl3q_u8(table, idx));
+#else
+  // This is a specific implementation working only for compute stats with
+  // wiener_win == 5.
+  uint8x8x3_t table_lo = { { vreinterpret_u8_s16(vget_low_s16(a)),
+                             vreinterpret_u8_s16(vget_high_s16(a)),
+                             vreinterpret_u8_s16(vget_low_s16(b)) } };
+  uint8x8x3_t table_hi = { { vreinterpret_u8_s16(vget_low_s16(b)),
+                             vreinterpret_u8_s16(vget_high_s16(b)),
+                             vreinterpret_u8_s16(vget_low_s16(c)) } };
+  return vreinterpretq_s16_u8(vcombine_u8(
+      vtbl3_u8(table_lo, vget_low_u8(idx)),
+      vtbl3_u8(table_hi, vsub_u8(vget_high_u8(idx), vdup_n_u8(16)))));
+#endif
+}
+
+static INLINE int64_t div_shift_s64(int64_t x, int power) {
+  return (x < 0 ? x + (1ll << power) - 1 : x) >> power;
+}
+
+// The M matrix is accumulated in a bitdepth-dependent number of steps to
+// speed up the computation. This function computes the final M from the
+// accumulated (src_s64) and the residual parts (src_s32). It also transposes
+// the result as the output needs to be column-major.
+static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
+                                   const int32_t *src_s32, const int wiener_win,
+                                   int shift) {
+  for (int i = 0; i < wiener_win; ++i) {
+    for (int j = 0; j < wiener_win; ++j) {
+      int tr_idx = j * wiener_win + i;
+      *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift);
+    }
+  }
+}
+
+// The resulting H is a column-major matrix accumulated from the transposed
+// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single
+// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This
+// function transforms back to the originally expected format (double
+// transpose). The H matrix is accumulated in a bitdepth-dependent number of
+// steps to speed up the computation. This function computes the final H from
+// the accumulated (src_s64) and the residual parts (src_s32). The computed H is
+// only an upper triangle matrix, this function also fills the lower triangle of
+// the resulting matrix.
+static INLINE void update_H(int64_t *dst, const int64_t *src_s64,
+                            const int32_t *src_s32, const int wiener_win,
+                            int stride, int shift) {
+  // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
+  // `wiener_win2` is 9, the M matrix is 3x3:
+  // 0, 3, 6
+  // 1, 4, 7
+  // 2, 5, 8
+  //
+  // This is viewed as a vector to compute H (9x9) by vector outer product:
+  // 0, 3, 6, 1, 4, 7, 2, 5, 8
+  //
+  // Double transpose and upper triangle remapping for 3x3 -> 9x9 case:
+  // 0,    3,    6,    1,    4,    7,    2,    5,    8,
+  // 3,   30,   33,   12,   31,   34,   21,   32,   35,
+  // 6,   33,   60,   15,   42,   61,   24,   51,   62,
+  // 1,   12,   15,   10,   13,   16,   11,   14,   17,
+  // 4,   31,   42,   13,   40,   43,   22,   41,   44,
+  // 7,   34,   61,   16,   43,   70,   25,   52,   71,
+  // 2,   21,   24,   11,   22,   25,   20,   23,   26,
+  // 5,   32,   51,   14,   41,   52,   23,   50,   53,
+  // 8,   35,   62,   17,   44,   71,   26,   53,   80,
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  // Loop through the indices according to the remapping above, along the
+  // columns:
+  // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ...,
+  // wiener_win - 1, wiener_win - 1 + wiener_win, ...
+  // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8.
+  for (int i = 0; i < wiener_win; ++i) {
+    for (int j = i; j < wiener_win2; j += wiener_win) {
+      // These two inner loops are the same as the two outer loops, but running
+      // along rows instead of columns. For the 3x3 case `l` will be:
+      // 0, 3, 6, 1, 4, 7, 2, 5, 8.
+      for (int k = 0; k < wiener_win; ++k) {
+        for (int l = k; l < wiener_win2; l += wiener_win) {
+          // The nominal double transpose indexing would be:
+          // int idx = stride * j + l;
+          // However we need the upper-right triangle, it is easy with some
+          // min/max operations.
+          int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
+
+          // Resulting matrix is filled by combining the 64-bit and the residual
+          // 32-bit matrices together with scaling.
+          *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift);
+        }
+      }
+    }
+  }
+}
+
+// Load 7x7 matrix into 7 128-bit vectors from consecutive rows, the last load
+// address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src,
+                                         ptrdiff_t stride) {
+  dst[0] = vld1q_s16(src);
+  src += stride;
+  dst[1] = vld1q_s16(src);
+  src += stride;
+  dst[2] = vld1q_s16(src);
+  src += stride;
+  dst[3] = vld1q_s16(src);
+  src += stride;
+  dst[4] = vld1q_s16(src);
+  src += stride;
+  dst[5] = vld1q_s16(src);
+  src += stride;
+  dst[6] = vld1q_s16(src - 1);
+}
+
+static INLINE void highbd_compute_stats_win7_neon(
+    const uint16_t *dgd, const uint16_t *src, int avg, int width, int height,
+    int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+    aom_bit_depth_t bit_depth) {
+  // Matrix names are capitalized to help readability.
+  DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
+  DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
+  DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
+  DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
+  DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+  DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+
+  memset(M_s32, 0, sizeof(M_s32));
+  memset(M_s64, 0, sizeof(M_s64));
+  memset(H_s32, 0, sizeof(H_s32));
+  memset(H_s64, 0, sizeof(H_s64));
+
+  // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7
+  // matrices.
+  // clang-format off
+  DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7_highbd[192]) = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 16, 17,
+    2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 16, 17, 18, 19,
+    4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 16, 17, 18, 19, 20, 21,
+    6,  7,  8,  9,  10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23,
+    8,  9,  10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+    10, 11, 12, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 18, 19,
+    4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 18, 19, 20, 21,
+    6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23,
+    8,  9,  10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25,
+    10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+    12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  };
+  // clang-format on
+
+  const uint8x16_t lut0 = vld1q_u8(shuffle_stats7_highbd + 0);
+  const uint8x16_t lut1 = vld1q_u8(shuffle_stats7_highbd + 16);
+  const uint8x16_t lut2 = vld1q_u8(shuffle_stats7_highbd + 32);
+  const uint8x16_t lut3 = vld1q_u8(shuffle_stats7_highbd + 48);
+  const uint8x16_t lut4 = vld1q_u8(shuffle_stats7_highbd + 64);
+  const uint8x16_t lut5 = vld1q_u8(shuffle_stats7_highbd + 80);
+  const uint8x16_t lut6 = vld1q_u8(shuffle_stats7_highbd + 96);
+  const uint8x16_t lut7 = vld1q_u8(shuffle_stats7_highbd + 112);
+  const uint8x16_t lut8 = vld1q_u8(shuffle_stats7_highbd + 128);
+  const uint8x16_t lut9 = vld1q_u8(shuffle_stats7_highbd + 144);
+  const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160);
+  const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176);
+
+  // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
+  // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
+  // be as high as 32768/2048/128 for the compute stats.
+  const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+  int acc_cnt = acc_cnt_max;
+  const int src_next = src_stride - width;
+  const int dgd_next = dgd_stride - width;
+  const int16x8_t avg_s16 = vdupq_n_s16(avg);
+
+  do {
+    int j = width;
+    while (j >= 2) {
+      // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+      // middle 6x7 elements being shared.
+      int16x8_t dgd_rows[7];
+      load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+      const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6;
+      dgd += 2;
+
+      dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16);
+      dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16);
+      dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16);
+      dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16);
+      dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16);
+      dgd_rows[5] = vsubq_s16(dgd_rows[5], avg_s16);
+      dgd_rows[6] = vsubq_s16(dgd_rows[6], avg_s16);
+
+      // Re-arrange the combined 8x7 matrix to have the 2 whole 7x7 matrices (1
+      // for each of the 2 pixels) separated into distinct int16x8_t[6] arrays.
+      // These arrays contain 48 elements of the 49 (7x7). Compute `dgd - avg`
+      // for both buffers. Each DGD_AVG buffer contains 49 consecutive elements.
+      int16x8_t dgd_avg0[6];
+      int16x8_t dgd_avg1[6];
+
+      dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+      dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut6);
+      dgd_avg0[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+      dgd_avg1[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut7);
+      dgd_avg0[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+      dgd_avg1[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut8);
+      dgd_avg0[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut3);
+      dgd_avg1[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut9);
+      dgd_avg0[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut4);
+      dgd_avg1[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut10);
+      dgd_avg0[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut5);
+      dgd_avg1[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut11);
+
+      vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+      vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+      vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+      vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+      vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+      vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]);
+      vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+      vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]);
+      vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+      vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]);
+
+      // The remaining last (49th) elements of `dgd - avg`.
+      DGD_AVG0[48] = dgd_ptr[6] - avg;
+      DGD_AVG1[48] = dgd_ptr[7] - avg;
+
+      // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+      // output pixels at a time. M is of size 7 * 7. It needs to be filled such
+      // that multiplying one element from src with each element of a row of the
+      // wiener window will fill one column of M. However this is not very
+      // convenient in terms of memory access, as it means we do contiguous
+      // loads of dgd but strided stores to M. As a result, we use an
+      // intermediate matrix M_s32 which is instead filled such that one row of
+      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+      // then transposed to return M.
+      int src_avg0 = *src++ - avg;
+      int src_avg1 = *src++ - avg;
+      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+      int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+      update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+                       dgd_avg1[0]);
+      update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+                       dgd_avg1[1]);
+      update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+                       dgd_avg1[2]);
+      update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
+                       dgd_avg1[3]);
+      update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
+                       dgd_avg1[4]);
+      update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
+                       dgd_avg1[5]);
+
+      // Last (49th) element of M_s32 can be computed as scalar more efficiently
+      // for 2 output pixels.
+      M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
+
+      // Start accumulating into row-major version of matrix H
+      // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+      // row-major. H is of size 49 * 49. It is filled by multiplying every pair
+      // of elements of the wiener window together (vector outer product). Since
+      // it is a symmetric matrix, we only compute the upper-right triangle, and
+      // then copy it down to the lower-left later. The upper triangle is
+      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+      // column-major and the resulting H matrix is also expected to be
+      // column-major. It is not efficient to work with column-major matrices,
+      // so we accumulate into a row-major matrix H_s32. At the end of the
+      // algorithm a double transpose transformation will convert H_s32 back to
+      // the expected output layout.
+      update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+      // The last element of the triangle of H_s32 matrix can be computed as a
+      // scalar more efficiently.
+      H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
+          DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
+
+      // Accumulate into 64-bit after a bit depth dependent number of iterations
+      // to prevent overflow.
+      if (--acc_cnt == 0) {
+        acc_cnt = acc_cnt_max;
+
+        accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2);
+
+        // The widening accumulation is only needed for the upper triangle part
+        // of the matrix.
+        int64_t *lh = H_s64;
+        int32_t *lh32 = H_s32;
+        for (int k = 0; k < WIENER_WIN2; ++k) {
+          // The widening accumulation is only run for the relevant parts
+          // (upper-right triangle) in a row 4-element aligned.
+          int k4 = k / 4 * 4;
+          accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
+
+          // Last element of the row is computed separately.
+          lh[48] += lh32[48];
+          lh32[48] = 0;
+
+          lh += WIENER_WIN2_ALIGN2;
+          lh32 += WIENER_WIN2_ALIGN2;
+        }
+      }
+
+      j -= 2;
+    }
+
+    // Computations for odd pixel in the row.
+    if (width & 1) {
+      // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+      // middle 6x7 elements being shared.
+      int16x8_t dgd_rows[7];
+      load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+      const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6;
+      ++dgd;
+
+      // Re-arrange the combined 8x7 matrix to have a whole 7x7 matrix tightly
+      // packed into a int16x8_t[6] array. This array contains 48 elements of
+      // the 49 (7x7). Compute `dgd - avg` for the whole buffer. The DGD_AVG
+      // buffer contains 49 consecutive elements.
+      int16x8_t dgd_avg0[6];
+
+      dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16);
+      dgd_avg0[1] = vsubq_s16(tbl2q(dgd_rows[1], dgd_rows[2], lut1), avg_s16);
+      dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[2], dgd_rows[3], lut2), avg_s16);
+      dgd_avg0[3] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut3), avg_s16);
+      dgd_avg0[4] = vsubq_s16(tbl2q(dgd_rows[4], dgd_rows[5], lut4), avg_s16);
+      dgd_avg0[5] = vsubq_s16(tbl2q(dgd_rows[5], dgd_rows[6], lut5), avg_s16);
+
+      vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+      vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+      vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+      vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+
+      // The remaining last (49th) element of `dgd - avg`.
+      DGD_AVG0[48] = dgd_ptr[6] - avg;
+
+      // Accumulate into row-major order variant of matrix M (cross-correlation)
+      // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled
+      // such that multiplying one element from src with each element of a row
+      // of the wiener window will fill one column of M. However this is not
+      // very convenient in terms of memory access, as it means we do
+      // contiguous loads of dgd but strided stores to M. As a result, we use an
+      // intermediate matrix M_s32 which is instead filled such that one row of
+      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+      // then transposed to return M.
+      int src_avg0 = *src++ - avg;
+      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+      update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+      update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+      update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+      update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
+      update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
+      update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
+
+      // Last (49th) element of M_s32 can be computed as scalar more efficiently
+      // for 1 output pixel.
+      M_s32[48] += DGD_AVG0[48] * src_avg0;
+
+      // Start accumulating into row-major order version of matrix H
+      // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+      // H is of size 49 * 49. It is filled by multiplying every pair of
+      // elements of the wiener window together (vector outer product). Since it
+      // is a symmetric matrix, we only compute the upper-right triangle, and
+      // then copy it down to the lower-left later. The upper triangle is
+      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+      // column-major and the resulting H matrix is also expected to be
+      // column-major. It is not efficient to work column-major matrices, so we
+      // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+      // double transpose transformation will convert H_s32 back to the expected
+      // output layout.
+      update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
+
+      // The last element of the triangle of H_s32 matrix can be computed as
+      // scalar more efficiently.
+      H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
+    }
+
+    src += src_next;
+    dgd += dgd_next;
+  } while (--height != 0);
+
+  int bit_depth_shift = bit_depth - AOM_BITS_8;
+
+  acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, bit_depth_shift);
+
+  update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, bit_depth_shift);
+}
+
+// Load 5x5 matrix into 5 128-bit vectors from consecutive rows, the last load
+// address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src,
+                                         ptrdiff_t stride) {
+  dst[0] = vld1q_s16(src);
+  src += stride;
+  dst[1] = vld1q_s16(src);
+  src += stride;
+  dst[2] = vld1q_s16(src);
+  src += stride;
+  dst[3] = vld1q_s16(src);
+  src += stride;
+  dst[4] = vld1q_s16(src - 3);
+}
+
+static void highbd_compute_stats_win5_neon(const uint16_t *dgd,
+                                           const uint16_t *src, int avg,
+                                           int width, int height,
+                                           int dgd_stride, int src_stride,
+                                           int64_t *M, int64_t *H,
+                                           aom_bit_depth_t bit_depth) {
+  // Matrix names are capitalized to help readability.
+  DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]);
+  DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]);
+  DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]);
+  DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]);
+  DECLARE_ALIGNED(64, int32_t,
+                  H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+  DECLARE_ALIGNED(64, int64_t,
+                  H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+
+  memset(M_s32, 0, sizeof(M_s32));
+  memset(M_s64, 0, sizeof(M_s64));
+  memset(H_s32, 0, sizeof(H_s32));
+  memset(H_s64, 0, sizeof(H_s64));
+
+  // Look-up tables to create 8x3 matrix with consecutive elements from 5x5
+  // matrix.
+  DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5_highbd[96]) = {
+    0, 1, 2,  3,  4,  5,  6,  7,  8,  9,  16, 17, 18, 19, 20, 21,
+    6, 7, 8,  9,  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 33,
+    2, 3, 4,  5,  6,  7,  8,  9,  22, 23, 24, 25, 26, 27, 28, 29,
+    2, 3, 4,  5,  6,  7,  8,  9,  10, 11, 18, 19, 20, 21, 22, 23,
+    8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 34, 35,
+    4, 5, 6,  7,  8,  9,  10, 11, 24, 25, 26, 27, 28, 29, 30, 31,
+  };
+
+  const uint8x16_t lut0 = vld1q_u8(shuffle_stats5_highbd + 0);
+  const uint8x16_t lut1 = vld1q_u8(shuffle_stats5_highbd + 16);
+  const uint8x16_t lut2 = vld1q_u8(shuffle_stats5_highbd + 32);
+  const uint8x16_t lut3 = vld1q_u8(shuffle_stats5_highbd + 48);
+  const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64);
+  const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80);
+
+  // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
+  // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
+  // be as high as 32768/2048/128 for the compute stats.
+  const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+  int acc_cnt = acc_cnt_max;
+  const int src_next = src_stride - width;
+  const int dgd_next = dgd_stride - width;
+  const int16x8_t avg_s16 = vdupq_n_s16(avg);
+
+  do {
+    int j = width;
+    while (j >= 2) {
+      // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+      // middle 4x5 elements being shared.
+      int16x8_t dgd_rows[5];
+      load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+      const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4;
+      dgd += 2;
+
+      dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16);
+      dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16);
+      dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16);
+      dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16);
+      dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16);
+
+      // Re-arrange the combined 6x5 matrix to have the 2 whole 5x5 matrices (1
+      // for each of the 2 pixels) separated into distinct int16x8_t[3] arrays.
+      // These arrays contain 24 elements of the 25 (5x5). Compute `dgd - avg`
+      // for both buffers. Each DGD_AVG buffer contains 25 consecutive elements.
+      int16x8_t dgd_avg0[3];
+      int16x8_t dgd_avg1[3];
+
+      dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+      dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
+      dgd_avg0[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1);
+      dgd_avg1[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut4);
+      dgd_avg0[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut2);
+      dgd_avg1[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut5);
+
+      vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+      vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+      vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+      vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+
+      // The remaining last (25th) elements of `dgd - avg`.
+      DGD_AVG0[24] = dgd_ptr[4] - avg;
+      DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+      // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+      // output pixels at a time. M is of size 5 * 5. It needs to be filled such
+      // that multiplying one element from src with each element of a row of the
+      // wiener window will fill one column of M. However this is not very
+      // convenient in terms of memory access, as it means we do contiguous
+      // loads of dgd but strided stores to M. As a result, we use an
+      // intermediate matrix M_s32 which is instead filled such that one row of
+      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+      // then transposed to return M.
+      int src_avg0 = *src++ - avg;
+      int src_avg1 = *src++ - avg;
+      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+      int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+      update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+                       dgd_avg1[0]);
+      update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+                       dgd_avg1[1]);
+      update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+                       dgd_avg1[2]);
+
+      // Last (25th) element of M_s32 can be computed as scalar more efficiently
+      // for 2 output pixels.
+      M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
+
+      // Start accumulating into row-major version of matrix H
+      // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+      // row-major. H is of size 25 * 25. It is filled by multiplying every pair
+      // of elements of the wiener window together (vector outer product). Since
+      // it is a symmetric matrix, we only compute the upper-right triangle, and
+      // then copy it down to the lower-left later. The upper triangle is
+      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+      // column-major and the resulting H matrix is also expected to be
+      // column-major. It is not efficient to work with column-major matrices,
+      // so we accumulate into a row-major matrix H_s32. At the end of the
+      // algorithm a double transpose transformation will convert H_s32 back to
+      // the expected output layout.
+      update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+      // The last element of the triangle of H_s32 matrix can be computed as a
+      // scalar more efficiently.
+      H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+          DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
+
+      // Accumulate into 64-bit after a bit depth dependent number of iterations
+      // to prevent overflow.
+      if (--acc_cnt == 0) {
+        acc_cnt = acc_cnt_max;
+
+        accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2);
+
+        // The widening accumulation is only needed for the upper triangle part
+        // of the matrix.
+        int64_t *lh = H_s64;
+        int32_t *lh32 = H_s32;
+        for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) {
+          // The widening accumulation is only run for the relevant parts
+          // (upper-right triangle) in a row 4-element aligned.
+          int k4 = k / 4 * 4;
+          accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
+
+          // Last element of the row is computed separately.
+          lh[24] += lh32[24];
+          lh32[24] = 0;
+
+          lh += WIENER_WIN2_REDUCED_ALIGN2;
+          lh32 += WIENER_WIN2_REDUCED_ALIGN2;
+        }
+      }
+
+      j -= 2;
+    }
+
+    // Computations for odd pixel in the row.
+    if (width & 1) {
+      // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+      // middle 4x5 elements being shared.
+      int16x8_t dgd_rows[5];
+      load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+      const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4;
+      ++dgd;
+
+      // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5
+      // matrix tightly packed into a int16x8_t[3] array. This array contains
+      // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer.
+      // The DGD_AVG buffer contains 25 consecutive elements.
+      int16x8_t dgd_avg0[3];
+
+      dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16);
+      dgd_avg0[1] = vsubq_s16(
+          tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1), avg_s16);
+      dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut2), avg_s16);
+
+      vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+
+      // The remaining last (25th) element of `dgd - avg`.
+      DGD_AVG0[24] = dgd_ptr[4] - avg;
+      DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+      // Accumulate into row-major order variant of matrix M (cross-correlation)
+      // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled
+      // such that multiplying one element from src with each element of a row
+      // of the wiener window will fill one column of M. However this is not
+      // very convenient in terms of memory access, as it means we do
+      // contiguous loads of dgd but strided stores to M. As a result, we use an
+      // intermediate matrix M_s32 which is instead filled such that one row of
+      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+      // then transposed to return M.
+      int src_avg0 = *src++ - avg;
+      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+      update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+      update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+      update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+
+      // Last (25th) element of M_s32 can be computed as scalar more efficiently
+      // for 1 output pixel.
+      M_s32[24] += DGD_AVG0[24] * src_avg0;
+
+      // Start accumulating into row-major order version of matrix H
+      // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+      // H is of size 25 * 25. It is filled by multiplying every pair of
+      // elements of the wiener window together (vector outer product). Since it
+      // is a symmetric matrix, we only compute the upper-right triangle, and
+      // then copy it down to the lower-left later. The upper triangle is
+      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+      // column-major and the resulting H matrix is also expected to be
+      // column-major. It is not efficient to work with column-major matrices,
+      // so we accumulate into a row-major matrix H_s32. At the end of the
+      // algorithm a double transpose transformation will convert H_s32 back to
+      // the expected output layout.
+      update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
+
+      // The last element of the triangle of H_s32 matrix can be computed as a
+      // scalar more efficiently.
+      H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+          DGD_AVG0[24] * DGD_AVG0[24];
+    }
+
+    src += src_next;
+    dgd += dgd_next;
+  } while (--height != 0);
+
+  int bit_depth_shift = bit_depth - AOM_BITS_8;
+
+  acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, bit_depth_shift);
+
+  update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2,
+           bit_depth_shift);
+}
+
+static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride,
+                                         int width, int height) {
+  assert(width > 0);
+  assert(height > 0);
+
+  uint64x2_t sum_u64 = vdupq_n_u64(0);
+  uint64_t sum = 0;
+
+  int h = height;
+  do {
+    uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+    int w = width;
+    const uint16_t *row = src;
+    while (w >= 32) {
+      uint16x8_t s0 = vld1q_u16(row + 0);
+      uint16x8_t s1 = vld1q_u16(row + 8);
+      uint16x8_t s2 = vld1q_u16(row + 16);
+      uint16x8_t s3 = vld1q_u16(row + 24);
+
+      s0 = vaddq_u16(s0, s1);
+      s2 = vaddq_u16(s2, s3);
+      sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+      sum_u32[1] = vpadalq_u16(sum_u32[1], s2);
+
+      row += 32;
+      w -= 32;
+    }
+
+    if (w >= 16) {
+      uint16x8_t s0 = vld1q_u16(row + 0);
+      uint16x8_t s1 = vld1q_u16(row + 8);
+
+      s0 = vaddq_u16(s0, s1);
+      sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+
+      row += 16;
+      w -= 16;
+    }
+
+    if (w >= 8) {
+      uint16x8_t s0 = vld1q_u16(row);
+      sum_u32[1] = vpadalq_u16(sum_u32[1], s0);
+
+      row += 8;
+      w -= 8;
+    }
+
+    if (w >= 4) {
+      uint16x8_t s0 = vcombine_u16(vld1_u16(row), vdup_n_u16(0));
+      sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+
+      row += 4;
+      w -= 4;
+    }
+
+    while (w-- > 0) {
+      sum += *row++;
+    }
+
+    sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1]));
+
+    src += src_stride;
+  } while (--h != 0);
+
+  return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width));
+}
+
+void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8,
+                                   const uint8_t *src8, int h_start, int h_end,
+                                   int v_start, int v_end, int dgd_stride,
+                                   int src_stride, int64_t *M, int64_t *H,
+                                   aom_bit_depth_t bit_depth) {
+  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED);
+
+  const int wiener_halfwin = wiener_win >> 1;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const int height = v_end - v_start;
+  const int width = h_end - h_start;
+
+  const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride;
+  const uint16_t *src_start = src + h_start + v_start * src_stride;
+
+  // The wiener window will slide along the dgd frame, centered on each pixel.
+  // For the top left pixel and all the pixels on the side of the frame this
+  // means half of the window will be outside of the frame. As such the actual
+  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+  const int vert_offset = v_start - wiener_halfwin;
+  const int horiz_offset = h_start - wiener_halfwin;
+  const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+
+  uint16_t avg = highbd_find_average_neon(dgd_start, dgd_stride, width, height);
+
+  if (wiener_win == WIENER_WIN) {
+    highbd_compute_stats_win7_neon(dgd_win, src_start, avg, width, height,
+                                   dgd_stride, src_stride, M, H, bit_depth);
+  } else {
+    highbd_compute_stats_win5_neon(dgd_win, src_start, avg, width, height,
+                                   dgd_stride, src_stride, M, H, bit_depth);
+  }
+}
+
+int64_t av1_highbd_pixel_proj_error_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  int64_t sse = 0;
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    int32x2_t xq_v = vld1_s32(xq);
+    int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4);
+
+    do {
+      int j = 0;
+      int32x4_t sse_s32 = vdupq_n_s32(0);
+
+      do {
+        const uint16x8_t d = vld1q_u16(&dat[j]);
+        const uint16x8_t s = vld1q_u16(&src[j]);
+        int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
+        int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
+        int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
+        int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
+
+        int32x4_t d_s32_lo = vreinterpretq_s32_u32(
+            vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0));
+        int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16(
+            vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0));
+
+        int32x4_t v0 = vsubq_s32(
+            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)),
+            d_s32_lo);
+        int32x4_t v1 = vsubq_s32(
+            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)),
+            d_s32_hi);
+
+        v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0);
+        v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0);
+        v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
+        v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
+
+        int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+        int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+        int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
+                                vreinterpretq_s16_u16(vsubq_u16(d, s)));
+        int16x4_t e_lo = vget_low_s16(e);
+        int16x4_t e_hi = vget_high_s16(e);
+
+        sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+        sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+        j += 8;
+      } while (j <= width - 8);
+
+      for (int k = j; k < width; ++k) {
+        int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+        v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]);
+        v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4);
+        int32_t e =
+            (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+        sse += ((int64_t)e * e);
+      }
+
+      sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    } while (--height != 0);
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    int32x4_t xq_v = vdupq_n_s32(xq_active);
+
+    do {
+      int j = 0;
+      int32x4_t sse_s32 = vdupq_n_s32(0);
+      do {
+        const uint16x8_t d0 = vld1q_u16(&dat[j]);
+        const uint16x8_t s0 = vld1q_u16(&src[j]);
+        int32x4_t flt0_0 = vld1q_s32(&flt[j]);
+        int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]);
+
+        uint16x8_t d_u16 = vshlq_n_u16(d0, 4);
+        int32x4_t sub0 = vreinterpretq_s32_u32(
+            vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16)));
+        int32x4_t sub1 = vreinterpretq_s32_u32(
+            vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16)));
+
+        int32x4_t v0 = vmlaq_s32(
+            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0,
+            xq_v);
+        int32x4_t v1 = vmlaq_s32(
+            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1,
+            xq_v);
+
+        int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+        int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+        int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
+                                vreinterpretq_s16_u16(vsubq_u16(d0, s0)));
+        int16x4_t e_lo = vget_low_s16(e);
+        int16x4_t e_hi = vget_high_s16(e);
+
+        sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+        sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+        j += 8;
+      } while (j <= width - 8);
+
+      for (int k = j; k < width; ++k) {
+        int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+        v += xq_active * (int32_t)((uint32_t)flt[j] - (uint16_t)(dat[k] << 4));
+        const int32_t e =
+            (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+        sse += ((int64_t)e * e);
+      }
+
+      sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+      dat += dat_stride;
+      flt += flt_stride;
+      src += src_stride;
+    } while (--height != 0);
+  } else {
+    do {
+      int j = 0;
+
+      do {
+        const uint16x8_t d = vld1q_u16(&dat[j]);
+        const uint16x8_t s = vld1q_u16(&src[j]);
+
+        uint16x8_t diff = vabdq_u16(d, s);
+        uint16x4_t diff_lo = vget_low_u16(diff);
+        uint16x4_t diff_hi = vget_high_u16(diff);
+
+        uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo);
+        uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi);
+
+        sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo));
+        sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi));
+
+        j += 8;
+      } while (j <= width - 8);
+
+      for (int k = j; k < width; ++k) {
+        int32_t e = dat[k] - src[k];
+        sse += e * e;
+      }
+
+      dat += dat_stride;
+      src += src_stride;
+    } while (--height != 0);
+  }
+
+  sse += horizontal_add_s64x2(sse_s64);
+  return sse;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c
new file mode 100644
index 0000000000..4bf7ae6ce4
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+int64_t av1_highbd_block_error_neon(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+  const int shift = 2 * (bd - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int32x4_t c = vld1q_s32(coeff);
+    const int32x4_t d = vld1q_s32(dqcoeff);
+
+    const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+    err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+    err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+    ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+    ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+    coeff += 4;
+    dqcoeff += 4;
+    block_size -= 4;
+  } while (block_size != 0);
+
+  *ssz = (horizontal_add_s64x2(ssz_s64) + rounding) >> shift;
+  return ((int64_t)horizontal_add_u64x2(err_u64) + rounding) >> shift;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
new file mode 100644
index 0000000000..88e176f56c
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_squared_error(
+    const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2,
+    const uint32_t stride2, const uint32_t block_width,
+    const uint32_t block_height, uint32_t *frame_sse,
+    const unsigned int dst_stride) {
+  uint32_t *dst = frame_sse;
+
+  uint32_t i = 0;
+  do {
+    uint32_t j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(frame1 + i * stride1 + j);
+      uint16x8_t r = vld1q_u16(frame2 + i * stride2 + j);
+
+      uint16x8_t abs_diff = vabdq_u16(s, r);
+      uint32x4_t sse_lo =
+          vmull_u16(vget_low_u16(abs_diff), vget_low_u16(abs_diff));
+      uint32x4_t sse_hi =
+          vmull_u16(vget_high_u16(abs_diff), vget_high_u16(abs_diff));
+
+      vst1q_u32(dst + j, sse_lo);
+      vst1q_u32(dst + j + 4, sse_hi);
+
+      j += 8;
+    } while (j < block_width);
+
+    dst += dst_stride;
+    i++;
+  } while (i < block_height);
+}
+
+static uint32_t sum_kernel5x5_mask_single(const uint32x4_t vsrc[5][2],
+                                          const uint32x4_t mask_single) {
+  uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask_single);
+  vsums = vmlaq_u32(vsums, vsrc[1][0], mask_single);
+  vsums = vmlaq_u32(vsums, vsrc[2][0], mask_single);
+  vsums = vmlaq_u32(vsums, vsrc[3][0], mask_single);
+  vsums = vmlaq_u32(vsums, vsrc[4][0], mask_single);
+  return horizontal_add_u32x4(vsums);
+}
+
+static uint32x4_t sum_kernel5x5_mask_double(const uint32x4_t vsrc[5][2],
+                                            const uint32x4_t mask1,
+                                            const uint32x4_t mask2) {
+  uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[1][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[2][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[3][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[4][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[0][1], mask2);
+  vsums = vmlaq_u32(vsums, vsrc[1][1], mask2);
+  vsums = vmlaq_u32(vsums, vsrc[2][1], mask2);
+  vsums = vmlaq_u32(vsums, vsrc[3][1], mask2);
+  vsums = vmlaq_u32(vsums, vsrc[4][1], mask2);
+  return vsums;
+}
+
+static void highbd_apply_temporal_filter(
+    const uint16_t *frame, const unsigned int stride,
+    const uint32_t block_width, const uint32_t block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    const uint32_t *frame_sse, const uint32_t frame_sse_stride,
+    const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+    const double decay_factor, const double inv_factor,
+    const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl,
+    int bd) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_neon[BH][BW] = { 0 };
+  const int half_window = TF_WINDOW_LENGTH >> 1;
+
+  uint32x4_t vsrc[5][2] = { 0 };
+  const uint32x4_t k0000 = vdupq_n_u32(0);
+  const uint32x4_t k1111 = vdupq_n_u32(1);
+  const uint32_t k3110_u32[4] = { 0, 1, 1, 3 };
+  const uint32_t k2111_u32[4] = { 1, 1, 1, 2 };
+  const uint32_t k1112_u32[4] = { 2, 1, 1, 1 };
+  const uint32_t k0113_u32[4] = { 3, 1, 1, 0 };
+  const uint32x4_t k3110 = vld1q_u32(k3110_u32);
+  const uint32x4_t k2111 = vld1q_u32(k2111_u32);
+  const uint32x4_t k1112 = vld1q_u32(k1112_u32);
+  const uint32x4_t k0113 = vld1q_u32(k0113_u32);
+
+  uint32x4_t vmask1[4], vmask2[4];
+  vmask1[0] = k1111;
+  vmask2[0] = vextq_u32(k1111, k0000, 3);
+  vmask1[1] = vextq_u32(k0000, k1111, 3);
+  vmask2[1] = vextq_u32(k1111, k0000, 2);
+  vmask1[2] = vextq_u32(k0000, k1111, 2);
+  vmask2[2] = vextq_u32(k1111, k0000, 1);
+  vmask1[3] = vextq_u32(k0000, k1111, 1);
+  vmask2[3] = k1111;
+
+  uint32_t row = 0;
+  do {
+    uint32_t col = 0;
+    const uint32_t *src = frame_sse + row * frame_sse_stride;
+    if (row == 0) {
+      vsrc[2][0] = vld1q_u32(src);
+      vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+      vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+      // First 2 rows of the 5x5 matrix are padded from the 1st.
+      vsrc[0][0] = vsrc[2][0];
+      vsrc[1][0] = vsrc[2][0];
+    } else if (row == 1) {
+      vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+      vsrc[2][0] = vld1q_u32(src);
+      vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+      vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+      // First row of the 5x5 matrix are padded from the 1st.
+      vsrc[0][0] = vsrc[1][0];
+    } else if (row == block_height - 2) {
+      vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+      vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+      vsrc[2][0] = vld1q_u32(src);
+      vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+
+      // Last row of the 5x5 matrix are padded from the one before.
+      vsrc[4][0] = vsrc[3][0];
+    } else if (row == block_height - 1) {
+      vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+      vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+      vsrc[2][0] = vld1q_u32(src);
+
+      // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+      vsrc[3][0] = vsrc[2][0];
+      vsrc[4][0] = vsrc[2][0];
+    } else {
+      vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+      vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+      vsrc[2][0] = vld1q_u32(src);
+      vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+      vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+    }
+
+    acc_5x5_neon[row][0] = sum_kernel5x5_mask_single(vsrc, k0113);
+    acc_5x5_neon[row][1] = sum_kernel5x5_mask_single(vsrc, k1112);
+
+    col += 4;
+    src += 4;
+    // Traverse 4 columns at a time
+    do {
+      if (row == 0) {
+        vsrc[2][1] = vld1q_u32(src);
+        vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+        vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+        // First 2 rows of the 5x5 matrix are padded from the 1st.
+        vsrc[0][1] = vsrc[2][1];
+        vsrc[1][1] = vsrc[2][1];
+      } else if (row == 1) {
+        vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+        vsrc[2][1] = vld1q_u32(src);
+        vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+        vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+        // First row of the 5x5 matrix are padded from the 1st.
+        vsrc[0][1] = vsrc[1][1];
+      } else if (row == block_height - 2) {
+        vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+        vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+        vsrc[2][1] = vld1q_u32(src);
+        vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+
+        // Last row of the 5x5 matrix are padded from the one before.
+        vsrc[4][1] = vsrc[3][1];
+      } else if (row == block_height - 1) {
+        vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+        vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+        vsrc[2][1] = vld1q_u32(src);
+
+        // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+        vsrc[3][1] = vsrc[2][1];
+        vsrc[4][1] = vsrc[2][1];
+      } else {
+        vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+        vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+        vsrc[2][1] = vld1q_u32(src);
+        vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+        vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+      }
+
+      uint32x4_t sums[4];
+      sums[0] = sum_kernel5x5_mask_double(vsrc, vmask1[0], vmask2[0]);
+      sums[1] = sum_kernel5x5_mask_double(vsrc, vmask1[1], vmask2[1]);
+      sums[2] = sum_kernel5x5_mask_double(vsrc, vmask1[2], vmask2[2]);
+      sums[3] = sum_kernel5x5_mask_double(vsrc, vmask1[3], vmask2[3]);
+      vst1q_u32(&acc_5x5_neon[row][col - half_window],
+                horizontal_add_4d_u32x4(sums));
+
+      vsrc[0][0] = vsrc[0][1];
+      vsrc[1][0] = vsrc[1][1];
+      vsrc[2][0] = vsrc[2][1];
+      vsrc[3][0] = vsrc[3][1];
+      vsrc[4][0] = vsrc[4][1];
+
+      src += 4;
+      col += 4;
+    } while (col <= block_width - 4);
+
+    acc_5x5_neon[row][col - half_window] =
+        sum_kernel5x5_mask_single(vsrc, k2111);
+    acc_5x5_neon[row][col - half_window + 1] =
+        sum_kernel5x5_mask_single(vsrc, k3110);
+
+    row++;
+  } while (row < block_height);
+
+  // Perform filtering.
+  if (tf_wgt_calc_lvl == 0) {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        // Scale down the difference for high bit depth input.
+        const uint32_t diff_sse =
+            (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  } else {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        // Scale down the difference for high bit depth input.
+        const uint32_t diff_sse =
+            (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  }
+}
+
+void av1_highbd_apply_temporal_filter_neon(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+  assert(is_high_bitdepth);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint32_t frame_sse[BW * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const uint32_t frame_sse_stride = plane_w;
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint16_t *ref =
+        CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              const int ww = frame_sse_stride
+                             << ss_x_shift;  // Width of Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx];
+            }
+          }
+        }
+      }
+    }
+    get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+                      plane_h, frame_sse, frame_sse_stride);
+
+    highbd_apply_temporal_filter(
+        pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses,
+        accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride,
+        luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor,
+        weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd);
+
+    plane_offset += plane_h * plane_w;
+  }
+}
+
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src,
+                                                        int height, int width,
+                                                        int stride,
+                                                        int bitdepth,
+                                                        int edge_thresh) {
+  uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+  uint64x2_t acc = vdupq_n_u64(0);
+  // Count is in theory positive as it counts the number of times we're under
+  // the threshold, but it will be counted negatively in order to make best use
+  // of the vclt instruction, which sets every bit of a lane to 1 when the
+  // condition is true.
+  int32x4_t count = vdupq_n_s32(0);
+  int final_count = 0;
+  uint64_t final_acc = 0;
+  const uint16_t *src_start = src + stride + 1;
+  int h = 1;
+
+  do {
+    int w = 1;
+    const uint16_t *src_ptr = src_start;
+
+    while (w <= (width - 1) - 8) {
+      uint16x8_t mat[3][3];
+      mat[0][0] = vld1q_u16(src_ptr - stride - 1);
+      mat[0][1] = vld1q_u16(src_ptr - stride);
+      mat[0][2] = vld1q_u16(src_ptr - stride + 1);
+      mat[1][0] = vld1q_u16(src_ptr - 1);
+      mat[1][1] = vld1q_u16(src_ptr);
+      mat[1][2] = vld1q_u16(src_ptr + 1);
+      mat[2][0] = vld1q_u16(src_ptr + stride - 1);
+      mat[2][1] = vld1q_u16(src_ptr + stride);
+      mat[2][2] = vld1q_u16(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]);
+      uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]);
+      gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0]));
+      gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2]));
+
+      uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]);
+      uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]);
+      gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1]));
+      gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1]));
+
+      uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+      ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth));
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionnally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+      uint16x8_t center = vshlq_n_u16(mat[1][1], 2);
+
+      uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]);
+      uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]);
+      uint16x8_t adj = vaddq_u16(adj0, adj1);
+      adj = vaddq_u16(adj, adj);
+
+      uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]);
+      uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]);
+      uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+      uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj);
+      v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16);
+      uint32x4_t v_u32 = vpaddlq_u16(v);
+
+      acc = vpadalq_u32(acc, v_u32);
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+      w += 8;
+      src_ptr += 8;
+    }
+
+    if (w <= (width - 1) - 4) {
+      uint16x4_t mat[3][3];
+      mat[0][0] = vld1_u16(src_ptr - stride - 1);
+      mat[0][1] = vld1_u16(src_ptr - stride);
+      mat[0][2] = vld1_u16(src_ptr - stride + 1);
+      mat[1][0] = vld1_u16(src_ptr - 1);
+      mat[1][1] = vld1_u16(src_ptr);
+      mat[1][2] = vld1_u16(src_ptr + 1);
+      mat[2][0] = vld1_u16(src_ptr + stride - 1);
+      mat[2][1] = vld1_u16(src_ptr + stride);
+      mat[2][2] = vld1_u16(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]);
+      uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]);
+      gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0]));
+      gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2]));
+
+      uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]);
+      uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]);
+      gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1]));
+      gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1]));
+
+      uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb);
+      ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth));
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionnally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh));
+
+      uint16x4_t center = vshl_n_u16(mat[1][1], 2);
+
+      uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]);
+      uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]);
+      uint16x4_t adj = vadd_u16(adj0, adj1);
+      adj = vadd_u16(adj, adj);
+
+      uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]);
+      uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]);
+      uint16x4_t diag = vadd_u16(diag0, diag1);
+
+      uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj);
+      v = vand_u16(v, thresh_u16);
+      uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth)));
+
+      acc = vpadalq_u32(acc, v_u32);
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16));
+
+      w += 4;
+      src_ptr += 4;
+    }
+
+    while (w < width - 1) {
+      int mat[3][3];
+      mat[0][0] = *(src_ptr - stride - 1);
+      mat[0][1] = *(src_ptr - stride);
+      mat[0][2] = *(src_ptr - stride + 1);
+      mat[1][0] = *(src_ptr - 1);
+      mat[1][1] = *(src_ptr);
+      mat[1][2] = *(src_ptr + 1);
+      mat[2][0] = *(src_ptr + stride - 1);
+      mat[2][1] = *(src_ptr + stride);
+      mat[2][2] = *(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8);
+
+      // Accumulate Laplacian.
+      const int is_under = ga < edge_thresh;
+      const int v = 4 * mat[1][1] -
+                    2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                    (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+      final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under;
+      final_count += is_under;
+
+      src_ptr++;
+      w++;
+    }
+    src_start += stride;
+  } while (++h < height - 1);
+
+  // We counted negatively, so subtract to get the final value.
+  final_count -= horizontal_add_s32x4(count);
+  final_acc += horizontal_add_u64x2(acc);
+  return (final_count < 16)
+             ? -1.0
+             : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
new file mode 100644
index 0000000000..6cf835a243
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/txfm_common.h"
+
+static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
+  int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+  int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+                            vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+  int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+                            vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+  out[0] = c0.val[0];
+  out[1] = c0.val[1];
+  out[2] = c1.val[0];
+  out[3] = c1.val[1];
+}
+
+void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) {
+  // Load the 4x4 source in transposed form.
+  int16x4_t a1, b1, c1, d1, e;
+  a1 = vld1_s16(&input[0]);
+  b1 = vld1_s16(&input[1 * stride]);
+  c1 = vld1_s16(&input[2 * stride]);
+  d1 = vld1_s16(&input[3 * stride]);
+
+  // WHT.
+
+  // Row transforms.
+  a1 = vadd_s16(a1, b1);
+  d1 = vsub_s16(d1, c1);
+  e = vhsub_s16(a1, d1);
+  b1 = vsub_s16(e, b1);
+  c1 = vsub_s16(e, c1);
+  a1 = vsub_s16(a1, c1);
+  d1 = vadd_s16(d1, b1);
+
+  int16x8_t x[2];
+  x[0] = vcombine_s16(a1, c1);
+  x[1] = vcombine_s16(d1, b1);
+
+  int16x4_t s[4];
+  transpose4x4(x, s);
+
+  a1 = s[0];
+  b1 = s[1];
+  c1 = s[2];
+  d1 = s[3];
+
+  // Row transforms.
+  a1 = vadd_s16(a1, b1);
+  d1 = vsub_s16(d1, c1);
+  e = vhsub_s16(a1, d1);
+  b1 = vsub_s16(e, b1);
+  c1 = vsub_s16(e, c1);
+  a1 = vsub_s16(a1, c1);
+  d1 = vadd_s16(d1, b1);
+
+  vst1q_s32(&output[0], vshll_n_s16(a1, UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[4], vshll_n_s16(c1, UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[8], vshll_n_s16(d1, UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[12], vshll_n_s16(b1, UNIT_QUANT_SHIFT));
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/ml_neon.c b/third_party/aom/av1/encoder/arm/neon/ml_neon.c
new file mode 100644
index 0000000000..be6ddfd763
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/ml_neon.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+
+static void nn_activate8(float32x4_t *out_h, float32x4_t *out_l,
+                         const float32x4_t *zero) {
+  *out_h = vmaxq_f32(*out_h, *zero);
+  *out_l = vmaxq_f32(*out_l, *zero);
+}
+
+static void nn_activate4(float32x4_t *x, const float32x4_t *zero) {
+  *x = vmaxq_f32(*x, *zero);
+}
+
+#define CLAMP_0(x) (x = x > 0 ? x : 0)
+
+static void nn_propagate_8to1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t vadd = zero;
+  float total = *layer_bias;
+
+  for (int in = 0; in < num_inputs; in += 8) {
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+    const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+    const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+    vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+    vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+  }
+#if AOM_ARCH_AARCH64
+  total += vaddvq_f32(vadd);
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+  if (!output_layer) CLAMP_0(total);
+  *output_nodes = total;
+}
+
+static void nn_propagate_xto1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes) {
+  float32x4_t vadd = vdupq_n_f32(0);
+
+  float total = *layer_bias;
+  int j = num_inputs;
+  int in = 0;
+  while (j > 7) {
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+    const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+    const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+    vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+    vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+    in += 8;
+    j -= 8;
+  }
+
+#if AOM_ARCH_AARCH64
+  total += vaddvq_f32(vadd);
+
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+  for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+  *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_xsto1(int num_inputs, const float *const inputs,
+                               const float *const weights,
+                               const float *layer_bias,
+                               float *const output_nodes) {
+  float total = *layer_bias;
+#if AOM_ARCH_AARCH64
+  const float32x4_t v_inputs = vld1q_f32(inputs);
+  const float32x4_t v_weights = vld1q_f32(weights);
+  const float32x4_t vadd = vmulq_f32(v_inputs, v_weights);
+  total += vaddvq_f32(vadd);
+  int in = 4;
+#else
+  int in = 0;
+#endif
+  for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+  *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_4to1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t vadd = zero;
+  float total = *layer_bias;
+
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_inputs = vld1q_f32(&inputs[in]);
+    const float32x4_t v_weights = vld1q_f32(&weights[in]);
+    vadd = vmlaq_f32(vadd, v_inputs, v_weights);
+  }
+
+#if AOM_ARCH_AARCH64
+  total += vaddvq_f32(vadd);
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+  if (!output_layer) CLAMP_0(total);
+  *output_nodes = total;
+}
+
+static void nn_propagate_4to4(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t outputs = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+
+  float32x4_t mul0[2] = { zero, zero };
+  float32x4_t mul1[2] = { zero, zero };
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_input = vld1q_f32(&inputs[in]);
+
+    for (int i = 0; i < 2; i++) {
+      const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+      mul0[i] = vmlaq_f32(mul0[i], weight0, v_input);
+      const float32x4_t weight1 =
+          vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+      mul1[i] = vmlaq_f32(mul1[i], weight1, v_input);
+    }
+  }
+  for (int i = 0; i < 2; i++)
+#if AOM_ARCH_AARCH64
+    mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+  const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]);
+#else
+    mul0[i] =
+        vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+                     vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+  const float32x4_t hh =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+                   vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+#endif
+
+  outputs = vaddq_f32(outputs, hh);
+  if (!output_layer) nn_activate4(&outputs, &zero);
+  vst1q_f32(output_nodes, outputs);
+}
+
+static void nn_propagate_4to8(const int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t out_h = vld1q_f32(&layer_bias[4]);
+  float32x4_t out_l = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t mul0[4] = { zero, zero, zero, zero };
+  float32x4_t mul1[4] = { zero, zero, zero, zero };
+
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_input = vld1q_f32(&inputs[in]);
+    for (int i = 0; i < 4; i++) {
+      const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+      const float32x4_t weight1 =
+          vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+      mul0[i] = vmlaq_f32(mul0[i], v_input, weight0);
+      mul1[i] = vmlaq_f32(mul1[i], v_input, weight1);
+    }
+  }
+  for (int i = 0; i < 4; i++)
+#if AOM_ARCH_AARCH64
+    mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+  const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]);
+  const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]);
+#else
+    mul0[i] =
+        vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+                     vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+  const float32x4_t hh0 =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+                   vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+  const float32x4_t hh1 =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[2]), vget_high_f32(mul0[2])),
+                   vpadd_f32(vget_low_f32(mul0[3]), vget_high_f32(mul0[3])));
+#endif
+
+  out_h = vaddq_f32(out_h, hh1);
+  out_l = vaddq_f32(out_l, hh0);
+
+  if (!output_layer) nn_activate8(&out_h, &out_l, &zero);
+  vst1q_f32(&output_nodes[4], out_h);
+  vst1q_f32(output_nodes, out_l);
+}
+
+static void nn_propagate_8to4(const int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t outputs = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t add[4] = { zero, zero, zero, zero };
+  for (int in = 0; in < num_inputs; in += 8) {
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+
+    for (int i = 0; i < 4; i++) {
+      const float32x4_t weight_l = vld1q_f32(&weights[in + i * num_inputs]);
+      const float32x4_t weight_h = vld1q_f32(&weights[in + i * num_inputs + 4]);
+      add[i] = vmlaq_f32(add[i], inputs_l, weight_l);
+      add[i] = vmlaq_f32(add[i], inputs_h, weight_h);
+    }
+  }
+#if AOM_ARCH_AARCH64
+  const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]);
+  const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]);
+  const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h);
+#else
+  const float32x4_t hadd_h =
+      vcombine_f32(vpadd_f32(vget_low_f32(add[2]), vget_high_f32(add[2])),
+                   vpadd_f32(vget_low_f32(add[3]), vget_high_f32(add[3])));
+  const float32x4_t hadd_l =
+      vcombine_f32(vpadd_f32(vget_low_f32(add[0]), vget_high_f32(add[0])),
+                   vpadd_f32(vget_low_f32(add[1]), vget_high_f32(add[1])));
+  const float32x4_t haddhadd =
+      vcombine_f32(vpadd_f32(vget_low_f32(hadd_l), vget_high_f32(hadd_l)),
+                   vpadd_f32(vget_low_f32(hadd_h), vget_high_f32(hadd_h)));
+#endif
+
+  outputs = vaddq_f32(outputs, haddhadd);
+  if (!output_layer) nn_activate4(&outputs, &zero);
+  vst1q_f32(output_nodes, outputs);
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_neon(const float *input_nodes,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
+                         float *const output) {
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  int buf_index = 0;
+  int num_inputs = nn_config->num_inputs;
+  // Hidden layers, except the final iteration is the output layer.
+  for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    bool output_layer = (layer == nn_config->num_hidden_layers);
+    float *const output_nodes = output_layer ? output : buf[buf_index];
+    const int num_outputs = output_layer ? nn_config->num_outputs
+                                         : nn_config->num_hidden_nodes[layer];
+
+    if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out += 8) {
+        nn_propagate_4to8(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        nn_propagate_8to4(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        nn_propagate_4to4(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_8to1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_4to1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs > 8) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_xto1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out]);
+      }
+    } else if (num_inputs >= 4) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_xsto1(num_inputs, input_nodes,
+                           &layer_weights[out * num_inputs], &layer_bias[out],
+                           &output_nodes[out]);
+      }
+    } else {
+      for (int node = 0; node < num_outputs; ++node) {
+        float val = layer_bias[node];
+        for (int i = 0; i < num_inputs; ++i)
+          val += layer_weights[node * num_inputs + i] * input_nodes[i];
+        // ReLU as activation function.
+        val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
+        output_nodes[node] = val;
+      }
+    }
+    input_nodes = output_nodes;
+    num_inputs = num_outputs;
+    buf_index = 1 - buf_index;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c
new file mode 100644
index 0000000000..2e4761f9a4
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c
@@ -0,0 +1,1217 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+int64_t av1_lowbd_pixel_proj_error_neon(
+    const uint8_t *src, int width, int height, int src_stride,
+    const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int64_t sse = 0;
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    int32x2_t xq_v = vld1_s32(xq);
+    int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS);
+
+    do {
+      int j = 0;
+      int32x4_t sse_s32 = vdupq_n_s32(0);
+
+      do {
+        const uint8x8_t d = vld1_u8(&dat[j]);
+        const uint8x8_t s = vld1_u8(&src[j]);
+        int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
+        int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
+        int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
+        int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
+
+        int32x4_t offset =
+            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
+        int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0);
+        int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0);
+
+        v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
+        v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
+
+        int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d));
+        v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16),
+                            vreinterpret_s16_s32(xq_sum_v), 0);
+        v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16),
+                            vreinterpret_s16_s32(xq_sum_v), 0);
+
+        int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+        int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+        int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
+        int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
+        int16x4_t e_lo = vget_low_s16(e);
+        int16x4_t e_hi = vget_high_s16(e);
+
+        sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+        sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+        j += 8;
+      } while (j <= width - 8);
+
+      for (int k = j; k < width; ++k) {
+        int32_t u = (dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) +
+                    xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]);
+        int32_t e =
+            (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+        sse += e * e;
+      }
+
+      sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    } while (--height != 0);
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    int32x2_t xq_v = vdup_n_s32(xq_active);
+
+    do {
+      int32x4_t sse_s32 = vdupq_n_s32(0);
+      int j = 0;
+
+      do {
+        const uint8x8_t d = vld1_u8(&dat[j]);
+        const uint8x8_t s = vld1_u8(&src[j]);
+        int32x4_t flt_0 = vld1q_s32(&flt[j]);
+        int32x4_t flt_1 = vld1q_s32(&flt[j + 4]);
+        int16x8_t d_s16 =
+            vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+
+        int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16));
+        int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16));
+
+        int32x4_t offset =
+            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
+        int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0);
+        int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0);
+
+        int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+        int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+        int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
+        int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
+        int16x4_t e_lo = vget_low_s16(e);
+        int16x4_t e_hi = vget_high_s16(e);
+
+        sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+        sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+        j += 8;
+      } while (j <= width - 8);
+
+      for (int k = j; k < width; ++k) {
+        int32_t u = dat[k] << SGRPROJ_RST_BITS;
+        int32_t v = xq_active * (flt[k] - u);
+        int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) +
+                    dat[k] - src[k];
+        sse += e * e;
+      }
+
+      sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+    } while (--height != 0);
+  } else {
+    uint32x4_t sse_s32 = vdupq_n_u32(0);
+
+    do {
+      int j = 0;
+
+      do {
+        const uint8x16_t d = vld1q_u8(&dat[j]);
+        const uint8x16_t s = vld1q_u8(&src[j]);
+
+        uint8x16_t diff = vabdq_u8(d, s);
+        uint8x8_t diff_lo = vget_low_u8(diff);
+        uint8x8_t diff_hi = vget_high_u8(diff);
+
+        sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo));
+        sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi));
+
+        j += 16;
+      } while (j <= width - 16);
+
+      for (int k = j; k < width; ++k) {
+        int32_t e = dat[k] - src[k];
+        sse += e * e;
+      }
+
+      dat += dat_stride;
+      src += src_stride;
+    } while (--height != 0);
+
+    sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32));
+  }
+
+  sse += horizontal_add_s64x2(sse_s64);
+  return sse;
+}
+
+// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are
+// processing 2 pixels at a time, so the accumulator max can be as high as 32768
+// for the compute stats.
+#define STAT_ACCUMULATOR_MAX 32768
+
+static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
+#if AOM_ARCH_AARCH64
+  uint8x16x2_t table = { { a, b } };
+  return vqtbl2_u8(table, idx);
+#else
+  uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
+                          vget_high_u8(b) } };
+  return vtbl4_u8(table, idx);
+#endif
+}
+
+static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+  uint8x16x2_t table = { { a, b } };
+  return vqtbl2q_u8(table, idx);
+#else
+  uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
+                          vget_high_u8(b) } };
+  return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
+                     vtbl4_u8(table, vget_high_u8(idx)));
+#endif
+}
+
+// The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the
+// computation. This function computes the final M from the accumulated
+// (src_s64) and the residual parts (src_s32). It also transposes the result as
+// the output needs to be column-major.
+static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
+                                   const int32_t *src_s32, const int wiener_win,
+                                   int scale) {
+  for (int i = 0; i < wiener_win; ++i) {
+    for (int j = 0; j < wiener_win; ++j) {
+      int tr_idx = j * wiener_win + i;
+      *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
+    }
+  }
+}
+
+// The resulting H is a column-major matrix accumulated from the transposed
+// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single
+// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This
+// function transforms back to the originally expected format (double
+// transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to
+// speed-up the computation. This function computes the final H from the
+// accumulated (src_s64) and the residual parts (src_s32). The computed H is
+// only an upper triangle matrix, this function also fills the lower triangle of
+// the resulting matrix.
+static void update_H(int64_t *dst, const int64_t *src_s64,
+                     const int32_t *src_s32, const int wiener_win, int stride,
+                     int scale) {
+  // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
+  // `wiener_win2` is 9, the M matrix is 3x3:
+  // 0, 3, 6
+  // 1, 4, 7
+  // 2, 5, 8
+  //
+  // This is viewed as a vector to compute H (9x9) by vector outer product:
+  // 0, 3, 6, 1, 4, 7, 2, 5, 8
+  //
+  // Double transpose and upper triangle remapping for 3x3 -> 9x9 case:
+  // 0,    3,    6,    1,    4,    7,    2,    5,    8,
+  // 3,   30,   33,   12,   31,   34,   21,   32,   35,
+  // 6,   33,   60,   15,   42,   61,   24,   51,   62,
+  // 1,   12,   15,   10,   13,   16,   11,   14,   17,
+  // 4,   31,   42,   13,   40,   43,   22,   41,   44,
+  // 7,   34,   61,   16,   43,   70,   25,   52,   71,
+  // 2,   21,   24,   11,   22,   25,   20,   23,   26,
+  // 5,   32,   51,   14,   41,   52,   23,   50,   53,
+  // 8,   35,   62,   17,   44,   71,   26,   53,   80,
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  // Loop through the indices according to the remapping above, along the
+  // columns:
+  // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ...,
+  // wiener_win - 1, wiener_win - 1 + wiener_win, ...
+  // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8.
+  for (int i = 0; i < wiener_win; ++i) {
+    for (int j = i; j < wiener_win2; j += wiener_win) {
+      // These two inner loops are the same as the two outer loops, but running
+      // along rows instead of columns. For the 3x3 case `l` will be:
+      // 0, 3, 6, 1, 4, 7, 2, 5, 8.
+      for (int k = 0; k < wiener_win; ++k) {
+        for (int l = k; l < wiener_win2; l += wiener_win) {
+          // The nominal double transpose indexing would be:
+          // int idx = stride * j + l;
+          // However we need the upper-triangle indices, it is easy with some
+          // min/max operations.
+          int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
+
+          // Resulting matrix is filled by combining the 64-bit and the residual
+          // 32-bit matrices together with scaling.
+          *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
+        }
+      }
+    }
+  }
+}
+
+// Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the
+// last load address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
+                                        ptrdiff_t stride) {
+  dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+  src += 2 * stride;
+  dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+  src += 2 * stride;
+  dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+  src += 2 * stride;
+  dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0));
+}
+
+static INLINE void compute_stats_win7_neon(const uint8_t *dgd,
+                                           const uint8_t *src, int width,
+                                           int height, int dgd_stride,
+                                           int src_stride, int avg, int64_t *M,
+                                           int64_t *H, int downsample_factor) {
+  // Matrix names are capitalized to help readability.
+  DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
+  DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
+  DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
+  DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
+  DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+  DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+
+  memset(M_s32, 0, sizeof(M_s32));
+  memset(M_s64, 0, sizeof(M_s64));
+  memset(H_s32, 0, sizeof(H_s32));
+  memset(H_s64, 0, sizeof(H_s64));
+
+  // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7
+  // matrices.
+  // clang-format off
+  DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = {
+    0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 16, 17,
+    2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18, 19,
+    4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22,
+    1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 17, 18,
+    3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20,
+    5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23,
+  };
+  // clang-format on
+
+  const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0);
+  const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16);
+  const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32);
+  const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48);
+  const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64);
+  const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80);
+
+  int acc_cnt = STAT_ACCUMULATOR_MAX;
+  const int src_next = downsample_factor * src_stride - width;
+  const int dgd_next = downsample_factor * dgd_stride - width;
+  const uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+  do {
+    int j = width;
+    while (j >= 2) {
+      // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+      // middle 6x7 elements being shared.
+      uint8x16_t dgd_rows[4];
+      load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
+
+      const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
+      dgd += 2;
+
+      // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7
+      // matrices (1 for each of the 2 pixels) separated into distinct
+      // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7).
+      // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49
+      // consecutive elements.
+      int16x8_t dgd_avg0[6];
+      int16x8_t dgd_avg1[6];
+      uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+      uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
+
+      dgd_avg0[0] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+      dgd_avg0[1] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+      dgd_avg1[0] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8));
+      dgd_avg1[1] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8));
+
+      vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+      vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+      vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+
+      uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+      uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4);
+
+      dgd_avg0[2] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+      dgd_avg0[3] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+      dgd_avg1[2] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8));
+      dgd_avg1[3] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8));
+
+      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+      vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+      vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+      vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]);
+
+      uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+      uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5);
+
+      dgd_avg0[4] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+      dgd_avg0[5] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+      dgd_avg1[4] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8));
+      dgd_avg1[5] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8));
+
+      vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+      vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+      vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]);
+      vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]);
+
+      // The remaining last (49th) elements of `dgd - avg`.
+      DGD_AVG0[48] = dgd_ptr[6] - avg;
+      DGD_AVG1[48] = dgd_ptr[7] - avg;
+
+      // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+      // output pixels at a time. M is of size 7 * 7. It needs to be filled such
+      // that multiplying one element from src with each element of a row of the
+      // wiener window will fill one column of M. However this is not very
+      // convenient in terms of memory access, as it means we do contiguous
+      // loads of dgd but strided stores to M. As a result, we use an
+      // intermediate matrix M_s32 which is instead filled such that one row of
+      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+      // then transposed to return M.
+      int src_avg0 = *src++ - avg;
+      int src_avg1 = *src++ - avg;
+      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+      int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+      update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+                       dgd_avg1[0]);
+      update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+                       dgd_avg1[1]);
+      update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+                       dgd_avg1[2]);
+      update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
+                       dgd_avg1[3]);
+      update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
+                       dgd_avg1[4]);
+      update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
+                       dgd_avg1[5]);
+
+      // Last (49th) element of M_s32 can be computed as scalar more efficiently
+      // for 2 output pixels.
+      M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
+
+      // Start accumulating into row-major version of matrix H
+      // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+      // row-major. H is of size 49 * 49. It is filled by multiplying every pair
+      // of elements of the wiener window together (vector outer product). Since
+      // it is a symmetric matrix, we only compute the upper-right triangle, and
+      // then copy it down to the lower-left later. The upper triangle is
+      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+      // column-major and the resulting H matrix is also expected to be
+      // column-major. It is not efficient to work with column-major matrices,
+      // so we accumulate into a row-major matrix H_s32. At the end of the
+      // algorithm a double transpose transformation will convert H_s32 back to
+      // the expected output layout.
+      update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+      // The last element of the triangle of H_s32 matrix can be computed as a
+      // scalar more efficiently.
+      H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
+          DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
+
+      // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
+      // overflow.
+      if (--acc_cnt == 0) {
+        acc_cnt = STAT_ACCUMULATOR_MAX;
+
+        accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2);
+
+        // The widening accumulation is only needed for the upper triangle part
+        // of the matrix.
+        int64_t *lh = H_s64;
+        int32_t *lh32 = H_s32;
+        for (int k = 0; k < WIENER_WIN2; ++k) {
+          // The widening accumulation is only run for the relevant parts
+          // (upper-right triangle) in a row 4-element aligned.
+          int k4 = k / 4 * 4;
+          accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
+
+          // Last element of the row is computed separately.
+          lh[48] += lh32[48];
+          lh32[48] = 0;
+
+          lh += WIENER_WIN2_ALIGN2;
+          lh32 += WIENER_WIN2_ALIGN2;
+        }
+      }
+
+      j -= 2;
+    }
+
+    // Computations for odd pixel in the row.
+    if (width & 1) {
+      // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+      // middle 6x7 elements being shared.
+      uint8x16_t dgd_rows[4];
+      load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
+
+      const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
+      ++dgd;
+
+      // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7
+      // matrix tightly packed into a int16x8_t[6] array. This array contains
+      // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer.
+      // The DGD_AVG buffer contains 49 consecutive elements.
+      int16x8_t dgd_avg0[6];
+      uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+      dgd_avg0[0] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+      dgd_avg0[1] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+      vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+
+      uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+      dgd_avg0[2] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+      dgd_avg0[3] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+      vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+
+      uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+      dgd_avg0[4] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+      dgd_avg0[5] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+      vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+      vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+
+      // The remaining last (49th) element of `dgd - avg`.
+      DGD_AVG0[48] = dgd_ptr[6] - avg;
+
+      // Accumulate into row-major order variant of matrix M (cross-correlation)
+      // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled
+      // such that multiplying one element from src with each element of a row
+      // of the wiener window will fill one column of M. However this is not
+      // very convenient in terms of memory access, as it means we do
+      // contiguous loads of dgd but strided stores to M. As a result, we use an
+      // intermediate matrix M_s32 which is instead filled such that one row of
+      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+      // then transposed to return M.
+      int src_avg0 = *src++ - avg;
+      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+      update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+      update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+      update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+      update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
+      update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
+      update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
+
+      // Last (49th) element of M_s32 can be computed as scalar more efficiently
+      // for 1 output pixel.
+      M_s32[48] += DGD_AVG0[48] * src_avg0;
+
+      // Start accumulating into row-major order version of matrix H
+      // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+      // H is of size 49 * 49. It is filled by multiplying every pair of
+      // elements of the wiener window together (vector outer product). Since it
+      // is a symmetric matrix, we only compute the upper-right triangle, and
+      // then copy it down to the lower-left later. The upper triangle is
+      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+      // column-major and the resulting H matrix is also expected to be
+      // column-major. It is not efficient to work column-major matrices, so we
+      // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+      // double transpose transformation will convert H_s32 back to the expected
+      // output layout.
+      update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
+
+      // The last element of the triangle of H_s32 matrix can be computed as
+      // scalar more efficiently.
+      H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
+    }
+
+    src += src_next;
+    dgd += dgd_next;
+  } while (--height != 0);
+
+  acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor);
+
+  update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor);
+}
+
+// Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the
+// last load address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src,
+                                        ptrdiff_t stride) {
+  dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+  src += 2 * stride;
+  dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+  src += 2 * stride;
+  dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0));
+}
+
+static INLINE void compute_stats_win5_neon(const uint8_t *dgd,
+                                           const uint8_t *src, int width,
+                                           int height, int dgd_stride,
+                                           int src_stride, int avg, int64_t *M,
+                                           int64_t *H, int downsample_factor) {
+  // Matrix names are capitalized to help readability.
+  DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]);
+  DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]);
+  DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]);
+  DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]);
+  DECLARE_ALIGNED(64, int32_t,
+                  H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+  DECLARE_ALIGNED(64, int64_t,
+                  H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+
+  memset(M_s32, 0, sizeof(M_s32));
+  memset(M_s64, 0, sizeof(M_s64));
+  memset(H_s32, 0, sizeof(H_s32));
+  memset(H_s64, 0, sizeof(H_s64));
+
+  // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5
+  // matrices.
+  // clang-format off
+  DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = {
+    0,  1,  2,  3,  4,  8,  9, 10, 11, 12, 16, 17, 18, 19, 20, 24,
+    1,  2,  3,  4,  5,  9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25,
+    9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23,
+  };
+  // clang-format on
+
+  const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0);
+  const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16);
+  const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32);
+
+  int acc_cnt = STAT_ACCUMULATOR_MAX;
+  const int src_next = downsample_factor * src_stride - width;
+  const int dgd_next = downsample_factor * dgd_stride - width;
+  const uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+  do {
+    int j = width;
+    while (j >= 2) {
+      // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+      // middle 4x5 elements being shared.
+      uint8x16_t dgd_rows[3];
+      load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
+
+      const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
+      dgd += 2;
+
+      // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5
+      // matrices (1 for each of the 2 pixels) separated into distinct
+      // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5).
+      // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25
+      // consecutive elements.
+      int16x8_t dgd_avg0[3];
+      int16x8_t dgd_avg1[3];
+      uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+      uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1);
+      uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2);
+
+      dgd_avg0[0] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+      dgd_avg0[1] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+      dgd_avg0[2] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+      dgd_avg1[0] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+      dgd_avg1[1] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+      dgd_avg1[2] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+
+      vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
+      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+      vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]);
+      vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+      vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+
+      // The remaining last (25th) elements of `dgd - avg`.
+      DGD_AVG0[24] = dgd_ptr[4] - avg;
+      DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+      // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+      // output pixels at a time. M is of size 5 * 5. It needs to be filled such
+      // that multiplying one element from src with each element of a row of the
+      // wiener window will fill one column of M. However this is not very
+      // convenient in terms of memory access, as it means we do contiguous
+      // loads of dgd but strided stores to M. As a result, we use an
+      // intermediate matrix M_s32 which is instead filled such that one row of
+      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+      // then transposed to return M.
+      int src_avg0 = *src++ - avg;
+      int src_avg1 = *src++ - avg;
+      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+      int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+      update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+                       dgd_avg1[0]);
+      update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+                       dgd_avg1[1]);
+      update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+                       dgd_avg1[2]);
+
+      // Last (25th) element of M_s32 can be computed as scalar more efficiently
+      // for 2 output pixels.
+      M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
+
+      // Start accumulating into row-major version of matrix H
+      // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+      // row-major. H is of size 25 * 25. It is filled by multiplying every pair
+      // of elements of the wiener window together (vector outer product). Since
+      // it is a symmetric matrix, we only compute the upper-right triangle, and
+      // then copy it down to the lower-left later. The upper triangle is
+      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+      // column-major and the resulting H matrix is also expected to be
+      // column-major. It is not efficient to work with column-major matrices,
+      // so we accumulate into a row-major matrix H_s32. At the end of the
+      // algorithm a double transpose transformation will convert H_s32 back to
+      // the expected output layout.
+      update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+      // The last element of the triangle of H_s32 matrix can be computed as a
+      // scalar more efficiently.
+      H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+          DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
+
+      // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
+      // overflow.
+      if (--acc_cnt == 0) {
+        acc_cnt = STAT_ACCUMULATOR_MAX;
+
+        accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2);
+
+        // The widening accumulation is only needed for the upper triangle part
+        // of the matrix.
+        int64_t *lh = H_s64;
+        int32_t *lh32 = H_s32;
+        for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) {
+          // The widening accumulation is only run for the relevant parts
+          // (upper-right triangle) in a row 4-element aligned.
+          int k4 = k / 4 * 4;
+          accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
+
+          // Last element of the row is computed separately.
+          lh[24] += lh32[24];
+          lh32[24] = 0;
+
+          lh += WIENER_WIN2_REDUCED_ALIGN2;
+          lh32 += WIENER_WIN2_REDUCED_ALIGN2;
+        }
+      }
+
+      j -= 2;
+    }
+
+    // Computations for odd pixel in the row.
+    if (width & 1) {
+      // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+      // middle 4x5 elements being shared.
+      uint8x16_t dgd_rows[3];
+      load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
+
+      const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
+      ++dgd;
+
+      // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5
+      // matrix tightly packed into a int16x8_t[3] array. This array contains
+      // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer.
+      // The DGD_AVG buffer contains 25 consecutive elements.
+      int16x8_t dgd_avg0[3];
+      uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+      uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2));
+
+      dgd_avg0[0] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+      dgd_avg0[1] =
+          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+      dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8));
+
+      vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
+      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+
+      // The remaining last (25th) element of `dgd - avg`.
+      DGD_AVG0[24] = dgd_ptr[4] - avg;
+
+      // Accumulate into row-major order variant of matrix M (cross-correlation)
+      // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled
+      // such that multiplying one element from src with each element of a row
+      // of the wiener window will fill one column of M. However this is not
+      // very convenient in terms of memory access, as it means we do
+      // contiguous loads of dgd but strided stores to M. As a result, we use an
+      // intermediate matrix M_s32 which is instead filled such that one row of
+      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+      // then transposed to return M.
+      int src_avg0 = *src++ - avg;
+      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+      update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+      update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+      update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+
+      // Last (25th) element of M_s32 can be computed as scalar more efficiently
+      // for 1 output pixel.
+      M_s32[24] += DGD_AVG0[24] * src_avg0;
+
+      // Start accumulating into row-major order version of matrix H
+      // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+      // H is of size 25 * 25. It is filled by multiplying every pair of
+      // elements of the wiener window together (vector outer product). Since it
+      // is a symmetric matrix, we only compute the upper-right triangle, and
+      // then copy it down to the lower-left later. The upper triangle is
+      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+      // column-major and the resulting H matrix is also expected to be
+      // column-major. It is not efficient to work column-major matrices, so we
+      // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+      // double transpose transformation will convert H_s32 back to the expected
+      // output layout.
+      update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
+
+      // The last element of the triangle of H_s32 matrix can be computed as a
+      // scalar more efficiently.
+      H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+          DGD_AVG0[24] * DGD_AVG0[24];
+    }
+
+    src += src_next;
+    dgd += dgd_next;
+  } while (--height != 0);
+
+  acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor);
+
+  update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2,
+           downsample_factor);
+}
+
+static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride,
+                                        int width, int height) {
+  uint64_t sum = 0;
+
+  if (width >= 16) {
+    int h = 0;
+    // We can accumulate up to 257 8-bit values in a 16-bit value, given
+    // that each 16-bit vector has 8 elements, that means we can process up to
+    // int(257*8/width) rows before we need to widen to 32-bit vector
+    // elements.
+    int h_overflow = 257 * 8 / width;
+    int h_limit = height > h_overflow ? h_overflow : height;
+    uint32x4_t avg_u32 = vdupq_n_u32(0);
+    do {
+      uint16x8_t avg_u16 = vdupq_n_u16(0);
+      do {
+        int j = width;
+        const uint8_t *src_ptr = src;
+        do {
+          uint8x16_t s = vld1q_u8(src_ptr);
+          avg_u16 = vpadalq_u8(avg_u16, s);
+          j -= 16;
+          src_ptr += 16;
+        } while (j >= 16);
+        if (j >= 8) {
+          uint8x8_t s = vld1_u8(src_ptr);
+          avg_u16 = vaddw_u8(avg_u16, s);
+          j -= 8;
+          src_ptr += 8;
+        }
+        // Scalar tail case.
+        while (j > 0) {
+          sum += src[width - j];
+          j--;
+        }
+        src += src_stride;
+      } while (++h < h_limit);
+      avg_u32 = vpadalq_u16(avg_u32, avg_u16);
+
+      h_limit += h_overflow;
+      h_limit = height > h_overflow ? h_overflow : height;
+    } while (h < height);
+    return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) /
+                     (width * height));
+  }
+  if (width >= 8) {
+    int h = 0;
+    // We can accumulate up to 257 8-bit values in a 16-bit value, given
+    // that each 16-bit vector has 4 elements, that means we can process up to
+    // int(257*4/width) rows before we need to widen to 32-bit vector
+    // elements.
+    int h_overflow = 257 * 4 / width;
+    int h_limit = height > h_overflow ? h_overflow : height;
+    uint32x2_t avg_u32 = vdup_n_u32(0);
+    do {
+      uint16x4_t avg_u16 = vdup_n_u16(0);
+      do {
+        int j = width;
+        const uint8_t *src_ptr = src;
+        uint8x8_t s = vld1_u8(src_ptr);
+        avg_u16 = vpadal_u8(avg_u16, s);
+        j -= 8;
+        src_ptr += 8;
+        // Scalar tail case.
+        while (j > 0) {
+          sum += src[width - j];
+          j--;
+        }
+        src += src_stride;
+      } while (++h < h_limit);
+      avg_u32 = vpadal_u16(avg_u32, avg_u16);
+
+      h_limit += h_overflow;
+      h_limit = height > h_overflow ? h_overflow : height;
+    } while (h < height);
+    return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) /
+                     (width * height));
+  }
+  int i = height;
+  do {
+    int j = 0;
+    do {
+      sum += src[j];
+    } while (++j < width);
+    src += src_stride;
+  } while (--i != 0);
+  return (uint8_t)(sum / (width * height));
+}
+
+void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd,
+                            const uint8_t *src, int16_t *dgd_avg,
+                            int16_t *src_avg, int h_start, int h_end,
+                            int v_start, int v_end, int dgd_stride,
+                            int src_stride, int64_t *M, int64_t *H,
+                            int use_downsampled_wiener_stats) {
+  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+  assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
+  (void)dgd_avg;
+  (void)src_avg;
+
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = wiener_win >> 1;
+  const int width = h_end - h_start;
+  const int height = v_end - v_start;
+
+  const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride;
+  const uint8_t *src_start = src + h_start + v_start * src_stride;
+
+  // The wiener window will slide along the dgd frame, centered on each pixel.
+  // For the top left pixel and all the pixels on the side of the frame this
+  // means half of the window will be outside of the frame. As such the actual
+  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+  const int vert_offset = v_start - wiener_halfwin;
+  const int horiz_offset = h_start - wiener_halfwin;
+  const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+
+  uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
+
+  // Since the height is not necessarily a multiple of the downsample factor,
+  // the last line of src will be scaled according to how many rows remain.
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+  int downsampled_height = height / downsample_factor;
+  int downsample_remainder = height % downsample_factor;
+
+  memset(M, 0, wiener_win2 * sizeof(*M));
+  memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H));
+
+  // Calculate the M and H matrices for the normal and downsampled cases.
+  if (downsampled_height > 0) {
+    if (wiener_win == WIENER_WIN) {
+      compute_stats_win7_neon(dgd_win, src_start, width, downsampled_height,
+                              dgd_stride, src_stride, avg, M, H,
+                              downsample_factor);
+    } else {
+      compute_stats_win5_neon(dgd_win, src_start, width, downsampled_height,
+                              dgd_stride, src_stride, avg, M, H,
+                              downsample_factor);
+    }
+  }
+
+  // Accumulate the remaining last rows in the downsampled case.
+  if (downsample_remainder > 0) {
+    int remainder_offset = height - downsample_remainder;
+    if (wiener_win == WIENER_WIN) {
+      compute_stats_win7_neon(dgd_win + remainder_offset * dgd_stride,
+                              src_start + remainder_offset * src_stride, width,
+                              1, dgd_stride, src_stride, avg, M, H,
+                              downsample_remainder);
+    } else {
+      compute_stats_win5_neon(dgd_win + remainder_offset * dgd_stride,
+                              src_start + remainder_offset * src_stride, width,
+                              1, dgd_stride, src_stride, avg, M, H,
+                              downsample_remainder);
+    }
+  }
+}
+
+static INLINE void calc_proj_params_r0_r1_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+
+  int64x2_t h00_lo = vdupq_n_s64(0);
+  int64x2_t h00_hi = vdupq_n_s64(0);
+  int64x2_t h11_lo = vdupq_n_s64(0);
+  int64x2_t h11_hi = vdupq_n_s64(0);
+  int64x2_t h01_lo = vdupq_n_s64(0);
+  int64x2_t h01_hi = vdupq_n_s64(0);
+  int64x2_t c0_lo = vdupq_n_s64(0);
+  int64x2_t c0_hi = vdupq_n_s64(0);
+  int64x2_t c1_lo = vdupq_n_s64(0);
+  int64x2_t c1_hi = vdupq_n_s64(0);
+
+  do {
+    const uint8_t *src_ptr = src8;
+    const uint8_t *dat_ptr = dat8;
+    int32_t *flt0_ptr = flt0;
+    int32_t *flt1_ptr = flt1;
+    int w = width;
+
+    do {
+      uint8x8_t s = vld1_u8(src_ptr);
+      uint8x8_t d = vld1_u8(dat_ptr);
+      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+      f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+      f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+      f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+      f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+      h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+      h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+      h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+      h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt0_ptr += 8;
+      flt1_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src8 += src_stride;
+    dat8 += dat_stride;
+    flt0 += flt0_stride;
+    flt1 += flt1_stride;
+  } while (--height != 0);
+
+  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+  H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+  H[1][0] = H[0][1];
+  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width,
+                                            int height, int src_stride,
+                                            const uint8_t *dat8, int dat_stride,
+                                            int32_t *flt0, int flt0_stride,
+                                            int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+
+  int64x2_t h00_lo = vdupq_n_s64(0);
+  int64x2_t h00_hi = vdupq_n_s64(0);
+  int64x2_t c0_lo = vdupq_n_s64(0);
+  int64x2_t c0_hi = vdupq_n_s64(0);
+
+  do {
+    const uint8_t *src_ptr = src8;
+    const uint8_t *dat_ptr = dat8;
+    int32_t *flt0_ptr = flt0;
+    int w = width;
+
+    do {
+      uint8x8_t s = vld1_u8(src_ptr);
+      uint8x8_t d = vld1_u8(dat_ptr);
+      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+      f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+      f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+
+      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt0_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src8 += src_stride;
+    dat8 += dat_stride;
+    flt0 += flt0_stride;
+  } while (--height != 0);
+
+  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width,
+                                            int height, int src_stride,
+                                            const uint8_t *dat8, int dat_stride,
+                                            int32_t *flt1, int flt1_stride,
+                                            int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+
+  int64x2_t h11_lo = vdupq_n_s64(0);
+  int64x2_t h11_hi = vdupq_n_s64(0);
+  int64x2_t c1_lo = vdupq_n_s64(0);
+  int64x2_t c1_hi = vdupq_n_s64(0);
+
+  do {
+    const uint8_t *src_ptr = src8;
+    const uint8_t *dat_ptr = dat8;
+    int32_t *flt1_ptr = flt1;
+    int w = width;
+
+    do {
+      uint8x8_t s = vld1_u8(src_ptr);
+      uint8x8_t d = vld1_u8(dat_ptr);
+      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+      f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+      f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt1_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src8 += src_stride;
+    dat8 += dat_stride;
+    flt1 += flt1_stride;
+  } while (--height != 0);
+
+  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+//    of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+//    non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+//    non-zero and need to be computed.
+void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height,
+                               int src_stride, const uint8_t *dat8,
+                               int dat_stride, int32_t *flt0, int flt0_stride,
+                               int32_t *flt1, int flt1_stride, int64_t H[2][2],
+                               int64_t C[2], const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+                                dat_stride, flt0, flt0_stride, flt1,
+                                flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride,
+                             flt1, flt1_stride, H, C);
+  }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h
new file mode 100644
index 0000000000..7b72dca34d
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+
+#include <arm_neon.h>
+
+#include "av1/common/restoration.h"
+
+// Aligned sizes for Wiener filters.
+#define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2)
+#define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3)
+#define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED))
+#define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2)
+#define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3)
+
+// Compute 8 values of M (cross correlation) for a single source pixel and
+// accumulate.
+static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
+                                   int16x8_t dgd_avg) {
+  int32x4_t lo = vld1q_s32(M_s32 + 0);
+  int32x4_t hi = vld1q_s32(M_s32 + 4);
+
+  lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg);
+  hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg);
+
+  vst1q_s32(M_s32 + 0, lo);
+  vst1q_s32(M_s32 + 4, hi);
+}
+
+// Compute 8 values of M (cross correlation) for two source pixels and
+// accumulate.
+static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
+                                    int16x4_t src_avg1, int16x8_t dgd_avg0,
+                                    int16x8_t dgd_avg1) {
+  int32x4_t lo = vld1q_s32(M_s32 + 0);
+  int32x4_t hi = vld1q_s32(M_s32 + 4);
+
+  lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0);
+  hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0);
+  lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1);
+  hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1);
+
+  vst1q_s32(M_s32 + 0, lo);
+  vst1q_s32(M_s32 + 4, hi);
+}
+
+static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
+                                   int width, int height) {
+  for (int i = 0; i < height; i += 4) {
+    int16x4_t di = vld1_s16(dgd_avg + i);
+
+    for (int j = i; j < width; j += 4) {
+      int16x4_t dj = vld1_s16(dgd_avg + j);
+      int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j);
+      int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j);
+      int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j);
+      int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j);
+
+      h0 = vmlal_lane_s16(h0, dj, di, 0);
+      h1 = vmlal_lane_s16(h1, dj, di, 1);
+      h2 = vmlal_lane_s16(h2, dj, di, 2);
+      h3 = vmlal_lane_s16(h3, dj, di, 3);
+
+      vst1q_s32(H_s32 + 0 * width + j, h0);
+      vst1q_s32(H_s32 + 1 * width + j, h1);
+      vst1q_s32(H_s32 + 2 * width + j, h2);
+      vst1q_s32(H_s32 + 3 * width + j, h3);
+    }
+    H_s32 += 4 * width;
+  }
+}
+
+static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
+                                        const int16_t *dgd_avg1) {
+  for (int i = 0; i < 24; i += 4) {
+    int16x4_t di0 = vld1_s16(dgd_avg0 + i);
+    int16x4_t di1 = vld1_s16(dgd_avg1 + i);
+
+    for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) {
+      int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
+      int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
+      int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+      int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+      int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+      int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+
+      h0 = vmlal_lane_s16(h0, dj0, di0, 0);
+      h0 = vmlal_lane_s16(h0, dj1, di1, 0);
+      h1 = vmlal_lane_s16(h1, dj0, di0, 1);
+      h1 = vmlal_lane_s16(h1, dj1, di1, 1);
+      h2 = vmlal_lane_s16(h2, dj0, di0, 2);
+      h2 = vmlal_lane_s16(h2, dj1, di1, 2);
+      h3 = vmlal_lane_s16(h3, dj0, di0, 3);
+      h3 = vmlal_lane_s16(h3, dj1, di1, 3);
+
+      vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0);
+      vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1);
+      vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2);
+      vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3);
+    }
+    H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2;
+  }
+}
+
+static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
+                                        const int16_t *dgd_avg1) {
+  for (int i = 0; i < 48; i += 4) {
+    int16x4_t di0 = vld1_s16(dgd_avg0 + i);
+    int16x4_t di1 = vld1_s16(dgd_avg1 + i);
+
+    int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i);
+    int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i);
+    int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i);
+    int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i);
+
+    h0 = vmlal_lane_s16(h0, di0, di0, 0);
+    h0 = vmlal_lane_s16(h0, di1, di1, 0);
+    h1 = vmlal_lane_s16(h1, di0, di0, 1);
+    h1 = vmlal_lane_s16(h1, di1, di1, 1);
+    h2 = vmlal_lane_s16(h2, di0, di0, 2);
+    h2 = vmlal_lane_s16(h2, di1, di1, 2);
+    h3 = vmlal_lane_s16(h3, di0, di0, 3);
+    h3 = vmlal_lane_s16(h3, di1, di1, 3);
+
+    vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0);
+    vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1);
+    vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2);
+    vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3);
+
+    for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) {
+      int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
+      int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
+      h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j);
+      h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j);
+      h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j);
+      h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j);
+
+      h0 = vmlal_lane_s16(h0, dj0, di0, 0);
+      h0 = vmlal_lane_s16(h0, dj1, di1, 0);
+      h1 = vmlal_lane_s16(h1, dj0, di0, 1);
+      h1 = vmlal_lane_s16(h1, dj1, di1, 1);
+      h2 = vmlal_lane_s16(h2, dj0, di0, 2);
+      h2 = vmlal_lane_s16(h2, dj1, di1, 2);
+      h3 = vmlal_lane_s16(h3, dj0, di0, 3);
+      h3 = vmlal_lane_s16(h3, dj1, di1, 3);
+
+      vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0);
+      vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1);
+      vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2);
+      vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3);
+    }
+    H_s32 += 4 * WIENER_WIN2_ALIGN2;
+  }
+}
+
+// Widen 32-bit src data and accumulate into 64-bit dst. Clear src data.
+static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src,
+                                        int length) {
+  do {
+    int32x4_t s32 = vld1q_s32(src);
+    vst1q_s32(src, vdupq_n_s32(0));
+    src += 4;
+
+    int64x2_t d_lo = vld1q_s64(dst + 0);
+    int64x2_t d_hi = vld1q_s64(dst + 2);
+
+    d_lo = vaddw_s32(d_lo, vget_low_s32(s32));
+    d_hi = vaddw_s32(d_hi, vget_high_s32(s32));
+
+    vst1q_s64(dst + 0, d_lo);
+    vst1q_s64(dst + 2, d_hi);
+
+    dst += 4;
+    length -= 4;
+  } while (length > 0);
+}
+
+#endif  // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 0000000000..c3b57ce206
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,928 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+                                         int16x8_t v_eobmax,
+                                         uint16x8_t v_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
+                                       tran_low_t *qcoeff_ptr,
+                                       tran_low_t *dqcoeff_ptr,
+                                       int16x8_t v_quant, int16x8_t v_dequant,
+                                       int16x8_t v_round, int16x8_t v_zero) {
+  const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+  const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
+  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+  const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+  store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
+  store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
+  return v_nz_mask;
+}
+
+void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  int16x8_t v_quant = vld1q_s16(quant_ptr);
+  int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+  int16x8_t v_round = vld1q_s16(round_ptr);
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  uint16x8_t v_nz_mask;
+  // process dc and the first seven ac coeffs
+  v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                            v_dequant, v_round, v_zero);
+  v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask);
+  // overwrite the dc constants with ac constants
+  v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+  v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+  v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+  count -= 8;
+  // now process the rest of the ac coeffs
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                              v_dequant, v_round, v_zero);
+    v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+    count -= 8;
+  } while (count > 0);
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
+                                       int16_t *qcoeff_ptr,
+                                       int16_t *dqcoeff_ptr, int16x8_t v_quant,
+                                       int16x8_t v_dequant, int16x8_t v_round,
+                                       int16x8_t v_zero) {
+  const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+  const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
+  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+  const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+  vst1q_s16(qcoeff_ptr, v_qcoeff);
+  vst1q_s16(dqcoeff_ptr, v_dqcoeff);
+  return v_nz_mask;
+}
+
+void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  int16x8_t v_quant = vld1q_s16(quant_ptr);
+  int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+  int16x8_t v_round = vld1q_s16(round_ptr);
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  uint16x8_t v_nz_mask;
+  intptr_t count = n_coeffs;
+
+  // process dc and the first seven ac coeffs
+  v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                            v_dequant, v_round, v_zero);
+  v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+  // overwrite the dc constants with ac constants
+  v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+  v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+  v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+  count -= 8;
+  // now process the rest of the ac coeffs
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                              v_dequant, v_round, v_zero);
+    v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+    count -= 8;
+  } while (count != 0);
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
+    int16x8_t v_round, int16x8_t v_zero, int log_scale) {
+  const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1);
+  const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale));
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+  const uint16x8_t v_mask =
+      vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1));
+  // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
+                                    vreinterpretq_s16_u16(v_mask));
+  const int16x8_t v_tmp2 =
+      vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant);
+  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff =
+      vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
+  // Multiplying by dequant here will use all 16 bits. Cast to unsigned before
+  // shifting right. (vshlq_s16 will shift right if shift value is negative)
+  const uint16x8_t v_abs_dqcoeff =
+      vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)),
+                vdupq_n_s16(-log_scale));
+  const int16x8_t v_dqcoeff =
+      vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign),
+                v_coeff_sign);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+  store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+  return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
+    int16x8_t v_round, int16x8_t v_zero) {
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+  const uint16x8_t v_mask =
+      vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1),
+                vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2));
+  // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
+                                    vreinterpretq_s16_u16(v_mask));
+  // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+  const int16x8_t v_tmp2 =
+      vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1),
+                vreinterpretq_s16_u16(vshrq_n_u16(
+                    vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14)));
+  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff =
+      vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
+  // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+  const int16x8_t v_abs_dqcoeff =
+      vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13),
+                vreinterpretq_s16_u16(vshrq_n_u16(
+                    vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2)));
+  const int16x8_t v_dqcoeff =
+      vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+  store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+  return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan,
+    int log_scale) {
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  int16x8_t v_quant = vld1q_s16(quant_ptr);
+  int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+  const int16x8_t v_round_no_scale = vld1q_s16(round_ptr);
+  int16x8_t v_round =
+      vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  intptr_t non_zero_count = n_coeffs;
+
+  assert(n_coeffs > 16);
+  // Pre-scan pass
+  const int16x8_t v_dequant_scaled =
+      vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale)));
+  const int16x8_t v_zbin_s16 =
+      vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1);
+  intptr_t i = n_coeffs;
+  do {
+    const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8);
+    const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16);
+    const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a);
+    const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b);
+    const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
+    const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
+    // If the coefficient is in the base ZBIN range, then discard.
+    if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
+      non_zero_count -= 16;
+    } else {
+      break;
+    }
+    i -= 16;
+  } while (i > 0);
+
+  const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+  memset(qcoeff_ptr + non_zero_count, 0,
+         remaining_zcoeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr + non_zero_count, 0,
+         remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+  // process dc and the first seven ac coeffs
+  uint16x8_t v_nz_mask;
+  if (log_scale == 2) {
+    v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                        v_quant, v_dequant, v_round, v_zero);
+  } else {
+    v_nz_mask =
+        quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                               v_dequant, v_round, v_zero, log_scale);
+  }
+  v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+  // overwrite the dc constants with ac constants
+  v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+  v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+  v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+  for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    if (log_scale == 2) {
+      v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                          v_quant, v_dequant, v_round, v_zero);
+    } else {
+      v_nz_mask =
+          quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                                 v_dequant, v_round, v_zero, log_scale);
+    }
+    v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+  quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+                              qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
+                              iscan, 1);
+}
+
+void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+  quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+                              qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
+                              iscan, 2);
+}
+
+void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+
+  uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+    int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+    int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+    vcond = vcgeq_s16(v_abs, vzbins);
+
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+      int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+
+      vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+      int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+#define QM_MULL_SHIFT(x0, x1)                                              \
+  vreinterpretq_s16_u16(vorrq_u16(                                         \
+      vreinterpretq_u16_s16(vshlq_n_s16(                                   \
+          vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
+      vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
+
+static void aom_quantize_b_helper_16x16_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+
+      vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+static void aom_quantize_b_helper_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int log_scale = 1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+  const int16x8_t v_log_scale = v_eobmax_76543210;
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+            vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+        vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+      vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+          vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+static void aom_quantize_b_helper_64x64_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int log_scale = 2;
+  const int16x8_t v_log_scale =
+      vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+  int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+            vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    int16x8_t ones =
+        vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+    vtmp2 =
+        vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+        vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+    v_deq_abs =
+        vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+
+      int16x8_t ones =
+          vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+      vtmp2 =
+          vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+          vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+      v_deq_abs =
+          vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+void aom_quantize_b_helper_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  switch (log_scale) {  // log_scale for AV1 encoder can be only 0, 1, 2
+    case 0:
+      aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+    case 1:
+      aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+    case 2:
+      aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+  }
+}
+
+void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 2);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c
new file mode 100644
index 0000000000..7d3bd4c606
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include "av1/encoder/rdopt.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+                                          int32x4_t *xy_sum_32,
+                                          int32x4_t *xz_sum_32,
+                                          int32x4_t *x_sum_32,
+                                          int32x4_t *x2_sum_32) {
+  // Pixels in this 4x4   [ a b c d ]
+  // are referred to as:  [ e f g h ]
+  //                      [ i j k l ]
+  //                      [ m n o p ]
+
+  const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride));
+  const int16x4_t pixelsa_2_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16));
+  const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride));
+  const int16x4_t pixelsb_2_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16));
+  const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride));
+  const int16x4_t pixelsa_1_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16));
+  const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride));
+  const int16x4_t pixelsb_1_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16));
+
+  const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli);
+
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli);
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli);
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli);
+
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli);
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli);
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli);
+
+  // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+  // (sum up every element in slli_a and swap_b)
+  *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a);
+  *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli);
+
+  // Also sum their squares
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli);
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli);
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli);
+}
+
+void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride,
+                                          int width, int height, float *hcorr,
+                                          float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - right neighbour pixel
+  // z - below neighbour pixel
+  // w - down-right neighbour pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, x2_sum = 0;
+  int32x4_t zero = vdupq_n_s32(0);
+  int64x2_t v_x_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero);
+  // Process horizontal and vertical correlations through the body in 4x4
+  // blocks.  This excludes the final row and column and possibly one extra
+  // column depending how 3 divides into width and height
+
+  for (int i = 0; i <= height - 4; i += 3) {
+    int32x4_t xy_sum_32 = zero;
+    int32x4_t xz_sum_32 = zero;
+    int32x4_t x_sum_32 = zero;
+    int32x4_t x2_sum_32 = zero;
+    for (int j = 0; j <= width - 4; j += 3) {
+      horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+                             &xz_sum_32, &x_sum_32, &x2_sum_32);
+    }
+    v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32);
+    v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32);
+    v_x_sum = vpadalq_s32(v_x_sum, x_sum_32);
+    v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32);
+  }
+#if AOM_ARCH_AARCH64
+  xy_sum = vaddvq_s64(v_xy_sum);
+  xz_sum = vaddvq_s64(v_xz_sum);
+  x2_sum = vaddvq_s64(v_x2_sum);
+  x_sum = vaddvq_s64(v_x_sum);
+#else
+  xy_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0);
+  xz_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0);
+  x2_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0);
+  x_sum =
+      vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0);
+#endif
+  // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+  int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+  // Do we have 2 rows remaining or just the one?  Note that width and height
+  // are powers of 2, so each modulo 3 must be 1 or 2.
+  if (height % 3 == 1) {  // Just horiz corrs on the final row
+    const int16_t x0 = diff[(height - 1) * stride];
+    x_sum += x0;
+    x_finalrow += x0;
+    x2_sum += x0 * x0;
+    x2_finalrow += x0 * x0;
+    if (width >= 8) {
+      int32x4_t v_y_sum = zero;
+      int32x4_t v_y2_sum = zero;
+      int32x4_t v_xy_sum_a = zero;
+      int k = width - 1;
+      int j = 0;
+      while ((k - 8) > 0) {
+        const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]);
+        const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+        const int16x4_t v_x_lo = vget_low_s16(v_x);
+        const int16x4_t v_x_hi = vget_high_s16(v_x);
+        const int16x4_t v_y_lo = vget_low_s16(v_y);
+        const int16x4_t v_y_hi = vget_high_s16(v_y);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+        v_y_sum = vpadalq_s16(v_y_sum, v_y);
+        k -= 8;
+        j += 8;
+      }
+
+      const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j);
+      const int16x8_t v_x =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+      const int16x4_t v_x_lo = vget_low_s16(v_x);
+      const int16x4_t v_x_hi = vget_high_s16(v_x);
+      const int16x4_t v_y_lo = vget_low_s16(v_y);
+      const int16x4_t v_y_hi = vget_high_s16(v_y);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+      const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y);
+      const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+#if AOM_ARCH_AARCH64
+      const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+      xy_sum += vaddvq_s64(v_xy_sum2);
+      const int32_t y = vaddvq_s32(v_y_sum_a);
+      const int64_t y2 = vaddvq_s64(v_y2_sum_a);
+#else
+      xy_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+      const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a);
+      const int64_t y =
+          vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0);
+      const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum);
+      int64_t y2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0);
+#endif
+      x_sum += y;
+      x2_sum += y2;
+      x_finalrow += y;
+      x2_finalrow += y2;
+    } else {
+      for (int j = 0; j < width - 1; ++j) {
+        const int16_t x = diff[(height - 1) * stride + j];
+        const int16_t y = diff[(height - 1) * stride + j + 1];
+        xy_sum += x * y;
+        x_sum += y;
+        x2_sum += y * y;
+        x_finalrow += y;
+        x2_finalrow += y * y;
+      }
+    }
+  } else {  // Two rows remaining to do
+    const int16_t x0 = diff[(height - 2) * stride];
+    const int16_t z0 = diff[(height - 1) * stride];
+    x_sum += x0 + z0;
+    x2_sum += x0 * x0 + z0 * z0;
+    x_finalrow += z0;
+    x2_finalrow += z0 * z0;
+    if (width >= 8) {
+      int32x4_t v_y2_sum = zero;
+      int32x4_t v_w2_sum = zero;
+      int32x4_t v_xy_sum_a = zero;
+      int32x4_t v_xz_sum_a = zero;
+      int32x4_t v_x_sum_a = zero;
+      int32x4_t v_w_sum = zero;
+      int k = width - 1;
+      int j = 0;
+      while ((k - 8) > 0) {
+        const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]);
+        const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]);
+        const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]);
+        const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+
+        const int16x4_t v_x_lo = vget_low_s16(v_x);
+        const int16x4_t v_y_lo = vget_low_s16(v_y);
+        const int16x4_t v_z_lo = vget_low_s16(v_z);
+        const int16x4_t v_w_lo = vget_low_s16(v_w);
+        const int16x4_t v_x_hi = vget_high_s16(v_x);
+        const int16x4_t v_y_hi = vget_high_s16(v_y);
+        const int16x4_t v_z_hi = vget_high_s16(v_z);
+        const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+        v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+        v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+        v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+        v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+        v_w_sum = vpadalq_s16(v_w_sum, v_w);
+        v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+        v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+        k -= 8;
+        j += 8;
+      }
+      const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j);
+      const int16x8_t v_x =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j);
+      const int16x8_t v_z =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1);
+
+      const int16x4_t v_x_lo = vget_low_s16(v_x);
+      const int16x4_t v_y_lo = vget_low_s16(v_y);
+      const int16x4_t v_z_lo = vget_low_s16(v_z);
+      const int16x4_t v_w_lo = vget_low_s16(v_w);
+      const int16x4_t v_x_hi = vget_high_s16(v_x);
+      const int16x4_t v_y_hi = vget_high_s16(v_y);
+      const int16x4_t v_z_hi = vget_high_s16(v_z);
+      const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+      v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+      v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+      v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+      v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+      v_w_sum = vpadalq_s16(v_w_sum, v_w);
+      v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+      v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+#if AOM_ARCH_AARCH64
+      xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a));
+      xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a));
+      x_sum += vaddvq_s32(v_x_sum_a);
+      x_finalrow += vaddvq_s32(v_w_sum);
+      int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum));
+      int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum));
+#else
+      const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+      xy_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+      const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a);
+      xz_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0);
+      const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a);
+      x_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0);
+      const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum);
+      x_finalrow += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0);
+      const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+      int64_t y2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0);
+      const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum);
+      int64_t w2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0);
+#endif
+      x2_sum += y2 + w2;
+      x2_finalrow += w2;
+    } else {
+      for (int j = 0; j < width - 1; ++j) {
+        const int16_t x = diff[(height - 2) * stride + j];
+        const int16_t y = diff[(height - 2) * stride + j + 1];
+        const int16_t z = diff[(height - 1) * stride + j];
+        const int16_t w = diff[(height - 1) * stride + j + 1];
+
+        // Horizontal and vertical correlations for the penultimate row:
+        xy_sum += x * y;
+        xz_sum += x * z;
+
+        // Now just horizontal correlations for the final row:
+        xy_sum += z * w;
+
+        x_sum += y + w;
+        x2_sum += y * y + w * w;
+        x_finalrow += w;
+        x2_finalrow += w * w;
+      }
+    }
+  }
+
+  // Do we have 2 columns remaining or just the one?
+  if (width % 3 == 1) {  // Just vert corrs on the final col
+    const int16_t x0 = diff[width - 1];
+    x_sum += x0;
+    x_finalcol += x0;
+    x2_sum += x0 * x0;
+    x2_finalcol += x0 * x0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 1];
+      xz_sum += x * z;
+      x_finalcol += z;
+      x2_finalcol += z * z;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z;
+        x2_sum += z * z;
+      }
+    }
+  } else {  // Two cols remaining
+    const int16_t x0 = diff[width - 2];
+    const int16_t y0 = diff[width - 1];
+    x_sum += x0 + y0;
+    x2_sum += x0 * x0 + y0 * y0;
+    x_finalcol += y0;
+    x2_finalcol += y0 * y0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 2];
+      const int16_t y = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 2];
+      const int16_t w = diff[(i + 1) * stride + width - 1];
+
+      // Horizontal and vertical correlations for the penultimate col:
+      // Skip these on the last iteration of this loop if we also had two
+      // rows remaining, otherwise the final horizontal and vertical correlation
+      // get erroneously processed twice
+      if (i < height - 2 || height % 3 == 1) {
+        xy_sum += x * y;
+        xz_sum += x * z;
+      }
+
+      x_finalcol += w;
+      x2_finalcol += w * w;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z + w;
+        x2_sum += z * z + w * w;
+      }
+
+      // Now just vertical correlations for the final column:
+      xz_sum += y * w;
+    }
+  }
+
+  // Calculate the simple sums and squared-sums
+  int64_t x_firstrow = 0, x_firstcol = 0;
+  int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+  if (width >= 8) {
+    int32x4_t v_x_firstrow = zero;
+    int32x4_t v_x2_firstrow = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int16x8_t v_diff = vld1q_s16(diff + j);
+      const int16x4_t v_diff_lo = vget_low_s16(v_diff);
+      const int16x4_t v_diff_hi = vget_high_s16(v_diff);
+      v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff);
+      v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo);
+      v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi);
+    }
+#if AOM_ARCH_AARCH64
+    x_firstrow += vaddvq_s32(v_x_firstrow);
+    x2_firstrow += vaddvq_s32(v_x2_firstrow);
+#else
+    const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow);
+    x_firstrow += vget_lane_s64(
+        vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)),
+        0);
+    const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow);
+    x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64),
+                                          vget_high_s64(v_x2_firstrow_64)),
+                                 0);
+#endif
+  } else {
+    for (int j = 0; j < width; ++j) {
+      x_firstrow += diff[j];
+      x2_firstrow += diff[j] * diff[j];
+    }
+  }
+  for (int i = 0; i < height; ++i) {
+    x_firstcol += diff[i * stride];
+    x2_firstcol += diff[i * stride] * diff[i * stride];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c
new file mode 100644
index 0000000000..3d17723224
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+
+#include "av1/encoder/reconinter_enc.h"
+
+void aom_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
+                             int subpel_x_q3, int subpel_y_q3,
+                             const uint8_t *ref, int ref_stride,
+                             int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter_params = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    if (width > 8) {
+      assert(width % 16 == 0);
+      int i = height;
+      do {
+        int j = 0;
+        do {
+          uint8x16_t r = vld1q_u8(ref + j);
+          vst1q_u8(comp_pred + j, r);
+          j += 16;
+        } while (j < width);
+        ref += ref_stride;
+        comp_pred += width;
+      } while (--i != 0);
+    } else if (width == 8) {
+      int i = height;
+      do {
+        uint8x8_t r = vld1_u8(ref);
+        vst1_u8(comp_pred, r);
+        ref += ref_stride;
+        comp_pred += width;
+      } while (--i != 0);
+    } else {
+      assert(width == 4);
+      int i = height / 2;
+      do {
+        uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+        vst1_u8(comp_pred, r);
+        ref += 2 * ref_stride;
+        comp_pred += 2 * width;
+      } while (--i != 0);
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const filter_x =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, filter_x, 16, NULL,
+                        -1, width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const filter_y =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, filter_y,
+                       16, width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    im_block[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+
+    const int16_t *const filter_x =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1);
+    const int16_t *const filter_y =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1);
+
+    const int im_stride = MAX_SB_SIZE;
+    const int im_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + SUBPEL_TAPS;
+
+    const int ref_vert_offset = ref_stride * ((SUBPEL_TAPS >> 1) - 1);
+    const int im_vert_offset = im_stride * ((filter_params->taps >> 1) - 1);
+
+    assert(im_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref - ref_vert_offset, ref_stride, im_block,
+                        MAX_SB_SIZE, filter_x, 16, NULL, -1, width, im_height);
+    aom_convolve8_vert(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred, width,
+                       NULL, -1, filter_y, 16, width, height);
+  }
+}
+
+void aom_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd,
+                                      const AV1_COMMON *const cm, int mi_row,
+                                      int mi_col, const MV *const mv,
+                                      uint8_t *comp_pred, const uint8_t *pred,
+                                      int width, int height, int subpel_x_q3,
+                                      int subpel_y_q3, const uint8_t *ref,
+                                      int ref_stride, int subpel_search) {
+  aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                          subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                          subpel_search);
+
+  aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width);
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_neon(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                          subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                          subpel_search);
+
+  aom_dist_wtd_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred,
+                                  width, jcp_param);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred8, int width, int height,
+                                    int subpel_x_q3, int subpel_y_q3,
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    if (width > 4) {
+      assert(width % 8 == 0);
+      int i = height;
+      do {
+        int j = 0;
+        do {
+          uint16x8_t r = vld1q_u16(ref + j);
+          vst1q_u16(comp_pred + j, r);
+          j += 8;
+        } while (j < width);
+        ref += ref_stride;
+        comp_pred += width;
+      } while (--i != 0);
+    } else if (width == 4) {
+      int i = height;
+      do {
+        uint16x4_t r = vld1_u16(ref);
+        vst1_u16(comp_pred, r);
+        ref += ref_stride;
+        comp_pred += width;
+      } while (--i != 0);
+    } else {
+      assert(width == 2);
+      int i = height / 2;
+      do {
+        uint16x4_t r = load_u16_2x2(ref, ref_stride);
+        store_u16x2_strided_x2(comp_pred, width, r);
+        ref += 2 * ref_stride;
+        comp_pred += 2 * width;
+      } while (--i != 0);
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz_neon(ref8, ref_stride, comp_pred8, width, kernel,
+                                    16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert_neon(ref8, ref_stride, comp_pred8, width, NULL,
+                                   -1, kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz_neon(
+        ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride,
+        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+        intermediate_height, bd);
+    aom_highbd_convolve8_vert_neon(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_neon(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                                 height, subpel_x_q3, subpel_y_q3, ref8,
+                                 ref_stride, bd, subpel_search);
+
+  aom_highbd_comp_avg_pred_neon(comp_pred8, pred8, width, height, comp_pred8,
+                                width);
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                                 height, subpel_x_q3, subpel_y_q3, ref8,
+                                 ref_stride, bd, subpel_search);
+
+  aom_highbd_dist_wtd_comp_avg_pred_neon(comp_pred8, pred8, width, height,
+                                         comp_pred8, width, jcp_param);
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/arm/neon/shift_neon.h b/third_party/aom/av1/encoder/arm/neon/shift_neon.h
new file mode 100644
index 0000000000..d73aef2f25
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/shift_neon.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"  // For AOM_INLINE.
+
+#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg)                \
+  static AOM_INLINE void name(const type *in, type *out, int size) { \
+    int i = 0;                                                       \
+    do {                                                             \
+      out[i] = intrinsic(in[i], arg);                                \
+    } while (++i < size);                                            \
+  }
+
+SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s32_x4, int32x4_t, vshlq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s16_x8, int16x8_t, vrshrq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_4_round_s16_x8, int16x8_t, vrshrq_n_s16, 4)
+SHIFT_LOOP_HELPER(shift_right_4_round_s32_x4, int32x4_t, vrshrq_n_s32, 4)
+
+// Addition instructions have slightly better performance compared to shift
+// instructions on some micro-architectures, so use these for shifts by one.
+
+SHIFT_LOOP_HELPER(shift_left_1_s16_x4, int16x4_t, vadd_s16, in[i])
+SHIFT_LOOP_HELPER(shift_left_1_s16_x8, int16x8_t, vaddq_s16, in[i])
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x4, int16x4_t, vrhadd_s16,
+                  vdup_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x8, int16x8_t, vrhaddq_s16,
+                  vdupq_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32,
+                  vdupq_n_s32(0))
+
+#undef SHIFT_LOOP_HELPER
+
+#endif  // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c
new file mode 100644
index 0000000000..986f143864
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+// When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits.
+DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
+  0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000,
+  0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+  0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+  0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
+};
+
+static INLINE void get_squared_error(
+    const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2,
+    const uint32_t stride2, const uint32_t block_width,
+    const uint32_t block_height, uint16_t *frame_sse,
+    const unsigned int dst_stride) {
+  uint16_t *dst = frame_sse;
+
+  uint32_t i = 0;
+  do {
+    uint32_t j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+      uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      uint16x8_t sse_lo =
+          vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
+      uint16x8_t sse_hi =
+          vmull_u8(vget_high_u8(abs_diff), vget_high_u8(abs_diff));
+
+      vst1q_u16(dst + j + 2, sse_lo);
+      vst1q_u16(dst + j + 10, sse_hi);
+
+      j += 16;
+    } while (j < block_width);
+
+    dst += dst_stride;
+  } while (++i < block_height);
+}
+
+static INLINE uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col,
+                                      const uint32_t block_width) {
+  uint16x8_t s = vld1q_u16(src);
+
+  if (col == 0) {
+    const uint16_t lane2 = vgetq_lane_u16(s, 2);
+    s = vsetq_lane_u16(lane2, s, 0);
+    s = vsetq_lane_u16(lane2, s, 1);
+  } else if (col >= block_width - 4) {
+    const uint16_t lane5 = vgetq_lane_u16(s, 5);
+    s = vsetq_lane_u16(lane5, s, 6);
+    s = vsetq_lane_u16(lane5, s, 7);
+  }
+  return s;
+}
+
+static void apply_temporal_filter(
+    const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+    const uint32_t block_height, const int *subblock_mses,
+    unsigned int *accumulator, uint16_t *count, const uint16_t *frame_sse,
+    const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+    const double decay_factor, const double inv_factor,
+    const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_neon[BH][BW];
+  const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask);
+
+  // Traverse 4 columns at a time - first and last two columns need padding.
+  for (uint32_t col = 0; col < block_width; col += 4) {
+    uint16x8_t vsrc[5];
+    const uint16_t *src = frame_sse + col;
+
+    // Load and pad (for first and last two columns) 3 rows from the top.
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = load_and_pad(src, col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Pad the top 2 rows.
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (unsigned int row = 0; row < block_height; row++) {
+      for (int i = 0; i < 4; i++) {
+        uint32x4_t vsum = vdupq_n_u32(0);
+        for (int j = 0; j < 5; j++) {
+          vsum = vpadalq_u16(vsum, vandq_u16(vsrc[j], vmask.val[i]));
+        }
+        acc_5x5_neon[row][col + i] = horizontal_add_u32x4(vsum);
+      }
+
+      // Push all rows in the sliding window up one.
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      if (row <= block_height - 4) {
+        // Load next row into the bottom of the sliding window.
+        vsrc[4] = load_and_pad(src, col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        // Pad the bottom 2 rows.
+        vsrc[4] = vsrc[3];
+      }
+    }
+  }
+
+  // Perform filtering.
+  if (tf_wgt_calc_lvl == 0) {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  } else {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  }
+}
+
+void av1_apply_temporal_filter_neon(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+                      plane_h, frame_sse, SSE_STRIDE);
+
+    apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+                          subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_sse, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor, tf_wgt_calc_lvl);
+
+    plane_offset += plane_h * plane_w;
+  }
+}
+
+double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
+                                                 int width, int stride,
+                                                 int edge_thresh) {
+  uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+  uint32x4_t acc = vdupq_n_u32(0);
+  // Count is in theory positive as it counts the number of times we're under
+  // the threshold, but it will be counted negatively in order to make best use
+  // of the vclt instruction, which sets every bit of a lane to 1 when the
+  // condition is true.
+  int32x4_t count = vdupq_n_s32(0);
+  int final_count = 0;
+  int64_t final_acc = 0;
+  const uint8_t *src_start = src + stride + 1;
+  int h = 1;
+
+  do {
+    int w = 1;
+    const uint8_t *src_ptr = src_start;
+
+    while (w <= (width - 1) - 16) {
+      uint8x16_t mat[3][3];
+      mat[0][0] = vld1q_u8(src_ptr - stride - 1);
+      mat[0][1] = vld1q_u8(src_ptr - stride);
+      mat[0][2] = vld1q_u8(src_ptr - stride + 1);
+      mat[1][0] = vld1q_u8(src_ptr - 1);
+      mat[1][1] = vld1q_u8(src_ptr);
+      mat[1][2] = vld1q_u8(src_ptr + 1);
+      mat[2][0] = vld1q_u8(src_ptr + stride - 1);
+      mat[2][1] = vld1q_u8(src_ptr + stride);
+      mat[2][2] = vld1q_u8(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa_lo =
+          vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0]));
+      uint16x8_t gxa_hi =
+          vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0]));
+      uint16x8_t gxb_lo =
+          vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2]));
+      uint16x8_t gxb_hi =
+          vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2]));
+      gxa_lo = vaddq_u16(
+          gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0])));
+      gxa_hi = vaddq_u16(
+          gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0])));
+      gxb_lo = vaddq_u16(
+          gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2])));
+      gxb_hi = vaddq_u16(
+          gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2])));
+
+      uint16x8_t gya_lo =
+          vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+      uint16x8_t gya_hi =
+          vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+      uint16x8_t gyb_lo =
+          vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+      uint16x8_t gyb_hi =
+          vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+      gya_lo = vaddq_u16(
+          gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1])));
+      gya_hi = vaddq_u16(
+          gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1])));
+      gyb_lo = vaddq_u16(
+          gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1])));
+      gyb_hi = vaddq_u16(
+          gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1])));
+
+      uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo);
+      uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi);
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh);
+      uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh);
+
+      uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2);
+      uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2);
+
+      uint16x8_t adj0_lo =
+          vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1]));
+      uint16x8_t adj0_hi =
+          vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1]));
+      uint16x8_t adj1_lo =
+          vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2]));
+      uint16x8_t adj1_hi =
+          vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2]));
+      uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo);
+      adj_lo = vaddq_u16(adj_lo, adj_lo);
+      uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi);
+      adj_hi = vaddq_u16(adj_hi, adj_hi);
+
+      uint16x8_t diag0_lo =
+          vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+      uint16x8_t diag0_hi =
+          vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+      uint16x8_t diag1_lo =
+          vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+      uint16x8_t diag1_hi =
+          vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+      uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo);
+      uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi);
+
+      uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo);
+      v_lo = vabdq_u16(v_lo, adj_lo);
+      uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi);
+      v_hi = vabdq_u16(v_hi, adj_hi);
+
+      acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo));
+      acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi));
+
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo));
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi));
+
+      w += 16;
+      src_ptr += 16;
+    }
+
+    if (w <= (width - 1) - 8) {
+      uint8x8_t mat[3][3];
+      mat[0][0] = vld1_u8(src_ptr - stride - 1);
+      mat[0][1] = vld1_u8(src_ptr - stride);
+      mat[0][2] = vld1_u8(src_ptr - stride + 1);
+      mat[1][0] = vld1_u8(src_ptr - 1);
+      mat[1][1] = vld1_u8(src_ptr);
+      mat[1][2] = vld1_u8(src_ptr + 1);
+      mat[2][0] = vld1_u8(src_ptr + stride - 1);
+      mat[2][1] = vld1_u8(src_ptr + stride);
+      mat[2][2] = vld1_u8(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+      uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+      gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+      gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+      uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+      uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+      gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+      gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+      uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+      uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+      uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+      uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+      uint16x8_t adj = vaddq_u16(adj0, adj1);
+      adj = vaddq_u16(adj, adj);
+
+      uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+      uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+      uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+      uint16x8_t v = vaddq_u16(center, diag);
+      v = vabdq_u16(v, adj);
+
+      acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+      w += 8;
+      src_ptr += 8;
+    }
+
+    if (w <= (width - 1) - 4) {
+      uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0));
+      uint8x8_t mat[3][3];
+      mat[0][0] = load_u8_4x1(src_ptr - stride - 1);
+      mat[0][1] = load_u8_4x1(src_ptr - stride);
+      mat[0][2] = load_u8_4x1(src_ptr - stride + 1);
+      mat[1][0] = load_u8_4x1(src_ptr - 1);
+      mat[1][1] = load_u8_4x1(src_ptr);
+      mat[1][2] = load_u8_4x1(src_ptr + 1);
+      mat[2][0] = load_u8_4x1(src_ptr + stride - 1);
+      mat[2][1] = load_u8_4x1(src_ptr + stride);
+      mat[2][2] = load_u8_4x1(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+      uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+      gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+      gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+      uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+      uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+      gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+      gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+      uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask);
+
+      uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+      uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+      uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+      uint16x8_t adj = vaddq_u16(adj0, adj1);
+      adj = vaddq_u16(adj, adj);
+
+      uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+      uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+      uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+      uint16x8_t v = vaddq_u16(center, diag);
+      v = vabdq_u16(v, adj);
+
+      acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+      w += 4;
+      src_ptr += 4;
+    }
+
+    while (w < width - 1) {
+      int mat[3][3];
+      mat[0][0] = *(src_ptr - stride - 1);
+      mat[0][1] = *(src_ptr - stride);
+      mat[0][2] = *(src_ptr - stride + 1);
+      mat[1][0] = *(src_ptr - 1);
+      mat[1][1] = *(src_ptr);
+      mat[1][2] = *(src_ptr + 1);
+      mat[2][0] = *(src_ptr + stride - 1);
+      mat[2][1] = *(src_ptr + stride);
+      mat[2][2] = *(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int ga = abs(gx) + abs(gy);
+
+      // Accumulate Laplacian.
+      const int is_under = ga < edge_thresh;
+      const int v = 4 * mat[1][1] -
+                    2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                    (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+      final_acc += abs(v) * is_under;
+      final_count += is_under;
+
+      src_ptr++;
+      w++;
+    }
+    src_start += stride;
+  } while (++h < height - 1);
+
+  // We counted negatively, so subtract to get the final value.
+  final_count -= horizontal_add_s32x4(count);
+  final_acc += horizontal_long_add_u32x4(acc);
+  return (final_count < 16)
+             ? -1.0
+             : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
new file mode 100644
index 0000000000..5a52e701a2
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+// clang-format off
+
+DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
+  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+  0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+  0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
+  0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+// clang-format on
+
+static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
+                                const uint8_t *frame2, const uint32_t stride2,
+                                const uint32_t block_width,
+                                const uint32_t block_height,
+                                uint8_t *frame_abs_diff,
+                                const unsigned int dst_stride) {
+  uint8_t *dst = frame_abs_diff;
+
+  uint32_t i = 0;
+  do {
+    uint32_t j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+      uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      vst1q_u8(dst + j + 2, abs_diff);
+      j += 16;
+    } while (j < block_width);
+
+    dst += dst_stride;
+  } while (++i < block_height);
+}
+
+static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col,
+                                      const uint32_t block_width) {
+  uint8x8_t s = vld1_u8(src);
+
+  if (col == 0) {
+    const uint8_t lane2 = vget_lane_u8(s, 2);
+    s = vset_lane_u8(lane2, s, 0);
+    s = vset_lane_u8(lane2, s, 1);
+  } else if (col >= block_width - 4) {
+    const uint8_t lane5 = vget_lane_u8(s, 5);
+    s = vset_lane_u8(lane5, s, 6);
+    s = vset_lane_u8(lane5, s, 7);
+  }
+  return vcombine_u8(s, s);
+}
+
+static void apply_temporal_filter(
+    const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+    const uint32_t block_height, const int *subblock_mses,
+    unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff,
+    const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+    const double decay_factor, const double inv_factor,
+    const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_neon[BH][BW];
+  const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
+
+  // Traverse 4 columns at a time - first and last two columns need padding.
+  for (uint32_t col = 0; col < block_width; col += 4) {
+    uint8x16_t vsrc[5][2];
+    const uint8_t *src = frame_abs_diff + col;
+
+    // Load, pad (for first and last two columns) and mask 3 rows from the top.
+    for (int i = 2; i < 5; i++) {
+      const uint8x16_t s = load_and_pad(src, col, block_width);
+      vsrc[i][0] = vandq_u8(s, vmask.val[0]);
+      vsrc[i][1] = vandq_u8(s, vmask.val[1]);
+      src += SSE_STRIDE;
+    }
+
+    // Pad the top 2 rows.
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (unsigned int row = 0; row < block_height; row++) {
+      uint32x4_t sum_01 = vdupq_n_u32(0);
+      uint32x4_t sum_23 = vdupq_n_u32(0);
+
+      sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]);
+
+      sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]);
+
+      vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23));
+
+      // Push all rows in the sliding window up one.
+      for (int i = 0; i < 4; i++) {
+        vsrc[i][0] = vsrc[i + 1][0];
+        vsrc[i][1] = vsrc[i + 1][1];
+      }
+
+      if (row <= block_height - 4) {
+        // Load next row into the bottom of the sliding window.
+        uint8x16_t s = load_and_pad(src, col, block_width);
+        vsrc[4][0] = vandq_u8(s, vmask.val[0]);
+        vsrc[4][1] = vandq_u8(s, vmask.val[1]);
+        src += SSE_STRIDE;
+      } else {
+        // Pad the bottom 2 rows.
+        vsrc[4][0] = vsrc[3][0];
+        vsrc[4][1] = vsrc[3][1];
+      }
+    }
+  }
+
+  // Perform filtering.
+  if (tf_wgt_calc_lvl == 0) {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  } else {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  }
+}
+
+void av1_apply_temporal_filter_neon_dotprod(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] +=
+                  (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
+                   frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+            }
+          }
+        }
+      }
+    }
+
+    get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+                 plane_h, frame_abs_diff, SSE_STRIDE);
+
+    apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+                          subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_abs_diff, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor, tf_wgt_calc_lvl);
+
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/txfm_neon.h b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h
new file mode 100644
index 0000000000..635364f46a
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+
+#include "aom/aom_integer.h"  // For AOM_INLINE.
+
+static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip,
+                                                  const int16_t **input,
+                                                  int *stride, int out_size) {
+  if (ud_flip) {
+    *input = *input + (out_size - 1) * *stride;
+    *stride = -*stride;
+  }
+}
+
+#endif  // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c
new file mode 100644
index 0000000000..1b35269b33
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c for details of the parameters and
+ * computation.
+ */
+uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  assert(N % 64 == 0);
+
+  uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  int i = 0;
+  do {
+    int32x4_t sum[4];
+    int32x4_t sse[2];
+    int16x4_t sum_s16[4];
+
+    const int16x8_t r1_l = vld1q_s16(r1 + i);
+    const int16x8_t r1_h = vld1q_s16(r1 + i + 8);
+    const int16x8_t d_l = vld1q_s16(d + i);
+    const int16x8_t d_h = vld1q_s16(d + i + 8);
+    // The following three lines are a bit inelegant compared to using a pair
+    // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair -
+    // which can be executed in parallel with the subsequent SSHL instructions.
+    // (SSHL can only be executed on half of the Neon pipes in modern Arm
+    // cores, whereas ZIP1/2 can be executed on all of them.)
+    const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0));
+    const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]);
+    const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]);
+
+    sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS);
+    sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS);
+    sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS);
+    sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS);
+
+    sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l));
+    sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l));
+    sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h));
+    sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h));
+
+    sum_s16[0] = vqmovn_s32(sum[0]);
+    sum_s16[1] = vqmovn_s32(sum[1]);
+    sum_s16[2] = vqmovn_s32(sum[2]);
+    sum_s16[3] = vqmovn_s32(sum[3]);
+
+    sse[0] = vmull_s16(sum_s16[0], sum_s16[0]);
+    sse[1] = vmull_s16(sum_s16[2], sum_s16[2]);
+    sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]);
+    sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]);
+
+    v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0]));
+    v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1]));
+
+    i += 16;
+  } while (i < N);
+
+  uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1]));
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m,
+                                          int N, int64_t limit) {
+  int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                       vdupq_n_s32(0) };
+
+  do {
+    int16x8_t ds_l = vld1q_s16(ds);
+    int16x8_t ds_h = vld1q_s16(ds + 8);
+
+    int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m));
+    int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8));
+    int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8));
+
+    acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l));
+    acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l));
+    acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h));
+    acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h));
+
+    ds += 16;
+    m += 16;
+    N -= 16;
+  } while (N != 0);
+
+  int64x2_t sum = vpaddlq_s32(acc[0]);
+  sum = vpadalq_s32(sum, acc[1]);
+  sum = vpadalq_s32(sum, acc[2]);
+  sum = vpadalq_s32(sum, acc[3]);
+
+  return (horizontal_add_s64x2(sum) > limit);
+}
+
+void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr,
+                                          const int16_t *b_ptr, int N) {
+  do {
+    int16x8_t a = vld1q_s16(a_ptr);
+    int16x8_t b = vld1q_s16(b_ptr);
+
+    int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a));
+    int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a));
+
+    sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b));
+    sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b));
+
+    int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi));
+
+    vst1q_s16(d_ptr, res);
+
+    d_ptr += 8;
+    a_ptr += 8;
+    b_ptr += 8;
+    N -= 8;
+  } while (N != 0);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
new file mode 100644
index 0000000000..6601c19ab3
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -0,0 +1,1885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[3];
+  bf1[1] = input[1] + input[2];
+  bf1[2] = -input[2] + input[1];
+  bf1[3] = -input[3] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[1];
+  bf1[3] = bf0[3];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[7];
+  bf1[1] = input[1] + input[6];
+  bf1[2] = input[2] + input[5];
+  bf1[3] = input[3] + input[4];
+  bf1[4] = -input[4] + input[3];
+  bf1[5] = -input[5] + input[2];
+  bf1[6] = -input[6] + input[1];
+  bf1[7] = -input[7] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[4];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[6];
+  bf1[4] = bf0[1];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[3];
+  bf1[7] = bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[15];
+  bf1[1] = input[1] + input[14];
+  bf1[2] = input[2] + input[13];
+  bf1[3] = input[3] + input[12];
+  bf1[4] = input[4] + input[11];
+  bf1[5] = input[5] + input[10];
+  bf1[6] = input[6] + input[9];
+  bf1[7] = input[7] + input[8];
+  bf1[8] = -input[8] + input[7];
+  bf1[9] = -input[9] + input[6];
+  bf1[10] = -input[10] + input[5];
+  bf1[11] = -input[11] + input[4];
+  bf1[12] = -input[12] + input[3];
+  bf1[13] = -input[13] + input[2];
+  bf1[14] = -input[14] + input[1];
+  bf1[15] = -input[15] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[8];
+  bf1[2] = bf0[4];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[2];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[14];
+  bf1[8] = bf0[1];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[5];
+  bf1[11] = bf0[13];
+  bf1[12] = bf0[3];
+  bf1[13] = bf0[11];
+  bf1[14] = bf0[7];
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[31];
+  bf1[1] = input[1] + input[30];
+  bf1[2] = input[2] + input[29];
+  bf1[3] = input[3] + input[28];
+  bf1[4] = input[4] + input[27];
+  bf1[5] = input[5] + input[26];
+  bf1[6] = input[6] + input[25];
+  bf1[7] = input[7] + input[24];
+  bf1[8] = input[8] + input[23];
+  bf1[9] = input[9] + input[22];
+  bf1[10] = input[10] + input[21];
+  bf1[11] = input[11] + input[20];
+  bf1[12] = input[12] + input[19];
+  bf1[13] = input[13] + input[18];
+  bf1[14] = input[14] + input[17];
+  bf1[15] = input[15] + input[16];
+  bf1[16] = -input[16] + input[15];
+  bf1[17] = -input[17] + input[14];
+  bf1[18] = -input[18] + input[13];
+  bf1[19] = -input[19] + input[12];
+  bf1[20] = -input[20] + input[11];
+  bf1[21] = -input[21] + input[10];
+  bf1[22] = -input[22] + input[9];
+  bf1[23] = -input[23] + input[8];
+  bf1[24] = -input[24] + input[7];
+  bf1[25] = -input[25] + input[6];
+  bf1[26] = -input[26] + input[5];
+  bf1[27] = -input[27] + input[4];
+  bf1[28] = -input[28] + input[3];
+  bf1[29] = -input[29] + input[2];
+  bf1[30] = -input[30] + input[1];
+  bf1[31] = -input[31] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[16];
+  bf1[2] = bf0[8];
+  bf1[3] = bf0[24];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[20];
+  bf1[6] = bf0[12];
+  bf1[7] = bf0[28];
+  bf1[8] = bf0[2];
+  bf1[9] = bf0[18];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[26];
+  bf1[12] = bf0[6];
+  bf1[13] = bf0[22];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[30];
+  bf1[16] = bf0[1];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[9];
+  bf1[19] = bf0[25];
+  bf1[20] = bf0[5];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[13];
+  bf1[23] = bf0[29];
+  bf1[24] = bf0[3];
+  bf1[25] = bf0[19];
+  bf1[26] = bf0[11];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[7];
+  bf1[29] = bf0[23];
+  bf1[30] = bf0[15];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t x0, x1, x2, x3;
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 0
+  av1_range_check_buf(0, input, input, 4, stage_range[0]);
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
+  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
+  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
+  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
+  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
+  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
+  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
+  s7 = range_check_value(x0 + x1, stage_range[1]);
+
+  // stage 2
+  s7 = range_check_value(s7 - x3, stage_range[2]);
+
+  // stage 3
+  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
+  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
+  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
+  x3 = range_check_value(s4, bit + stage_range[3]);
+
+  // stage 4
+  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
+  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
+
+  // stage 5
+  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
+  s1 = range_check_value(x1, bit + stage_range[5]);
+  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
+  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
+
+  // stage 6
+  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = round_shift(s0, bit);
+  output[1] = round_shift(s1, bit);
+  output[2] = round_shift(s2, bit);
+  output[3] = round_shift(s3, bit);
+  av1_range_check_buf(6, input, output, 4, stage_range[6]);
+}
+
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  assert(output != input);
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  assert(output != input);
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 4; ++i)
+    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+  av1_range_check_buf(0, input, output, 4, stage_range[0]);
+}
+
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
+  av1_range_check_buf(0, input, output, 8, stage_range[0]);
+}
+
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 16; ++i)
+    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+  av1_range_check_buf(0, input, output, 16, stage_range[0]);
+}
+
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
+  av1_range_check_buf(0, input, output, 32, stage_range[0]);
+}
+
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
new file mode 100644
index 0000000000..9ef54fe4de
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
new file mode 100644
index 0000000000..2777cc25bc
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t av1_fwd_cos_bit_col[5][5];
+extern const int8_t av1_fwd_cos_bit_row[5][5];
+#endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
new file mode 100644
index 0000000000..12a9535a7c
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_fdct4;
+    case TXFM_TYPE_DCT8: return av1_fdct8;
+    case TXFM_TYPE_DCT16: return av1_fdct16;
+    case TXFM_TYPE_DCT32: return av1_fdct32;
+    case TXFM_TYPE_DCT64: return av1_fdct64;
+    case TXFM_TYPE_ADST4: return av1_fadst4;
+    case TXFM_TYPE_ADST8: return av1_fadst8;
+    case TXFM_TYPE_ADST16: return av1_fadst16;
+    case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
+    case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
+    case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
+    case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+    default: assert(0); return NULL;
+  }
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, int bd) {
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1;
+  }
+
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1;
+  }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
+                                int32_t *buf, int bd) {
+  int c, r;
+  // Note when assigning txfm_size_col, we use the txfm_size from the
+  // row configuration and vice versa. This is intentionally done to
+  // accurately perform rectangular transforms. When the transform is
+  // rectangular, the number of columns will be the same as the
+  // txfm_size stored in the row cfg struct. It will make no difference
+  // for square transforms.
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
+
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  // use output buffer as temp buffer
+  int32_t *temp_in = output;
+  int32_t *temp_out = output + txfm_size_row;
+
+  // Columns
+  for (c = 0; c < txfm_size_col; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
+    }
+    av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        buf[r * txfm_size_col + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip from left to right
+        buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
+    }
+  }
+
+  DECLARE_ALIGNED(16, int32_t, row_buffer[MAX_TX_SIZE]);
+
+  // Rows
+  for (r = 0; r < txfm_size_row; ++r) {
+    txfm_func_row(buf + r * txfm_size_col, row_buffer, cos_bit_row,
+                  stage_range_row);
+    av1_round_shift_array(row_buffer, txfm_size_col, -shift[2]);
+    if (abs(rect_type) == 1) {
+      // Multiply everything by Sqrt2 if the transform is rectangular and the
+      // size difference is a factor of 2.
+      for (c = 0; c < txfm_size_col; ++c) {
+        row_buffer[c] =
+            round_shift((int64_t)row_buffer[c] * NewSqrt2, NewSqrt2Bits);
+      }
+    }
+    for (c = 0; c < txfm_size_col; ++c) {
+      output[c * txfm_size_row + r] = row_buffer[c];
+    }
+  }
+}
+
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[4 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 64];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out top-right 32x32 area.
+  for (int col = 0; col < 32; ++col) {
+    memset(output + col * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Zero out the bottom 64x32 area.
+  memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int col = 1; col < 32; ++col) {
+    memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out right 32x32 area.
+  for (int col = 0; col < 32; ++col) {
+    memset(output + col * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int col = 1; col < 32; ++col) {
+    memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 32x32 area.
+  memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out right 32x16 area.
+  for (int row = 0; row < 16; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x16 indices.
+  for (int row = 1; row < 16; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
+static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 };
+static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
+static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
+
+const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+  fwd_shift_4x4,   fwd_shift_8x8,   fwd_shift_16x16, fwd_shift_32x32,
+  fwd_shift_64x64, fwd_shift_4x8,   fwd_shift_8x4,   fwd_shift_8x16,
+  fwd_shift_16x8,  fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
+  fwd_shift_64x32, fwd_shift_4x16,  fwd_shift_16x4,  fwd_shift_8x32,
+  fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
+};
+
+const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+                                [MAX_TXWH_IDX /*txh_idx*/] = {
+                                  { 13, 13, 13, 0, 0 },
+                                  { 13, 13, 13, 12, 0 },
+                                  { 13, 13, 13, 12, 13 },
+                                  { 0, 13, 13, 12, 13 },
+                                  { 0, 0, 13, 12, 13 }
+                                };
+
+const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+                                [MAX_TXWH_IDX /*txh_idx*/] = {
+                                  { 13, 13, 12, 0, 0 },
+                                  { 13, 13, 13, 12, 0 },
+                                  { 13, 13, 12, 13, 12 },
+                                  { 0, 12, 13, 12, 11 },
+                                  { 0, 0, 12, 11, 10 }
+                                };
+
+static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
+static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
+static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 };
+static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 };
+static const int8_t fdct64_range_mult2[12] = { 0,  2,  4,  6,  8,  10,
+                                               11, 11, 11, 11, 11, 11 };
+
+static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 };
+static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
+static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
+
+static const int8_t fidtx4_range_mult2[1] = { 1 };
+static const int8_t fidtx8_range_mult2[1] = { 2 };
+static const int8_t fidtx16_range_mult2[1] = { 3 };
+static const int8_t fidtx32_range_mult2[1] = { 4 };
+
+static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+  fdct4_range_mult2,  fdct8_range_mult2,   fdct16_range_mult2,
+  fdct32_range_mult2, fdct64_range_mult2,  fadst4_range_mult2,
+  fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
+  fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+};
+
+static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+
+  const int8_t *const range_mult2_col =
+      fwd_txfm_range_mult2_list[cfg->txfm_type_col];
+  const int stage_num_col = cfg->stage_num_col;
+  // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow.
+  for (int i = 0; i < stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i)
+    cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
+
+  const int8_t *const range_mult2_row =
+      fwd_txfm_range_mult2_list[cfg->txfm_type_row];
+  const int stage_num_row = cfg->stage_num_row;
+  // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow.
+  for (int i = 0; i < stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    cfg->stage_range_row[i] =
+        (range_mult2_col[stage_num_col - 1] + range_mult2_row[i] + 1) >> 1;
+  }
+}
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  cfg->shift = av1_fwd_txfm_shift_ls[tx_size];
+  cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  assert(cfg->txfm_type_col != TXFM_TYPE_INVALID);
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  assert(cfg->txfm_type_row != TXFM_TYPE_INVALID);
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+  set_fwd_txfm_non_scale_range(cfg);
+}
diff --git a/third_party/aom/av1/encoder/av1_ml_partition_models.h b/third_party/aom/av1/encoder/av1_ml_partition_models.h
new file mode 100644
index 0000000000..2572b138d5
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_ml_partition_models.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+#define AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(kyslov): Replace with proper weights after training AV1 models
+
+#define FEATURES 6
+static const float av1_var_part_nn_weights_64_layer0[FEATURES * 8] = {
+  0.35755366f,  0.86281112f,  -0.20871686f, 0.0409634f,   0.97305766f,
+  0.75510254f,  0.04860447f,  0.77095283f,  -0.44105278f, -0.3755049f,
+  -0.08456618f, 1.1821136f,   -0.73956301f, 1.30016453f,  0.45566902f,
+  0.4742967f,   0.44213975f,  0.4876028f,   0.26720522f,  -0.34429858f,
+  -0.25148252f, -0.49623932f, -0.46747941f, -0.36656624f, 0.10213375f,
+  0.60262819f,  -0.54788715f, -0.27272022f, 1.0995462f,   -0.36338376f,
+  -0.64836313f, 0.16057039f,  1.02782791f,  0.9985311f,   0.90607883f,
+  0.80570411f,  -0.07750863f, -0.74006402f, 1.72839526f,  1.72355343f,
+  1.69288916f,  1.59102043f,  0.14140216f,  -1.47262839f, 0.4262519f,
+  -0.33805936f, -0.02449707f, 0.67203692f
+};
+
+static const float av1_var_part_nn_bias_64_layer0[8] = {
+  0.39995694f, 0.65593756f, 1.12876737f,  1.28790576f,
+  0.53468556f, 0.3177908f,  -0.74388266f, -1.81131248f
+};
+
+static const float av1_var_part_nn_weights_64_layer1[8] = {
+  -1.31174053f, 0.69696917f, 0.78721456f, 0.45326379f,
+  0.79258322f,  1.74626188f, -5.41831f,   3.33887435f
+};
+
+static const float av1_var_part_nn_bias_64_layer1[1] = { -0.90951047f };
+
+static const float av1_var_part_means_64[FEATURES] = {
+  5.36750249f, 11.58023127f, 0.25550964f, 0.23809917f, 0.24650665f, 0.22117687f
+};
+static const float av1_var_part_vars_64[FEATURES] = {
+  0.89599769f, 2.2686018f, 0.02568608f, 0.02523411f, 0.02443085f, 0.01922085f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_64_layer0,
+      av1_var_part_nn_weights_64_layer1,
+  },
+  {
+      av1_var_part_nn_bias_64_layer0,
+      av1_var_part_nn_bias_64_layer1,
+  },
+};
+
+static const float av1_var_part_nn_weights_32_layer0[FEATURES * 8] = {
+  0.97886049f,  -1.66262011f, 0.94902798f,  0.7080922f,   0.91181186f,
+  0.35222601f,  -0.04428585f, 0.42086472f,  -0.0206325f,  -0.77937809f,
+  -0.70947522f, -1.24463119f, 0.23739497f,  -1.34327359f, 0.01024804f,
+  0.4544633f,   -0.96907661f, 0.67279522f,  0.23180693f,  1.54063368f,
+  -0.15700707f, 0.18597331f,  0.34167589f,  0.40736558f,  0.69213366f,
+  -1.33584593f, 1.21190814f,  1.26725267f,  1.21284802f,  1.26611399f,
+  0.17546514f,  -0.30248399f, -1.32589316f, -1.37432674f, -1.37423023f,
+  -1.26890855f, 0.12166347f,  -0.94565678f, -1.47475267f, -0.69279948f,
+  -0.10166587f, -0.23489881f, 0.57123565f,  0.80051137f,  -1.28411946f,
+  -1.36576732f, -1.30257508f, -1.30575106f
+};
+
+static const float av1_var_part_nn_bias_32_layer0[8] = {
+  -1.6301435f, 0.61879037f, -1.68612662f, 1.66960165f,
+  -0.0838243f, 0.32253287f, -0.65755282f, 0.96661531f
+};
+
+static const float av1_var_part_nn_weights_32_layer1[8] = {
+  1.99257161f,  0.7331492f,  1.33539961f,  1.13501456f,
+  -2.21154528f, 1.85858542f, -0.85565298f, -1.96410246f
+};
+
+static const float av1_var_part_nn_bias_32_layer1[1] = { -0.14880827f };
+
+static const float av1_var_part_means_32[FEATURES] = {
+  5.36360686f, 9.88421868f, 0.23543671f, 0.23621205f, 0.23409667f, 0.22855539f
+};
+
+static const float av1_var_part_vars_32[FEATURES] = {
+  0.89077225f, 2.32312894f, 0.02167654f, 0.02392842f, 0.02466495f, 0.02047641f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_32_layer0,
+      av1_var_part_nn_weights_32_layer1,
+  },
+  {
+      av1_var_part_nn_bias_32_layer0,
+      av1_var_part_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_var_part_nn_weights_16_layer0[FEATURES * 8] = {
+  0.45118305f,  -0.22068295f, 0.4604435f,   -0.1446326f,  -0.15765035f,
+  0.42260198f,  -0.0945916f,  0.49544996f,  0.62781567f,  -0.41564372f,
+  -0.39103292f, 0.44407624f,  0.48382613f,  -0.85424238f, -0.00961433f,
+  0.25383582f,  0.14403897f,  0.00901859f,  -0.83201967f, -0.19323284f,
+  0.59271213f,  0.69487457f,  0.6897112f,   0.62768521f,  0.9204492f,
+  -1.42448347f, -0.16491054f, -0.10114424f, -0.1069687f,  -0.11289049f,
+  0.26290832f,  -0.41850393f, 0.17239733f,  0.41770622f,  0.43725942f,
+  0.19362467f,  -0.35955731f, -0.899446f,   0.49726389f,  0.66569571f,
+  0.65893982f,  0.53199654f,  -0.1158694f,  -0.26472603f, 0.4155923f,
+  0.15059544f,  0.09596755f,  0.26247133f
+};
+
+static const float av1_var_part_nn_bias_16_layer0[8] = {
+  1.64486321f, -0.11851574f, 1.29322833f,  -0.61193136f,
+  0.33027532f, 1.04197232f,  -0.80716674f, 0.88681233f
+};
+
+static const float av1_var_part_nn_weights_16_layer1[8] = {
+  -1.02832118f, 0.72800106f, -0.42904783f, 1.44490586f,
+  -1.03888227f, -0.9023916f, -1.51543102f, -0.43059521f
+};
+
+static const float av1_var_part_nn_bias_16_layer1[1] = { -0.85087946f };
+
+static const float av1_var_part_means_16[FEATURES] = {
+  5.32551326f, 8.218448f, 0.21954822f, 0.22808377f, 0.23019798f, 0.22320699f
+};
+
+static const float av1_var_part_vars_16[FEATURES] = { 0.86806032f, 2.39938956f,
+                                                      0.01958579f, 0.02437927f,
+                                                      0.02420755f, 0.0192003f };
+
+static const NN_CONFIG av1_var_part_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_16_layer0,
+      av1_var_part_nn_weights_16_layer1,
+  },
+  {
+      av1_var_part_nn_bias_16_layer0,
+      av1_var_part_nn_bias_16_layer1,
+  },
+};
+
+#undef FEATURES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.c b/third_party/aom/av1/encoder/av1_noise_estimate.c
new file mode 100644
index 0000000000..25007bb6d4
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_noise_estimate.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/encoder.h"
+#if CONFIG_AV1_TEMPORAL_DENOISING
+#include "av1/encoder/av1_temporal_denoiser.h"
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
+  return (!cpi->ppi->use_svc ||
+          (cpi->ppi->use_svc &&
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
+  const int64_t area = (int64_t)width * height;
+  ne->enabled = 0;
+  ne->level = (area < 1280 * 720) ? kLowLow : kLow;
+  ne->value = 0;
+  ne->count = 0;
+  ne->thresh = 90;
+  ne->last_w = 0;
+  ne->last_h = 0;
+  if (area >= 1920 * 1080) {
+    ne->thresh = 200;
+  } else if (area >= 1280 * 720) {
+    ne->thresh = 140;
+  } else if (area >= 640 * 360) {
+    ne->thresh = 115;
+  }
+  ne->num_frames_estimate = 15;
+  ne->adapt_thresh = (3 * ne->thresh) >> 1;
+}
+
+static int enable_noise_estimation(AV1_COMP *const cpi) {
+  const int resize_pending = is_frame_resize_pending(cpi);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (cpi->common.seq_params->use_highbitdepth) return 0;
+#endif
+// Enable noise estimation if denoising is on.
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+      cpi->common.width >= 320 && cpi->common.height >= 180)
+    return 1;
+#endif
+  // Only allow noise estimate under certain encoding mode.
+  // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
+  // Not enabled for SVC mode and screen_content_mode.
+  // Not enabled for low resolutions.
+  if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
+      resize_pending == 0 && !cpi->ppi->use_svc &&
+      cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+      cpi->common.width * cpi->common.height >= 640 * 360)
+    return 1;
+  else
+    return 0;
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+                       const YV12_BUFFER_CONFIG *const src) {
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
+
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+
+  for (int r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
+  }
+}
+#endif  // CONFIG_AV1_TEMPORAL_DENOISING
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+  int noise_level = kLowLow;
+  if (ne->value > (ne->thresh << 1)) {
+    noise_level = kHigh;
+  } else {
+    if (ne->value > ne->thresh)
+      noise_level = kMedium;
+    else if (ne->value > (ne->thresh >> 1))
+      noise_level = kLow;
+    else
+      noise_level = kLowLow;
+  }
+  return noise_level;
+}
+
+void av1_update_noise_estimate(AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  // Estimate of noise level every frame_period frames.
+  int frame_period = 8;
+  int thresh_consec_zeromv = 2;
+  int frame_counter = cm->current_frame.frame_number;
+  // Estimate is between current source and last source.
+  YV12_BUFFER_CONFIG *last_source = cpi->last_source;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+    last_source = &cpi->denoiser.last_source;
+    // Tune these thresholds for different resolutions when denoising is
+    // enabled.
+    if (cm->width > 640 && cm->width <= 1920) {
+      thresh_consec_zeromv = 2;
+    }
+  }
+#endif
+  ne->enabled = enable_noise_estimation(cpi);
+  if (cpi->svc.number_spatial_layers > 1)
+    frame_counter = cpi->svc.current_superframe;
+  if (!ne->enabled || frame_counter % frame_period != 0 ||
+      last_source == NULL ||
+      (cpi->svc.number_spatial_layers == 1 &&
+       (ne->last_w != cm->width || ne->last_h != cm->height))) {
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+      copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+    if (last_source != NULL) {
+      ne->last_w = cm->width;
+      ne->last_h = cm->height;
+    }
+    return;
+  } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 &&
+             cpi->rc.frames_since_key > cpi->svc.number_spatial_layers &&
+             cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+             cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) {
+    // Force noise estimation to 0 and denoiser off if content has high motion.
+    ne->level = kLowLow;
+    ne->count = 0;
+    ne->num_frames_estimate = 10;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+        cpi->svc.current_superframe > 1) {
+      av1_denoiser_set_noise_level(cpi, ne->level);
+      copy_frame(&cpi->denoiser.last_source, cpi->source);
+    }
+#endif
+    return;
+  } else {
+    unsigned int bin_size = 100;
+    unsigned int hist[MAX_VAR_HIST_BINS] = { 0 };
+    unsigned int hist_avg[MAX_VAR_HIST_BINS];
+    unsigned int max_bin = 0;
+    unsigned int max_bin_count = 0;
+    unsigned int bin_cnt;
+    BLOCK_SIZE bsize = BLOCK_16X16;
+    // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+    // been encoded as zero/small mv at least x consecutive frames, compute
+    // the variance to update estimate of noise in the source.
+    const uint8_t *src_y = cpi->source->y_buffer;
+    const int src_ystride = cpi->source->y_stride;
+    const uint8_t *last_src_y = last_source->y_buffer;
+    const int last_src_ystride = last_source->y_stride;
+    int mi_row, mi_col;
+    int num_low_motion = 0;
+    int frame_low_motion = 1;
+    for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) {
+      for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += 2) {
+        int bl_index =
+            (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+        if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+          num_low_motion++;
+      }
+    }
+    if (num_low_motion <
+        (((3 * (mi_params->mi_rows * mi_params->mi_cols) >> 2)) >> 3))
+      frame_low_motion = 0;
+    for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
+        // 16x16 blocks, 1/4 sample of frame.
+        if (mi_row % 8 == 0 && mi_col % 8 == 0 &&
+            mi_row < mi_params->mi_rows - 3 &&
+            mi_col < mi_params->mi_cols - 3) {
+          int bl_index =
+              (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+          int bl_index1 = bl_index + 1;
+          int bl_index2 = bl_index + (mi_params->mi_cols >> 1);
+          int bl_index3 = bl_index2 + 1;
+          int consec_zeromv =
+              AOMMIN(cpi->consec_zero_mv[bl_index],
+                     AOMMIN(cpi->consec_zero_mv[bl_index1],
+                            AOMMIN(cpi->consec_zero_mv[bl_index2],
+                                   cpi->consec_zero_mv[bl_index3])));
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. And exclude this frame if
+          // high_source_sad is true (i.e., scene/content change).
+          if (frame_low_motion && consec_zeromv > thresh_consec_zeromv &&
+              !cpi->rc.high_source_sad) {
+            unsigned int sse;
+            // Compute variance between co-located blocks from current and
+            // last input frames.
+            unsigned int variance = cpi->ppi->fn_ptr[bsize].vf(
+                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+            unsigned int hist_index = variance / bin_size;
+            if (hist_index < MAX_VAR_HIST_BINS)
+              hist[hist_index]++;
+            else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1))
+              hist[MAX_VAR_HIST_BINS - 1]++;  // Account for the tail
+          }
+        }
+        src_y += 4;
+        last_src_y += 4;
+      }
+      src_y += (src_ystride << 2) - (mi_params->mi_cols << 2);
+      last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2);
+    }
+    ne->last_w = cm->width;
+    ne->last_h = cm->height;
+    // Adjust histogram to account for effect that histogram flattens
+    // and shifts to zero as scene darkens.
+    if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) {
+      hist[0] = 0;
+      hist[1] >>= 2;
+      hist[2] >>= 2;
+      hist[3] >>= 2;
+      hist[4] >>= 1;
+      hist[5] >>= 1;
+      hist[6] = 3 * hist[6] >> 1;
+      hist[MAX_VAR_HIST_BINS - 1] >>= 1;
+    }
+
+    // Average hist[] and find largest bin
+    for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) {
+      if (bin_cnt == 0)
+        hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 1)
+        hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 2)
+        hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] +
+                             (hist[bin_cnt + 1] >> 1) + 2) >>
+                            2;
+      else
+        hist_avg[bin_cnt] =
+            (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >>
+            2;
+
+      if (hist_avg[bin_cnt] > max_bin_count) {
+        max_bin_count = hist_avg[bin_cnt];
+        max_bin = bin_cnt;
+      }
+    }
+    // Scale by 40 to work with existing thresholds
+    ne->value = (int)((3 * ne->value + max_bin * 40) >> 2);
+    // Quickly increase VNR strength when the noise level increases suddenly.
+    if (ne->level < kMedium && ne->value > ne->adapt_thresh) {
+      ne->count = ne->num_frames_estimate;
+    } else {
+      ne->count++;
+    }
+    if (ne->count == ne->num_frames_estimate) {
+      // Reset counter and check noise level condition.
+      ne->num_frames_estimate = 30;
+      ne->count = 0;
+      ne->level = av1_noise_estimate_extract_level(ne);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+      if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+        av1_denoiser_set_noise_level(cpi, ne->level);
+#endif
+    }
+  }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+    copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+}
diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.h b/third_party/aom/av1/encoder/av1_noise_estimate.h
new file mode 100644
index 0000000000..85530666f6
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_noise_estimate.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+#define AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_VAR_HIST_BINS 20
+
+typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL;
+
+typedef struct noise_estimate {
+  int enabled;
+  NOISE_LEVEL level;
+  int value;
+  int thresh;
+  int adapt_thresh;
+  int count;
+  int last_w;
+  int last_h;
+  int num_frames_estimate;
+} NOISE_ESTIMATE;
+
+struct AV1_COMP;
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height);
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
+void av1_update_noise_estimate(struct AV1_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
new file mode 100644
index 0000000000..110d17f434
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  *eob_ptr = 0;
+}
+
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+                               const int16_t dequant_ptr[2],
+                               const int16_t round_ptr[2], int log_scale,
+                               const int16_t *scan, int coeff_count,
+                               const tran_low_t *coeff_ptr,
+                               tran_low_t *qcoeff_ptr,
+                               tran_low_t *dqcoeff_ptr) {
+  memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr));
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+  int eob = 0;
+  for (int i = 0; i < coeff_count; i++) {
+    const int rc = scan[i];
+    const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32 = 0;
+    if ((abs_coeff << (1 + log_scale)) >= thresh) {
+      abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+      tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+      if (tmp32) {
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        const tran_low_t abs_dqcoeff =
+            (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+      }
+    }
+    if (tmp32) eob = i + 1;
+  }
+  return eob;
+}
+
+static void quantize_fp_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, int log_scale) {
+  int i, eob = -1;
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (qm_ptr == NULL && iqm_ptr == NULL) {
+    *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+                                          log_scale, scan, (int)n_coeffs,
+                                          coeff_ptr, qcoeff_ptr, dqcoeff_ptr);
+  } else {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const int coeff_sign = AOMSIGN(coeff);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int tmp32 = 0;
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+        abs_coeff += rounding[rc != 0];
+        abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
+        tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
+                      (16 - log_scale + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+      }
+
+      if (tmp32) eob = i;
+    }
+    *eob_ptr = eob + 1;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_quantize_fp_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, int log_scale) {
+  int i;
+  int eob = -1;
+  const int shift = 16 - log_scale;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  if (qm_ptr || iqm_ptr) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int abs_qcoeff = 0;
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+        const int64_t tmp =
+            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_qcoeff =
+            (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+        if (abs_qcoeff) eob = i;
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  } else {
+    const int log_scaled_round_arr[2] = {
+      ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+      ROUND_POWER_OF_TWO(round_ptr[1], log_scale),
+    };
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int rc01 = (rc != 0);
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int log_scaled_round = log_scaled_round_arr[rc01];
+      if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
+        const int quant = quant_ptr[rc01];
+        const int dequant = dequant_ptr[rc01];
+        const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+        const int abs_qcoeff = (int)((tmp * quant) >> shift);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        if (abs_qcoeff) eob = i;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  (void)iscan;
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (int i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+    if (tmp) eob = i;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                         p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    switch (qparam->log_scale) {
+      case 0:
+        av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                        dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                        sc->iscan);
+        break;
+      case 1:
+        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
+        break;
+      case 2:
+        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#if !CONFIG_REALTIME_ONLY
+  if (qparam->use_quant_b_adapt) {
+    // TODO(sarahparker) These quantize_b optimizations need SIMD
+    // implementations
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      aom_quantize_b_adaptive_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX,
+                                  p->round_QTX, p->quant_QTX,
+                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 1:
+          aom_quantize_b_32x32_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_quantize_b_64x64_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
+    return;
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                            dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                            sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    switch (qparam->log_scale) {
+      case 0:
+        aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                       p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                       dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                       sc->iscan);
+        break;
+      case 1:
+        aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                             sc->iscan);
+        break;
+      case 2:
+        aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                             sc->iscan);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                        int skip_block, const int16_t *round_ptr,
+                        const int16_t quant, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                        uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                        const qm_val_t *iqm_ptr, const int log_scale) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = AOMSIGN(coeff);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp;
+  int eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  (void)sc;
+  assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+              p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
+              eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    highbd_quantize_fp_helper_c(
+        coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+        p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+        sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                           p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                           dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                           sc->iscan, qparam->log_scale);
+  }
+}
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const SCAN_ORDER *sc,
+                                  const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#if !CONFIG_REALTIME_ONLY
+  if (qparam->use_quant_b_adapt) {
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      aom_highbd_quantize_b_adaptive_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_highbd_quantize_b_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 1:
+          aom_highbd_quantize_b_32x32_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_highbd_quantize_b_64x64_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
+    return;
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    aom_highbd_quantize_b_helper_c(
+        coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+        p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+        sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    switch (qparam->log_scale) {
+      case 0:
+        aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                              p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
+        break;
+      case 1:
+        aom_highbd_quantize_b_32x32(
+            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+            eob_ptr, sc->scan, sc->iscan);
+        break;
+      case 2:
+        aom_highbd_quantize_b_64x64(
+            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+            eob_ptr, sc->scan, sc->iscan);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static INLINE void highbd_quantize_dc(
+    const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+    const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    const int64_t tmpw = tmp * wt;
+    const int abs_qcoeff =
+        (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    const int dequant =
+        (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  (void)sc;
+
+  highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+                     p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr,
+                     p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr,
+                     qparam->log_scale);
+}
+
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan,
+                              int log_scale) {
+  highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                              quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                              dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
+                              log_scale);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+  uint32_t t;
+  int l, m;
+  t = d;
+  l = get_msb(t);
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
+  *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
+  const int quant = av1_dc_quant_QTX(q, 0, bit_depth);
+  switch (bit_depth) {
+    case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq) {
+  int i, q, quant_QTX;
+
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    const int qzbin_factor = get_qzbin_factor(q, bit_depth);
+    const int qrounding_factor = q == 0 ? 64 : 48;
+
+    for (i = 0; i < 2; ++i) {
+      const int qrounding_factor_fp = 64;
+      // y quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, 0, bit_depth);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
+                   quant_QTX);
+      quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->y_dequant_QTX[q][i] = quant_QTX;
+
+      // u quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
+      invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
+                   quant_QTX);
+      quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->u_dequant_QTX[q][i] = quant_QTX;
+
+      // v quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
+      invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
+                   quant_QTX);
+      quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->v_dequant_QTX[q][i] = quant_QTX;
+    }
+
+    for (i = 2; i < 8; i++) {  // 8: SIMD width
+      quants->y_quant[q][i] = quants->y_quant[q][1];
+      quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+      quants->y_zbin[q][i] = quants->y_zbin[q][1];
+      quants->y_round[q][i] = quants->y_round[q][1];
+      deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
+
+      quants->u_quant[q][i] = quants->u_quant[q][1];
+      quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
+      quants->u_round_fp[q][i] = quants->u_round_fp[q][1];
+      quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1];
+      quants->u_zbin[q][i] = quants->u_zbin[q][1];
+      quants->u_round[q][i] = quants->u_round[q][1];
+      deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
+
+      quants->v_quant[q][i] = quants->v_quant[q][1];
+      quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
+      quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
+      quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
+      quants->v_zbin[q][i] = quants->v_zbin[q][1];
+      quants->v_round[q][i] = quants->v_round[q][1];
+      deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
+    }
+  }
+}
+
+static INLINE bool deltaq_params_have_changed(
+    const DeltaQuantParams *prev_deltaq_params,
+    const CommonQuantParams *quant_params) {
+  return (prev_deltaq_params->y_dc_delta_q != quant_params->y_dc_delta_q ||
+          prev_deltaq_params->u_dc_delta_q != quant_params->u_dc_delta_q ||
+          prev_deltaq_params->v_dc_delta_q != quant_params->v_dc_delta_q ||
+          prev_deltaq_params->u_ac_delta_q != quant_params->u_ac_delta_q ||
+          prev_deltaq_params->v_ac_delta_q != quant_params->v_ac_delta_q);
+}
+
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+                        const CommonQuantParams *quant_params,
+                        aom_bit_depth_t bit_depth) {
+  DeltaQuantParams *const prev_deltaq_params =
+      &enc_quant_dequant_params->prev_deltaq_params;
+
+  // Re-initialize the quantizer only if any of the dc/ac deltaq parameters
+  // change.
+  if (!deltaq_params_have_changed(prev_deltaq_params, quant_params)) return;
+  QUANTS *const quants = &enc_quant_dequant_params->quants;
+  Dequants *const dequants = &enc_quant_dequant_params->dequants;
+  av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q,
+                      quant_params->u_dc_delta_q, quant_params->u_ac_delta_q,
+                      quant_params->v_dc_delta_q, quant_params->v_ac_delta_q,
+                      quants, dequants);
+
+  // Record the state of deltaq parameters.
+  prev_deltaq_params->y_dc_delta_q = quant_params->y_dc_delta_q;
+  prev_deltaq_params->u_dc_delta_q = quant_params->u_dc_delta_q;
+  prev_deltaq_params->v_dc_delta_q = quant_params->v_dc_delta_q;
+  prev_deltaq_params->u_ac_delta_q = quant_params->u_ac_delta_q;
+  prev_deltaq_params->v_ac_delta_q = quant_params->v_ac_delta_q;
+}
+
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+                     int qindex, MACROBLOCK *x) {
+  const QUANTS *const quants = &enc_quant_dequant_params->quants;
+  const Dequants *const dequants = &enc_quant_dequant_params->dequants;
+  x->qindex = qindex;
+  x->seg_skip_block =
+      0;  // TODO(angiebird): Find a proper place to init this variable.
+
+  // Y
+  x->plane[0].quant_QTX = quants->y_quant[qindex];
+  x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+  x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+  x->plane[0].round_QTX = quants->y_round[qindex];
+  x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex];
+
+  // U
+  x->plane[1].quant_QTX = quants->u_quant[qindex];
+  x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+  x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+  x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+  x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+  x->plane[1].round_QTX = quants->u_round[qindex];
+  x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex];
+
+  // V
+  x->plane[2].quant_QTX = quants->v_quant[qindex];
+  x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+  x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+  x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+  x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+  x->plane[2].round_QTX = quants->v_round[qindex];
+  x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex];
+}
+
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+                     MACROBLOCKD *xd) {
+  const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
+  const int qmlevel_y =
+      use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
+  const int qmlevel_u =
+      use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+  const int qmlevel_v =
+      use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+  const int qmlevel_ls[MAX_MB_PLANE] = { qmlevel_y, qmlevel_u, qmlevel_v };
+  for (int i = 0; i < MAX_MB_PLANE; ++i) {
+    const int qmlevel = qmlevel_ls[i];
+    memcpy(&xd->plane[i].seg_qmatrix[segment_id],
+           quant_params->gqmatrix[qmlevel][i],
+           sizeof(quant_params->gqmatrix[qmlevel][i]));
+    memcpy(&xd->plane[i].seg_iqmatrix[segment_id],
+           quant_params->giqmatrix[qmlevel][i],
+           sizeof(quant_params->giqmatrix[qmlevel][i]));
+  }
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id, const int do_update) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *const quant_params = &cm->quant_params;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  int qindex_rd;
+
+  const int current_qindex = AOMMAX(
+      0,
+      AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+                                   ? quant_params->base_qindex + x->delta_qindex
+                                   : quant_params->base_qindex));
+  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
+
+  if (cpi->oxcf.sb_qp_sweep) {
+    const int current_rd_qindex =
+        AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+                                               ? quant_params->base_qindex +
+                                                     x->rdmult_delta_qindex
+                                               : quant_params->base_qindex));
+    qindex_rd = av1_get_qindex(&cm->seg, segment_id, current_rd_qindex);
+  } else {
+    qindex_rd = qindex;
+  }
+
+  const int qindex_rdmult = qindex_rd + quant_params->y_dc_delta_q;
+  const int rdmult = av1_compute_rd_mult(
+      qindex_rdmult, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+
+  const int qindex_change = x->qindex != qindex;
+  if (qindex_change || do_update) {
+    av1_set_q_index(&cpi->enc_quant_dequant_params, qindex, x);
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if ((segment_id != x->prev_segment_id) ||
+      av1_use_qmatrix(quant_params, xd, segment_id)) {
+    av1_set_qmatrix(quant_params, segment_id, xd);
+  }
+
+  x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+
+  av1_set_error_per_bit(&x->errorperbit, rdmult);
+  av1_set_sad_per_bit(cpi, &x->sadperbit, qindex_rd);
+
+  x->prev_segment_id = segment_id;
+}
+
+void av1_frame_init_quantizer(AV1_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  x->prev_segment_id = -1;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 1);
+}
+
+static int adjust_hdr_cb_deltaq(int base_qindex) {
+  double baseQp = base_qindex / QP_SCALE_FACTOR;
+  const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+  const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+  int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5));
+  dqpCb = AOMMIN(0, dqpCb);
+  dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  return dqpCb;
+}
+
+static int adjust_hdr_cr_deltaq(int base_qindex) {
+  double baseQp = base_qindex / QP_SCALE_FACTOR;
+  const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+  const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+  int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5));
+  dqpCr = AOMMIN(0, dqpCr);
+  dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  return dqpCr;
+}
+
+void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
+                       int q, int enable_chroma_deltaq, int enable_hdr_deltaq) {
+  // quantizer has to be reinitialized with av1_init_quantizer() if any
+  // delta_q changes.
+  CommonQuantParams *quant_params = &cm->quant_params;
+  quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
+  quant_params->y_dc_delta_q = 0;
+
+  if (enable_chroma_deltaq) {
+    // TODO(aomedia:2717): need to design better delta
+    quant_params->u_dc_delta_q = 2;
+    quant_params->u_ac_delta_q = 2;
+    quant_params->v_dc_delta_q = 2;
+    quant_params->v_ac_delta_q = 2;
+  } else {
+    quant_params->u_dc_delta_q = 0;
+    quant_params->u_ac_delta_q = 0;
+    quant_params->v_dc_delta_q = 0;
+    quant_params->v_ac_delta_q = 0;
+  }
+
+  // following section 8.3.2 in T-REC-H.Sup15 document
+  // to apply to AV1 qindex in the range of [0, 255]
+  if (enable_hdr_deltaq) {
+    int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex);
+    int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex);
+    quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb;
+    quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr;
+    if (dqpCb != dqpCr) {
+      cm->seq_params->separate_uv_delta_q = 1;
+    }
+  }
+
+  quant_params->qmatrix_level_y =
+      aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
+  quant_params->qmatrix_level_u =
+      aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q,
+                      min_qmlevel, max_qmlevel);
+
+  if (!cm->seq_params->separate_uv_delta_q)
+    quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
+  else
+    quant_params->qmatrix_level_v =
+        aom_get_qmlevel(quant_params->base_qindex + quant_params->v_ac_delta_q,
+                        min_qmlevel, max_qmlevel);
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+  0,   4,   8,   12,  16,  20,  24,  28,  32,  36,  40,  44,  48,
+  52,  56,  60,  64,  68,  72,  76,  80,  84,  88,  92,  96,  100,
+  104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+  156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+  208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int av1_quantizer_to_qindex(int quantizer) {
+  return quantizer_to_qindex[quantizer];
+}
+
+int av1_qindex_to_quantizer(int qindex) {
+  int quantizer;
+
+  for (quantizer = 0; quantizer < 64; ++quantizer)
+    if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+  return 63;
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
new file mode 100644
index 0000000000..040973376d
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct QUANT_PARAM {
+  int log_scale;
+  TX_SIZE tx_size;
+  const qm_val_t *qmatrix;
+  const qm_val_t *iqmatrix;
+  int use_quant_b_adapt;
+  int use_optimize_b;
+  int xform_quant_idx;
+} QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const MACROBLOCK_PLANE *p,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const SCAN_ORDER *sc,
+                                 const QUANT_PARAM *qparam);
+
+// The QUANTS structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// All of its fields use the same coefficient shift/scaling at TX.
+typedef struct {
+  // 0: dc 1: ac 2-8: ac repeated to SIMD width
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+  // TODO(jingning): in progress of re-working the quantization. will decide
+  // if we want to deprecate the current use of y_quant.
+  DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+// The Dequants structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// Fields are suffixed according to whether or not they're expressed in
+// the same coefficient shift/precision as TX or a fixed Q3 format.
+typedef struct {
+  DECLARE_ALIGNED(16, int16_t,
+                  y_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  u_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  v_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+} Dequants;
+
+// The DeltaQuantParams structure holds the dc/ac deltaq parameters.
+typedef struct {
+  int y_dc_delta_q;
+  int u_dc_delta_q;
+  int u_ac_delta_q;
+  int v_dc_delta_q;
+  int v_ac_delta_q;
+} DeltaQuantParams;
+
+typedef struct {
+  // Quantization parameters for internal quantizer setup.
+  QUANTS quants;
+  // Dequantization parameters for internal quantizer setup.
+  Dequants dequants;
+  // Deltaq parameters to track the state of the dc/ac deltaq parameters in
+  // cm->quant_params. It is used to decide whether the quantizer tables need
+  // to be re-initialized.
+  DeltaQuantParams prev_deltaq_params;
+} EncQuantDequantParams;
+
+struct AV1_COMP;
+struct AV1Common;
+
+void av1_frame_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id, const int do_update);
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq);
+
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+                        const CommonQuantParams *quant_params,
+                        aom_bit_depth_t bit_depth);
+
+void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
+                       int max_qmlevel, int q, int enable_chroma_deltaq,
+                       int enable_hdr_deltaq);
+
+int av1_quantizer_to_qindex(int quantizer);
+
+int av1_qindex_to_quantizer(int qindex);
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+/*!\brief Quantize transform coefficients without using qmatrix
+ *
+ * quant_ptr, dequant_ptr and round_ptr are size 2 arrays,
+ * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs.
+ *
+ * \param[in]  quant_ptr    16-bit fixed point representation of inverse
+ *                          quantize step size, i.e. 2^16/dequant
+ * \param[in]  dequant_ptr  quantize step size
+ * \param[in]  round_ptr    rounding
+ * \param[in]  log_scale    the relative log scale of the transform
+ *                          coefficients
+ * \param[in]  scan         scan[i] indicates the position of ith to-be-coded
+ *                          coefficient
+ * \param[in]  coeff_count  number of coefficients
+ * \param[out] qcoeff_ptr   quantized coefficients
+ * \param[out] dqcoeff_ptr  dequantized coefficients
+ *
+ * \return The last non-zero coefficient's scan index plus 1
+ */
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+                               const int16_t dequant_ptr[2],
+                               const int16_t round_ptr[2], int log_scale,
+                               const int16_t *scan, int coeff_count,
+                               const tran_low_t *coeff_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+/*!\brief Update quantize parameters in MACROBLOCK
+ *
+ * \param[in]  enc_quant_dequant_params This parameter cached the quantize and
+ *                                      dequantize parameters for all q
+ *                                      indices.
+ * \param[in]  qindex                   Quantize index used for the current
+ *                                      superblock.
+ * \param[out] x                        A superblock data structure for
+ *                                      encoder.
+ */
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+                     int qindex, MACROBLOCK *x);
+
+/*!\brief Update quantize matrix in MACROBLOCKD based on segment id
+ *
+ * \param[in]  quant_params  Quantize parameters used by encoder and decoder
+ * \param[in]  segment_id    Segment id.
+ * \param[out] xd            A superblock data structure used by encoder and
+ * decoder.
+ */
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+                     MACROBLOCKD *xd);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const SCAN_ORDER *sc,
+                                  const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam);
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.c b/third_party/aom/av1/encoder/av1_temporal_denoiser.c
new file mode 100644
index 0000000000..3012df6311
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif
+
+static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  return 3 + (increase_denoising ? 1 : 0);
+}
+
+static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 4;
+}
+
+static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 625;
+}
+
+static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40);
+}
+
+static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
+                           int motion_magnitude) {
+  if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) {
+    if (increase_denoising)
+      return (1 << num_pels_log2_lookup[bs]) << 2;
+    else
+      return 0;
+  } else {
+    return (1 << num_pels_log2_lookup[bs]) << 4;
+  }
+}
+
+static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+
+// TODO(kyslov): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int av1_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+                          const uint8_t *mc_avg, int mc_avg_stride,
+                          uint8_t *avg, int avg_stride, int increase_denoising,
+                          BLOCK_SIZE bs, int motion_magnitude) {
+  int r, c;
+  const uint8_t *sig_start = sig;
+  const uint8_t *mc_avg_start = mc_avg;
+  uint8_t *avg_start = avg;
+  int diff, adj, absdiff, delta;
+  int adj_val[] = { 3, 4, 6 };
+  int total_adj = 0;
+  int shift_inc = 1;
+
+  // If motion_magnitude is small, making the denoiser more aggressive by
+  // increasing the adjustment for each level. Add another increment for
+  // blocks that are labeled for increase denoising.
+  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+    if (increase_denoising) {
+      shift_inc = 2;
+    }
+    adj_val[0] += shift_inc;
+    adj_val[1] += shift_inc;
+    adj_val[2] += shift_inc;
+  }
+
+  // First attempt to apply a strong temporal denoising filter.
+  for (r = 0; r < block_size_high[bs]; ++r) {
+    for (c = 0; c < block_size_wide[bs]; ++c) {
+      diff = mc_avg[c] - sig[c];
+      absdiff = abs(diff);
+
+      if (absdiff <= absdiff_thresh(bs, increase_denoising)) {
+        avg[c] = mc_avg[c];
+        total_adj += diff;
+      } else {
+        switch (absdiff) {
+          case 4:
+          case 5:
+          case 6:
+          case 7: adj = adj_val[0]; break;
+          case 8:
+          case 9:
+          case 10:
+          case 11:
+          case 12:
+          case 13:
+          case 14:
+          case 15: adj = adj_val[1]; break;
+          default: adj = adj_val[2];
+        }
+        if (diff > 0) {
+          avg[c] = AOMMIN(UINT8_MAX, sig[c] + adj);
+          total_adj += adj;
+        } else {
+          avg[c] = AOMMAX(0, sig[c] - adj);
+          total_adj -= adj;
+        }
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // If the strong filter did not modify the signal too much, we're all set.
+  if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+
+  // Otherwise, we try to dampen the filter if the delta is not too high.
+  delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >>
+           num_pels_log2_lookup[bs]) +
+          1;
+
+  if (delta >= delta_thresh(bs, increase_denoising)) {
+    return COPY_BLOCK;
+  }
+
+  mc_avg = mc_avg_start;
+  avg = avg_start;
+  sig = sig_start;
+  for (r = 0; r < block_size_high[bs]; ++r) {
+    for (c = 0; c < block_size_wide[bs]; ++c) {
+      diff = mc_avg[c] - sig[c];
+      adj = abs(diff);
+      if (adj > delta) {
+        adj = delta;
+      }
+      if (diff > 0) {
+        // Diff positive means we made positive adjustment above
+        // (in first try/attempt), so now make negative adjustment to bring
+        // denoised signal down.
+        avg[c] = AOMMAX(0, avg[c] - adj);
+        total_adj -= adj;
+      } else {
+        // Diff negative means we made negative adjustment above
+        // (in first try/attempt), so now make positive adjustment to bring
+        // denoised signal up.
+        avg[c] = AOMMIN(UINT8_MAX, avg[c] + adj);
+        total_adj += adj;
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // We can use the filter if it has been sufficiently dampened
+  if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+  return COPY_BLOCK;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row,
+                            int mi_col) {
+  return framebuf + (stride * mi_row << 2) + (mi_col << 2);
+}
+
+static AV1_DENOISER_DECISION perform_motion_compensation(
+    AV1_COMMON *const cm, AV1_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
+    int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
+    int motion_magnitude, int *zeromv_filter, int num_spatial_layers, int width,
+    int lst_fb_idx, int gld_fb_idx, int use_svc, int spatial_layer,
+    int use_gf_temporal_ref) {
+  const int sse_diff = (ctx->newmv_sse == UINT_MAX)
+                           ? 0
+                           : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
+  int frame;
+  int denoise_layer_idx = 0;
+  MACROBLOCKD *filter_mbd = &mb->e_mbd;
+  MB_MODE_INFO *mi = filter_mbd->mi[0];
+  MB_MODE_INFO saved_mi;
+  int i;
+  struct buf_2d saved_dst[MAX_MB_PLANE];
+  struct buf_2d saved_pre[MAX_MB_PLANE];
+  // const RefBuffer *saved_block_refs[2];
+  MV_REFERENCE_FRAME saved_frame;
+
+  frame = ctx->best_reference_frame;
+
+  saved_mi = *mi;
+
+  // Avoid denoising small blocks. When noise > kDenLow or frame width > 480,
+  // denoise 16x16 blocks.
+  if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 ||
+      (bs == BLOCK_16X16 && width > 480 &&
+       denoiser->denoising_level <= kDenLow))
+    return COPY_BLOCK;
+
+  // If the best reference frame uses inter-prediction and there is enough of a
+  // difference in sum-squared-error, use it.
+  if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME &&
+      sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
+    mi->ref_frame[0] = ctx->best_reference_frame;
+    mi->mode = ctx->best_sse_inter_mode;
+    mi->mv[0] = ctx->best_sse_mv;
+  } else {
+    // Otherwise, use the zero reference frame.
+    frame = ctx->best_zeromv_reference_frame;
+    ctx->newmv_sse = ctx->zeromv_sse;
+    // Bias to last reference.
+    if ((num_spatial_layers > 1 && !use_gf_temporal_ref) ||
+        frame == ALTREF_FRAME ||
+        (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
+        (frame != LAST_FRAME &&
+         ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
+          denoiser->denoising_level >= kDenHigh))) {
+      frame = LAST_FRAME;
+      ctx->newmv_sse = ctx->zeromv_lastref_sse;
+    }
+    mi->ref_frame[0] = frame;
+    mi->mode = GLOBALMV;
+    mi->mv[0].as_int = 0;
+    ctx->best_sse_inter_mode = GLOBALMV;
+    ctx->best_sse_mv.as_int = 0;
+    *zeromv_filter = 1;
+    if (denoiser->denoising_level > kDenMedium) {
+      motion_magnitude = 0;
+    }
+  }
+
+  saved_frame = frame;
+  // When using SVC, we need to map REF_FRAME to the frame buffer index.
+  if (use_svc) {
+    if (frame == LAST_FRAME)
+      frame = lst_fb_idx + 1;
+    else if (frame == GOLDEN_FRAME)
+      frame = gld_fb_idx + 1;
+    // Shift for the second spatial layer.
+    if (num_spatial_layers - spatial_layer == 2)
+      frame = frame + denoiser->num_ref_frames;
+    denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+  }
+
+  // Force copy (no denoise, copy source in denoised buffer) if
+  // running_avg_y[frame] is NULL.
+  if (denoiser->running_avg_y[frame].buffer_alloc == NULL) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+
+  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+  if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+
+  // We will restore these after motion compensation.
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    saved_pre[i] = filter_mbd->plane[i].pre[0];
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
+
+  // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
+  // struct.
+  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+  av1_setup_pre_planes(filter_mbd, 0, &(denoiser->running_avg_y[frame]), mi_row,
+                       mi_col, filter_mbd->block_ref_scale_factors[0], 1);
+  av1_setup_dst_planes(filter_mbd->plane, bs,
+                       &(denoiser->mc_running_avg_y[denoise_layer_idx]), mi_row,
+                       mi_col, 0, 1);
+
+  av1_enc_build_inter_predictor_y(filter_mbd, mi_row, mi_col);
+
+  // Restore everything to its original state
+  *mi = saved_mi;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    filter_mbd->plane[i].pre[0] = saved_pre[i];
+    filter_mbd->plane[i].dst = saved_dst[i];
+  }
+
+  return FILTER_BLOCK;
+}
+
+void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
+                          BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+                          AV1_DENOISER_DECISION *denoiser_decision,
+                          int use_gf_temporal_ref) {
+  int mv_col, mv_row;
+  int motion_magnitude = 0;
+  int zeromv_filter = 0;
+  AV1_DENOISER *denoiser = &cpi->denoiser;
+  AV1_DENOISER_DECISION decision = COPY_BLOCK;
+
+  const int shift =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+          ? denoiser->num_ref_frames
+          : 0;
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+  const int denoise_layer_index =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
+  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
+  uint8_t *mc_avg_start =
+      block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
+  struct buf_2d src = mb->plane[0].src;
+  int increase_denoising = 0;
+  int last_is_reference = cpi->ref_frame_flags & AOM_LAST_FLAG;
+  mv_col = ctx->best_sse_mv.as_mv.col;
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
+  if (denoiser->denoising_level == kDenHigh) increase_denoising = 1;
+
+  // Copy block if LAST_FRAME is not a reference.
+  // Last doesn't always exist when SVC layers are dynamically changed, e.g. top
+  // spatial layer doesn't have last reference when it's brought up for the
+  // first time on the fly.
+  if (last_is_reference && denoiser->denoising_level >= kDenLow &&
+      !ctx->sb_skip_denoising)
+    decision = perform_motion_compensation(
+        &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
+        motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers,
+        cpi->source->y_width, cpi->ppi->rtc_ref.ref_idx[0],
+        cpi->ppi->rtc_ref.ref_idx[3], cpi->ppi->use_svc,
+        cpi->svc.spatial_layer_id, use_gf_temporal_ref);
+
+  if (decision == FILTER_BLOCK) {
+    decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start,
+                                   mc_avg.y_stride, avg_start, avg.y_stride,
+                                   increase_denoising, bs, motion_magnitude);
+  }
+
+  if (decision == FILTER_BLOCK) {
+    aom_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+                      block_size_wide[bs], block_size_high[bs]);
+  } else {  // COPY_BLOCK
+    aom_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+                      block_size_wide[bs], block_size_high[bs]);
+  }
+  *denoiser_decision = decision;
+  if (decision == FILTER_BLOCK && zeromv_filter == 1)
+    *denoiser_decision = FILTER_ZEROMV_BLOCK;
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+                       const YV12_BUFFER_CONFIG *const src) {
+  int r;
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
+
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+
+  for (r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
+  }
+}
+
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
+                              YV12_BUFFER_CONFIG *const src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
+}
+
+void av1_denoiser_update_frame_info(
+    AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref,
+    struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame,
+    int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx,
+    int gld_fb_idx, int lst_fb_idx, int resized,
+    int svc_refresh_denoiser_buffers, int second_spatial_layer) {
+  const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
+  // Copy source into denoised reference buffers on KEY_FRAME or
+  // if the just encoded frame was resized. For SVC, copy source if the base
+  // spatial layer was key frame.
+  if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset ||
+      svc_refresh_denoiser_buffers) {
+    int i;
+    // Start at 1 so as not to overwrite the INTRA_FRAME
+    for (i = 1; i < denoiser->num_ref_frames; ++i) {
+      if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+        copy_frame(&denoiser->running_avg_y[i + shift], &src);
+    }
+    denoiser->reset = 0;
+    return;
+  }
+
+  if (rtc_ref->set_ref_frame_config) {
+    int i;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (rtc_ref->refresh[svc->spatial_layer_id] & (1 << i))
+        copy_frame(&denoiser->running_avg_y[i + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+    }
+  } else {
+    // If more than one refresh occurs, must copy frame buffer.
+    if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) >
+        1) {
+      if (refresh_alt_ref_frame) {
+        copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_golden_frame) {
+        copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_last_frame) {
+        copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+    } else {
+      if (refresh_alt_ref_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_golden_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_last_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+    }
+  }
+}
+
+void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
+  ctx->zeromv_sse = INT64_MAX;
+  ctx->newmv_sse = INT64_MAX;
+  ctx->zeromv_lastref_sse = INT64_MAX;
+  ctx->best_sse_mv.as_int = 0;
+}
+
+void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
+                                     PREDICTION_MODE mode,
+                                     PICK_MODE_CONTEXT *ctx) {
+  if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
+    ctx->zeromv_sse = sse;
+    ctx->best_zeromv_reference_frame = mi->ref_frame[0];
+    if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse;
+  }
+
+  if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
+    ctx->newmv_sse = sse;
+    ctx->best_sse_inter_mode = mode;
+    ctx->best_sse_mv = mi->mv[0];
+    ctx->best_reference_frame = mi->ref_frame[0];
+  }
+}
+
+static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm,
+                                           AV1_DENOISER *denoiser, int fb_idx) {
+  int fail = 0;
+  if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+    fail = aom_alloc_frame_buffer(
+        &denoiser->running_avg_y[fb_idx], cm->width, cm->height,
+        cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+        cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+        cm->features.byte_alignment, 0, 0);
+    if (fail) {
+      av1_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
+                             struct RTC_REF *rtc_ref, struct SVC *svc,
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx) {
+  int fail = 0;
+  if (rtc_ref->set_ref_frame_config) {
+    int i;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (cm->current_frame.frame_type == KEY_FRAME ||
+          rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) {
+        fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+                                               i + 1 + svc_buf_shift);
+      }
+    }
+  } else {
+    if (refresh_alt) {
+      // Increase the frame buffer index by 1 to map it to the buffer index in
+      // the denoiser.
+      fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+                                             alt_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+    if (refresh_gld) {
+      fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+                                             gld_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+    if (refresh_lst) {
+      fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+                                             lst_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+  }
+  return 0;
+}
+
+int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy, int use_highbitdepth, int border) {
+  int i, layer, fail, init_num_ref_frames;
+  const int legacy_byte_alignment = 0;
+  int num_layers = 1;
+  int scaled_width = width;
+  int scaled_height = height;
+  if (use_svc) {
+    LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+                                                svc->number_temporal_layers +
+                                            svc->temporal_layer_id];
+    av1_get_layer_resolution(width, height, lc->scaling_factor_num,
+                             lc->scaling_factor_den, &scaled_width,
+                             &scaled_height);
+    // For SVC: only denoise at most 2 spatial (highest) layers.
+    if (noise_sen >= 2)
+      // Denoise from one spatial layer below the top.
+      svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 2, 0);
+    else
+      // Only denoise the top spatial layer.
+      svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 1, 0);
+    num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+  }
+  assert(denoiser != NULL);
+  denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+  init_num_ref_frames = use_svc ? REF_FRAMES : NONSVC_REF_FRAMES;
+  denoiser->num_layers = num_layers;
+  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+                  aom_calloc(denoiser->num_ref_frames * num_layers,
+                             sizeof(denoiser->running_avg_y[0])));
+  CHECK_MEM_ERROR(
+      cm, denoiser->mc_running_avg_y,
+      aom_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+
+  for (layer = 0; layer < num_layers; ++layer) {
+    const int denoise_width = (layer == 0) ? width : scaled_width;
+    const int denoise_height = (layer == 0) ? height : scaled_height;
+    for (i = 0; i < init_num_ref_frames; ++i) {
+      fail = aom_alloc_frame_buffer(
+          &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+          denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border,
+          legacy_byte_alignment, 0, 0);
+      if (fail) {
+        av1_denoiser_free(denoiser);
+        return 1;
+      }
+#ifdef OUTPUT_YUV_DENOISED
+      make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+    }
+
+    fail = aom_alloc_frame_buffer(
+        &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx,
+        ssy, use_highbitdepth, border, legacy_byte_alignment, 0, 0);
+    if (fail) {
+      av1_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+
+  // denoiser->last_source only used for noise_estimation, so only for top
+  // layer.
+  fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
+                                use_highbitdepth, border, legacy_byte_alignment,
+                                0, 0);
+  if (fail) {
+    av1_denoiser_free(denoiser);
+    return 1;
+  }
+#ifdef OUTPUT_YUV_DENOISED
+  make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  denoiser->frame_buffer_initialized = 1;
+  denoiser->denoising_level = kDenMedium;
+  denoiser->prev_denoising_level = kDenMedium;
+  denoiser->reset = 0;
+  denoiser->current_denoiser_frame = 0;
+  return 0;
+}
+
+void av1_denoiser_free(AV1_DENOISER *denoiser) {
+  int i;
+  if (denoiser == NULL) {
+    return;
+  }
+  denoiser->frame_buffer_initialized = 0;
+  for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
+    aom_free_frame_buffer(&denoiser->running_avg_y[i]);
+  }
+  aom_free(denoiser->running_avg_y);
+  denoiser->running_avg_y = NULL;
+
+  for (i = 0; i < denoiser->num_layers; ++i) {
+    aom_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+  }
+
+  aom_free(denoiser->mc_running_avg_y);
+  denoiser->mc_running_avg_y = NULL;
+  aom_free_frame_buffer(&denoiser->last_source);
+}
+
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+static void force_refresh_longterm_ref(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // If long term reference is used, force refresh of that slot, so
+  // denoiser buffer for long term reference stays in sync.
+  if (svc->use_gf_temporal_ref_current_layer) {
+    int index = svc->spatial_layer_id;
+    if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+    assert(index >= 0);
+    cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+    cpi->refresh_alt_ref_frame = 1;
+  }
+}
+#endif
+
+void av1_denoiser_set_noise_level(AV1_COMP *const cpi, int noise_level) {
+  AV1_DENOISER *const denoiser = &cpi->denoiser;
+  denoiser->denoising_level = noise_level;
+  if (denoiser->denoising_level > kDenLowLow &&
+      denoiser->prev_denoising_level == kDenLowLow) {
+    denoiser->reset = 1;
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+    force_refresh_longterm_ref(cpi);
+#endif
+  } else {
+    denoiser->reset = 0;
+  }
+  denoiser->prev_denoising_level = denoiser->denoising_level;
+}
+
+// Scale/increase the partition threshold
+// for denoiser speed-up.
+int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
+                              CONTENT_STATE_SB content_state,
+                              int temporal_layer_id) {
+  if ((content_state.source_sad_nonrd <= kLowSad &&
+       content_state.low_sumdiff) ||
+      (content_state.source_sad_nonrd == kHighSad &&
+       content_state.low_sumdiff) ||
+      (content_state.lighting_change && !content_state.low_sumdiff) ||
+      (noise_level == kDenHigh) || (temporal_layer_id != 0)) {
+    int64_t scaled_thr =
+        (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2;
+    return scaled_thr;
+  } else {
+    return (5 * threshold) >> 2;
+  }
+}
+
+//  Scale/increase the ac skip threshold for
+//  denoiser speed-up.
+int64_t av1_scale_acskip_thresh(int64_t threshold,
+                                AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
+                                int temporal_layer_id) {
+  if (noise_level >= kDenLow && abs_sumdiff < 5)
+    threshold *= (noise_level == kDenLow)   ? 2
+                 : (temporal_layer_id == 2) ? 10
+                                            : 6;
+  return threshold;
+}
+
+void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) {
+  if (/*av1_denoise_svc_non_key(cpi) &&*/
+      cpi->denoiser.current_denoiser_frame == 0) {
+    cpi->denoiser.reset = 1;
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+    force_refresh_longterm_ref(cpi);
+#endif
+  }
+}
+
+void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  SVC *const svc = &cpi->svc;
+
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->denoiser.denoising_level > kDenLowLow) {
+    int svc_refresh_denoiser_buffers = 0;
+    int denoise_svc_second_layer = 0;
+    FRAME_TYPE frame_type = cm->current_frame.frame_type == INTRA_ONLY_FRAME
+                                ? KEY_FRAME
+                                : cm->current_frame.frame_type;
+    cpi->denoiser.current_denoiser_frame++;
+    const int resize_pending = is_frame_resize_pending(cpi);
+
+    if (cpi->ppi->use_svc) {
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+      const int svc_buf_shift =
+          svc->number_spatial_layers - svc->spatial_layer_id == 2
+              ? cpi->denoiser.num_ref_frames
+              : 0;
+      int layer =
+          LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                           svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      svc_refresh_denoiser_buffers =
+          lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id];
+      denoise_svc_second_layer =
+          svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0;
+      // Check if we need to allocate extra buffers in the denoiser
+      // for refreshed frames.
+      if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, rtc_ref,
+                                   svc, svc_buf_shift,
+                                   cpi->refresh_alt_ref_frame,
+                                   cpi->refresh_golden_frame,
+                                   cpi->refresh_last_frame, cpi->alt_fb_idx,
+                                   cpi->gld_fb_idx, cpi->lst_fb_idx))
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to re-allocate denoiser for SVC");
+#endif
+    }
+    av1_denoiser_update_frame_info(
+        &cpi->denoiser, *cpi->source, rtc_ref, svc, frame_type,
+        cpi->refresh_frame.alt_ref_frame, cpi->refresh_frame.golden_frame, 1,
+        rtc_ref->ref_idx[6], rtc_ref->ref_idx[3], rtc_ref->ref_idx[0],
+        resize_pending, svc_refresh_denoiser_buffers, denoise_svc_second_layer);
+  }
+}
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+  int r, c;
+  uint8_t *u = yuv->u_buffer;
+  uint8_t *v = yuv->v_buffer;
+
+  for (r = 0; r < yuv->uv_height; ++r) {
+    for (c = 0; c < yuv->uv_width; ++c) {
+      u[c] = UINT8_MAX / 2;
+      v[c] = UINT8_MAX / 2;
+    }
+    u += yuv->uv_stride;
+    v += yuv->uv_stride;
+  }
+}
+
+void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+  unsigned char *src = s->y_buffer;
+  int h = s->y_crop_height;
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.h b/third_party/aom/av1/encoder/av1_temporal_denoiser.h
new file mode 100644
index 0000000000..14dcccce69
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
+#define AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
+typedef enum av1_denoiser_decision {
+  COPY_BLOCK,
+  FILTER_BLOCK,
+  FILTER_ZEROMV_BLOCK
+} AV1_DENOISER_DECISION;
+
+typedef enum av1_denoiser_level {
+  kDenLowLow,
+  kDenLow,
+  kDenMedium,
+  kDenHigh
+} AV1_DENOISER_LEVEL;
+
+typedef struct av1_denoiser {
+  YV12_BUFFER_CONFIG *running_avg_y;
+  YV12_BUFFER_CONFIG *mc_running_avg_y;
+  YV12_BUFFER_CONFIG last_source;
+  int frame_buffer_initialized;
+  int reset;
+  int num_ref_frames;
+  int num_layers;
+  unsigned int current_denoiser_frame;
+  AV1_DENOISER_LEVEL denoising_level;
+  AV1_DENOISER_LEVEL prev_denoising_level;
+} AV1_DENOISER;
+
+typedef struct {
+  int64_t zero_last_cost_orig;
+  unsigned int *ref_frame_cost;
+  int_mv (*frame_mv)[REF_FRAMES];
+  int reuse_inter_pred;
+  TX_SIZE best_tx_size;
+  PREDICTION_MODE best_mode;
+  MV_REFERENCE_FRAME best_ref_frame;
+  int_interpfilters best_pred_filter;
+  uint8_t best_mode_skip_txfm;
+} AV1_PICKMODE_CTX_DEN;
+
+struct AV1_COMP;
+struct SVC;
+struct RTC_REF;
+
+void av1_denoiser_update_frame_info(
+    AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref,
+    struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame,
+    int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx,
+    int gld_fb_idx, int lst_fb_idx, int resized,
+    int svc_refresh_denoiser_buffers, int second_spatial_layer);
+
+void av1_denoiser_denoise(struct AV1_COMP *cpi, MACROBLOCK *mb, int mi_row,
+                          int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+                          AV1_DENOISER_DECISION *denoiser_decision,
+                          int use_gf_temporal_ref);
+
+void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
+
+void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
+                                     PREDICTION_MODE mode,
+                                     PICK_MODE_CONTEXT *ctx);
+
+int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
+                             struct RTC_REF *rtc, struct SVC *svc,
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx);
+
+int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy, int use_highbitdepth, int border);
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// This function is used by both c and sse2 denoiser implementations.
+// Define it as a static function within the scope where av1_denoiser.h
+// is referenced.
+static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
+                                          int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+#endif
+
+void av1_denoiser_free(AV1_DENOISER *denoiser);
+
+void av1_denoiser_set_noise_level(struct AV1_COMP *const cpi, int noise_level);
+
+void av1_denoiser_reset_on_first_frame(struct AV1_COMP *const cpi);
+
+int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
+                              CONTENT_STATE_SB content_state,
+                              int temporal_layer_id);
+
+int64_t av1_scale_acskip_thresh(int64_t threshold,
+                                AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
+                                int temporal_layer_id);
+
+void av1_denoiser_update_ref_frame(struct AV1_COMP *const cpi);
+
+void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
new file mode 100644
index 0000000000..219784fedf
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -0,0 +1,4248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem_ops.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+
+#define ENC_MISMATCH_DEBUG 0
+#define SETUP_TIME_OH_CONST 5     // Setup time overhead constant per worker
+#define JOB_DISP_TIME_OH_CONST 1  // Job dispatch time overhead per tile
+
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_literal(w, (v - m) & 1, 1);
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
+    aom_writer *const w, int plane, FRAME_COUNTS *counts);
+#endif
+
+static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+                                             const MB_MODE_INFO *mi,
+                                             const MB_MODE_INFO *above_mi,
+                                             const MB_MODE_INFO *left_mi,
+                                             PREDICTION_MODE mode,
+                                             aom_writer *w) {
+  assert(!is_intrabc_block(mi));
+  (void)mi;
+  aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
+                   INTRA_MODES);
+}
+
+static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+                                        FRAME_CONTEXT *ec_ctx,
+                                        const int16_t mode_ctx) {
+  const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+
+  aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
+
+  if (mode != NEWMV) {
+    const int16_t zeromv_ctx =
+        (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
+
+    if (mode != GLOBALMV) {
+      int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
+    }
+  }
+}
+
+static AOM_INLINE void write_drl_idx(
+    FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+  assert(mbmi->ref_mv_idx < 3);
+
+  const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+  if (new_mv) {
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
+
+        aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
+                         2);
+        if (mbmi->ref_mv_idx == idx) return;
+      }
+    }
+    return;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    int idx;
+    // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+    for (idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
+        aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
+                         ec_ctx->drl_cdf[drl_ctx], 2);
+        if (mbmi->ref_mv_idx == (idx - 1)) return;
+      }
+    }
+    return;
+  }
+}
+
+static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+                                                 PREDICTION_MODE mode,
+                                                 const int16_t mode_ctx) {
+  assert(is_inter_compound_mode(mode));
+  aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
+                   xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
+                   INTER_COMPOUND_MODES);
+}
+
+static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
+                                           const MB_MODE_INFO *mbmi,
+                                           TX_SIZE tx_size, int depth,
+                                           int blk_row, int blk_col,
+                                           aom_writer *w) {
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+  const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (depth == MAX_VARTX_DEPTH) {
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->bsize, tx_size);
+  const int txb_size_index =
+      av1_get_txb_size_index(mbmi->bsize, blk_row, blk_col);
+  const int write_txfm_partition =
+      tx_size == mbmi->inter_tx_size[txb_size_index];
+  if (write_txfm_partition) {
+    aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    // TODO(yuec): set correct txfm partition update for qttx
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+    aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+    if (sub_txs == TX_4X4) {
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
+      return;
+    }
+
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetc = blk_col + col;
+        write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
+      }
+    }
+  }
+}
+
+static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
+                                              aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  if (block_signals_txsize(bsize)) {
+    const TX_SIZE tx_size = mbmi->tx_size;
+    const int tx_size_ctx = get_tx_size_context(xd);
+    const int depth = tx_size_to_depth(tx_size, bsize);
+    const int max_depths = bsize_to_max_depth(bsize);
+    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+
+    assert(depth >= 0 && depth <= max_depths);
+    assert(!is_inter_block(mbmi));
+    assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+
+    aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                     max_depths + 1);
+  }
+}
+
+static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      uint8_t segment_id, const MB_MODE_INFO *mi,
+                      aom_writer *w) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int skip_txfm = mi->skip_txfm;
+    const int ctx = av1_get_skip_txfm_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    aom_write_symbol(w, skip_txfm, ec_ctx->skip_txfm_cdfs[ctx], 2);
+    return skip_txfm;
+  }
+}
+
+static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           uint8_t segment_id, const MB_MODE_INFO *mi,
+                           aom_writer *w) {
+  if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+  const int skip_mode = mi->skip_mode;
+  if (!is_comp_ref_allowed(mi->bsize)) {
+    assert(!skip_mode);
+    return 0;
+  }
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    assert(!skip_mode);
+    return 0;
+  }
+  const int ctx = av1_get_skip_mode_context(xd);
+  aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2);
+  return skip_mode;
+}
+
+static AOM_INLINE void write_is_inter(const AV1_COMMON *cm,
+                                      const MACROBLOCKD *xd, uint8_t segment_id,
+                                      aom_writer *w, const int is_inter) {
+  if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+      assert(is_inter);
+      return;
+    }
+    const int ctx = av1_get_intra_inter_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
+  }
+}
+
+static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         const MB_MODE_INFO *mbmi,
+                                         aom_writer *w) {
+  MOTION_MODE last_motion_mode_allowed =
+      cm->features.switchable_motion_mode
+          ? motion_mode_allowed(cm->global_motion, xd, mbmi,
+                                cm->features.allow_warped_motion)
+          : SIMPLE_TRANSLATION;
+  assert(mbmi->motion_mode <= last_motion_mode_allowed);
+  switch (last_motion_mode_allowed) {
+    case SIMPLE_TRANSLATION: break;
+    case OBMC_CAUSAL:
+      aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+                       xd->tile_ctx->obmc_cdf[mbmi->bsize], 2);
+      break;
+    default:
+      aom_write_symbol(w, mbmi->motion_mode,
+                       xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
+                       MOTION_MODES);
+  }
+}
+
+static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
+                                          int delta_qindex, aom_writer *w) {
+  int sign = delta_qindex < 0;
+  int abs = sign ? -delta_qindex : delta_qindex;
+  int rem_bits, thr;
+  int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
+                   DELTA_Q_PROBS + 1);
+
+  if (!smallval) {
+    rem_bits = get_msb(abs - 1);
+    thr = (1 << rem_bits) + 1;
+    aom_write_literal(w, rem_bits - 1, 3);
+    aom_write_literal(w, abs - thr, rem_bits);
+  }
+  if (abs > 0) {
+    aom_write_bit(w, sign);
+  }
+}
+
+static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
+                                           const MACROBLOCKD *xd, int lf_id,
+                                           int delta_lflevel,
+                                           int delta_lf_multi, aom_writer *w) {
+  int sign = delta_lflevel < 0;
+  int abs = sign ? -delta_lflevel : delta_lflevel;
+  int rem_bits, thr;
+  int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+
+  if (delta_lf_multi) {
+    assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+                                                         : FRAME_LF_COUNT - 2));
+    aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
+                     ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
+  } else {
+    aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+                     DELTA_LF_PROBS + 1);
+  }
+
+  if (!smallval) {
+    rem_bits = get_msb(abs - 1);
+    thr = (1 << rem_bits) + 1;
+    aom_write_literal(w, rem_bits - 1, 3);
+    aom_write_literal(w, abs - thr, rem_bits);
+  }
+  if (abs > 0) {
+    aom_write_bit(w, sign);
+  }
+}
+
+static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp,
+                                       int n, int num, MapCdf map_pb_cdf) {
+  const TokenExtra *p = *tp;
+  const int palette_size_idx = n - PALETTE_MIN_SIZE;
+  write_uniform(w, n, p->token);  // The first color index.
+  ++p;
+  --num;
+  for (int i = 0; i < num; ++i) {
+    assert((p->color_ctx >= 0) &&
+           (p->color_ctx < PALETTE_COLOR_INDEX_CONTEXTS));
+    aom_cdf_prob *color_map_cdf = map_pb_cdf[palette_size_idx][p->color_ctx];
+    aom_write_symbol(w, p->token, color_map_cdf, n);
+    ++p;
+  }
+  *tp = p;
+}
+
+static AOM_INLINE void pack_txb_tokens(
+    aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp,
+    const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+    int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block,
+    int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+
+  if (tx_size == plane_tx_size || plane) {
+    av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size);
+#if CONFIG_RD_DEBUG
+    TOKEN_STATS tmp_token_stats;
+    init_token_stats(&tmp_token_stats);
+    token_stats->cost += tmp_token_stats.cost;
+#endif
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int r = 0; r < row_end; r += bsh) {
+      const int offsetr = blk_row + r;
+      for (int c = 0; c < col_end; c += bsw) {
+        const int offsetc = blk_col + c;
+        pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
+                        bit_depth, block, offsetr, offsetc, sub_txs,
+                        token_stats);
+        block += step;
+      }
+    }
+  }
+}
+
+static INLINE void set_spatial_segment_id(
+    const CommonModeInfoParams *const mi_params, uint8_t *segment_ids,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, uint8_t segment_id) {
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+
+  const int mi_stride = mi_params->mi_cols;
+
+  set_segment_id(segment_ids, mi_offset, xmis, ymis, mi_stride, segment_id);
+}
+
+int av1_neg_interleave(int x, int ref, int max) {
+  assert(x < max);
+  const int diff = x - ref;
+  if (!ref) return x;
+  if (ref >= (max - 1)) return -x + max - 1;
+  if (2 * ref < max) {
+    if (abs(diff) <= ref) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
+    }
+    return x;
+  } else {
+    if (abs(diff) < (max - ref)) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
+    }
+    return (max - x) - 1;
+  }
+}
+
+static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
+                                        const MB_MODE_INFO *const mbmi,
+                                        aom_writer *w,
+                                        const struct segmentation *seg,
+                                        struct segmentation_probs *segp,
+                                        int skip_txfm) {
+  if (!seg->enabled || !seg->update_map) return;
+
+  AV1_COMMON *const cm = &cpi->common;
+  int cdf_num;
+  const uint8_t pred = av1_get_spatial_seg_pred(
+      cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  if (skip_txfm) {
+    // Still need to transmit tx size for intra blocks even if skip_txfm is
+    // true. Changing segment_id may make the tx size become invalid, e.g
+    // changing from lossless to lossy.
+    assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment);
+
+    set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
+                           mi_row, mi_col, pred);
+    set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->bsize,
+                           mi_row, mi_col, pred);
+    /* mbmi is read only but we need to update segment_id */
+    ((MB_MODE_INFO *)mbmi)->segment_id = pred;
+    return;
+  }
+
+  const int coded_id =
+      av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
+  set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
+                         mi_row, mi_col, mbmi->segment_id);
+}
+
+#define WRITE_REF_BIT(bname, pname) \
+  aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
+
+// This function encodes the reference frame
+static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
+                                        const MACROBLOCKD *xd, aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_compound = has_second_ref(mbmi);
+  const uint8_t segment_id = mbmi->segment_id;
+
+  // If segment level coding of this signal is disabled...
+  // or the segment allows multiple reference frame options
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+           get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] == LAST_FRAME);
+  } else {
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+      if (is_comp_ref_allowed(mbmi->bsize))
+        aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
+    } else {
+      assert((!is_compound) ==
+             (cm->current_frame.reference_mode == SINGLE_REFERENCE));
+    }
+
+    if (is_compound) {
+      const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                    ? UNIDIR_COMP_REFERENCE
+                                                    : BIDIR_COMP_REFERENCE;
+      aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd),
+                       2);
+
+      if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+        const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
+        WRITE_REF_BIT(bit, uni_comp_ref_p);
+
+        if (!bit) {
+          assert(mbmi->ref_frame[0] == LAST_FRAME);
+          const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+                           mbmi->ref_frame[1] == GOLDEN_FRAME;
+          WRITE_REF_BIT(bit1, uni_comp_ref_p1);
+          if (bit1) {
+            const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+            WRITE_REF_BIT(bit2, uni_comp_ref_p2);
+          }
+        } else {
+          assert(mbmi->ref_frame[1] == ALTREF_FRAME);
+        }
+
+        return;
+      }
+
+      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+      const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                       mbmi->ref_frame[0] == LAST3_FRAME);
+      WRITE_REF_BIT(bit, comp_ref_p);
+
+      if (!bit) {
+        const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME;
+        WRITE_REF_BIT(bit1, comp_ref_p1);
+      } else {
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        WRITE_REF_BIT(bit2, comp_ref_p2);
+      }
+
+      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+      WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+      if (!bit_bwd) {
+        WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
+      }
+
+    } else {
+      const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
+                        mbmi->ref_frame[0] >= BWDREF_FRAME);
+      WRITE_REF_BIT(bit0, single_ref_p1);
+
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+        WRITE_REF_BIT(bit1, single_ref_p2);
+
+        if (!bit1) {
+          WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
+        }
+      } else {
+        const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+                          mbmi->ref_frame[0] == GOLDEN_FRAME);
+        WRITE_REF_BIT(bit2, single_ref_p3);
+
+        if (!bit2) {
+          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+          WRITE_REF_BIT(bit3, single_ref_p4);
+        } else {
+          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+          WRITE_REF_BIT(bit4, single_ref_p5);
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void write_filter_intra_mode_info(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+    aom_writer *w) {
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
+                     xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2);
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const FILTER_INTRA_MODE mode =
+          mbmi->filter_intra_mode_info.filter_intra_mode;
+      aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf,
+                       FILTER_INTRA_MODES);
+    }
+  }
+}
+
+static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta,
+                                         aom_cdf_prob *cdf) {
+  aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
+                   2 * MAX_ANGLE_DELTA + 1);
+}
+
+static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
+                                              ThreadData *td, aom_writer *w) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!av1_is_interp_needed(xd)) {
+    int_interpfilters filters = av1_broadcast_interp_filter(
+        av1_unswitchable_filter(cm->features.interp_filter));
+    assert(mbmi->interp_filters.as_int == filters.as_int);
+    (void)filters;
+    return;
+  }
+  if (cm->features.interp_filter == SWITCHABLE) {
+    int dir;
+    for (dir = 0; dir < 2; ++dir) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
+                       SWITCHABLE_FILTERS);
+      ++td->interp_filter_selected[filter];
+      if (cm->seq_params->enable_dual_filter == 0) return;
+    }
+  }
+}
+
+// Transmit color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. "min_val" is
+// the smallest possible value of the deltas.
+static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num,
+                                                   int bit_depth, int min_val,
+                                                   aom_writer *w) {
+  if (num <= 0) return;
+  assert(colors[0] < (1 << bit_depth));
+  aom_write_literal(w, colors[0], bit_depth);
+  if (num == 1) return;
+  int max_delta = 0;
+  int deltas[PALETTE_MAX_SIZE];
+  memset(deltas, 0, sizeof(deltas));
+  for (int i = 1; i < num; ++i) {
+    assert(colors[i] < (1 << bit_depth));
+    const int delta = colors[i] - colors[i - 1];
+    deltas[i - 1] = delta;
+    assert(delta >= min_val);
+    if (delta > max_delta) max_delta = delta;
+  }
+  const int min_bits = bit_depth - 3;
+  int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+  assert(bits <= bit_depth);
+  int range = (1 << bit_depth) - colors[0] - min_val;
+  aom_write_literal(w, bits - min_bits, 2);
+  for (int i = 0; i < num - 1; ++i) {
+    aom_write_literal(w, deltas[i] - min_val, bits);
+    range -= deltas[i];
+    bits = AOMMIN(bits, av1_ceil_log2(range));
+  }
+}
+
+// Transmit luma palette color values. First signal if each color in the color
+// cache is used. Those colors that are not in the cache are transmitted with
+// delta encoding.
+static AOM_INLINE void write_palette_colors_y(
+    const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+    int bit_depth, aom_writer *w) {
+  const int n = pmi->palette_size[0];
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache =
+      av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+                            cache_color_found, out_cache_colors);
+  int n_in_cache = 0;
+  for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+    const int found = cache_color_found[i];
+    aom_write_bit(w, found);
+    n_in_cache += found;
+  }
+  assert(n_in_cache + n_out_cache == n);
+  delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w);
+}
+
+// Write chroma palette color values. U channel is handled similarly to the luma
+// channel. For v channel, either use delta encoding or transmit raw values
+// directly, whichever costs less.
+static AOM_INLINE void write_palette_colors_uv(
+    const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+    int bit_depth, aom_writer *w) {
+  const int n = pmi->palette_size[1];
+  const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+  const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+  // U channel colors.
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache = av1_index_color_cache(
+      color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors);
+  int n_in_cache = 0;
+  for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+    const int found = cache_color_found[i];
+    aom_write_bit(w, found);
+    n_in_cache += found;
+  }
+  delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w);
+
+  // V channel colors. Don't use color cache as the colors are not sorted.
+  const int max_val = 1 << bit_depth;
+  int zero_count = 0, min_bits_v = 0;
+  int bits_v =
+      av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+  const int rate_using_delta =
+      2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+  const int rate_using_raw = bit_depth * n;
+  if (rate_using_delta < rate_using_raw) {  // delta encoding
+    assert(colors_v[0] < (1 << bit_depth));
+    aom_write_bit(w, 1);
+    aom_write_literal(w, bits_v - min_bits_v, 2);
+    aom_write_literal(w, colors_v[0], bit_depth);
+    for (int i = 1; i < n; ++i) {
+      assert(colors_v[i] < (1 << bit_depth));
+      if (colors_v[i] == colors_v[i - 1]) {  // No need to signal sign bit.
+        aom_write_literal(w, 0, bits_v);
+        continue;
+      }
+      const int delta = abs((int)colors_v[i] - colors_v[i - 1]);
+      const int sign_bit = colors_v[i] < colors_v[i - 1];
+      if (delta <= max_val - delta) {
+        aom_write_literal(w, delta, bits_v);
+        aom_write_bit(w, sign_bit);
+      } else {
+        aom_write_literal(w, max_val - delta, bits_v);
+        aom_write_bit(w, !sign_bit);
+      }
+    }
+  } else {  // Transmit raw values.
+    aom_write_bit(w, 0);
+    for (int i = 0; i < n; ++i) {
+      assert(colors_v[i] < (1 << bit_depth));
+      aom_write_literal(w, colors_v[i], bit_depth);
+    }
+  }
+}
+
+static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
+                                               const MACROBLOCKD *xd,
+                                               const MB_MODE_INFO *const mbmi,
+                                               aom_writer *w) {
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd);
+    aom_write_symbol(
+        w, n > 0,
+        xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2);
+    if (n > 0) {
+      aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+                       xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+                       PALETTE_SIZES);
+      write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w);
+    }
+  }
+
+  const int uv_dc_pred =
+      num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref;
+  if (uv_dc_pred) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+    aom_write_symbol(w, n > 0,
+                     xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
+    if (n > 0) {
+      aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+                       xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+                       PALETTE_SIZES);
+      write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w);
+    }
+  }
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+                       TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const FeatureFlags *const features = &cm->features;
+  const int is_inter = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 &&
+      ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) ||
+       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+      !mbmi->skip_txfm &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+    const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+        tx_size, is_inter, features->reduced_tx_set_used);
+    const int eset =
+        get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and there
+    // is no need to send the tx_type
+    assert(eset > 0);
+    assert(av1_ext_tx_used[tx_set_type][tx_type]);
+    if (is_inter) {
+      aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+                       ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                       av1_num_ext_tx_set[tx_set_type]);
+    } else {
+      PREDICTION_MODE intra_dir;
+      if (mbmi->filter_intra_mode_info.use_filter_intra)
+        intra_dir =
+            fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+      else
+        intra_dir = mbmi->mode;
+      aom_write_symbol(
+          w, av1_ext_tx_ind[tx_set_type][tx_type],
+          ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+          av1_num_ext_tx_set[tx_set_type]);
+    }
+  }
+}
+
+static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx,
+                                                BLOCK_SIZE bsize,
+                                                PREDICTION_MODE mode,
+                                                aom_writer *w) {
+  aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+                   INTRA_MODES);
+}
+
+static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+                                           UV_PREDICTION_MODE uv_mode,
+                                           PREDICTION_MODE y_mode,
+                                           CFL_ALLOWED_TYPE cfl_allowed,
+                                           aom_writer *w) {
+  aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                   UV_INTRA_MODES - !cfl_allowed);
+}
+
+static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx,
+                                        uint8_t idx, int8_t joint_sign,
+                                        aom_writer *w) {
+  aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  // Magnitudes are only signaled for nonzero codes.
+  if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+    aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE);
+  }
+  if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+    aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
+  }
+}
+
+static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                  aom_writer *w, int skip) {
+  if (cm->features.coded_lossless || cm->features.allow_intrabc) return;
+
+  // At the start of a superblock, mark that we haven't yet written CDEF
+  // strengths for any of the CDEF units contained in this superblock.
+  const int sb_mask = (cm->seq_params->mib_size - 1);
+  const int mi_row_in_sb = (xd->mi_row & sb_mask);
+  const int mi_col_in_sb = (xd->mi_col & sb_mask);
+  if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
+    xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
+        xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
+  }
+
+  // CDEF unit size is 64x64 irrespective of the superblock size.
+  const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
+
+  // Find index of this CDEF unit in this superblock.
+  const int index_mask = cdef_size;
+  const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
+  const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
+  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
+                        ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
+                        : 0;
+
+  // Write CDEF strength to the first non-skip coding block in this CDEF unit.
+  if (!xd->cdef_transmitted[index] && !skip) {
+    // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO
+    // of the 1st block in this CDEF unit.
+    const int first_block_mask = ~(cdef_size - 1);
+    const CommonModeInfoParams *const mi_params = &cm->mi_params;
+    const int grid_idx =
+        get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
+                        xd->mi_col & first_block_mask);
+    const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+    aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits);
+    xd->cdef_transmitted[index] = true;
+  }
+}
+
+static AOM_INLINE void write_inter_segment_id(
+    AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w,
+    const struct segmentation *const seg, struct segmentation_probs *const segp,
+    int skip, int preskip) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  if (seg->update_map) {
+    if (preskip) {
+      if (!seg->segid_preskip) return;
+    } else {
+      if (seg->segid_preskip) return;
+      if (skip) {
+        write_segment_id(cpi, xd, mbmi, w, seg, segp, 1);
+        if (seg->temporal_update) mbmi->seg_id_predicted = 0;
+        return;
+      }
+    }
+    if (seg->temporal_update) {
+      const int pred_flag = mbmi->seg_id_predicted;
+      aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+      aom_write_symbol(w, pred_flag, pred_cdf, 2);
+      if (!pred_flag) {
+        write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+      }
+      if (pred_flag) {
+        set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
+                               mbmi->bsize, mi_row, mi_col, mbmi->segment_id);
+      }
+    } else {
+      write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+    }
+  }
+}
+
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm,
+                                            MACROBLOCKD *const xd, int skip,
+                                            aom_writer *w) {
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+
+  if (delta_q_info->delta_q_present_flag) {
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    const BLOCK_SIZE bsize = mbmi->bsize;
+    const int super_block_upper_left =
+        ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+        ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
+
+    if ((bsize != cm->seq_params->sb_size || skip == 0) &&
+        super_block_upper_left) {
+      assert(mbmi->current_qindex > 0);
+      const int reduced_delta_qindex =
+          (mbmi->current_qindex - xd->current_base_qindex) /
+          delta_q_info->delta_q_res;
+      write_delta_qindex(xd, reduced_delta_qindex, w);
+      xd->current_base_qindex = mbmi->current_qindex;
+      if (delta_q_info->delta_lf_present_flag) {
+        if (delta_q_info->delta_lf_multi) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+            int reduced_delta_lflevel =
+                (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+                delta_q_info->delta_lf_res;
+            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w);
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+          }
+        } else {
+          int reduced_delta_lflevel =
+              (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+              delta_q_info->delta_lf_res;
+          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w);
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
+                                                    MACROBLOCKD *const xd,
+                                                    int is_keyframe,
+                                                    aom_writer *w) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PREDICTION_MODE mode = mbmi->mode;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+
+  // Y mode.
+  if (is_keyframe) {
+    const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+    const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+    write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+  } else {
+    write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+  }
+
+  // Y angle delta.
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  if (use_angle_delta && av1_is_directional_mode(mode)) {
+    write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+                      ec_ctx->angle_delta_cdf[mode - V_PRED]);
+  }
+
+  // UV mode and UV angle delta.
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+    if (uv_mode == UV_CFL_PRED)
+      write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+    const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+    if (use_angle_delta && av1_is_directional_mode(intra_mode)) {
+      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+                        ec_ctx->angle_delta_cdf[intra_mode - V_PRED]);
+    }
+  }
+
+  // Palette.
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+    write_palette_mode_info(cm, xd, mbmi, w);
+  }
+
+  // Filter intra.
+  write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
+
+static INLINE int16_t mode_context_analyzer(
+    const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) {
+  if (rf[1] <= INTRA_FRAME) return mode_context;
+
+  const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK;
+  const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+      newmv_ctx, COMP_NEWMV_CTXS - 1)];
+  return comp_ctx;
+}
+
+static INLINE int_mv get_ref_mv_from_stack(
+    int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) {
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+  if (ref_frame[1] > INTRA_FRAME) {
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
+  }
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+  return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+                               x->mbmi_ext_frame);
+}
+
+static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
+                                           aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const uint8_t segment_id = mbmi->segment_id;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int allow_hp = cm->features.allow_high_precision_mv;
+  const int is_inter = is_inter_block(mbmi);
+  const int is_compound = has_second_ref(mbmi);
+  int ref;
+
+  write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1);
+
+  write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+  assert(IMPLIES(mbmi->skip_mode, mbmi->skip_txfm));
+  const int skip =
+      mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+  write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0);
+
+  write_cdef(cm, xd, w, skip);
+
+  write_delta_q_params(cm, xd, skip, w);
+
+  if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+  if (mbmi->skip_mode) return;
+
+  if (!is_inter) {
+    write_intra_prediction_modes(cm, xd, 0, w);
+  } else {
+    int16_t mode_ctx;
+
+    av1_collect_neighbors_ref_counts(xd);
+
+    write_ref_frames(cm, xd, w);
+
+    mode_ctx =
+        mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame);
+
+    // If segment skip is not enabled code the mode.
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+      if (is_inter_compound_mode(mode))
+        write_inter_compound_mode(xd, w, mode, mode_ctx);
+      else if (is_inter_singleref_mode(mode))
+        write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+      if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
+        write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w);
+      else
+        assert(mbmi->ref_mv_idx == 0);
+    }
+
+    if (mode == NEWMV || mode == NEW_NEWMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        nmv_context *nmvc = &ec_ctx->nmvc;
+        const int_mv ref_mv = get_ref_mv(x, ref);
+        av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+                      allow_hp);
+      }
+    } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = get_ref_mv(x, 1);
+      av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
+                    allow_hp);
+    } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = get_ref_mv(x, 0);
+      av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
+                    allow_hp);
+    }
+
+    if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE &&
+        cpi->common.seq_params->enable_interintra_compound &&
+        is_interintra_allowed(mbmi)) {
+      const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+      const int bsize_group = size_group_lookup[bsize];
+      aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
+      if (interintra) {
+        aom_write_symbol(w, mbmi->interintra_mode,
+                         ec_ctx->interintra_mode_cdf[bsize_group],
+                         INTERINTRA_MODES);
+        if (av1_is_wedge_used(bsize)) {
+          aom_write_symbol(w, mbmi->use_wedge_interintra,
+                           ec_ctx->wedge_interintra_cdf[bsize], 2);
+          if (mbmi->use_wedge_interintra) {
+            aom_write_symbol(w, mbmi->interintra_wedge_index,
+                             ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+          }
+        }
+      }
+    }
+
+    if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
+
+    // First write idx to indicate current compound inter prediction mode group
+    // Group A (0): dist_wtd_comp, compound_average
+    // Group B (1): interintra, compound_diffwtd, wedge
+    if (has_second_ref(mbmi)) {
+      const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                       cm->seq_params->enable_masked_compound;
+
+      if (masked_compound_used) {
+        const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+        aom_write_symbol(w, mbmi->comp_group_idx,
+                         ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2);
+      } else {
+        assert(mbmi->comp_group_idx == 0);
+      }
+
+      if (mbmi->comp_group_idx == 0) {
+        if (mbmi->compound_idx)
+          assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
+
+        if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+          aom_write_symbol(w, mbmi->compound_idx,
+                           ec_ctx->compound_index_cdf[comp_index_ctx], 2);
+        } else {
+          assert(mbmi->compound_idx == 1);
+        }
+      } else {
+        assert(cpi->common.current_frame.reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+        assert(masked_compound_used);
+        // compound_diffwtd, wedge
+        assert(mbmi->interinter_comp.type == COMPOUND_WEDGE ||
+               mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+          aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE,
+                           ec_ctx->compound_type_cdf[bsize],
+                           MASKED_COMPOUND_TYPES);
+
+        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+          aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+                           ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+          aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+        } else {
+          assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+          aom_write_literal(w, mbmi->interinter_comp.mask_type,
+                            MAX_DIFFWTD_MASK_BITS);
+        }
+      }
+    }
+    write_mb_interp_filter(cm, td, w);
+  }
+}
+
+static AOM_INLINE void write_intrabc_info(
+    MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+    aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  int use_intrabc = is_intrabc_block(mbmi);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
+  if (use_intrabc) {
+    assert(mbmi->mode == DC_PRED);
+    assert(mbmi->uv_mode == UV_DC_PRED);
+    assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
+    int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv;
+    av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+  }
+}
+
+static AOM_INLINE void write_mb_modes_kf(
+    AV1_COMP *cpi, MACROBLOCKD *xd,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  if (seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+
+  const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
+
+  if (!seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, xd, mbmi, w, seg, segp, skip);
+
+  write_cdef(cm, xd, w, skip);
+
+  write_delta_q_params(cm, xd, skip, w);
+
+  if (av1_allow_intrabc(cm)) {
+    write_intrabc_info(xd, mbmi_ext_frame, w);
+    if (is_intrabc_block(mbmi)) return;
+  }
+
+  write_intra_prediction_modes(cm, xd, 1, w);
+}
+
+#if CONFIG_RD_DEBUG
+static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
+  printf("\nmi->mi_row == %d\n", mi->mi_row);
+  printf("&& mi->mi_col == %d\n", mi->mi_col);
+  printf("&& mi->bsize == %d\n", mi->bsize);
+  printf("&& mi->tx_size == %d\n", mi->tx_size);
+  printf("&& mi->mode == %d\n", mi->mode);
+}
+
+static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
+                                   int plane) {
+  if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
+    printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
+           plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
+    return 1;
+  }
+  return 0;
+}
+#endif
+
+#if ENC_MISMATCH_DEBUG
+static AOM_INLINE void enc_dump_logs(
+    const AV1_COMMON *const cm,
+    const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) {
+  const MB_MODE_INFO *const mbmi = *(
+      cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col));
+  const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame =
+      mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col,
+                                                 cm->mi_params.mi_alloc_bsize,
+                                                 mbmi_ext_info->stride);
+  if (is_inter_block(mbmi)) {
+#define FRAME_TO_CHECK 11
+    if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
+        cm->show_frame == 1) {
+      const BLOCK_SIZE bsize = mbmi->bsize;
+
+      int_mv mv[2] = { 0 };
+      const int is_comp_ref = has_second_ref(mbmi);
+
+      for (int ref = 0; ref < 1 + is_comp_ref; ++ref)
+        mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+      if (!is_comp_ref) {
+        mv[1].as_int = 0;
+      }
+
+      const int16_t mode_ctx =
+          is_comp_ref ? 0
+                      : mode_context_analyzer(mbmi_ext_frame->mode_context,
+                                              mbmi->ref_frame);
+
+      const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+      int16_t zeromv_ctx = -1;
+      int16_t refmv_ctx = -1;
+
+      if (mbmi->mode != NEWMV) {
+        zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+        if (mbmi->mode != GLOBALMV)
+          refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      }
+
+      printf(
+          "=== ENCODER ===: "
+          "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+          "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+          "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+          "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+          cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode,
+          mbmi->mode, bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+          mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+          mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx,
+          zeromv_ctx, refmv_ctx, mbmi->tx_size);
+    }
+  }
+}
+#endif  // ENC_MISMATCH_DEBUG
+
+static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
+                                    aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
+  MB_MODE_INFO *m = xd->mi[0];
+
+  if (frame_is_intra_only(cm)) {
+    write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w);
+  } else {
+    // has_subpel_mv_component needs the ref frame buffers set up to look
+    // up if they are scaled. has_subpel_mv_component is in turn needed by
+    // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+    set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
+
+#if ENC_MISMATCH_DEBUG
+    enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col);
+#endif  // ENC_MISMATCH_DEBUG
+
+    pack_inter_mode_mvs(cpi, td, w);
+  }
+}
+
+static AOM_INLINE void write_inter_txb_coeff(
+    AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi,
+    aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end,
+    TOKEN_STATS *token_stats, const int row, const int col, int *block,
+    const int plane) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+  const int step =
+      tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+  const int bkw = tx_size_wide_unit[max_tx_size];
+  const int bkh = tx_size_high_unit[max_tx_size];
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+  const int num_4x4_w = mi_size_wide[plane_bsize];
+  const int num_4x4_h = mi_size_high[plane_bsize];
+  const int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+  const int mu_blocks_high = mi_size_high[max_unit_bsize];
+  const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h);
+  const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w);
+  for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) {
+    for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) {
+      pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
+                      cm->seq_params->bit_depth, *block, blk_row, blk_col,
+                      max_tx_size, token_stats);
+      *block += step;
+    }
+  }
+}
+
+static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
+                                      aom_writer *w, const TokenExtra **tok,
+                                      const TokenExtra *const tok_end) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+
+  assert(!mbmi->skip_txfm);
+
+  const int is_inter = is_inter_block(mbmi);
+  if (!is_inter) {
+    av1_write_intra_coeffs_mb(cm, x, w, bsize);
+  } else {
+    int block[MAX_MB_PLANE] = { 0 };
+    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                         xd->plane[0].subsampling_y));
+    const int num_4x4_w = mi_size_wide[bsize];
+    const int num_4x4_h = mi_size_high[bsize];
+    TOKEN_STATS token_stats;
+    init_token_stats(&token_stats);
+
+    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+    assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64,
+                                                  xd->plane[0].subsampling_x,
+                                                  xd->plane[0].subsampling_y));
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
+    mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
+
+    const int num_planes = av1_num_planes(cm);
+    for (int row = 0; row < num_4x4_h; row += mu_blocks_high) {
+      for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+        for (int plane = 0; plane < num_planes; ++plane) {
+          if (plane && !xd->is_chroma_ref) break;
+          write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row,
+                                col, &block[plane], plane);
+        }
+      }
+    }
+#if CONFIG_RD_DEBUG
+    for (int plane = 0; plane < num_planes; ++plane) {
+      if (mbmi->bsize >= BLOCK_8X8 &&
+          rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+        dump_mode_info(mbmi);
+        assert(0);
+      }
+    }
+#endif  // CONFIG_RD_DEBUG
+  }
+}
+
+static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
+                                     const TileInfo *const tile, aom_writer *w,
+                                     const TokenExtra **tok,
+                                     const TokenExtra *const tok_end,
+                                     int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCKD *xd = &td->mb.e_mbd;
+  FRAME_CONTEXT *tile_ctx = xd->tile_ctx;
+  const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
+  xd->mi = mi_params->mi_grid_base + grid_idx;
+  td->mb.mbmi_ext_frame =
+      cpi->mbmi_ext_info.frame_base +
+      get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+                     cpi->mbmi_ext_info.stride);
+  xd->tx_type_map = mi_params->tx_type_map + grid_idx;
+  xd->tx_type_map_stride = mi_params->mi_stride;
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  assert(bsize <= cm->seq_params->sb_size ||
+         (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
+
+  const int bh = mi_size_high[bsize];
+  const int bw = mi_size_wide[bsize];
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
+
+  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  write_mbmi_b(cpi, td, w);
+
+  for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
+    const uint8_t palette_size_plane =
+        mbmi->palette_mode_info.palette_size[plane];
+    assert(!mbmi->skip_mode || !palette_size_plane);
+    if (palette_size_plane > 0) {
+      assert(mbmi->use_intrabc == 0);
+      assert(av1_allow_palette(cm->features.allow_screen_content_tools,
+                               mbmi->bsize));
+      assert(!plane || xd->is_chroma_ref);
+      int rows, cols;
+      av1_get_block_dimensions(mbmi->bsize, plane, xd, NULL, NULL, &rows,
+                               &cols);
+      assert(*tok < tok_end);
+      MapCdf map_pb_cdf = plane ? tile_ctx->palette_uv_color_index_cdf
+                                : tile_ctx->palette_y_color_index_cdf;
+      pack_map_tokens(w, tok, palette_size_plane, rows * cols, map_pb_cdf);
+    }
+  }
+
+  const int is_inter_tx = is_inter_block(mbmi);
+  const int skip_txfm = mbmi->skip_txfm;
+  const uint8_t segment_id = mbmi->segment_id;
+  if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) {
+    if (is_inter_tx) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+      const int txbh = tx_size_high_unit[max_tx_size];
+      const int txbw = tx_size_wide_unit[max_tx_size];
+      const int width = mi_size_wide[bsize];
+      const int height = mi_size_high[bsize];
+      for (int idy = 0; idy < height; idy += txbh) {
+        for (int idx = 0; idx < width; idx += txbw) {
+          write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+        }
+      }
+    } else {
+      write_selected_tx_size(xd, w);
+      set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd);
+    }
+  } else {
+    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
+                  skip_txfm && is_inter_tx, xd);
+  }
+
+  if (!mbmi->skip_txfm) {
+    int start = aom_tell_size(w);
+
+    write_tokens_b(cpi, &td->mb, w, tok, tok_end);
+
+    const int end = aom_tell_size(w);
+    td->coefficient_size += end - start;
+  }
+}
+
+static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
+                                       const MACROBLOCKD *const xd, int hbs,
+                                       int mi_row, int mi_col, PARTITION_TYPE p,
+                                       BLOCK_SIZE bsize, aom_writer *w) {
+  const int is_partition_point = bsize >= BLOCK_8X8;
+
+  if (!is_partition_point) return;
+
+  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT);
+    return;
+  }
+
+  if (has_rows && has_cols) {
+    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx],
+                     partition_cdf_length(bsize));
+  } else if (!has_rows && has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+    aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+  } else {
+    assert(has_rows && !has_cols);
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+    aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+  }
+}
+
+static AOM_INLINE void write_modes_sb(
+    AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile,
+    aom_writer *const w, const TokenExtra **tok,
+    const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+#if !CONFIG_REALTIME_ONLY
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    int rcol0, rcol1, rrow0, rrow1;
+
+    // Skip some unnecessary work if loop restoration is disabled
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
+    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                           &rcol0, &rcol1, &rrow0, &rrow1)) {
+      const int rstride = cm->rst_info[plane].horz_units;
+      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+          const int runit_idx = rcol + rrow * rstride;
+          loop_restoration_write_sb_coeffs(cm, xd, runit_idx, w, plane,
+                                           td->counts);
+        }
+      }
+    }
+  }
+#endif
+
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+  switch (partition) {
+    case PARTITION_NONE:
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_row + hbs < mi_params->mi_rows)
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_col + hbs < mi_params->mi_cols)
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs,
+                     subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col,
+                     subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+                     subsize);
+      break;
+    case PARTITION_HORZ_A:
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+        write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col);
+      }
+      break;
+    default: assert(0);
+  }
+
+  // update partition context
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+// Populate token pointers appropriately based on token_info.
+static AOM_INLINE void get_token_pointers(const TokenInfo *token_info,
+                                          const int tile_row, int tile_col,
+                                          const int sb_row_in_tile,
+                                          const TokenExtra **tok,
+                                          const TokenExtra **tok_end) {
+  if (!is_token_info_allocated(token_info)) {
+    *tok = NULL;
+    *tok_end = NULL;
+    return;
+  }
+  *tok = token_info->tplist[tile_row][tile_col][sb_row_in_tile].start;
+  *tok_end =
+      *tok + token_info->tplist[tile_row][tile_col][sb_row_in_tile].count;
+}
+
+static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td,
+                                   const TileInfo *const tile,
+                                   aom_writer *const w, int tile_row,
+                                   int tile_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
+  const int mi_row_start = tile->mi_row_start;
+  const int mi_row_end = tile->mi_row_end;
+  const int mi_col_start = tile->mi_col_start;
+  const int mi_col_end = tile->mi_col_end;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
+  av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd);
+
+  if (cpi->common.delta_q_info.delta_q_present_flag) {
+    xd->current_base_qindex = cpi->common.quant_params.base_qindex;
+    if (cpi->common.delta_q_info.delta_lf_present_flag) {
+      av1_reset_loop_filter_delta(xd, num_planes);
+    }
+  }
+
+  for (int mi_row = mi_row_start; mi_row < mi_row_end;
+       mi_row += cm->seq_params->mib_size) {
+    const int sb_row_in_tile =
+        (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2;
+    const TokenInfo *token_info = &cpi->token_info;
+    const TokenExtra *tok;
+    const TokenExtra *tok_end;
+    get_token_pointers(token_info, tile_row, tile_col, sb_row_in_tile, &tok,
+                       &tok_end);
+
+    av1_zero_left_context(xd);
+
+    for (int mi_col = mi_col_start; mi_col < mi_col_end;
+         mi_col += cm->seq_params->mib_size) {
+      td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+      write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col,
+                     cm->seq_params->sb_size);
+    }
+    assert(tok == tok_end);
+  }
+}
+
+static AOM_INLINE void encode_restoration_mode(
+    AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.all_lossless);
+  if (!cm->seq_params->enable_restoration) return;
+  if (cm->features.allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
+    }
+    switch (rsi->frame_restoration_type) {
+      case RESTORE_NONE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 0);
+        break;
+      case RESTORE_WIENER:
+        aom_wb_write_bit(wb, 1);
+        aom_wb_write_bit(wb, 0);
+        break;
+      case RESTORE_SGRPROJ:
+        aom_wb_write_bit(wb, 1);
+        aom_wb_write_bit(wb, 1);
+        break;
+      case RESTORE_SWITCHABLE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 1);
+        break;
+      default: assert(0);
+    }
+  }
+  if (!all_none) {
+    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+           cm->seq_params->sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    assert(rsi->restoration_unit_size >= sb_size);
+    assert(RESTORATION_UNITSIZE_MAX == 256);
+
+    if (sb_size == 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 64);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 128);
+    }
+  }
+
+  if (num_planes > 1) {
+    int s =
+        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
+    if (s && !chroma_none) {
+      aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
+                               cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[1].restoration_unit_size ==
+                 cm->rst_info[0].restoration_unit_size ||
+             cm->rst_info[1].restoration_unit_size ==
+                 (cm->rst_info[0].restoration_unit_size >> s));
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    } else if (!s) {
+      assert(cm->rst_info[1].restoration_unit_size ==
+             cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    }
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void write_wiener_filter(int wiener_win,
+                                           const WienerInfo *wiener_info,
+                                           WienerInfo *ref_wiener_info,
+                                           aom_writer *wb) {
+  if (wiener_win == WIENER_WIN)
+    aom_write_primitive_refsubexpfin(
+        wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  else
+    assert(wiener_info->vfilter[0] == 0 &&
+           wiener_info->vfilter[WIENER_WIN - 1] == 0);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+  if (wiener_win == WIENER_WIN)
+    aom_write_primitive_refsubexpfin(
+        wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  else
+    assert(wiener_info->hfilter[0] == 0 &&
+           wiener_info->hfilter[WIENER_WIN - 1] == 0);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+  memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
+                                            SgrprojInfo *ref_sgrproj_info,
+                                            aom_writer *wb) {
+  aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    assert(sgrproj_info->xqd[0] == 0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  } else if (params->r[1] == 0) {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  } else {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  }
+
+  memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
+    aom_writer *const w, int plane, FRAME_COUNTS *counts) {
+  const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx];
+  const RestorationInfo *rsi = cm->rst_info + plane;
+  RestorationType frame_rtype = rsi->frame_restoration_type;
+  assert(frame_rtype != RESTORE_NONE);
+
+  (void)counts;
+  assert(!cm->features.all_lossless);
+
+  const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+  WienerInfo *ref_wiener_info = &xd->wiener_info[plane];
+  SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane];
+  RestorationType unit_rtype = rui->restoration_type;
+
+  if (frame_rtype == RESTORE_SWITCHABLE) {
+    aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf,
+                     RESTORE_SWITCHABLE_TYPES);
+#if CONFIG_ENTROPY_STATS
+    ++counts->switchable_restore[unit_rtype];
+#endif
+    switch (unit_rtype) {
+      case RESTORE_WIENER:
+#if DEBUG_LR_COSTING
+        assert(!memcmp(
+            ref_wiener_info,
+            &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx].wiener_info,
+            sizeof(*ref_wiener_info)));
+#endif
+        write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
+        break;
+      case RESTORE_SGRPROJ:
+#if DEBUG_LR_COSTING
+        assert(!memcmp(&ref_sgrproj_info->xqd,
+                       &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx]
+                            .sgrproj_info.xqd,
+                       sizeof(ref_sgrproj_info->xqd)));
+#endif
+        write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
+        break;
+      default: assert(unit_rtype == RESTORE_NONE); break;
+    }
+  } else if (frame_rtype == RESTORE_WIENER) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->wiener_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+      assert(
+          !memcmp(ref_wiener_info,
+                  &lr_ref_params[RESTORE_WIENER][plane][runit_idx].wiener_info,
+                  sizeof(*ref_wiener_info)));
+#endif
+      write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
+    }
+  } else if (frame_rtype == RESTORE_SGRPROJ) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->sgrproj_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+      assert(!memcmp(
+          &ref_sgrproj_info->xqd,
+          &lr_ref_params[RESTORE_SGRPROJ][plane][runit_idx].sgrproj_info.xqd,
+          sizeof(ref_sgrproj_info->xqd)));
+#endif
+      write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// Only write out the ref delta section if any of the elements
+// will signal a delta.
+static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) {
+  struct loopfilter *lf = &cm->lf;
+  if (!lf->mode_ref_delta_update) {
+    return 0;
+  }
+  const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+  int8_t last_ref_deltas[REF_FRAMES];
+  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+  if (buf == NULL) {
+    av1_set_default_ref_deltas(last_ref_deltas);
+    av1_set_default_mode_deltas(last_mode_deltas);
+  } else {
+    memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+    memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (lf->ref_deltas[i] != last_ref_deltas[i]) {
+      return true;
+    }
+  }
+  for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+    if (lf->mode_deltas[i] != last_mode_deltas[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
+                                         struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.coded_lossless);
+  if (cm->features.allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  struct loopfilter *lf = &cm->lf;
+
+  // Encode the loop filter level and type
+  aom_wb_write_literal(wb, lf->filter_level[0], 6);
+  aom_wb_write_literal(wb, lf->filter_level[1], 6);
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      aom_wb_write_literal(wb, lf->filter_level_u, 6);
+      aom_wb_write_literal(wb, lf->filter_level_v, 6);
+    }
+  }
+  aom_wb_write_literal(wb, lf->sharpness_level, 3);
+
+  aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled), only if there is information to write.
+  int meaningful = is_mode_ref_delta_meaningful(cm);
+  aom_wb_write_bit(wb, meaningful);
+  if (!meaningful) {
+    return;
+  }
+
+  const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+  int8_t last_ref_deltas[REF_FRAMES];
+  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+  if (buf == NULL) {
+    av1_set_default_ref_deltas(last_ref_deltas);
+    av1_set_default_mode_deltas(last_mode_deltas);
+  } else {
+    memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+    memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  for (int i = 0; i < REF_FRAMES; i++) {
+    const int delta = lf->ref_deltas[i];
+    const int changed = delta != last_ref_deltas[i];
+    aom_wb_write_bit(wb, changed);
+    if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+  }
+  for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+    const int delta = lf->mode_deltas[i];
+    const int changed = delta != last_mode_deltas[i];
+    aom_wb_write_bit(wb, changed);
+    if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+  }
+}
+
+static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
+                                   struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.coded_lossless);
+  if (!cm->seq_params->enable_cdef) return;
+  if (cm->features.allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int i;
+  aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2);
+  aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2);
+  for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) {
+    aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i],
+                         CDEF_STRENGTH_BITS);
+    if (num_planes > 1)
+      aom_wb_write_literal(wb, cm->cdef_info.cdef_uv_strengths[i],
+                           CDEF_STRENGTH_BITS);
+  }
+}
+
+static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb,
+                                     int delta_q) {
+  if (delta_q != 0) {
+    aom_wb_write_bit(wb, 1);
+    aom_wb_write_inv_signed_literal(wb, delta_q, 6);
+  } else {
+    aom_wb_write_bit(wb, 0);
+  }
+}
+
+static AOM_INLINE void encode_quantization(
+    const CommonQuantParams *const quant_params, int num_planes,
+    bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS);
+  write_delta_q(wb, quant_params->y_dc_delta_q);
+  if (num_planes > 1) {
+    int diff_uv_delta =
+        (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) ||
+        (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q);
+    if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+    write_delta_q(wb, quant_params->u_dc_delta_q);
+    write_delta_q(wb, quant_params->u_ac_delta_q);
+    if (diff_uv_delta) {
+      write_delta_q(wb, quant_params->v_dc_delta_q);
+      write_delta_q(wb, quant_params->v_ac_delta_q);
+    }
+  }
+  aom_wb_write_bit(wb, quant_params->using_qmatrix);
+  if (quant_params->using_qmatrix) {
+    aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS);
+    if (!separate_uv_delta_q)
+      assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v);
+    else
+      aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS);
+  }
+}
+
+static AOM_INLINE void encode_segmentation(AV1_COMMON *cm,
+                                           struct aom_write_bit_buffer *wb) {
+  int i, j;
+  struct segmentation *seg = &cm->seg;
+
+  aom_wb_write_bit(wb, seg->enabled);
+  if (!seg->enabled) return;
+
+  // Write update flags
+  if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
+    aom_wb_write_bit(wb, seg->update_map);
+    if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update);
+    aom_wb_write_bit(wb, seg->update_data);
+  }
+
+  // Segmentation data
+  if (seg->update_data) {
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int active = segfeature_active(seg, i, j);
+        aom_wb_write_bit(wb, active);
+        if (active) {
+          const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+          const int data = clamp(get_segdata(seg, i, j), data_min, data_max);
+
+          if (av1_is_segfeature_signed(j)) {
+            aom_wb_write_inv_signed_literal(wb, data, ubits);
+          } else {
+            aom_wb_write_literal(wb, data, ubits);
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void write_frame_interp_filter(
+    InterpFilter filter, struct aom_write_bit_buffer *wb) {
+  aom_wb_write_bit(wb, filter == SWITCHABLE);
+  if (filter != SWITCHABLE)
+    aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
+}
+
+// Same function as write_uniform but writing to uncompresses header wb
+static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
+                                        int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_wb_write_literal(wb, v, l - 1);
+  } else {
+    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+    aom_wb_write_literal(wb, (v - m) & 1, 1);
+  }
+}
+
+static AOM_INLINE void write_tile_info_max_tile(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+  int width_sb =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+  int height_sb =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  int size_sb, i;
+  const CommonTileParams *const tiles = &cm->tiles;
+
+  aom_wb_write_bit(wb, tiles->uniform_spacing);
+
+  if (tiles->uniform_spacing) {
+    int ones = tiles->log2_cols - tiles->min_log2_cols;
+    while (ones--) {
+      aom_wb_write_bit(wb, 1);
+    }
+    if (tiles->log2_cols < tiles->max_log2_cols) {
+      aom_wb_write_bit(wb, 0);
+    }
+
+    // rows
+    ones = tiles->log2_rows - tiles->min_log2_rows;
+    while (ones--) {
+      aom_wb_write_bit(wb, 1);
+    }
+    if (tiles->log2_rows < tiles->max_log2_rows) {
+      aom_wb_write_bit(wb, 0);
+    }
+  } else {
+    // Explicit tiles with configurable tile widths and heights
+    // columns
+    for (i = 0; i < tiles->cols; i++) {
+      size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1);
+      width_sb -= size_sb;
+    }
+    assert(width_sb == 0);
+
+    // rows
+    for (i = 0; i < tiles->rows; i++) {
+      size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb),
+                       size_sb - 1);
+      height_sb -= size_sb;
+    }
+    assert(height_sb == 0);
+  }
+}
+
+static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       struct aom_write_bit_buffer *wb) {
+  write_tile_info_max_tile(cm, wb);
+
+  *saved_wb = *wb;
+  if (cm->tiles.rows * cm->tiles.cols > 1) {
+    // tile id used for cdf update
+    aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 3, 2);
+  }
+}
+
+static AOM_INLINE void write_ext_tile_info(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb,
+    struct aom_write_bit_buffer *wb) {
+  // This information is stored as a separate byte.
+  int mod = wb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
+  assert(aom_wb_is_byte_aligned(wb));
+
+  *saved_wb = *wb;
+  if (cm->tiles.rows * cm->tiles.cols > 1) {
+    // Note that the last item in the uncompressed header is the data
+    // describing tile configuration.
+    // Number of bytes in tile column size - 1
+    aom_wb_write_literal(wb, 0, 2);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 0, 2);
+  }
+}
+
+static INLINE int find_identical_tile(
+    const int tile_row, const int tile_col,
+    TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
+  const MV32 candidate_offset[1] = { { 1, 0 } };
+  const uint8_t *const cur_tile_data =
+      tile_buffers[tile_row][tile_col].data + 4;
+  const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+  int i;
+
+  if (tile_row == 0) return 0;
+
+  // (TODO: yunqingwang) For now, only above tile is checked and used.
+  // More candidates such as left tile can be added later.
+  for (i = 0; i < 1; i++) {
+    int row_offset = candidate_offset[0].row;
+    int col_offset = candidate_offset[0].col;
+    int row = tile_row - row_offset;
+    int col = tile_col - col_offset;
+    const uint8_t *tile_data;
+    TileBufferEnc *candidate;
+
+    if (row < 0 || col < 0) continue;
+
+    const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data);
+
+    // Read out tile-copy-mode bit:
+    if ((tile_hdr >> 31) == 1) {
+      // The candidate is a copy tile itself: the offset is stored in bits
+      // 30 through 24 inclusive.
+      row_offset += (tile_hdr >> 24) & 0x7f;
+      row = tile_row - row_offset;
+    }
+
+    candidate = &tile_buffers[row][col];
+
+    if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+    tile_data = candidate->data + 4;
+
+    if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+    // Identical tile found
+    assert(row_offset > 0);
+    return row_offset;
+  }
+
+  // No identical tile found
+  return 0;
+}
+
+static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
+                                         struct aom_write_bit_buffer *wb) {
+  const int scaling_active = av1_resize_scaled(cm);
+  aom_wb_write_bit(wb, scaling_active);
+  if (scaling_active) {
+    aom_wb_write_literal(wb, cm->render_width - 1, 16);
+    aom_wb_write_literal(wb, cm->render_height - 1, 16);
+  }
+}
+
+static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
+                                            struct aom_write_bit_buffer *wb) {
+  const SequenceHeader *const seq_params = cm->seq_params;
+  if (!seq_params->enable_superres) {
+    assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
+    return;
+  }
+
+  // First bit is whether to to scale or not
+  if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
+    aom_wb_write_bit(wb, 0);  // no scaling
+  } else {
+    aom_wb_write_bit(wb, 1);  // scaling, write scale factor
+    assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN);
+    assert(cm->superres_scale_denominator <
+           SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS));
+    aom_wb_write_literal(
+        wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
+        SUPERRES_SCALE_BITS);
+  }
+}
+
+static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
+                                        int frame_size_override,
+                                        struct aom_write_bit_buffer *wb) {
+  const int coded_width = cm->superres_upscaled_width - 1;
+  const int coded_height = cm->superres_upscaled_height - 1;
+
+  if (frame_size_override) {
+    const SequenceHeader *seq_params = cm->seq_params;
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+    aom_wb_write_literal(wb, coded_width, num_bits_width);
+    aom_wb_write_literal(wb, coded_height, num_bits_height);
+  }
+
+  write_superres_scale(cm, wb);
+  write_render_size(cm, wb);
+}
+
+static AOM_INLINE void write_frame_size_with_refs(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+  int found = 0;
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
+
+    if (cfg != NULL) {
+      found = cm->superres_upscaled_width == cfg->y_crop_width &&
+              cm->superres_upscaled_height == cfg->y_crop_height;
+      found &= cm->render_width == cfg->render_width &&
+               cm->render_height == cfg->render_height;
+    }
+    aom_wb_write_bit(wb, found);
+    if (found) {
+      write_superres_scale(cm, wb);
+      break;
+    }
+  }
+
+  if (!found) {
+    int frame_size_override = 1;  // Always equal to 1 in this function
+    write_frame_size(cm, frame_size_override, wb);
+  }
+}
+
+static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile,
+                                     struct aom_write_bit_buffer *wb) {
+  assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
+  aom_wb_write_literal(wb, profile, PROFILE_BITS);
+}
+
+static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params,
+                                      struct aom_write_bit_buffer *wb) {
+  // Profile 0/1: [0] for 8 bit, [1]  10-bit
+  // Profile   2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
+  aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
+  if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) {
+    aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1);
+  }
+}
+
+static AOM_INLINE void write_color_config(
+    const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+  write_bitdepth(seq_params, wb);
+  const int is_monochrome = seq_params->monochrome;
+  // monochrome bit
+  if (seq_params->profile != PROFILE_1)
+    aom_wb_write_bit(wb, is_monochrome);
+  else
+    assert(!is_monochrome);
+  if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+    aom_wb_write_bit(wb, 0);  // No color description present
+  } else {
+    aom_wb_write_bit(wb, 1);  // Color description present
+    aom_wb_write_literal(wb, seq_params->color_primaries, 8);
+    aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8);
+    aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8);
+  }
+  if (is_monochrome) {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, seq_params->color_range);
+    return;
+  }
+  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    assert(seq_params->profile == PROFILE_1 ||
+           (seq_params->profile == PROFILE_2 &&
+            seq_params->bit_depth == AOM_BITS_12));
+  } else {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, seq_params->color_range);
+    if (seq_params->profile == PROFILE_0) {
+      // 420 only
+      assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1);
+    } else if (seq_params->profile == PROFILE_1) {
+      // 444 only
+      assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    } else if (seq_params->profile == PROFILE_2) {
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        // 420, 444 or 422
+        aom_wb_write_bit(wb, seq_params->subsampling_x);
+        if (seq_params->subsampling_x == 0) {
+          assert(seq_params->subsampling_y == 0 &&
+                 "4:4:0 subsampling not allowed in AV1");
+        } else {
+          aom_wb_write_bit(wb, seq_params->subsampling_y);
+        }
+      } else {
+        // 422 only
+        assert(seq_params->subsampling_x == 1 &&
+               seq_params->subsampling_y == 0);
+      }
+    }
+    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+      assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    }
+    if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) {
+      aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2);
+    }
+  }
+  aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
+}
+
+static AOM_INLINE void write_timing_info_header(
+    const aom_timing_info_t *const timing_info,
+    struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32);
+  aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32);
+  aom_wb_write_bit(wb, timing_info->equal_picture_interval);
+  if (timing_info->equal_picture_interval) {
+    aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1);
+  }
+}
+
+static AOM_INLINE void write_decoder_model_info(
+    const aom_dec_model_info_t *const decoder_model_info,
+    struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(
+      wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5);
+  aom_wb_write_unsigned_literal(
+      wb, decoder_model_info->num_units_in_decoding_tick, 32);
+  aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1,
+                       5);
+  aom_wb_write_literal(
+      wb, decoder_model_info->frame_presentation_time_length - 1, 5);
+}
+
+static AOM_INLINE void write_dec_model_op_parameters(
+    const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length,
+    struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay,
+                                buffer_delay_length);
+  aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay,
+                                buffer_delay_length);
+  aom_wb_write_bit(wb, op_params->low_delay_mode_flag);
+}
+
+static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm,
+                                         struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(
+      wb, cm->frame_presentation_time,
+      cm->seq_params->decoder_model_info.frame_presentation_time_length);
+}
+
+static AOM_INLINE void write_film_grain_params(
+    const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
+  aom_wb_write_bit(wb, pars->apply_grain);
+  if (!pars->apply_grain) return;
+
+  aom_wb_write_literal(wb, pars->random_seed, 16);
+
+  if (cm->current_frame.frame_type == INTER_FRAME)
+    aom_wb_write_bit(wb, pars->update_parameters);
+
+  if (!pars->update_parameters) {
+    int ref_frame, ref_idx;
+    for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
+      ref_idx = get_ref_frame_map_idx(cm, ref_frame);
+      assert(ref_idx != INVALID_IDX);
+      const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx];
+      if (buf->film_grain_params_present &&
+          aom_check_grain_params_equiv(pars, &buf->film_grain_params)) {
+        break;
+      }
+    }
+    assert(ref_frame < REF_FRAMES);
+    aom_wb_write_literal(wb, ref_idx, 3);
+    return;
+  }
+
+  // Scaling functions parameters
+  aom_wb_write_literal(wb, pars->num_y_points, 4);  // max 14
+  for (int i = 0; i < pars->num_y_points; i++) {
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8);
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
+  }
+
+  if (!cm->seq_params->monochrome) {
+    aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
+  } else {
+    assert(!pars->chroma_scaling_from_luma);
+  }
+
+  if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->seq_params->subsampling_x == 1) &&
+       (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) {
+    assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
+  } else {
+    aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8);
+    }
+
+    aom_wb_write_literal(wb, pars->num_cr_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8);
+    }
+  }
+
+  aom_wb_write_literal(wb, pars->scaling_shift - 8, 2);  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  aom_wb_write_literal(wb, pars->ar_coeff_lag, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8);
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8);
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8);
+
+  aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2);  // 8 + value
+
+  aom_wb_write_literal(wb, pars->grain_scale_shift, 2);
+
+  if (pars->num_cb_points) {
+    aom_wb_write_literal(wb, pars->cb_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_offset, 9);
+  }
+
+  if (pars->num_cr_points) {
+    aom_wb_write_literal(wb, pars->cr_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_offset, 9);
+  }
+
+  aom_wb_write_bit(wb, pars->overlap_flag);
+
+  aom_wb_write_bit(wb, pars->clip_to_restricted_range);
+}
+
+static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params,
+                                     struct aom_write_bit_buffer *wb) {
+  (void)seq_params;
+  (void)wb;
+  assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
+  assert(seq_params->mib_size == 1 << seq_params->mib_size_log2);
+  assert(seq_params->sb_size == BLOCK_128X128 ||
+         seq_params->sb_size == BLOCK_64X64);
+  aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
+}
+
+static AOM_INLINE void write_sequence_header(
+    const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4);
+  aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4);
+  aom_wb_write_literal(wb, seq_params->max_frame_width - 1,
+                       seq_params->num_bits_width);
+  aom_wb_write_literal(wb, seq_params->max_frame_height - 1,
+                       seq_params->num_bits_height);
+
+  if (!seq_params->reduced_still_picture_hdr) {
+    aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+    if (seq_params->frame_id_numbers_present_flag) {
+      // We must always have delta_frame_id_length < frame_id_length,
+      // in order for a frame to be referenced with a unique delta.
+      // Avoid wasting bits by using a coding that enforces this restriction.
+      aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4);
+      aom_wb_write_literal(
+          wb,
+          seq_params->frame_id_length - seq_params->delta_frame_id_length - 1,
+          3);
+    }
+  }
+
+  write_sb_size(seq_params, wb);
+
+  aom_wb_write_bit(wb, seq_params->enable_filter_intra);
+  aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter);
+
+  if (!seq_params->reduced_still_picture_hdr) {
+    aom_wb_write_bit(wb, seq_params->enable_interintra_compound);
+    aom_wb_write_bit(wb, seq_params->enable_masked_compound);
+    aom_wb_write_bit(wb, seq_params->enable_warped_motion);
+    aom_wb_write_bit(wb, seq_params->enable_dual_filter);
+
+    aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint);
+
+    if (seq_params->order_hint_info.enable_order_hint) {
+      aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp);
+      aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs);
+    }
+    if (seq_params->force_screen_content_tools == 2) {
+      aom_wb_write_bit(wb, 1);
+    } else {
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, seq_params->force_screen_content_tools);
+    }
+    if (seq_params->force_screen_content_tools > 0) {
+      if (seq_params->force_integer_mv == 2) {
+        aom_wb_write_bit(wb, 1);
+      } else {
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, seq_params->force_integer_mv);
+      }
+    } else {
+      assert(seq_params->force_integer_mv == 2);
+    }
+    if (seq_params->order_hint_info.enable_order_hint)
+      aom_wb_write_literal(
+          wb, seq_params->order_hint_info.order_hint_bits_minus_1, 3);
+  }
+
+  aom_wb_write_bit(wb, seq_params->enable_superres);
+  aom_wb_write_bit(wb, seq_params->enable_cdef);
+  aom_wb_write_bit(wb, seq_params->enable_restoration);
+}
+
+static AOM_INLINE void write_global_motion_params(
+    const WarpedMotionParams *params, const WarpedMotionParams *ref_params,
+    struct aom_write_bit_buffer *wb, int allow_hp) {
+  const TransformationType type = params->wmtype;
+
+  // As a workaround for an AV1 spec bug, we avoid choosing TRANSLATION
+  // type models. Check here that we don't accidentally pick one somehow.
+  // See comments in gm_get_motion_vector() for details on the bug we're
+  // working around here
+  assert(type != TRANSLATION);
+
+  aom_wb_write_bit(wb, type != IDENTITY);
+  if (type != IDENTITY) {
+    aom_wb_write_bit(wb, type == ROTZOOM);
+    if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
+  }
+
+  if (type >= ROTZOOM) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+  }
+
+  if (type >= AFFINE) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[0] >> trans_prec_diff),
+        (params->wmmat[0] >> trans_prec_diff));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[1] >> trans_prec_diff),
+        (params->wmmat[1] >> trans_prec_diff));
+  }
+}
+
+static AOM_INLINE void write_global_motion(AV1_COMP *cpi,
+                                           struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  int frame;
+  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    const WarpedMotionParams *ref_params =
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
+    write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
+                               cm->features.allow_high_precision_mv);
+    // TODO(sarahparker, debargha): The logic in the commented out code below
+    // does not work currently and causes mismatches when resize is on.
+    // Fix it before turning the optimization back on.
+    /*
+    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame);
+    if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
+        cpi->source->y_crop_height == ref_buf->y_crop_height) {
+      write_global_motion_params(&cm->global_motion[frame],
+                                 &cm->prev_frame->global_motion[frame], wb,
+                                 cm->features.allow_high_precision_mv);
+    } else {
+      assert(cm->global_motion[frame].wmtype == IDENTITY &&
+             "Invalid warp type for frames of different resolutions");
+    }
+    */
+    /*
+    printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n",
+           cm->current_frame.frame_number, cm->show_frame, frame,
+           cm->global_motion[frame].wmmat[0],
+           cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+           cm->global_motion[frame].wmmat[3]);
+           */
+  }
+}
+
+static int check_frame_refs_short_signaling(AV1_COMMON *const cm,
+                                            bool enable_ref_short_signaling) {
+  // In rtc case when res < 360p and speed >= 9, we turn on
+  // frame_refs_short_signaling if it won't break the decoder.
+  if (enable_ref_short_signaling) {
+    const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+    const int base =
+        1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+    const int order_hint_group_cur =
+        cm->current_frame.display_order_hint / base;
+    const int order_hint_group_gld =
+        cm->ref_frame_map[gld_map_idx]->display_order_hint / base;
+    const int relative_dist = cm->current_frame.order_hint -
+                              cm->ref_frame_map[gld_map_idx]->order_hint;
+
+    // If current frame and GOLDEN frame are in the same order_hint group, and
+    // they are not far apart (i.e., > 64 frames), then return 1.
+    if (order_hint_group_cur == order_hint_group_gld && relative_dist >= 0 &&
+        relative_dist <= 64) {
+      return 1;
+    }
+    return 0;
+  }
+
+  // Check whether all references are distinct frames.
+  const RefCntBuffer *seen_bufs[INTER_REFS_PER_FRAME] = { NULL };
+  int num_refs = 0;
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf != NULL) {
+      int seen = 0;
+      for (int i = 0; i < num_refs; i++) {
+        if (seen_bufs[i] == buf) {
+          seen = 1;
+          break;
+        }
+      }
+      if (!seen) seen_bufs[num_refs++] = buf;
+    }
+  }
+
+  // We only turn on frame_refs_short_signaling when all references are
+  // distinct.
+  if (num_refs < INTER_REFS_PER_FRAME) {
+    // It indicates that there exist more than one reference frame pointing to
+    // the same reference buffer, i.e. two or more references are duplicate.
+    return 0;
+  }
+
+  // Check whether the encoder side ref frame choices are aligned with that to
+  // be derived at the decoder side.
+  int remapped_ref_idx_decoder[REF_FRAMES];
+
+  const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+  const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+
+  // Set up the frame refs mapping indexes according to the
+  // frame_refs_short_signaling policy.
+  av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx);
+
+  // We only turn on frame_refs_short_signaling when the encoder side decision
+  // on ref frames is identical to that at the decoder side.
+  int frame_refs_short_signaling = 1;
+  for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
+    // Compare the buffer index between two reference frames indexed
+    // respectively by the encoder and the decoder side decisions.
+    RefCntBuffer *ref_frame_buf_new = NULL;
+    if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) {
+      ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]];
+    }
+    if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) {
+      frame_refs_short_signaling = 0;
+      break;
+    }
+  }
+
+#if 0   // For debug
+  printf("\nFrame=%d: \n", cm->current_frame.frame_number);
+  printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    printf("enc_ref(map_idx=%d)=%d, vs. "
+        "dec_ref(map_idx=%d)=%d\n",
+        get_ref_frame_map_idx(cm, ref_frame), ref_frame,
+        cm->remapped_ref_idx[ref_frame - LAST_FRAME],
+        ref_frame);
+  }
+#endif  // 0
+
+  return frame_refs_short_signaling;
+}
+
+// New function based on HLS R18
+static AOM_INLINE void write_uncompressed_header_obu(
+    AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb,
+    struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  FeatureFlags *const features = &cm->features;
+
+  if (!cpi->sf.rt_sf.enable_ref_short_signaling ||
+      !seq_params->order_hint_info.enable_order_hint ||
+      seq_params->order_hint_info.enable_ref_frame_mvs) {
+    current_frame->frame_refs_short_signaling = 0;
+  } else {
+    current_frame->frame_refs_short_signaling = 1;
+  }
+
+  if (seq_params->still_picture) {
+    assert(cm->show_existing_frame == 0);
+    assert(cm->show_frame == 1);
+    assert(current_frame->frame_type == KEY_FRAME);
+  }
+  if (!seq_params->reduced_still_picture_hdr) {
+    if (encode_show_existing_frame(cm)) {
+      aom_wb_write_bit(wb, 1);  // show_existing_frame
+      aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+      if (seq_params->decoder_model_info_present_flag &&
+          seq_params->timing_info.equal_picture_interval == 0) {
+        write_tu_pts_info(cm, wb);
+      }
+      if (seq_params->frame_id_numbers_present_flag) {
+        int frame_id_len = seq_params->frame_id_length;
+        int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+        aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+      }
+      return;
+    } else {
+      aom_wb_write_bit(wb, 0);  // show_existing_frame
+    }
+
+    aom_wb_write_literal(wb, current_frame->frame_type, 2);
+
+    aom_wb_write_bit(wb, cm->show_frame);
+    if (cm->show_frame) {
+      if (seq_params->decoder_model_info_present_flag &&
+          seq_params->timing_info.equal_picture_interval == 0)
+        write_tu_pts_info(cm, wb);
+    } else {
+      aom_wb_write_bit(wb, cm->showable_frame);
+    }
+    if (frame_is_sframe(cm)) {
+      assert(features->error_resilient_mode);
+    } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) {
+      aom_wb_write_bit(wb, features->error_resilient_mode);
+    }
+  }
+  aom_wb_write_bit(wb, features->disable_cdf_update);
+
+  if (seq_params->force_screen_content_tools == 2) {
+    aom_wb_write_bit(wb, features->allow_screen_content_tools);
+  } else {
+    assert(features->allow_screen_content_tools ==
+           seq_params->force_screen_content_tools);
+  }
+
+  if (features->allow_screen_content_tools) {
+    if (seq_params->force_integer_mv == 2) {
+      aom_wb_write_bit(wb, features->cur_frame_force_integer_mv);
+    } else {
+      assert(features->cur_frame_force_integer_mv ==
+             seq_params->force_integer_mv);
+    }
+  } else {
+    assert(features->cur_frame_force_integer_mv == 0);
+  }
+
+  int frame_size_override_flag = 0;
+
+  if (seq_params->reduced_still_picture_hdr) {
+    assert(cm->superres_upscaled_width == seq_params->max_frame_width &&
+           cm->superres_upscaled_height == seq_params->max_frame_height);
+  } else {
+    if (seq_params->frame_id_numbers_present_flag) {
+      int frame_id_len = seq_params->frame_id_length;
+      aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+    }
+
+    if (cm->superres_upscaled_width > seq_params->max_frame_width ||
+        cm->superres_upscaled_height > seq_params->max_frame_height) {
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Frame dimensions are larger than the maximum values");
+    }
+
+    frame_size_override_flag =
+        frame_is_sframe(cm)
+            ? 1
+            : (cm->superres_upscaled_width != seq_params->max_frame_width ||
+               cm->superres_upscaled_height != seq_params->max_frame_height);
+    if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
+
+    if (seq_params->order_hint_info.enable_order_hint)
+      aom_wb_write_literal(
+          wb, current_frame->order_hint,
+          seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+    if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
+      aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (seq_params->decoder_model_info_present_flag) {
+    aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present);
+    if (cpi->ppi->buffer_removal_time_present) {
+      for (int op_num = 0;
+           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+        if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
+          if (seq_params->operating_point_idc[op_num] == 0 ||
+              ((seq_params->operating_point_idc[op_num] >>
+                cm->temporal_layer_id) &
+                   0x1 &&
+               (seq_params->operating_point_idc[op_num] >>
+                (cm->spatial_layer_id + 8)) &
+                   0x1)) {
+            aom_wb_write_unsigned_literal(
+                wb, cm->buffer_removal_times[op_num],
+                seq_params->decoder_model_info.buffer_removal_time_length);
+            cm->buffer_removal_times[op_num]++;
+            if (cm->buffer_removal_times[op_num] == 0) {
+              aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                                 "buffer_removal_time overflowed");
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Shown keyframes and switch-frames automatically refreshes all reference
+  // frames.  For all other frame types, we need to write refresh_frame_flags.
+  if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) ||
+      current_frame->frame_type == INTER_FRAME ||
+      current_frame->frame_type == INTRA_ONLY_FRAME)
+    aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES);
+
+  if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) {
+    // Write all ref frame order hints if error_resilient_mode == 1
+    if (features->error_resilient_mode &&
+        seq_params->order_hint_info.enable_order_hint) {
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        aom_wb_write_literal(
+            wb, cm->ref_frame_map[ref_idx]->order_hint,
+            seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+      }
+    }
+  }
+
+  if (current_frame->frame_type == KEY_FRAME) {
+    write_frame_size(cm, frame_size_override_flag, wb);
+    assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+    if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+      aom_wb_write_bit(wb, features->allow_intrabc);
+  } else {
+    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+      write_frame_size(cm, frame_size_override_flag, wb);
+      assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+      if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+        aom_wb_write_bit(wb, features->allow_intrabc);
+    } else if (current_frame->frame_type == INTER_FRAME ||
+               frame_is_sframe(cm)) {
+      MV_REFERENCE_FRAME ref_frame;
+
+      // NOTE: Error resilient mode turns off frame_refs_short_signaling
+      //       automatically.
+#define FRAME_REFS_SHORT_SIGNALING 0
+#if FRAME_REFS_SHORT_SIGNALING
+      current_frame->frame_refs_short_signaling =
+          seq_params->order_hint_info.enable_order_hint;
+#endif  // FRAME_REFS_SHORT_SIGNALING
+
+      if (current_frame->frame_refs_short_signaling) {
+        //    In rtc case when cpi->sf.rt_sf.enable_ref_short_signaling is true,
+        //    we turn on frame_refs_short_signaling when the current frame and
+        //    golden frame are in the same order_hint group, and their relative
+        //    distance is <= 64 (in order to be decodable).
+
+        //    For other cases, an example solution for encoder-side
+        //    implementation on frame_refs_short_signaling is also provided in
+        //    this function, where frame_refs_short_signaling is only turned on
+        //    when the encoder side decision on ref frames is identical to that
+        //    at the decoder side.
+
+        current_frame->frame_refs_short_signaling =
+            check_frame_refs_short_signaling(
+                cm, cpi->sf.rt_sf.enable_ref_short_signaling);
+      }
+
+      if (seq_params->order_hint_info.enable_order_hint)
+        aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling);
+
+      if (current_frame->frame_refs_short_signaling) {
+        const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME);
+        aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
+
+        const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+        aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
+      }
+
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX);
+        if (!current_frame->frame_refs_short_signaling)
+          aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame),
+                               REF_FRAMES_LOG2);
+        if (seq_params->frame_id_numbers_present_flag) {
+          int i = get_ref_frame_map_idx(cm, ref_frame);
+          int frame_id_len = seq_params->frame_id_length;
+          int diff_len = seq_params->delta_frame_id_length;
+          int delta_frame_id_minus_1 =
+              ((cm->current_frame_id - cm->ref_frame_id[i] +
+                (1 << frame_id_len)) %
+               (1 << frame_id_len)) -
+              1;
+          if (delta_frame_id_minus_1 < 0 ||
+              delta_frame_id_minus_1 >= (1 << diff_len)) {
+            aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                               "Invalid delta_frame_id_minus_1");
+          }
+          aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
+        }
+      }
+
+      if (!features->error_resilient_mode && frame_size_override_flag) {
+        write_frame_size_with_refs(cm, wb);
+      } else {
+        write_frame_size(cm, frame_size_override_flag, wb);
+      }
+
+      if (!features->cur_frame_force_integer_mv)
+        aom_wb_write_bit(wb, features->allow_high_precision_mv);
+      write_frame_interp_filter(features->interp_filter, wb);
+      aom_wb_write_bit(wb, features->switchable_motion_mode);
+      if (frame_might_allow_ref_frame_mvs(cm)) {
+        aom_wb_write_bit(wb, features->allow_ref_frame_mvs);
+      } else {
+        assert(features->allow_ref_frame_mvs == 0);
+      }
+    }
+  }
+
+  const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
+                              !(features->disable_cdf_update);
+  if (cm->tiles.large_scale)
+    assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+
+  if (might_bwd_adapt) {
+    aom_wb_write_bit(
+        wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+  }
+
+  write_tile_info(cm, saved_wb, wb);
+  encode_quantization(quant_params, av1_num_planes(cm),
+                      cm->seq_params->separate_uv_delta_q, wb);
+  encode_segmentation(cm, wb);
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0);
+  if (quant_params->base_qindex > 0) {
+    aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag);
+    if (delta_q_info->delta_q_present_flag) {
+      aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2);
+      xd->current_base_qindex = quant_params->base_qindex;
+      if (features->allow_intrabc)
+        assert(delta_q_info->delta_lf_present_flag == 0);
+      else
+        aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag);
+      if (delta_q_info->delta_lf_present_flag) {
+        aom_wb_write_literal(wb, get_msb(delta_q_info->delta_lf_res), 2);
+        aom_wb_write_bit(wb, delta_q_info->delta_lf_multi);
+        av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+      }
+    }
+  }
+
+  if (features->all_lossless) {
+    assert(!av1_superres_scaled(cm));
+  } else {
+    if (!features->coded_lossless) {
+      encode_loopfilter(cm, wb);
+      encode_cdef(cm, wb);
+    }
+    encode_restoration_mode(cm, wb);
+  }
+
+  // Write TX mode
+  if (features->coded_lossless)
+    assert(features->tx_mode == ONLY_4X4);
+  else
+    aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT);
+
+  if (!frame_is_intra_only(cm)) {
+    const int use_hybrid_pred =
+        current_frame->reference_mode == REFERENCE_MODE_SELECT;
+
+    aom_wb_write_bit(wb, use_hybrid_pred);
+  }
+
+  if (current_frame->skip_mode_info.skip_mode_allowed)
+    aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag);
+
+  if (frame_might_allow_warped_motion(cm))
+    aom_wb_write_bit(wb, features->allow_warped_motion);
+  else
+    assert(!features->allow_warped_motion);
+
+  aom_wb_write_bit(wb, features->reduced_tx_set_used);
+
+  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
+
+  if (seq_params->film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame))
+    write_film_grain_params(cpi, wb);
+
+  if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb);
+}
+
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+  // Choose the number of bytes required to represent size, without
+  // using the 'spare_msbs' number of most significant bits.
+
+  // Make sure we will fit in 4 bytes to start with..
+  if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
+
+  // Normalise to 32 bits
+  size <<= spare_msbs;
+
+  if (size >> 24 != 0)
+    return 4;
+  else if (size >> 16 != 0)
+    return 3;
+  else if (size >> 8 != 0)
+    return 2;
+  else
+    return 1;
+}
+
+static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz,
+                                       const int val) {
+  switch (sz) {
+    case 1: dst[0] = (uint8_t)(val & 0xff); break;
+    case 2: mem_put_le16(dst, val); break;
+    case 3: mem_put_le24(dst, val); break;
+    case 4: mem_put_le32(dst, val); break;
+    default: assert(0 && "Invalid size"); break;
+  }
+}
+
+static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
+                       const uint32_t data_size, const uint32_t max_tile_size,
+                       const uint32_t max_tile_col_size,
+                       int *const tile_size_bytes,
+                       int *const tile_col_size_bytes) {
+  // Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+  int tsb;
+  int tcsb;
+
+  if (tiles->large_scale) {
+    // The top bit in the tile size field indicates tile copy mode, so we
+    // have 1 less bit to code the tile size
+    tsb = choose_size_bytes(max_tile_size, 1);
+    tcsb = choose_size_bytes(max_tile_col_size, 0);
+  } else {
+    tsb = choose_size_bytes(max_tile_size, 0);
+    tcsb = 4;  // This is ignored
+    (void)max_tile_col_size;
+  }
+
+  assert(tsb > 0);
+  assert(tcsb > 0);
+
+  *tile_size_bytes = tsb;
+  *tile_col_size_bytes = tcsb;
+  if (tsb == 4 && tcsb == 4) return data_size;
+
+  uint32_t wpos = 0;
+  uint32_t rpos = 0;
+
+  if (tiles->large_scale) {
+    int tile_row;
+    int tile_col;
+
+    for (tile_col = 0; tile_col < tiles->cols; tile_col++) {
+      // All but the last column has a column header
+      if (tile_col < tiles->cols - 1) {
+        uint32_t tile_col_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // Adjust the tile column size by the number of bytes removed
+        // from the tile size fields.
+        tile_col_size -= (4 - tsb) * tiles->rows;
+
+        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+        wpos += tcsb;
+      }
+
+      for (tile_row = 0; tile_row < tiles->rows; tile_row++) {
+        // All, including the last row has a header
+        uint32_t tile_header = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // If this is a copy tile, we need to shift the MSB to the
+        // top bit of the new width, and there is no data to copy.
+        if (tile_header >> 31 != 0) {
+          if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+        } else {
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+
+          tile_header += AV1_MIN_TILE_SIZE_BYTES;
+          memmove(dst + wpos, dst + rpos, tile_header);
+          rpos += tile_header;
+          wpos += tile_header;
+        }
+      }
+    }
+
+    assert(rpos > wpos);
+    assert(rpos == data_size);
+
+    return wpos;
+  }
+  const int n_tiles = tiles->cols * tiles->rows;
+  int n;
+
+  for (n = 0; n < n_tiles; n++) {
+    int tile_size;
+
+    if (n == n_tiles - 1) {
+      tile_size = data_size - rpos;
+    } else {
+      tile_size = mem_get_le32(dst + rpos);
+      rpos += 4;
+      mem_put_varsize(dst + wpos, tsb, tile_size);
+      tile_size += AV1_MIN_TILE_SIZE_BYTES;
+      wpos += tsb;
+    }
+
+    memmove(dst + wpos, dst + rpos, tile_size);
+
+    rpos += tile_size;
+    wpos += tile_size;
+  }
+
+  assert(rpos > wpos);
+  assert(rpos == data_size);
+
+  return wpos;
+}
+
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+                              int *frame_header_count, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst) {
+  if (level_params->keep_level_stats &&
+      (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
+    ++(*frame_header_count);
+
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
+  aom_wb_write_literal(&wb, (int)obu_type, 4);
+  aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+  aom_wb_write_literal(&wb, 1, 1);  // obu_has_size_field
+  aom_wb_write_literal(&wb, 0, 1);  // reserved
+
+  if (obu_extension) {
+    aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
+  }
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+                            uint8_t *dest) {
+  const size_t offset = obu_header_size;
+  size_t coded_obu_size = 0;
+  const uint32_t obu_size = (uint32_t)obu_payload_size;
+  assert(obu_size == obu_payload_size);
+
+  if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
+                      &coded_obu_size) != 0) {
+    return AOM_CODEC_ERROR;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size,
+                       uint8_t *data) {
+  const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
+  const size_t move_dst_offset = length_field_size + obu_header_size;
+  const size_t move_src_offset = obu_header_size;
+  const size_t move_size = obu_payload_size;
+  memmove(data + move_dst_offset, data + move_src_offset, move_size);
+  return length_field_size;
+}
+
+static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+  if (aom_wb_is_byte_aligned(wb)) {
+    aom_wb_write_literal(wb, 0x80, 8);
+  } else {
+    // assumes that the other bits are already 0s
+    aom_wb_write_bit(wb, 1);
+  }
+}
+
+static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx,
+                                             struct aom_write_bit_buffer *wb) {
+  assert(is_valid_seq_level_idx(seq_level_idx));
+  aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
+}
+
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+                                       uint8_t *const dst) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  write_profile(seq_params->profile, &wb);
+
+  // Still picture or not
+  aom_wb_write_bit(&wb, seq_params->still_picture);
+  assert(IMPLIES(!seq_params->still_picture,
+                 !seq_params->reduced_still_picture_hdr));
+  // whether to use reduced still picture header
+  aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr);
+
+  if (seq_params->reduced_still_picture_hdr) {
+    assert(seq_params->timing_info_present == 0);
+    assert(seq_params->decoder_model_info_present_flag == 0);
+    assert(seq_params->display_model_info_present_flag == 0);
+    write_bitstream_level(seq_params->seq_level_idx[0], &wb);
+  } else {
+    aom_wb_write_bit(
+        &wb, seq_params->timing_info_present);  // timing info present flag
+
+    if (seq_params->timing_info_present) {
+      // timing_info
+      write_timing_info_header(&seq_params->timing_info, &wb);
+      aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag);
+      if (seq_params->decoder_model_info_present_flag) {
+        write_decoder_model_info(&seq_params->decoder_model_info, &wb);
+      }
+    }
+    aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag);
+    aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1,
+                         OP_POINTS_CNT_MINUS_1_BITS);
+    int i;
+    for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+      aom_wb_write_literal(&wb, seq_params->operating_point_idc[i],
+                           OP_POINTS_IDC_BITS);
+      write_bitstream_level(seq_params->seq_level_idx[i], &wb);
+      if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
+        aom_wb_write_bit(&wb, seq_params->tier[i]);
+      if (seq_params->decoder_model_info_present_flag) {
+        aom_wb_write_bit(
+            &wb, seq_params->op_params[i].decoder_model_param_present_flag);
+        if (seq_params->op_params[i].decoder_model_param_present_flag) {
+          write_dec_model_op_parameters(
+              &seq_params->op_params[i],
+              seq_params->decoder_model_info
+                  .encoder_decoder_buffer_delay_length,
+              &wb);
+        }
+      }
+      if (seq_params->display_model_info_present_flag) {
+        aom_wb_write_bit(
+            &wb, seq_params->op_params[i].display_model_param_present_flag);
+        if (seq_params->op_params[i].display_model_param_present_flag) {
+          assert(seq_params->op_params[i].initial_display_delay >= 1);
+          assert(seq_params->op_params[i].initial_display_delay <= 10);
+          aom_wb_write_literal(
+              &wb, seq_params->op_params[i].initial_display_delay - 1, 4);
+        }
+      }
+    }
+  }
+  write_sequence_header(seq_params, &wb);
+
+  write_color_config(seq_params, &wb);
+
+  aom_wb_write_bit(&wb, seq_params->film_grain_params_present);
+
+  add_trailing_bits(&wb);
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t *const dst,
+                                       int append_trailing_bits) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  write_uncompressed_header_obu(cpi, xd, saved_wb, &wb);
+  if (append_trailing_bits) add_trailing_bits(&wb);
+  return aom_wb_bytes_written(&wb);
+}
+
+static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile,
+                                        int end_tile, int tiles_log2,
+                                        int tile_start_and_end_present_flag) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  if (!tiles_log2) return size;
+
+  aom_wb_write_bit(&wb, tile_start_and_end_present_flag);
+
+  if (tile_start_and_end_present_flag) {
+    aom_wb_write_literal(&wb, start_tile, tiles_log2);
+    aom_wb_write_literal(&wb, end_tile, tiles_log2);
+  }
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                                const char *filename);
+
+typedef struct {
+  uint32_t tg_hdr_size;
+  uint32_t frame_header_size;
+} LargeTileFrameOBU;
+
+// Initialize OBU header for large scale tile case.
+static uint32_t init_large_scale_tile_obu_header(
+    AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb,
+    LargeTileFrameOBU *lst_obu) {
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
+  CurrentFrame *const current_frame = &cpi->common.current_frame;
+  // For large_scale_tile case, we always have only one tile group, so it can
+  // be written as an OBU_FRAME.
+  const OBU_TYPE obu_type = OBU_FRAME;
+  lst_obu->tg_hdr_size = av1_write_obu_header(
+      level_params, &cpi->frame_header_count, obu_type, 0, *data);
+  *data += lst_obu->tg_hdr_size;
+
+  const uint32_t frame_header_size =
+      write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0);
+  *data += frame_header_size;
+  lst_obu->frame_header_size = frame_header_size;
+  // (yunqing) This test ensures the correctness of large scale tile coding.
+  if (cpi->oxcf.tile_cfg.enable_ext_tile_debug) {
+    char fn[20] = "./fh";
+    fn[4] = current_frame->frame_number / 100 + '0';
+    fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+    fn[6] = (current_frame->frame_number % 10) + '0';
+    fn[7] = '\0';
+    av1_print_uncompressed_frame_header(*data - frame_header_size,
+                                        frame_header_size, fn);
+  }
+  return frame_header_size;
+}
+
+// Write total buffer size and related information into the OBU header for large
+// scale tile case.
+static void write_large_scale_tile_obu_size(
+    const CommonTileParams *const tiles, uint8_t *const dst, uint8_t *data,
+    struct aom_write_bit_buffer *saved_wb, LargeTileFrameOBU *const lst_obu,
+    int have_tiles, uint32_t *total_size, int max_tile_size,
+    int max_tile_col_size) {
+  int tile_size_bytes = 0;
+  int tile_col_size_bytes = 0;
+  if (have_tiles) {
+    *total_size = remux_tiles(
+        tiles, data, *total_size - lst_obu->frame_header_size, max_tile_size,
+        max_tile_col_size, &tile_size_bytes, &tile_col_size_bytes);
+    *total_size += lst_obu->frame_header_size;
+  }
+
+  // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+  // current tile group size before tile data(include tile column header).
+  // Tile group size doesn't include the bytes storing tg size.
+  *total_size += lst_obu->tg_hdr_size;
+  const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size;
+  const size_t length_field_size =
+      av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
+  if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) !=
+      AOM_CODEC_OK)
+    assert(0);
+
+  *total_size += (uint32_t)length_field_size;
+  saved_wb->bit_buffer += length_field_size;
+
+  // Now fill in the gaps in the uncompressed header.
+  if (have_tiles) {
+    assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+    aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
+
+    assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+    aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+  }
+}
+
+// Store information on each large scale tile in the OBU header.
+static void write_large_scale_tile_obu(
+    AV1_COMP *const cpi, uint8_t *const dst, LargeTileFrameOBU *const lst_obu,
+    int *const largest_tile_id, uint32_t *total_size, const int have_tiles,
+    unsigned int *const max_tile_size, unsigned int *const max_tile_col_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+
+  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  unsigned int tile_size = 0;
+
+  av1_reset_pack_bs_thread_data(&cpi->td);
+  for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+    TileInfo tile_info;
+    const int is_last_col = (tile_col == tile_cols - 1);
+    const uint32_t col_offset = *total_size;
+
+    av1_tile_set_col(&tile_info, cm, tile_col);
+
+    // The last column does not have a column header
+    if (!is_last_col) *total_size += 4;
+
+    for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      const int data_offset = have_tiles ? 4 : 0;
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+      av1_tile_set_row(&tile_info, cm, tile_row);
+      aom_writer mode_bc;
+
+      buf->data = dst + *total_size + lst_obu->tg_hdr_size;
+
+      // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+      // even for the last one, unless no tiling is used at all.
+      *total_size += data_offset;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+      mode_bc.allow_update_cdf = !tiles->large_scale;
+      mode_bc.allow_update_cdf =
+          mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+      aom_start_encode(&mode_bc, buf->data + data_offset);
+      write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col);
+      if (aom_stop_encode(&mode_bc) < 0) {
+        aom_internal_error(cm->error, AOM_CODEC_ERROR, "Error writing modes");
+      }
+      tile_size = mode_bc.pos;
+      buf->size = tile_size;
+
+      // Record the maximum tile size we see, so we can compact headers later.
+      if (tile_size > *max_tile_size) {
+        *max_tile_size = tile_size;
+        *largest_tile_id = tile_cols * tile_row + tile_col;
+      }
+
+      if (have_tiles) {
+        // tile header: size of this tile, or copy offset
+        uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
+        const int tile_copy_mode =
+            ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256) ? 1
+                                                                           : 0;
+
+        // If tile_copy_mode = 1, check if this tile is a copy tile.
+        // Very low chances to have copy tiles on the key frames, so don't
+        // search on key frames to reduce unnecessary search.
+        if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) {
+          const int identical_tile_offset =
+              find_identical_tile(tile_row, tile_col, tile_buffers);
+
+          // Indicate a copy-tile by setting the most significant bit.
+          // The row-offset to copy from is stored in the highest byte.
+          // remux_tiles will move these around later
+          if (identical_tile_offset > 0) {
+            tile_size = 0;
+            tile_header = identical_tile_offset | 0x80;
+            tile_header <<= 24;
+          }
+        }
+
+        mem_put_le32(buf->data, (MEM_VALUE_T)tile_header);
+      }
+
+      *total_size += tile_size;
+    }
+    if (!is_last_col) {
+      uint32_t col_size = *total_size - col_offset - 4;
+      mem_put_le32(dst + col_offset + lst_obu->tg_hdr_size, col_size);
+
+      // Record the maximum tile column size we see.
+      *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+    }
+  }
+  av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
+}
+
+// Packs information in the obu header for large scale tiles.
+static INLINE uint32_t pack_large_scale_tiles_in_tg_obus(
+    AV1_COMP *const cpi, uint8_t *const dst,
+    struct aom_write_bit_buffer *saved_wb, int *const largest_tile_id) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  uint32_t total_size = 0;
+  unsigned int max_tile_size = 0;
+  unsigned int max_tile_col_size = 0;
+  const int have_tiles = tiles->cols * tiles->rows > 1;
+  uint8_t *data = dst;
+
+  LargeTileFrameOBU lst_obu;
+
+  total_size +=
+      init_large_scale_tile_obu_header(cpi, &data, saved_wb, &lst_obu);
+
+  write_large_scale_tile_obu(cpi, dst, &lst_obu, largest_tile_id, &total_size,
+                             have_tiles, &max_tile_size, &max_tile_col_size);
+
+  write_large_scale_tile_obu_size(tiles, dst, data, saved_wb, &lst_obu,
+                                  have_tiles, &total_size, max_tile_size,
+                                  max_tile_col_size);
+
+  return total_size;
+}
+
+// Writes obu, tile group and uncompressed headers to bitstream.
+void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd,
+                                   PackBSParams *const pack_bs_params,
+                                   const int tile_idx) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size;
+  const int tg_size =
+      (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg;
+
+  // Write Tile group, frame and OBU header
+  // A new tile group begins at this tile.  Write the obu header and
+  // tile group header
+  const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+  *curr_tg_hdr_size = av1_write_obu_header(
+      &cpi->ppi->level_params, &cpi->frame_header_count, obu_type,
+      pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr);
+  pack_bs_params->obu_header_size = *curr_tg_hdr_size;
+
+  if (cpi->num_tg == 1)
+    *curr_tg_hdr_size += write_frame_header_obu(
+        cpi, xd, pack_bs_params->saved_wb,
+        pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0);
+  *curr_tg_hdr_size += write_tile_group_header(
+      pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx,
+      AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1),
+      (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1);
+  *pack_bs_params->total_size += *curr_tg_hdr_size;
+}
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td,
+                        PackBSParams *const pack_bs_params) {
+  aom_writer mode_bc;
+  AV1_COMMON *const cm = &cpi->common;
+  int tile_row = pack_bs_params->tile_row;
+  int tile_col = pack_bs_params->tile_col;
+  uint32_t *const total_size = pack_bs_params->total_size;
+  TileInfo tile_info;
+  av1_tile_set_col(&tile_info, cm, tile_col);
+  av1_tile_set_row(&tile_info, cm, tile_row);
+  mode_bc.allow_update_cdf = 1;
+  mode_bc.allow_update_cdf =
+      mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+
+  unsigned int tile_size;
+
+  const int num_planes = av1_num_planes(cm);
+  av1_reset_loop_restoration(&td->mb.e_mbd, num_planes);
+
+  pack_bs_params->buf.data = pack_bs_params->dst + *total_size;
+
+  // The last tile of the tile group does not have a header.
+  if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4;
+
+  // Pack tile data
+  aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size);
+  write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col);
+  if (aom_stop_encode(&mode_bc) < 0) {
+    aom_internal_error(td->mb.e_mbd.error_info, AOM_CODEC_ERROR,
+                       "Error writing modes");
+  }
+  tile_size = mode_bc.pos;
+  assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+  pack_bs_params->buf.size = tile_size;
+
+  // Write tile size
+  if (!pack_bs_params->is_last_tile_in_tg) {
+    // size of this tile
+    mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+  }
+}
+
+void av1_write_last_tile_info(
+    AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+    struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+    uint8_t *curr_tg_start, uint32_t *const total_size,
+    uint8_t **tile_data_start, int *const largest_tile_id,
+    int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) {
+  // write current tile group size
+  const uint32_t obu_payload_size =
+      (uint32_t)(*curr_tg_data_size) - obu_header_size;
+  const size_t length_field_size =
+      av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start);
+  if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+                              curr_tg_start) != AOM_CODEC_OK) {
+    assert(0);
+  }
+  *curr_tg_data_size += (int)length_field_size;
+  *total_size += (uint32_t)length_field_size;
+  *tile_data_start += length_field_size;
+  if (cpi->num_tg == 1) {
+    // if this tg is combined with the frame header then update saved
+    // frame header base offset according to length field size
+    saved_wb->bit_buffer += length_field_size;
+  }
+
+  if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) {
+    // Make room for a duplicate Frame Header OBU.
+    memmove(curr_tg_start + fh_info->total_length, curr_tg_start,
+            *curr_tg_data_size);
+
+    // Insert a copy of the Frame Header OBU.
+    memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length);
+
+    // Force context update tile to be the first tile in error
+    // resilient mode as the duplicate frame headers will have
+    // context_update_tile_id set to 0
+    *largest_tile_id = 0;
+
+    // Rewrite the OBU header to change the OBU type to Redundant Frame
+    // Header.
+    av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count,
+                         OBU_REDUNDANT_FRAME_HEADER, obu_extn_header,
+                         &curr_tg_start[fh_info->obu_header_byte_offset]);
+
+    *curr_tg_data_size += (int)(fh_info->total_length);
+    *total_size += (uint32_t)(fh_info->total_length);
+  }
+  *is_first_tg = 0;
+}
+
+void av1_reset_pack_bs_thread_data(ThreadData *const td) {
+  td->coefficient_size = 0;
+  td->max_mv_magnitude = 0;
+  av1_zero(td->interp_filter_selected);
+}
+
+void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi,
+                                        ThreadData const *td) {
+  int do_max_mv_magnitude_update = 1;
+  cpi->rc.coefficient_size += td->coefficient_size;
+
+  // Disable max_mv_magnitude update for parallel frames based on update flag.
+  if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0;
+
+  if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update)
+    cpi->mv_search_params.max_mv_magnitude =
+        AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude);
+
+  for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++)
+    cpi->common.cur_frame->interp_filter_selected[filter] +=
+        td->interp_filter_selected[filter];
+}
+
+// Store information related to each default tile in the OBU header.
+static void write_tile_obu(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cpi->num_tg;
+  const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+  int tile_count = 0;
+  size_t curr_tg_data_size = 0;
+  uint8_t *tile_data_curr = dst;
+  int new_tg = 1;
+  int is_first_tg = 1;
+
+  av1_reset_pack_bs_thread_data(&cpi->td);
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+
+      int is_last_tile_in_tg = 0;
+      if (new_tg) {
+        tile_data_curr = dst + *total_size;
+        tile_count = 0;
+      }
+      tile_count++;
+
+      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1))
+        is_last_tile_in_tg = 1;
+
+      xd->tile_ctx = &this_tile->tctx;
+
+      // PackBSParams stores all parameters required to pack tile and header
+      // info.
+      PackBSParams pack_bs_params;
+      pack_bs_params.dst = dst;
+      pack_bs_params.curr_tg_hdr_size = 0;
+      pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg;
+      pack_bs_params.new_tg = new_tg;
+      pack_bs_params.obu_extn_header = obu_extn_header;
+      pack_bs_params.obu_header_size = 0;
+      pack_bs_params.saved_wb = saved_wb;
+      pack_bs_params.tile_col = tile_col;
+      pack_bs_params.tile_row = tile_row;
+      pack_bs_params.tile_data_curr = tile_data_curr;
+      pack_bs_params.total_size = total_size;
+
+      if (new_tg)
+        av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx);
+
+      av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params);
+
+      if (new_tg) {
+        curr_tg_data_size = pack_bs_params.curr_tg_hdr_size;
+        *tile_data_start += pack_bs_params.curr_tg_hdr_size;
+        *obu_header_size = pack_bs_params.obu_header_size;
+        new_tg = 0;
+      }
+      if (is_last_tile_in_tg) new_tg = 1;
+
+      curr_tg_data_size +=
+          (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4));
+
+      if (pack_bs_params.buf.size > *max_tile_size) {
+        *largest_tile_id = tile_idx;
+        *max_tile_size = (unsigned int)pack_bs_params.buf.size;
+      }
+
+      if (is_last_tile_in_tg)
+        av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size,
+                                 tile_data_curr, total_size, tile_data_start,
+                                 largest_tile_id, &is_first_tg,
+                                 *obu_header_size, obu_extn_header);
+      *total_size += (uint32_t)pack_bs_params.buf.size;
+    }
+  }
+  av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
+}
+
+// Write total buffer size and related information into the OBU header for
+// default tile case.
+static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst,
+                                struct aom_write_bit_buffer *saved_wb,
+                                int largest_tile_id, uint32_t *const total_size,
+                                unsigned int max_tile_size,
+                                uint32_t obu_header_size,
+                                uint8_t *tile_data_start) {
+  const CommonTileParams *const tiles = &cpi->common.tiles;
+
+  // Fill in context_update_tile_id indicating the tile to use for the
+  // cdf update. The encoder currently sets it to the largest tile
+  // (but is up to the encoder)
+  aom_wb_overwrite_literal(saved_wb, largest_tile_id,
+                           (tiles->log2_cols + tiles->log2_rows));
+  // If more than one tile group. tile_size_bytes takes the default value 4
+  // and does not need to be set. For a single tile group it is set in the
+  // section below.
+  if (cpi->num_tg != 1) return;
+  int tile_size_bytes = 4, unused;
+  const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+  const uint32_t tile_data_size = *total_size - tile_data_offset;
+
+  *total_size = remux_tiles(tiles, tile_data_start, tile_data_size,
+                            max_tile_size, 0, &tile_size_bytes, &unused);
+  *total_size += tile_data_offset;
+  assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+
+  aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+  // Update the OBU length if remux_tiles() reduced the size.
+  uint64_t payload_size;
+  size_t length_field_size;
+  int res =
+      aom_uleb_decode(dst + obu_header_size, *total_size - obu_header_size,
+                      &payload_size, &length_field_size);
+  assert(res == 0);
+  (void)res;
+
+  const uint64_t new_payload_size =
+      *total_size - obu_header_size - length_field_size;
+  if (new_payload_size != payload_size) {
+    size_t new_length_field_size;
+    res = aom_uleb_encode(new_payload_size, length_field_size,
+                          dst + obu_header_size, &new_length_field_size);
+    assert(res == 0);
+    if (new_length_field_size < length_field_size) {
+      const size_t src_offset = obu_header_size + length_field_size;
+      const size_t dst_offset = obu_header_size + new_length_field_size;
+      memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+      *total_size -= (int)(length_field_size - new_length_field_size);
+    }
+  }
+}
+
+// As per the experiments, single-thread bitstream packing is better for
+// frames with a smaller bitstream size. This behavior is due to setup time
+// overhead of multithread function would be more than that of time required
+// to pack the smaller bitstream of such frames. This function computes the
+// number of required number of workers based on setup time overhead and job
+// dispatch time overhead for given tiles and available workers.
+int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
+                            int avail_workers, bool pack_bs_mt_enabled) {
+  if (!pack_bs_mt_enabled) return 1;
+
+  uint64_t frame_abs_sum_level = 0;
+
+  for (int idx = 0; idx < num_tiles; idx++)
+    frame_abs_sum_level += tile_data[idx].abs_sum_level;
+
+  int ideal_num_workers = 1;
+  const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST;
+  float max_sum = 0.0;
+
+  for (int num_workers = avail_workers; num_workers > 1; num_workers--) {
+    const float fas_per_worker_const =
+        ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level;
+    const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST;
+    const float this_sum = fas_per_worker_const - setup_time_const -
+                           job_disp_time_const / num_workers;
+
+    if (this_sum > max_sum) {
+      max_sum = this_sum;
+      ideal_num_workers = num_workers;
+    }
+  }
+  return ideal_num_workers;
+}
+
+static INLINE uint32_t pack_tiles_in_tg_obus(
+    AV1_COMP *const cpi, uint8_t *const dst,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id) {
+  const CommonTileParams *const tiles = &cpi->common.tiles;
+  uint32_t total_size = 0;
+  unsigned int max_tile_size = 0;
+  uint32_t obu_header_size = 0;
+  uint8_t *tile_data_start = dst;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  const int num_tiles = tile_rows * tile_cols;
+
+  const int num_workers = calc_pack_bs_mt_workers(
+      cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS],
+      cpi->mt_info.pack_bs_mt_enabled);
+
+  if (num_workers > 1) {
+    av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header,
+                          fh_info, largest_tile_id, &max_tile_size,
+                          &obu_header_size, &tile_data_start, num_workers);
+  } else {
+    write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header,
+                   fh_info, largest_tile_id, &max_tile_size, &obu_header_size,
+                   &tile_data_start);
+  }
+
+  if (num_tiles > 1)
+    write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size,
+                        max_tile_size, obu_header_size, tile_data_start);
+  return total_size;
+}
+
+static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t obu_extension_header,
+                                       const FrameHeaderInfo *fh_info,
+                                       int *const largest_tile_id) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  *largest_tile_id = 0;
+
+  // Select the coding strategy (temporal or spatial)
+  if (cm->seg.enabled && cm->seg.update_map) {
+    if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+      cm->seg.temporal_update = 0;
+    } else {
+      cm->seg.temporal_update = 1;
+      if (cpi->td.rd_counts.seg_tmp_pred_cost[0] <
+          cpi->td.rd_counts.seg_tmp_pred_cost[1])
+        cm->seg.temporal_update = 0;
+    }
+  }
+
+  if (tiles->large_scale)
+    return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb,
+                                             largest_tile_id);
+
+  return pack_tiles_in_tg_obus(cpi, dst, saved_wb, obu_extension_header,
+                               fh_info, largest_tile_id);
+}
+
+static size_t av1_write_metadata_obu(const aom_metadata_t *metadata,
+                                     uint8_t *const dst) {
+  size_t coded_metadata_size = 0;
+  const uint64_t metadata_type = (uint64_t)metadata->type;
+  if (aom_uleb_encode(metadata_type, sizeof(metadata_type), dst,
+                      &coded_metadata_size) != 0) {
+    return 0;
+  }
+  memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz);
+  // Add trailing bits.
+  dst[coded_metadata_size + metadata->sz] = 0x80;
+  return (uint32_t)(coded_metadata_size + metadata->sz + 1);
+}
+
+static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
+  if (!cpi->source) return 0;
+  AV1_COMMON *const cm = &cpi->common;
+  aom_metadata_array_t *arr = cpi->source->metadata;
+  if (!arr) return 0;
+  size_t obu_header_size = 0;
+  size_t obu_payload_size = 0;
+  size_t total_bytes_written = 0;
+  size_t length_field_size = 0;
+  for (size_t i = 0; i < arr->sz; i++) {
+    aom_metadata_t *current_metadata = arr->metadata_array[i];
+    if (current_metadata && current_metadata->payload) {
+      if ((cm->current_frame.frame_type == KEY_FRAME &&
+           current_metadata->insert_flag == AOM_MIF_KEY_FRAME) ||
+          (cm->current_frame.frame_type != KEY_FRAME &&
+           current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
+          current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
+        obu_header_size = av1_write_obu_header(&cpi->ppi->level_params,
+                                               &cpi->frame_header_count,
+                                               OBU_METADATA, 0, dst);
+        obu_payload_size =
+            av1_write_metadata_obu(current_metadata, dst + obu_header_size);
+        length_field_size =
+            av1_obu_memmove(obu_header_size, obu_payload_size, dst);
+        if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
+            AOM_CODEC_OK) {
+          const size_t obu_size = obu_header_size + obu_payload_size;
+          dst += obu_size + length_field_size;
+          total_bytes_written += obu_size + length_field_size;
+        } else {
+          aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                             "Error writing metadata OBU size");
+        }
+      }
+    }
+  }
+  return total_bytes_written;
+}
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+                       int *const largest_tile_id) {
+  uint8_t *data = dst;
+  uint32_t data_size;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
+  uint32_t obu_header_size = 0;
+  uint32_t obu_payload_size = 0;
+  FrameHeaderInfo fh_info = { NULL, 0, 0 };
+  const uint8_t obu_extension_header =
+      cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
+
+  // If no non-zero delta_q has been used, reset delta_q_present_flag
+  if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+    cm->delta_q_info.delta_q_present_flag = 0;
+  }
+
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_reset_write();
+#endif
+
+  cpi->frame_header_count = 0;
+
+  // The TD is now written outside the frame encode loop
+
+  // write sequence header obu at each key frame or intra_only frame,
+  // preceded by 4-byte size
+  if (cm->current_frame.frame_type == INTRA_ONLY_FRAME ||
+      cm->current_frame.frame_type == KEY_FRAME) {
+    obu_header_size = av1_write_obu_header(
+        level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data);
+    obu_payload_size =
+        av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
+    const size_t length_field_size =
+        av1_obu_memmove(obu_header_size, obu_payload_size, data);
+    if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+
+    data += obu_header_size + obu_payload_size + length_field_size;
+  }
+
+  // write metadata obus before the frame obu that has the show_frame flag set
+  if (cm->show_frame) data += av1_write_metadata_array(cpi, data);
+
+  const int write_frame_header =
+      (cpi->num_tg > 1 || encode_show_existing_frame(cm));
+  struct aom_write_bit_buffer saved_wb = { NULL, 0 };
+  size_t length_field = 0;
+  if (write_frame_header) {
+    // Write Frame Header OBU.
+    fh_info.frame_header = data;
+    obu_header_size =
+        av1_write_obu_header(level_params, &cpi->frame_header_count,
+                             OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
+                                              data + obu_header_size, 1);
+
+    length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data);
+    if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+
+    fh_info.obu_header_byte_offset = 0;
+    fh_info.total_length = obu_header_size + obu_payload_size + length_field;
+    data += fh_info.total_length;
+  }
+
+  if (encode_show_existing_frame(cm)) {
+    data_size = 0;
+  } else {
+    // Since length_field is determined adaptively after frame header
+    // encoding, saved_wb must be adjusted accordingly.
+    if (saved_wb.bit_buffer != NULL) {
+      saved_wb.bit_buffer += length_field;
+    }
+
+    //  Each tile group obu will be preceded by 4-byte size of the tile group
+    //  obu
+    data_size = write_tiles_in_tg_obus(
+        cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id);
+  }
+  data += data_size;
+  *size = data - dst;
+  return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
new file mode 100644
index 0000000000..12e8a630db
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/level.h"
+#include "aom_dsp/bitwriter.h"
+
+struct aom_write_bit_buffer;
+struct AV1_COMP;
+struct ThreadData;
+
+/*!\cond */
+
+// Stores the location and size of a tile's data in the bitstream.  Used for
+// later identifying identical tiles
+typedef struct {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
+typedef struct {
+  uint8_t *frame_header;
+  size_t obu_header_byte_offset;
+  size_t total_length;
+} FrameHeaderInfo;
+
+typedef struct {
+  struct aom_write_bit_buffer *saved_wb;  // Bit stream buffer writer structure
+  TileBufferEnc buf;     // Structure to hold bitstream buffer and size
+  uint32_t *total_size;  // Size of the bitstream buffer for the tile in bytes
+  uint8_t *dst;          // Base address of tile bitstream buffer
+  uint8_t *tile_data_curr;   // Base address of tile-group bitstream buffer
+  size_t tile_buf_size;      // Available bitstream buffer for the tile in bytes
+  uint8_t obu_extn_header;   // Presence of OBU extension header
+  uint32_t obu_header_size;  // Size of the OBU header
+  int curr_tg_hdr_size;      // Size of the obu, tg, frame headers
+  int tile_size_mi;          // Tile size in mi units
+  int tile_row;              // Number of tile rows
+  int tile_col;              // Number of tile columns
+  int is_last_tile_in_tg;    // Flag to indicate last tile in a tile-group
+  int new_tg;                // Flag to indicate starting of a new tile-group
+} PackBSParams;
+
+typedef struct {
+  uint64_t abs_sum_level;
+  uint16_t tile_idx;
+} PackBSTileOrder;
+
+// Pack bitstream data for pack bitstream multi-threading.
+typedef struct {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif
+  // Tile order structure of pack bitstream multithreading.
+  PackBSTileOrder pack_bs_tile_order[MAX_TILES];
+
+  // Index of next job to be processed.
+  int next_job_idx;
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool pack_bs_mt_exit;
+} AV1EncPackBSSync;
+
+/*!\endcond */
+
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+                                       uint8_t *const dst);
+
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+                              int *frame_header_count, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst);
+
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+                            uint8_t *dest);
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td,
+                        PackBSParams *const pack_bs_params);
+
+void av1_write_last_tile_info(
+    struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+    struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+    uint8_t *curr_tg_start, uint32_t *const total_size,
+    uint8_t **tile_data_start, int *const largest_tile_id,
+    int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header);
+
+/*!\brief Pack the bitstream for one frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ */
+int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+                       int *const largest_tile_id);
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+                       TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w);
+
+void av1_reset_pack_bs_thread_data(struct ThreadData *const td);
+
+void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi,
+                                        struct ThreadData const *td);
+
+void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi,
+                                   MACROBLOCKD *const xd,
+                                   PackBSParams *const pack_bs_params,
+                                   const int tile_idx);
+
+int av1_neg_interleave(int x, int ref, int max);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
new file mode 100644
index 0000000000..33d2d8c2a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/block.h
@@ -0,0 +1,1515 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * Declares various structs used to encode the current partition block.
+ */
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/entropy.h"
+#include "av1/common/enums.h"
+#include "av1/common/mvref_common.h"
+
+#include "av1/encoder/enc_enums.h"
+#include "av1/encoder/mcomp_structs.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+
+#include "av1/encoder/hash_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//! Minimum linear dimension of a tpl block
+#define MIN_TPL_BSIZE_1D 16
+//! Maximum number of tpl block in a super block
+#define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D)
+//! Number of txfm hash records kept for the partition block.
+#define RD_RECORD_BUFFER_LEN 8
+
+/*! Maximum value taken by transform type probabilities */
+#define MAX_TX_TYPE_PROB 1024
+
+//! Compute color sensitivity index for given plane
+#define COLOR_SENS_IDX(plane) ((plane)-1)
+
+//! Enable timer statistics of mode search in non-rd
+#define COLLECT_NONRD_PICK_MODE_STAT 0
+
+/*!\cond */
+#if COLLECT_NONRD_PICK_MODE_STAT
+#include "aom_ports/aom_timer.h"
+
+typedef struct _mode_search_stat_nonrd {
+  int32_t num_blocks[BLOCK_SIZES];
+  int64_t total_block_times[BLOCK_SIZES];
+  int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT];
+  int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t ms_time[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t ifs_time[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t model_rd_time[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t txfm_time[BLOCK_SIZES][MB_MODE_COUNT];
+  struct aom_usec_timer timer1;
+  struct aom_usec_timer timer2;
+  struct aom_usec_timer bsize_timer;
+} mode_search_stat_nonrd;
+#endif  // COLLECT_NONRD_PICK_MODE_STAT
+/*!\endcond */
+
+/*! \brief Superblock level encoder info
+ *
+ * SuperblockEnc stores superblock level information used by the encoder for
+ * more efficient encoding. Currently this is mostly used to store TPL data
+ * for the current superblock.
+ */
+typedef struct {
+  //! Maximum partition size for the sb.
+  BLOCK_SIZE min_partition_size;
+  //! Minimum partition size for the sb.
+  BLOCK_SIZE max_partition_size;
+
+  /*****************************************************************************
+   * \name TPL Info
+   *
+   * Information gathered from tpl_model at tpl block precision for the
+   * superblock to speed up the encoding process..
+   ****************************************************************************/
+  /**@{*/
+  //! Number of TPL blocks in this superblock.
+  int tpl_data_count;
+  //! TPL's estimate of inter cost for each tpl block.
+  int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+  //! TPL's estimate of tpl cost for each tpl block.
+  int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+  //! Motion vectors found by TPL model for each tpl block.
+  int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME];
+  //! TPL's stride for the arrays in this struct.
+  int tpl_stride;
+  /**@}*/
+} SuperBlockEnc;
+
+/*! \brief Stores the best performing modes.
+ */
+typedef struct {
+  //! The mbmi used to reconstruct the winner mode.
+  MB_MODE_INFO mbmi;
+  //! Rdstats of the winner mode.
+  RD_STATS rd_cost;
+  //! Rdcost of the winner mode
+  int64_t rd;
+  //! Luma rate of the winner mode.
+  int rate_y;
+  //! Chroma rate of the winner mode.
+  int rate_uv;
+  //! The color map needed to reconstruct palette mode.
+  uint8_t color_index_map[MAX_SB_SQUARE];
+  //! The current winner mode.
+  THR_MODES mode_index;
+} WinnerModeStats;
+
+/*! \brief Each source plane of the current macroblock
+ *
+ * This struct also stores the txfm buffers and quantizer settings.
+ */
+typedef struct macroblock_plane {
+  //! Stores source - pred so the txfm can be computed later
+  int16_t *src_diff;
+  //! Dequantized coefficients
+  tran_low_t *dqcoeff;
+  //! Quantized coefficients
+  tran_low_t *qcoeff;
+  //! Transformed coefficients
+  tran_low_t *coeff;
+  //! Location of the end of qcoeff (end of block).
+  uint16_t *eobs;
+  //! Contexts used to code the transform coefficients.
+  uint8_t *txb_entropy_ctx;
+  //! A buffer containing the source frame.
+  struct buf_2d src;
+
+  /*! \name Quantizer Settings
+   *
+   * \attention These are used/accessed only in the quantization process.
+   * RDO does not and *must not* depend on any of these values.
+   * All values below share the coefficient scale/shift used in TX.
+   */
+  /**@{*/
+  //! Quantization step size used by AV1_XFORM_QUANT_FP.
+  const int16_t *quant_fp_QTX;
+  //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_FP.
+  const int16_t *round_fp_QTX;
+  //! Quantization step size used by AV1_XFORM_QUANT_B.
+  const int16_t *quant_QTX;
+  //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_B.
+  const int16_t *round_QTX;
+  //! Scale factor to shift coefficients toward zero. Only used by QUANT_B.
+  const int16_t *quant_shift_QTX;
+  //! Size of the quantization bin around 0. Only Used by QUANT_B
+  const int16_t *zbin_QTX;
+  //! Dequantizer
+  const int16_t *dequant_QTX;
+  /**@}*/
+} MACROBLOCK_PLANE;
+
+/*! \brief Costs for encoding the coefficients within a level.
+ *
+ * Covers everything including txb_skip, eob, dc_sign,
+ */
+typedef struct {
+  //! Cost to skip txfm for the current txfm block.
+  int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+  /*! \brief Cost for encoding the base_eob of a level.
+   *
+   * Decoder uses base_eob to derive the base_level as base_eob := base_eob+1.
+   */
+  int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+  /*! \brief Cost for encoding the base level of a coefficient.
+   *
+   * Decoder derives coeff_base as coeff_base := base_eob + 1.
+   */
+  int base_cost[SIG_COEF_CONTEXTS][8];
+  /*! \brief Cost for encoding the last non-zero coefficient.
+   *
+   * Eob is derived from eob_extra at the decoder as eob := eob_extra + 1
+   */
+  int eob_extra_cost[EOB_COEF_CONTEXTS][2];
+  //! Cost for encoding the dc_sign
+  int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+  //! Cost for encoding an increment to the coefficient
+  int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
+} LV_MAP_COEFF_COST;
+
+/*! \brief Costs for encoding the eob.
+ */
+typedef struct {
+  //! eob_cost.
+  int eob_cost[2][11];
+} LV_MAP_EOB_COST;
+
+/*! \brief Stores the transforms coefficients for the whole superblock.
+ */
+typedef struct {
+  //! The transformed coefficients.
+  tran_low_t *tcoeff[MAX_MB_PLANE];
+  //! Where the transformed coefficients end.
+  uint16_t *eobs[MAX_MB_PLANE];
+  /*! \brief Transform block entropy contexts.
+   *
+   * Each element is used as a bit field.
+   * - Bits 0~3: txb_skip_ctx
+   * - Bits 4~5: dc_sign_ctx.
+   */
+  uint8_t *entropy_ctx[MAX_MB_PLANE];
+} CB_COEFF_BUFFER;
+
+/*! \brief Extended mode info derived from mbmi.
+ */
+typedef struct {
+  // TODO(angiebird): Reduce the buffer size according to sb_type
+  //! The reference mv list for the current block.
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+  //! The weights used to compute the ref mvs.
+  uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+  //! Number of ref mvs in the drl.
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  //! Global mvs
+  int_mv global_mvs[REF_FRAMES];
+  //! Context used to encode the current mode.
+  int16_t mode_context[MODE_CTX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+/*! \brief Stores best extended mode information at frame level.
+ *
+ * The frame level in here is used in bitstream preparation stage. The
+ * information in \ref MB_MODE_INFO_EXT are copied to this struct to save
+ * memory.
+ */
+typedef struct {
+  //! \copydoc MB_MODE_INFO_EXT::ref_mv_stack
+  CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE];
+  //! \copydoc MB_MODE_INFO_EXT::weight
+  uint16_t weight[USABLE_REF_MV_STACK_SIZE];
+  //! \copydoc MB_MODE_INFO_EXT::ref_mv_count
+  uint8_t ref_mv_count;
+  // TODO(Ravi/Remya): Reduce the buffer size of global_mvs
+  //! \copydoc MB_MODE_INFO_EXT::global_mvs
+  int_mv global_mvs[REF_FRAMES];
+  //! \copydoc MB_MODE_INFO_EXT::mode_context
+  int16_t mode_context;
+  //! Offset of current coding block's coeff buffer relative to the sb.
+  uint16_t cb_offset[PLANE_TYPES];
+} MB_MODE_INFO_EXT_FRAME;
+
+/*! \brief Inter-mode txfm results for a partition block.
+ */
+typedef struct {
+  //! Txfm size used if the current mode is intra mode.
+  TX_SIZE tx_size;
+  //! Txfm sizes used if the current mode is inter mode.
+  TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  //! Map showing which txfm block skips the txfm process.
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  //! Map showing the txfm types for each block.
+  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  //! Rd_stats for the whole partition block.
+  RD_STATS rd_stats;
+  //! Hash value of the current record.
+  uint32_t hash_value;
+} MB_RD_INFO;
+
+/*! \brief Hash records of the inter-mode transform results
+ *
+ * Hash records of the inter-mode transform results for a whole partition block
+ * based on the residue. Since this operates on the partition block level, this
+ * can give us a whole txfm partition tree.
+ */
+typedef struct {
+  /*! Circular buffer that stores the inter-mode txfm results of a partition
+   *  block.
+   */
+  MB_RD_INFO mb_rd_info[RD_RECORD_BUFFER_LEN];
+  //! Index to insert the newest rd record.
+  int index_start;
+  //! Number of info stored in this record.
+  int num;
+  //! Hash function
+  CRC32C crc_calculator;
+} MB_RD_RECORD;
+
+//! Number of compound rd stats
+#define MAX_COMP_RD_STATS 64
+/*! \brief Rdcost stats in compound mode.
+ */
+typedef struct {
+  //! Rate of the compound modes.
+  int32_t rate[COMPOUND_TYPES];
+  //! Distortion of the compound modes.
+  int64_t dist[COMPOUND_TYPES];
+  //! Estimated rate of the compound modes.
+  int32_t model_rate[COMPOUND_TYPES];
+  //! Estimated distortion of the compound modes.
+  int64_t model_dist[COMPOUND_TYPES];
+  //! Rate need to send the mask type.
+  int comp_rs2[COMPOUND_TYPES];
+  //! Motion vector for each predictor.
+  int_mv mv[2];
+  //! Ref frame for each predictor.
+  MV_REFERENCE_FRAME ref_frames[2];
+  //! Current prediction mode.
+  PREDICTION_MODE mode;
+  //! Current interpolation filter.
+  int_interpfilters filter;
+  //! Refmv index in the drl.
+  int ref_mv_idx;
+  //! Whether the predictors are GLOBALMV.
+  int is_global[2];
+  //! Current parameters for interinter mode.
+  INTERINTER_COMPOUND_DATA interinter_comp;
+} COMP_RD_STATS;
+
+/*! \brief Contains buffers used to speed up rdopt for obmc.
+ *
+ * See the comments for calc_target_weighted_pred for details.
+ */
+typedef struct {
+  /*! \brief A new source weighted with the above and left predictors.
+   *
+   * Used to efficiently construct multiple obmc predictors during rdopt.
+   */
+  int32_t *wsrc;
+  /*! \brief A new mask constructed from the original horz/vert mask.
+   *
+   * \copydetails wsrc
+   */
+  int32_t *mask;
+  /*! \brief Prediction from the up predictor.
+   *
+   * Used to build the obmc predictor.
+   */
+  uint8_t *above_pred;
+  /*! \brief Prediction from the up predictor.
+   *
+   * \copydetails above_pred
+   */
+  uint8_t *left_pred;
+} OBMCBuffer;
+
+/*! \brief Contains color maps used in palette mode.
+ */
+typedef struct {
+  //! The best color map found.
+  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+  //! A temporary buffer used for k-means clustering.
+  int16_t kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+} PALETTE_BUFFER;
+
+/*! \brief Contains buffers used by av1_compound_type_rd()
+ *
+ * For sizes and alignment of these arrays, refer to
+ * alloc_compound_type_rd_buffers() function.
+ */
+typedef struct {
+  //! First prediction.
+  uint8_t *pred0;
+  //! Second prediction.
+  uint8_t *pred1;
+  //! Source - first prediction.
+  int16_t *residual1;
+  //! Second prediction - first prediction.
+  int16_t *diff10;
+  //! Backup of the best segmentation mask.
+  uint8_t *tmp_best_mask_buf;
+} CompoundTypeRdBuffers;
+
+/*! \brief Holds some parameters related to partitioning schemes in AV1.
+ */
+// TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE
+typedef struct {
+#if !CONFIG_REALTIME_ONLY
+  // The following 4 parameters are used for cnn-based partitioning on intra
+  // frame.
+  /*! \brief Current index on the partition block quad tree.
+   *
+   * Used to index into the cnn buffer for partition decision.
+   */
+  int quad_tree_idx;
+  //! Whether the CNN buffer contains valid output.
+  int cnn_output_valid;
+  //! A buffer used by our segmentation CNN for intra-frame partitioning.
+  float cnn_buffer[CNN_OUT_BUF_SIZE];
+  //! log of the quantization parameter of the ancestor BLOCK_64X64.
+  float log_q;
+#endif
+
+  /*! \brief Variance of the subblocks in the superblock.
+   *
+   * This is used by rt mode for variance based partitioning.
+   * The indices corresponds to the following block sizes:
+   * -   0    - 128x128
+   * -  1-2   - 128x64
+   * -  3-4   -  64x128
+   * -  5-8   -  64x64
+   * -  9-16  -  64x32
+   * - 17-24  -  32x64
+   * - 25-40  -  32x32
+   * - 41-104 -  16x16
+   */
+  uint8_t variance_low[105];
+} PartitionSearchInfo;
+
+/*!\cond */
+enum {
+  /**
+   * Do not prune transform depths.
+   */
+  TX_PRUNE_NONE = 0,
+  /**
+   * Prune largest transform (depth 0) based on NN model.
+   */
+  TX_PRUNE_LARGEST = 1,
+  /**
+   * Prune split transforms (depth>=1) based on NN model.
+   */
+  TX_PRUNE_SPLIT = 2,
+} UENUM1BYTE(TX_PRUNE_TYPE);
+/*!\endcond */
+
+/*! \brief Defines the parameters used to perform txfm search.
+ *
+ * For the most part, this determines how various speed features are used.
+ */
+typedef struct {
+  /*! \brief Whether to limit the intra txfm search type to the default txfm.
+   *
+   * This could either be a result of either sequence parameter or speed
+   * features.
+   */
+  int use_default_intra_tx_type;
+
+  /*! Probability threshold used for conditionally forcing tx type*/
+  int default_inter_tx_type_prob_thresh;
+
+  //! Whether to prune 2d transforms based on 1d transform results.
+  int prune_2d_txfm_mode;
+
+  /*! \brief Variable from \ref WinnerModeParams based on current eval mode.
+   *
+   * See the documentation for \ref WinnerModeParams for more detail.
+   */
+  unsigned int coeff_opt_thresholds[2];
+  /*! \copydoc coeff_opt_thresholds */
+  unsigned int tx_domain_dist_threshold;
+  /*! \copydoc coeff_opt_thresholds */
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+  /*! \copydoc coeff_opt_thresholds */
+  unsigned int use_transform_domain_distortion;
+  /*! \copydoc coeff_opt_thresholds */
+  unsigned int skip_txfm_level;
+
+  /*! \brief How to search for the optimal tx_size
+   *
+   * If ONLY_4X4, use TX_4X4; if TX_MODE_LARGEST, use the largest tx_size for
+   * the current partition block; if TX_MODE_SELECT, search through the whole
+   * tree.
+   *
+   * \attention
+   * Although this looks suspicious similar to a bitstream element, this
+   * tx_mode_search_type is only used internally by the encoder, and is *not*
+   * written to the bitstream. It determines what kind of tx_mode would be
+   * searched. For example, we might set it to TX_MODE_LARGEST to find a good
+   * candidate, then code it as TX_MODE_SELECT.
+   */
+  TX_MODE tx_mode_search_type;
+
+  /*!
+   * Determines whether a block can be predicted as transform skip or DC only
+   * based on residual mean and variance.
+   * Type 0 : No skip block or DC only block prediction
+   * Type 1 : Prediction of skip block based on residual mean and variance
+   * Type 2 : Prediction of skip block or DC only block based on residual mean
+   * and variance
+   */
+  unsigned int predict_dc_level;
+
+  /*!
+   * Whether or not we should use the quantization matrix as weights for PSNR
+   * during RD search.
+   */
+  int use_qm_dist_metric;
+
+  /*!
+   * Keep track of previous mode evaluation stage type. This will be used to
+   * reset mb rd hash record when mode evaluation type changes.
+   */
+  int mode_eval_type;
+
+#if !CONFIG_REALTIME_ONLY
+  //! Indicates the transform depths for which RD evaluation is skipped.
+  TX_PRUNE_TYPE nn_prune_depths_for_intra_tx;
+
+  /*! \brief Indicates if NN model should be invoked to prune transform depths.
+   *
+   * Used to signal whether NN model should be evaluated to prune the R-D
+   * evaluation of specific transform depths.
+   */
+  bool enable_nn_prune_intra_tx_depths;
+#endif
+} TxfmSearchParams;
+
+/*!\cond */
+#define MAX_NUM_8X8_TXBS ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1))
+#define MAX_NUM_16X16_TXBS ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2))
+#define MAX_NUM_32X32_TXBS ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3))
+#define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4))
+/*!\endcond */
+
+/*! \brief Stores various encoding/search decisions related to txfm search.
+ *
+ * This struct contains a cache of previous txfm results, and some buffers for
+ * the current txfm decision.
+ */
+typedef struct {
+  //! Whether to skip transform and quantization on a partition block level.
+  uint8_t skip_txfm;
+
+  /*! \brief Whether to skip transform and quantization on a txfm block level.
+   *
+   * Skips transform and quantization on a transform block level inside the
+   * current partition block. Each element of this array is used as a bit-field.
+   * So for example, the we are skipping on the luma plane, then the last bit
+   * would be set to 1.
+   */
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  /*! \brief Transform types inside the partition block
+   *
+   * Keeps a record of what kind of transform to use for each of the transform
+   * block inside the partition block.
+   * \attention The buffer here is *never* directly used. Instead, this just
+   * allocates the memory for MACROBLOCKD::tx_type_map during rdopt on the
+   * partition block. So if we need to save memory, we could move the allocation
+   * to pick_sb_mode instead.
+   */
+  uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  //! Txfm hash records of inter-modes.
+  MB_RD_RECORD *mb_rd_record;
+
+  /*! \brief Number of txb splits.
+   *
+   * Keep track of how many times we've used split tx partition for transform
+   * blocks. Somewhat misleadingly, this parameter doesn't actually keep track
+   * of the count of the current block. Instead, it's a cumulative count across
+   * of the whole frame. The main usage is that if txb_split_count is zero, then
+   * we can signal TX_MODE_LARGEST at frame level.
+   */
+  // TODO(chiyotsai@google.com): Move this to a more appropriate location such
+  // as ThreadData.
+  unsigned int txb_split_count;
+#if CONFIG_SPEED_STATS
+  //! For debugging. Used to check how many txfm searches we are doing.
+  unsigned int tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+} TxfmSearchInfo;
+#undef MAX_NUM_8X8_TXBS
+#undef MAX_NUM_16X16_TXBS
+#undef MAX_NUM_32X32_TXBS
+#undef MAX_NUM_64X64_TXBS
+
+/*! \brief Holds the entropy costs for various modes sent to the bitstream.
+ *
+ * \attention This does not include the costs for mv and transformed
+ * coefficients.
+ */
+typedef struct {
+  /*****************************************************************************
+   * \name Partition Costs
+   ****************************************************************************/
+  /**@{*/
+  //! Cost for coding the partition.
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Intra Costs: General
+   ****************************************************************************/
+  /**@{*/
+  //! Luma mode cost for inter frame.
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  //! Luma mode cost for intra frame.
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  //! Chroma mode cost
+  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  //! filter_intra_cost
+  int filter_intra_cost[BLOCK_SIZES_ALL][2];
+  //! filter_intra_mode_cost
+  int filter_intra_mode_cost[FILTER_INTRA_MODES];
+  //! angle_delta_cost
+  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+
+  //! Rate rate associated with each alpha codeword
+  int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Intra Costs: Screen Contents
+   ****************************************************************************/
+  /**@{*/
+  //! intrabc_cost
+  int intrabc_cost[2];
+
+  //! palette_y_size_cost
+  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  //! palette_uv_size_cost
+  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  //! palette_y_color_cost
+  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                          [PALETTE_COLORS];
+  //! palette_uv_color_cost
+  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                           [PALETTE_COLORS];
+  //! palette_y_mode_cost
+  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  //! palette_uv_mode_cost
+  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: MV Modes
+   ****************************************************************************/
+  /**@{*/
+  //! skip_mode_cost
+  int skip_mode_cost[SKIP_MODE_CONTEXTS][2];
+  //! newmv_mode_cost
+  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  //! zeromv_mode_cost
+  int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
+  //! refmv_mode_cost
+  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+  //! drl_mode_cost0
+  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Ref Frame Types
+   ****************************************************************************/
+  /**@{*/
+  //! single_ref_cost
+  int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  //! comp_inter_cost
+  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+  //! comp_ref_type_cost
+  int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+                        [CDF_SIZE(COMP_REFERENCE_TYPES)];
+  //! uni_comp_ref_cost
+  int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+                       [CDF_SIZE(2)];
+  /*! \brief Cost for signaling ref_frame[0] in bidir-comp mode
+   *
+   * Includes LAST_FRAME, LAST2_FRAME, LAST3_FRAME, and GOLDEN_FRAME.
+   */
+  int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+  /*! \brief Cost for signaling ref_frame[1] in bidir-comp mode
+   *
+   * Includes ALTREF_FRAME, ALTREF2_FRAME, and BWDREF_FRAME.
+   */
+  int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Compound Types
+   ****************************************************************************/
+  /**@{*/
+  //! intra_inter_cost
+  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+  //! inter_compound_mode_cost
+  int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  //! compound_type_cost
+  int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+  //! wedge_idx_cost
+  int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+  //! interintra_cost
+  int interintra_cost[BLOCK_SIZE_GROUPS][2];
+  //! wedge_interintra_cost
+  int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
+  //! interintra_mode_cost
+  int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Compound Masks
+   ****************************************************************************/
+  /**@{*/
+  //! comp_idx_cost
+  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+  //! comp_group_idx_cost
+  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Motion Modes/Filters
+   ****************************************************************************/
+  /**@{*/
+  //! motion_mode_cost
+  int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+  //! motion_mode_cost1
+  int motion_mode_cost1[BLOCK_SIZES_ALL][2];
+  //! switchable_interp_costs
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Txfm Mode Costs
+   ****************************************************************************/
+  /**@{*/
+  //! skip_txfm_cost
+  int skip_txfm_cost[SKIP_CONTEXTS][2];
+  //! tx_size_cost
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+  //! txfm_partition_cost
+  int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
+  //! inter_tx_type_costs
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  //! intra_tx_type_costs
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                         [TX_TYPES];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Restoration Mode Costs
+   ****************************************************************************/
+  /**@{*/
+  //! switchable_restore_cost
+  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+  //! wiener_restore_cost
+  int wiener_restore_cost[2];
+  //! sgrproj_restore_cost
+  int sgrproj_restore_cost[2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Segmentation Mode Costs
+   ****************************************************************************/
+  /**@{*/
+  //! tmp_pred_cost
+  int tmp_pred_cost[SEG_TEMPORAL_PRED_CTXS][2];
+  //! spatial_pred_cost
+  int spatial_pred_cost[SPATIAL_PREDICTION_PROBS][MAX_SEGMENTS];
+  /**@}*/
+} ModeCosts;
+
+/*! \brief Holds mv costs for encoding and motion search.
+ */
+typedef struct {
+  /*****************************************************************************
+   * \name Encoding Costs
+   * Here are the entropy costs needed to encode a given mv.
+   * \ref nmv_cost_alloc and \ref nmv_cost_hp_alloc are two arrays that holds
+   * the memory for holding the mv cost. But since the motion vectors can be
+   * negative, we shift them to the middle and store the resulting pointer in
+   * \ref nmv_cost and \ref nmv_cost_hp for easier referencing. Finally, \ref
+   * mv_cost_stack points to the \ref nmv_cost with the mv precision we are
+   * currently working with. In essence, only \ref mv_cost_stack is needed for
+   * motion search, the other can be considered private.
+   ****************************************************************************/
+  /**@{*/
+  //! Costs for coding the zero components.
+  int nmv_joint_cost[MV_JOINTS];
+
+  //! Allocates memory for 1/4-pel motion vector costs.
+  int nmv_cost_alloc[2][MV_VALS];
+  //! Allocates memory for 1/8-pel motion vector costs.
+  int nmv_cost_hp_alloc[2][MV_VALS];
+  //! Points to the middle of \ref nmv_cost_alloc
+  int *nmv_cost[2];
+  //! Points to the middle of \ref nmv_cost_hp_alloc
+  int *nmv_cost_hp[2];
+  //! Points to the nmv_cost_hp in use.
+  int **mv_cost_stack;
+  /**@}*/
+} MvCosts;
+
+/*! \brief Holds mv costs for intrabc.
+ */
+typedef struct {
+  /*! Costs for coding the joint mv. */
+  int joint_mv[MV_JOINTS];
+
+  /*! \brief Cost of transmitting the actual motion vector.
+   *  dv_costs_alloc[0][i] is the cost of motion vector with horizontal
+   * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of
+   * motion vector with vertical component (mv_col) equal to i - MV_MAX.
+   */
+  int dv_costs_alloc[2][MV_VALS];
+
+  /*! Points to the middle of \ref dv_costs_alloc. */
+  int *dv_costs[2];
+} IntraBCMVCosts;
+
+/*! \brief Holds the costs needed to encode the coefficients
+ */
+typedef struct {
+  //! Costs for coding the coefficients.
+  LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+  //! Costs for coding the eobs.
+  LV_MAP_EOB_COST eob_costs[7][2];
+} CoeffCosts;
+
+/*!\cond */
+// 4: NEAREST, NEW, NEAR, GLOBAL
+#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
+/*!\endcond */
+struct inter_modes_info;
+
+/*! \brief Holds the motion samples for warp motion model estimation
+ */
+typedef struct {
+  //! Number of samples.
+  int num;
+  //! Sample locations in current frame.
+  int pts[16];
+  //! Sample location in the reference frame.
+  int pts_inref[16];
+} WARP_SAMPLE_INFO;
+
+/*!\cond */
+typedef enum {
+  kZeroSad = 0,
+  kVeryLowSad = 1,
+  kLowSad = 2,
+  kMedSad = 3,
+  kHighSad = 4
+} SOURCE_SAD;
+
+typedef struct {
+  //! SAD levels in non-rd path
+  SOURCE_SAD source_sad_nonrd;
+  //! SAD levels in rd-path for var-based part qindex thresholds
+  SOURCE_SAD source_sad_rd;
+  int lighting_change;
+  int low_sumdiff;
+} CONTENT_STATE_SB;
+
+// Structure to hold pixel level gradient info.
+typedef struct {
+  uint16_t abs_dx_abs_dy_sum;
+  int8_t hist_bin_idx;
+  bool is_dx_zero;
+} PixelLevelGradientInfo;
+
+// Structure to hold the variance and log(1 + variance) for 4x4 sub-blocks.
+typedef struct {
+  double log_var;
+  int var;
+} Block4x4VarInfo;
+
+#ifndef NDEBUG
+typedef struct SetOffsetsLoc {
+  int mi_row;
+  int mi_col;
+  BLOCK_SIZE bsize;
+} SetOffsetsLoc;
+#endif  // NDEBUG
+
+/*!\endcond */
+
+/*! \brief Encoder's parameters related to the current coding block.
+ *
+ * This struct contains most of the information the encoder needs to encode the
+ * current coding block. This includes the src and pred buffer, a copy of the
+ * decoder's view of the current block, the txfm coefficients. This struct also
+ * contains various buffers and data used to speed up the encoding process.
+ */
+typedef struct macroblock {
+  /*****************************************************************************
+   * \name Source, Buffers and Decoder
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Each of the encoding plane.
+   *
+   * An array holding the src buffer for each of plane of the current block. It
+   * also contains the txfm and quantized txfm coefficients.
+   */
+  struct macroblock_plane plane[MAX_MB_PLANE];
+
+  /*! \brief Decoder's view of current coding block.
+   *
+   * Contains the encoder's copy of what the decoder sees in the current block.
+   * Most importantly, this struct contains pointers to mbmi that is used in
+   * final bitstream packing.
+   */
+  MACROBLOCKD e_mbd;
+
+  /*! \brief Derived coding information.
+   *
+   * Contains extra information not transmitted in the bitstream but are
+   * derived. For example, this contains the stack of ref_mvs.
+   */
+  MB_MODE_INFO_EXT mbmi_ext;
+
+  /*! \brief Finalized mbmi_ext for the whole frame.
+   *
+   * Contains the finalized info in mbmi_ext that gets used at the frame level
+   * for bitstream packing.
+   */
+  MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
+
+  //! Entropy context for the current row.
+  FRAME_CONTEXT *row_ctx;
+  /*! \brief Entropy context for the current tile.
+   *
+   * This context will be used to update color_map_cdf pointer which would be
+   * used during pack bitstream. For single thread and tile-multithreading case
+   * this pointer will be same as xd->tile_ctx, but for the case of row-mt:
+   * xd->tile_ctx will point to a temporary context while tile_pb_ctx will point
+   * to the accurate tile context.
+   */
+  FRAME_CONTEXT *tile_pb_ctx;
+
+  /*! \brief Buffer of transformed coefficients
+   *
+   * Points to cb_coef_buff in the AV1_COMP struct, which contains the finalized
+   * coefficients. This is here to conveniently copy the best coefficients to
+   * frame level for bitstream packing. Since CB_COEFF_BUFFER is allocated on a
+   * superblock level, we need to combine it with cb_offset to get the proper
+   * position for the current coding block.
+   */
+  CB_COEFF_BUFFER *cb_coef_buff;
+  //! Offset of current coding block's coeff buffer relative to the sb.
+  uint16_t cb_offset[PLANE_TYPES];
+
+  //! Modified source and masks used for fast OBMC search.
+  OBMCBuffer obmc_buffer;
+  //! Buffer to store the best palette map.
+  PALETTE_BUFFER *palette_buffer;
+  //! Buffer used for compound_type_rd().
+  CompoundTypeRdBuffers comp_rd_buffer;
+  //! Buffer to store convolution during averaging process in compound mode.
+  CONV_BUF_TYPE *tmp_conv_dst;
+
+  /*! \brief Temporary buffer to hold prediction.
+   *
+   * Points to a buffer that is used to hold temporary prediction results. This
+   * is used in two ways:
+   * - This is a temporary buffer used to ping-pong the prediction in
+   *   handle_inter_mode.
+   * - xd->tmp_obmc_bufs also points to this buffer, and is used in ombc
+   *   prediction.
+   */
+  uint8_t *tmp_pred_bufs[2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Rdopt Costs
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Quantization index for the current partition block.
+   *
+   * This is used to as the index to find quantization parameter for luma and
+   * chroma transformed coefficients.
+   */
+  int qindex;
+
+  /*! \brief Difference between frame-level qindex and current qindex.
+   *
+   *  This is used to track whether a non-zero delta for qindex is used at least
+   *  once in the current frame.
+   */
+  int delta_qindex;
+
+  /*! \brief Difference between frame-level qindex and qindex used to
+   * compute rdmult (lambda).
+   *
+   * rdmult_delta_qindex is assigned the same as delta_qindex before qp sweep.
+   * During qp sweep, delta_qindex is changed and used to calculate the actual
+   * quant params, while rdmult_delta_qindex remains the same, and is used to
+   * calculate the rdmult in "set_deltaq_rdmult".
+   */
+  int rdmult_delta_qindex;
+
+  /*! \brief Current qindex (before being adjusted by delta_q_res) used to
+   * derive rdmult_delta_qindex.
+   */
+  int rdmult_cur_qindex;
+
+  /*! \brief Rate-distortion multiplier.
+   *
+   * The rd multiplier used to determine the rate-distortion trade-off. This is
+   * roughly proportional to the inverse of q-index for a given frame, but this
+   * can be manipulated for better rate-control. For example, in tune_ssim
+   * mode, this is scaled by a factor related to the variance of the current
+   * block.
+   */
+  int rdmult;
+
+  //! Intra only, per sb rd adjustment.
+  int intra_sb_rdmult_modifier;
+
+  //! Superblock level distortion propagation factor.
+  double rb;
+
+  //! Energy in the current source coding block. Used to calculate \ref rdmult
+  int mb_energy;
+  //! Energy in the current source superblock. Used to calculate \ref rdmult
+  int sb_energy_level;
+
+  //! The rate needed to signal a mode to the bitstream.
+  ModeCosts mode_costs;
+
+  //! The rate needed to encode a new motion vector to the bitstream and some
+  //! multipliers for motion search.
+  MvCosts *mv_costs;
+
+  /*! The rate needed to encode a new motion vector to the bitstream in intrabc
+   *  mode.
+   */
+  IntraBCMVCosts *dv_costs;
+
+  //! The rate needed to signal the txfm coefficients to the bitstream.
+  CoeffCosts coeff_costs;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Rate to Distortion Multipliers
+   ****************************************************************************/
+  /**@{*/
+  //! A multiplier that converts mv cost to l2 error.
+  int errorperbit;
+  //! A multiplier that converts mv cost to l1 error.
+  int sadperbit;
+  /**@}*/
+
+  /******************************************************************************
+   * \name Segmentation
+   *****************************************************************************/
+  /**@{*/
+  /*! \brief Skip mode for the segment
+   *
+   * A syntax element of the segmentation mode. In skip_block mode, all mvs are
+   * set 0 and all txfms are skipped.
+   */
+  int seg_skip_block;
+
+  /*! \brief Number of segment 1 blocks
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 1.
+   */
+  int actual_num_seg1_blocks;
+
+  /*!\brief Number of segment 2 blocks
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 2.
+   */
+  int actual_num_seg2_blocks;
+
+  /*!\brief Number of zero motion vectors
+   */
+  int cnt_zeromv;
+
+  /*!\brief Flag to force zeromv-skip at superblock level, for nonrd path.
+   *
+   * 0/1 imply zeromv-skip is disabled/enabled. 2 implies that the blocks
+   * in the superblock may be marked as zeromv-skip at block level.
+   */
+  int force_zeromv_skip_for_sb;
+
+  /*!\brief Flag to force zeromv-skip at block level, for nonrd path.
+   */
+  int force_zeromv_skip_for_blk;
+
+  /*! \brief Previous segment id for which qmatrices were updated.
+   * This is used to bypass setting of qmatrices if no change in qindex.
+   */
+  int prev_segment_id;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Superblock
+   ****************************************************************************/
+  /**@{*/
+  //! Information on a whole superblock level.
+  // TODO(chiyotsai@google.com): Refactor this out of macroblock
+  SuperBlockEnc sb_enc;
+
+  /*! \brief Characteristics of the current superblock.
+   *
+   *  Characteristics like whether the block has high sad, low sad, etc. This is
+   *  only used by av1 realtime mode.
+   */
+  CONTENT_STATE_SB content_state_sb;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Reference Frame Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Sum absolute distortion of the predicted mv for each ref frame.
+   *
+   * This is used to measure how viable a reference frame is.
+   */
+  int pred_mv_sad[REF_FRAMES];
+  /*! \brief The minimum of \ref pred_mv_sad.
+   *
+   * Index 0 stores the minimum \ref pred_mv_sad across past reference frames.
+   * Index 1 stores the minimum \ref pred_mv_sad across future reference frames.
+   */
+  int best_pred_mv_sad[2];
+  //! The sad of the 1st mv ref (nearest).
+  int pred_mv0_sad[REF_FRAMES];
+  //! The sad of the 2nd mv ref (near).
+  int pred_mv1_sad[REF_FRAMES];
+
+  /*! \brief Disables certain ref frame pruning based on tpl.
+   *
+   * Determines whether a given ref frame is "good" based on data from the TPL
+   * model. If so, this stops selective_ref frame from pruning the given ref
+   * frame at block level.
+   */
+  uint8_t tpl_keep_ref_frame[REF_FRAMES];
+
+  /*! \brief Warp motion samples buffer.
+   *
+   * Store the motion samples used for warp motion.
+   */
+  WARP_SAMPLE_INFO warp_sample_info[REF_FRAMES];
+
+  /*! \brief Reference frames picked by the square subblocks in a superblock.
+   *
+   * Keeps track of ref frames that are selected by square partition blocks
+   * within a superblock, in MI resolution. They can be used to prune ref frames
+   * for rectangular blocks.
+   */
+  int picked_ref_frames_mask[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  /*! \brief Prune ref frames in real-time mode.
+   *
+   * Determines whether to prune reference frames in real-time mode. For the
+   * most part, this is the same as nonrd_prune_ref_frame_search in
+   * cpi->sf.rt_sf.nonrd_prune_ref_frame_search, but this can be selectively
+   * turned off if the only frame available is GOLDEN_FRAME.
+   */
+  int nonrd_prune_ref_frame_search;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Partition Search
+   ****************************************************************************/
+  /**@{*/
+  //! Stores some partition-search related buffers.
+  PartitionSearchInfo part_search_info;
+
+  /*! \brief Whether to disable some features to force a mode in current block.
+   *
+   * In some cases, our speed features can be overly aggressive and remove all
+   * modes search in the superblock. When this happens, we set
+   * must_find_valid_partition to 1 to reduce the number of speed features, and
+   * recode the superblock again.
+   */
+  int must_find_valid_partition;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Prediction Mode Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Inter skip mode.
+   *
+   * Skip mode tries to use the closest forward and backward references for
+   * inter prediction. Skip here means to skip transmitting the reference
+   * frames, not to be confused with skip_txfm.
+   */
+  int skip_mode;
+
+  /*! \brief Factors used for rd-thresholding.
+   *
+   * Determines a rd threshold to determine whether to continue searching the
+   * current mode. If the current best rd is already <= threshold, then we skip
+   * the current mode.
+   */
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+
+  /*! \brief Tracks the winner modes in the current coding block.
+   *
+   * Winner mode is a two-pass strategy to find the best prediction mode. In the
+   * first pass, we search the prediction modes with a limited set of txfm
+   * options, and keep the top modes. These modes are called the winner modes.
+   * In the second pass, we retry the winner modes with more thorough txfm
+   * options.
+   */
+  WinnerModeStats *winner_mode_stats;
+  //! Tracks how many winner modes there are.
+  int winner_mode_count;
+
+  /*! \brief The model used for rd-estimation to avoid txfm
+   *
+   * These are for inter_mode_rd_model_estimation, which is another two pass
+   * approach. In this speed feature, we collect data in the first couple frames
+   * to build an rd model to estimate the rdcost of a prediction model based on
+   * the residue error. Once enough data is collected, this speed feature uses
+   * the estimated rdcost to find the most performant prediction mode. Then we
+   * follow up with a second pass find the best transform for the mode.
+   * Determines if one would go with reduced complexity transform block
+   * search model to select prediction modes, or full complexity model
+   * to select transform kernel.
+   */
+  TXFM_RD_MODEL rd_model;
+
+  /*! \brief Stores the inter mode information needed to build an rd model.
+   *
+   * These are for inter_mode_rd_model_estimation, which is another two pass
+   * approach. In this speed feature, we collect data in the first couple frames
+   * to build an rd model to estimate the rdcost of a prediction model based on
+   * the residue error. Once enough data is collected, this speed feature uses
+   * the estimated rdcost to find the most performant prediction mode. Then we
+   * follow up with a second pass find the best transform for the mode.
+   */
+  // TODO(any): try to consolidate this speed feature with winner mode
+  // processing.
+  struct inter_modes_info *inter_modes_info;
+
+  //! How to blend the compound predictions.
+  uint8_t compound_idx;
+
+  //! A caches of results of compound type search so they can be reused later.
+  COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
+  //! The idx for the latest compound mode in the cache \ref comp_rd_stats.
+  int comp_rd_stats_idx;
+
+  /*! \brief Whether to recompute the luma prediction.
+   *
+   * In interpolation search, we can usually skip recalculating the luma
+   * prediction because it is already calculated by a previous predictor. This
+   * flag signifies that some modes might have been skipped, so we need to
+   * rebuild the prediction.
+   */
+  int recalc_luma_mc_data;
+
+  /*! \brief Data structure to speed up intrabc search.
+   *
+   * Contains the hash table, hash function, and buffer used for intrabc.
+   */
+  IntraBCHashInfo intrabc_hash_info;
+
+  /*! \brief Whether to reuse the mode stored in mb_mode_cache. */
+  int use_mb_mode_cache;
+  /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and
+   *  \ref av1_rd_pick_inter_mode. */
+  const MB_MODE_INFO *mb_mode_cache;
+  /*! \brief Pointer to the buffer which caches gradient information.
+   *
+   * Pointer to the array of structures to store gradient information of each
+   * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+   * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+   */
+  PixelLevelGradientInfo *pixel_gradient_info;
+  /*! \brief Flags indicating the availability of cached gradient info. */
+  bool is_sb_gradient_cached[PLANE_TYPES];
+
+  /*! \brief Flag to reuse predicted samples of inter block. */
+  bool reuse_inter_pred;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name MV Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Context used to determine the initial step size in motion search.
+   *
+   * This context is defined as the \f$l_\inf\f$ norm of the best ref_mvs for
+   * each frame.
+   */
+  unsigned int max_mv_context[REF_FRAMES];
+
+  /*! \brief Limit for the range of motion vectors.
+   *
+   * These define limits to motion vector components to prevent them from
+   * extending outside the UMV borders
+   */
+  FullMvLimits mv_limits;
+
+  /*! \brief Buffer for storing the search site config.
+   *
+   * When resize mode or super resolution mode is on, the stride of the
+   * reference frame does not always match what's specified in \ref
+   * MotionVectorSearchParams::search_site_cfg. When his happens, we update the
+   * search_sine_config buffer here and use it for motion search.
+   */
+  search_site_config search_site_cfg_buf[NUM_DISTINCT_SEARCH_METHODS];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Txfm Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Parameters that control how motion search is done.
+   *
+   * Stores various txfm search related parameters such as txfm_type, txfm_size,
+   * trellis eob search, etc.
+   */
+  TxfmSearchParams txfm_search_params;
+
+  /*! \brief Results of the txfm searches that have been done.
+   *
+   * Caches old txfm search results and keeps the current txfm decisions to
+   * facilitate rdopt.
+   */
+  TxfmSearchInfo txfm_search_info;
+
+  /*! \brief Whether there is a strong color activity.
+   *
+   * Used in REALTIME coding mode to enhance the visual quality at the boundary
+   * of moving color objects.
+   */
+  uint8_t color_sensitivity_sb[MAX_MB_PLANE - 1];
+  //! Color sensitivity flag for the superblock for golden reference.
+  uint8_t color_sensitivity_sb_g[MAX_MB_PLANE - 1];
+  //! Color sensitivity flag for the superblock for altref reference.
+  uint8_t color_sensitivity_sb_alt[MAX_MB_PLANE - 1];
+  //! Color sensitivity flag for the coding block.
+  uint8_t color_sensitivity[MAX_MB_PLANE - 1];
+  //! Coding block distortion value for uv/color, minimum over the inter modes.
+  int64_t min_dist_inter_uv;
+
+  //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane
+  // so we can keep dqcoeff of the best tx_type.
+  tran_low_t *dqcoeff_buf;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Misc
+   ****************************************************************************/
+  /**@{*/
+  //! Variance of the source frame.
+  unsigned int source_variance;
+  //! Flag to indicate coding block is zero sad.
+  int block_is_zero_sad;
+  //! Flag to indicate superblock ME in variance partition is determined to be
+  // good/reliable, and so the superblock MV will be tested in the
+  // nonrd_pickmode. This is only used for LAST_FRAME.
+  int sb_me_partition;
+  //! Flag to indicate to test the superblock MV for the coding block in the
+  // nonrd_pickmode.
+  int sb_me_block;
+  //! Motion vector from superblock MV derived from int_pro_motion() in
+  // the variance_partitioning.
+  int_mv sb_me_mv;
+  //! SSE of the current predictor.
+  unsigned int pred_sse[REF_FRAMES];
+  //! Prediction for ML based partition.
+#if CONFIG_RT_ML_PARTITIONING
+  DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]);
+#endif
+  /**@}*/
+
+  /*! \brief NONE partition evaluated for merge.
+   *
+   * In variance based partitioning scheme, NONE & SPLIT partitions are
+   * evaluated to check the SPLIT can be merged as NONE. This flag signifies the
+   * partition is evaluated in the scheme.
+   */
+  int try_merge_partition;
+
+  /*! \brief Pointer to buffer which caches sub-block variances in a superblock.
+   *
+   *  Pointer to the array of structures to store source variance information of
+   *  each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to
+   *  store source variance and log of source variance of each 4x4 sub-block.
+   */
+  Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
+#ifndef NDEBUG
+  /*! \brief A hash to make sure av1_set_offsets is called */
+  SetOffsetsLoc last_set_offsets_loc;
+#endif  // NDEBUG
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+  mode_search_stat_nonrd ms_stat_nonrd;
+#endif  // COLLECT_NONRD_PICK_MODE_STAT
+
+  /*!\brief Number of pixels in current thread that choose palette mode in the
+   * fast encoding stage for screen content tool detemination.
+   */
+  int palette_pixels;
+
+  /*!\brief Pointer to the structure which stores the statistics used by
+   * sb-level multi-pass encoding.
+   */
+  struct SB_FIRST_PASS_STATS *sb_stats_cache;
+
+  /*!\brief Pointer to the structure which stores the statistics used by
+   * first-pass when superblock is searched twice consecutively.
+   */
+  struct SB_FIRST_PASS_STATS *sb_fp_stats;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+  /*!\brief Pointer to RD_STATS structure to be used in
+   * av1_rd_partition_search().
+   */
+  RD_STATS *rdcost;
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
+} MACROBLOCK;
+#undef SINGLE_REF_MODES
+
+/*!\cond */
+// Zeroes out 'n_stats' elements in the array x->winner_mode_stats.
+// It only zeroes out what is necessary in 'color_index_map' (just the block
+// size, not the whole array).
+static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats,
+                                          WinnerModeStats *stats) {
+  // When winner mode stats are not required, the memory allocation is avoided
+  // for x->winner_mode_stats. The stats pointer will be NULL in such cases.
+  if (stats == NULL) return;
+
+  const int block_height = block_size_high[bsize];
+  const int block_width = block_size_wide[bsize];
+  for (int i = 0; i < n_stats; ++i) {
+    WinnerModeStats *const stat = &stats[i];
+    memset(&stat->mbmi, 0, sizeof(stat->mbmi));
+    memset(&stat->rd_cost, 0, sizeof(stat->rd_cost));
+    memset(&stat->rd, 0, sizeof(stat->rd));
+    memset(&stat->rate_y, 0, sizeof(stat->rate_y));
+    memset(&stat->rate_uv, 0, sizeof(stat->rate_uv));
+    // Do not reset the whole array as it is CPU intensive.
+    memset(&stat->color_index_map, 0,
+           block_width * block_height * sizeof(stat->color_index_map[0]));
+    memset(&stat->mode_index, 0, sizeof(stat->mode_index));
+  }
+}
+
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+  static const char LUT[BLOCK_SIZES_ALL] = {
+    0,  // BLOCK_4X4
+    1,  // BLOCK_4X8
+    1,  // BLOCK_8X4
+    0,  // BLOCK_8X8
+    1,  // BLOCK_8X16
+    1,  // BLOCK_16X8
+    0,  // BLOCK_16X16
+    1,  // BLOCK_16X32
+    1,  // BLOCK_32X16
+    0,  // BLOCK_32X32
+    1,  // BLOCK_32X64
+    1,  // BLOCK_64X32
+    0,  // BLOCK_64X64
+    0,  // BLOCK_64X128
+    0,  // BLOCK_128X64
+    0,  // BLOCK_128X128
+    1,  // BLOCK_4X16
+    1,  // BLOCK_16X4
+    1,  // BLOCK_8X32
+    1,  // BLOCK_32X8
+    1,  // BLOCK_16X64
+    1,  // BLOCK_64X16
+  };
+
+  return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  return is_rect_tx_allowed_bsize(mbmi->bsize) &&
+         !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+  TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
+  int depth = 0;
+  while (tx_size != ctx_size) {
+    depth++;
+    ctx_size = sub_tx_size_map[ctx_size];
+    assert(depth <= MAX_TX_DEPTH);
+  }
+  return depth;
+}
+
+static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx,
+                                int skip) {
+  if (skip)
+    txb_skip[blk_idx] |= 1UL << plane;
+  else
+    txb_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+  // Set chroma planes to uninitialized states when luma is set to check if
+  // it will be set later
+  if (plane == 0) {
+    txb_skip[blk_idx] |= 1UL << (1 + 4);
+    txb_skip[blk_idx] |= 1UL << (2 + 4);
+  }
+
+  // Clear the initialization checking bit
+  txb_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) {
+#ifndef NDEBUG
+  // Check if this is initialized
+  assert(!(txb_skip[blk_idx] & (1UL << (plane + 4))));
+
+  // The magic number is 0x77, this is to test if there is garbage data
+  assert((txb_skip[blk_idx] & 0x88) == 0);
+#endif
+  return (txb_skip[blk_idx] >> plane) & 1;
+}
+
+/*!\endcond */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
new file mode 100644
index 0000000000..6ad2ddaf25
--- /dev/null
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static int horizontal_filter(const uint8_t *s) {
+  return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+  return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+  return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+//              p0 p1 p2 p3
+//              q0 q1 q2 q3
+// block edge ->
+//              r0 r1 r2 r3
+//              s0 s1 s2 s3
+
+// blockiness =  p0*-2+q0*6+r0*-6+s0*2 +
+//               p1*-2+q1*6+r1*-6+s1*2 +
+//               p2*-2+q2*6+r2*-6+s2*2 +
+//               p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+//                                blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+                               int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, s += sp, r += rp) {
+    s_blockiness += horizontal_filter(s);
+    r_blockiness += horizontal_filter(r);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-1];
+    sum_sq_1 += s[-1] * s[-1];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+                                 int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, ++s, ++r) {
+    s_blockiness += vertical_filter(s, sp);
+    r_blockiness += vertical_filter(r, rp);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-sp];
+    sum_sq_1 += s[-sp] * s[-sp];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch, int width,
+                          int height) {
+  double blockiness = 0;
+  int i, j;
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4) {
+      if (i > 0 && i < height && j > 0 && j < width) {
+        blockiness +=
+            blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+        blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+                                            img2_pitch, 4);
+      }
+    }
+  }
+  blockiness /= width * height / 16;
+  return blockiness;
+}
diff --git a/third_party/aom/av1/encoder/cnn.c b/third_party/aom/av1/encoder/cnn.c
new file mode 100644
index 0000000000..598b362753
--- /dev/null
+++ b/third_party/aom/av1/encoder/cnn.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+
+#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
+
+typedef struct {
+  const float **input;
+  int in_width;
+  int in_height;
+  int in_stride;
+  const CNN_LAYER_CONFIG *layer_config;
+  float **output;
+  int out_stride;
+  int start_idx;
+  int th_step;
+} CONVOLVE_OPS;
+
+static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); }
+
+static INLINE float relu(float x) { return (x < 0) ? 0 : x; }
+
+typedef struct {
+  int allocsize;
+  int channels;
+  int width, height, stride;
+  float *buf[CNN_MAX_CHANNELS];
+} TENSOR;
+
+static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
+
+static void free_tensor(TENSOR *tensor) {
+  if (tensor->allocsize) {
+    aom_free(tensor->buf[0]);
+    tensor->buf[0] = NULL;
+    tensor->allocsize = 0;
+  }
+}
+
+static bool realloc_tensor(TENSOR *tensor, int channels, int width,
+                           int height) {
+  const int newallocsize = channels * width * height;
+  if (tensor->allocsize < newallocsize) {
+    free_tensor(tensor);
+    tensor->buf[0] =
+        (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
+    if (!tensor->buf[0]) return false;
+    tensor->allocsize = newallocsize;
+  }
+  tensor->width = width;
+  tensor->height = height;
+  tensor->stride = width;
+  tensor->channels = channels;
+  for (int c = 1; c < channels; ++c)
+    tensor->buf[c] = &tensor->buf[0][c * width * height];
+  return true;
+}
+
+static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
+                        TENSOR *dst) {
+  assert(src->width == dst->width);
+  assert(src->height == dst->height);
+  assert(copy_channels <= src->channels);
+  if (src->stride == dst->width && dst->stride == dst->width) {
+    for (int c = 0; c < copy_channels; ++c) {
+      memcpy(dst->buf[dst_offset + c], src->buf[c],
+             sizeof(*dst->buf[0]) * src->width * src->height);
+    }
+  } else {
+    for (int c = 0; c < copy_channels; ++c) {
+      for (int r = 0; r < dst->height; ++r) {
+        memcpy(&dst->buf[dst_offset + c][r * dst->stride],
+               &src->buf[c][r * src->stride],
+               dst->width * sizeof(*dst->buf[c]));
+      }
+    }
+  }
+}
+
+static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
+                          int channels, int width, int height, int stride) {
+  tensor->allocsize = 0;
+  tensor->channels = channels;
+  tensor->width = width;
+  tensor->height = height;
+  tensor->stride = stride;
+  if (buf) {
+    for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
+  } else {
+    for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
+  }
+}
+
+static void swap_tensor(TENSOR *t1, TENSOR *t2) {
+  TENSOR t = *t1;
+  *t1 = *t2;
+  *t2 = t;
+}
+
+// The concatenated tensor goes into dst with first the channels in
+// original dst followed by the channels in the src
+static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
+  assert(src->width == dst->width);
+  assert(src->height == dst->height);
+
+  const int dst_channels = dst->channels;
+  const int channels = dst->channels + src->channels;
+  const int newallocsize = channels * dst->width * dst->height;
+  if (dst->allocsize < newallocsize) {
+    TENSOR t;
+    init_tensor(&t);
+    // allocate new buffers and copy first the dst channels
+    if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
+    copy_tensor(dst, dst->channels, 0, &t);
+    // Swap the tensors and free the old buffers
+    swap_tensor(dst, &t);
+    free_tensor(&t);
+  }
+  for (int c = 1; c < channels; ++c)
+    dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
+  // Copy the channels in src after the first dst_channels channels.
+  copy_tensor(src, src->channels, dst_channels, dst);
+  return true;
+}
+
+int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
+  return (t1->width == t2->width && t1->height == t2->height);
+}
+
+int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
+  return (t1->channels == t2->channels && t1->width == t2->width &&
+          t1->height == t2->height);
+}
+
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+                                    const CNN_LAYER_CONFIG *layer_config,
+                                    int *out_width, int *out_height) {
+  assert(layer_config->skip_width > 0);
+  assert(layer_config->skip_height > 0);
+  if (!layer_config->deconvolve) {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+      case PADDING_SAME_REPLICATE:
+        *out_width = (in_width + layer_config->skip_width - 1) /
+                     layer_config->skip_width;
+        *out_height = (in_height + layer_config->skip_height - 1) /
+                      layer_config->skip_height;
+        break;
+      case PADDING_VALID:
+        *out_width =
+            (in_width - layer_config->filter_width + layer_config->skip_width) /
+            layer_config->skip_width;
+        *out_height = (in_height - layer_config->filter_height +
+                       layer_config->skip_height) /
+                      layer_config->skip_height;
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  } else {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+      case PADDING_SAME_REPLICATE:
+        *out_width = in_width * layer_config->skip_width;
+        *out_height = in_height * layer_config->skip_height;
+        break;
+      case PADDING_VALID:
+        *out_width = (in_width - 1) * layer_config->skip_width +
+                     layer_config->filter_width;
+        *out_height = (in_height - 1) * layer_config->skip_height +
+                      layer_config->filter_height;
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  }
+}
+
+void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
+                           int channels_per_branch[]) {
+  int branch = layer_config->branch;
+  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+      if (layer_config->branch_copy_type == BRANCH_INPUT) {
+        channels_per_branch[b] = layer_config->in_channels;
+      } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+        channels_per_branch[b] = layer_config->out_channels;
+      } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+        channels_per_branch[b] = layer_config->out_channels;
+        for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+          if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+            assert(channels_per_branch[c] > 0);
+            channels_per_branch[b] += channels_per_branch[c];
+          }
+        }
+      }
+    }
+  }
+  channels_per_branch[branch] = layer_config->out_channels;
+  for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+    if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+      assert(channels_per_branch[c] > 0);
+      channels_per_branch[branch] += channels_per_branch[c];
+    }
+  }
+}
+
+#if CONFIG_DEBUG
+static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
+  const int num_layers = cnn_config->num_layers;
+  const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
+
+  for (int idx = 0; idx < num_layers; idx++) {
+    if (layer_configs[idx].output_num != -1) {
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
+void av1_find_cnn_output_size(int in_width, int in_height,
+                              const CNN_CONFIG *cnn_config, int *out_width,
+                              int *out_height, int *out_channels) {
+  int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
+  int i_width[CNN_MAX_BRANCHES] = { 0 };
+  int i_height[CNN_MAX_BRANCHES] = { 0 };
+  i_width[0] = in_width + cnn_config->ext_width * 2;
+  i_height[0] = in_height + cnn_config->ext_height * 2;
+
+#if CONFIG_DEBUG
+  assert(cnn_has_at_least_one_output(cnn_config));
+#endif
+
+  for (int i = 0; i < cnn_config->num_layers; ++i) {
+    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
+    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+    const int branch = layer_config->branch;
+    int o_width = 0, o_height = 0;
+
+    if (layer_config->branch_copy_type == BRANCH_INPUT) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+          assert(i_width[branch] > 0 && i_height[branch] > 0);
+          i_width[b] = i_width[branch];
+          i_height[b] = i_height[branch];
+        }
+      }
+    }
+
+    av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
+                                   layer_config, &o_width, &o_height);
+    i_width[branch] = o_width;
+    i_height[branch] = o_height;
+
+    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+          i_width[b] = o_width;
+          i_height[b] = o_height;
+        }
+      }
+    }
+
+    find_cnn_out_channels(layer_config, channels_per_branch);
+
+    const int output_num = layer_config->output_num;
+    if (output_num != -1) {  // Current layer is an output layer
+      out_width[output_num] = o_width;
+      out_height[output_num] = o_height;
+      out_channels[output_num] = channels_per_branch[layer_config->branch];
+    }
+  }
+}
+
+static INLINE int get_start_shift_convolve(int width, int filt_width,
+                                           int stride) {
+  const int mod = (width % stride);
+  const int filt_off = (filt_width - 1) / 2;
+  const int dif = (mod ? mod - 1 : stride - 1);
+  return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
+}
+
+void av1_cnn_add_c(float **output, int channels, int width, int height,
+                   int stride, const float **add) {
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i)
+      for (int j = 0; j < width; ++j)
+        output[c][i * stride + j] += add[c][i * stride + j];
+  }
+}
+
+void av1_cnn_activate_c(float **output, int channels, int width, int height,
+                        int stride, ACTIVATION layer_activation) {
+  if (layer_activation == RELU) {
+    for (int c = 0; c < channels; ++c) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          output[c][i * stride + j] = relu(output[c][i * stride + j]);
+    }
+  } else if (layer_activation == SOFTSIGN) {
+    for (int c = 0; c < channels; ++c) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          output[c][i * stride + j] = softsign(output[c][i * stride + j]);
+    }
+  } else if (layer_activation == SIGMOID) {
+    assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
+  } else if (layer_activation != NONE) {
+    assert(0 && "Unknown activation type");
+  }
+}
+
+static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
+                                           const CNN_LAYER_CONFIG *layer_config,
+                                           int branch, TENSOR branch_output[]) {
+  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+      // Copy layer's active tensor to output tensor of branch b if set in
+      // mask. The output becomes the input of the first layer of the branch
+      // because the layer of the branch is not the first layer.
+      int copy_channels = branch_config->channels_to_copy > 0
+                              ? branch_config->channels_to_copy
+                              : layer_active_tensor->channels;
+      if (!realloc_tensor(&branch_output[b], copy_channels,
+                          layer_active_tensor->width,
+                          layer_active_tensor->height)) {
+        return false;
+      }
+      copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
+    }
+  }
+  return true;
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_ZERO.
+static void convolve_maxpool_padding_zero(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep, const int filter_width_half,
+    const int filter_height_half) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii = hh + l - filter_height_half;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj = ww + m - filter_width_half;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_REPLICATE.
+static void convolve_maxpool_padding_replicate(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep, const int filter_width_half,
+    const int filter_height_half) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii =
+                    CLAMPINDEX(hh + l - filter_height_half, in_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj =
+                      CLAMPINDEX(ww + m - filter_width_half, in_width);
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_VALID.
+static void convolve_maxpool_padding_valid(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+         h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
+           w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii = hh + l;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj = ww + m;
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
+// equal to 1.
+static void convolve_element_wise(const float **input, int in_width,
+                                  int in_height, int in_stride,
+                                  const CNN_LAYER_CONFIG *const layer_config,
+                                  float **output, int out_stride, int start_idx,
+                                  int step) {
+  const int start_h = get_start_shift_convolve(
+      in_height, layer_config->filter_height, layer_config->skip_height);
+  const int start_w =
+      get_start_shift_convolve(in_width, layer_config->filter_width,
+                               layer_config->skip_width) +
+      start_idx * layer_config->skip_width;
+  const int out_w_step = AOMMAX(step, 1);
+  const int in_w_step = layer_config->skip_width * out_w_step;
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = start_h, u = 0; h < in_height;
+         h += layer_config->skip_height, ++u) {
+      const int in_h = h * in_stride;
+      const int out_h = u * out_stride + start_idx;
+      for (int w = start_w, out_index = out_h; w < in_width;
+           w += in_w_step, out_index += out_w_step) {
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          sum += layer_config->weights[k * layer_config->out_channels + i] *
+                 input[k][in_h + w];
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_ZERO.
+static void convolve_no_maxpool_padding_zero(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int filter_width_half,
+    const int filter_height_half, const int ii_shift, const int jj_shift,
+    const int channel_step) {
+  const int start_h = get_start_shift_convolve(
+      in_height, layer_config->filter_height, layer_config->skip_height);
+  const int start_w = get_start_shift_convolve(
+      in_width, layer_config->filter_width, layer_config->skip_width);
+  const int end_ii_shift = filter_height_half + 1;
+  const int end_jj_shift = filter_width_half + 1;
+  // *_filter_margin stores the number of pixels along a dimension in the
+  // intersection of the complement of the image in the extended image
+  // and the filter.
+  const int top_filter_margin = layer_config->filter_width * ii_shift;
+  const int right_filter_margin = end_jj_shift - in_width;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = start_h, u = 0; h < in_height;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int top_cstep =
+          AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
+              cstep +
+          i;
+      const int start_ii = AOMMAX(0, h - ii_shift);
+      const int end_ii = AOMMIN(in_height, h + end_ii_shift);
+      for (int w = start_w, out_index = out_h; w < in_width;
+           w += layer_config->skip_width, ++out_index) {
+        const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
+        const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
+        const int start_jj = AOMMAX(0, w - jj_shift);
+        const int end_jj = AOMMIN(in_width, w + end_jj_shift);
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + top_cstep;
+          for (int ii = start_ii; ii < end_ii; ++ii) {
+            off += left_cstep;
+            for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
+              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+            }
+            off += right_cstep;
+          }
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_REPLICATE.
+static void convolve_no_maxpool_padding_replicate(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int ii_shift, const int jj_shift,
+    const int channel_step) {
+  // h and w are shifted to an offset coordinate system to reduce in-loop
+  // computation.
+  const int start_h =
+      get_start_shift_convolve(in_height, layer_config->filter_height,
+                               layer_config->skip_height) -
+      ii_shift;
+  const int start_w =
+      get_start_shift_convolve(in_width, layer_config->filter_width,
+                               layer_config->skip_width) -
+      jj_shift;
+  const int end_h = in_height - ii_shift;
+  const int end_w = in_width - jj_shift;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = start_h, u = 0; h < end_h;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int upper_ii_index = layer_config->filter_height + h;
+      for (int w = start_w, out_index = out_h; w < end_w;
+           w += layer_config->skip_width, ++out_index) {
+        const int upper_jj_index = layer_config->filter_width + w;
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + i;
+          for (int ii = h; ii < upper_ii_index; ++ii) {
+            const int clamped_ii = CLAMPINDEX(ii, in_height);
+            for (int jj = w; jj < upper_jj_index; ++jj) {
+              const int clamped_jj = CLAMPINDEX(jj, in_width);
+              assert(clamped_ii >= 0 && clamped_ii < in_height &&
+                     clamped_jj >= 0 && clamped_jj < in_width);
+              sum += layer_config->weights[off] *
+                     input[k][clamped_ii * in_stride + clamped_jj];
+              off += cstep;
+            }
+          }
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_VALID.
+void av1_cnn_convolve_no_maxpool_padding_valid_c(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+    int start_idx, int cstep, int channel_step) {
+  assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
+         !layer_config->maxpool);
+  assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
+  assert(layer_config->pad == PADDING_VALID);
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int upper_ii_index = layer_config->filter_height + h;
+      for (int w = 0, out_index = out_h;
+           w < in_width - layer_config->filter_width + 1;
+           w += layer_config->skip_width, ++out_index) {
+        const int upper_jj_index = layer_config->filter_width + w;
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + i;
+          for (int ii = h; ii < upper_ii_index; ++ii) {
+            for (int jj = w; jj < upper_jj_index; ++jj) {
+              assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+              off += cstep;
+            }
+          }
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+static void av1_cnn_convolve(const float **input, int in_width, int in_height,
+                             int in_stride,
+                             const CNN_LAYER_CONFIG *layer_config,
+                             float **output, int out_stride, int start_idx,
+                             int step) {
+  assert(!layer_config->deconvolve);
+  const int cstep = layer_config->in_channels * layer_config->out_channels;
+  const int filter_height_half = layer_config->filter_height >> 1;
+  const int filter_width_half = layer_config->filter_width >> 1;
+  const int channel_step = AOMMAX(step, 1);
+
+  if (layer_config->maxpool &&
+      (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+        convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
+                                      layer_config, output, out_stride, cstep,
+                                      filter_width_half, filter_height_half);
+        break;
+      case PADDING_SAME_REPLICATE:
+        convolve_maxpool_padding_replicate(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, cstep, filter_width_half, filter_height_half);
+        break;
+      case PADDING_VALID:
+        convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
+                                       layer_config, output, out_stride, cstep);
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  } else {
+    // Results in element-wise matrix multiplication.
+    if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
+      convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
+                            output, out_stride, start_idx, step);
+      return;
+    }
+    const int ii_shift =
+        filter_height_half - (layer_config->filter_height - 1) % 2;
+    const int jj_shift =
+        filter_width_half - (layer_config->filter_width - 1) % 2;
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+        convolve_no_maxpool_padding_zero(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, filter_width_half, filter_height_half,
+            ii_shift, jj_shift, channel_step);
+        break;
+      case PADDING_SAME_REPLICATE:
+        convolve_no_maxpool_padding_replicate(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
+        break;
+      case PADDING_VALID:
+        av1_cnn_convolve_no_maxpool_padding_valid(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, channel_step);
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  }
+}
+
+static int convolve_layer(void *arg1, void *arg2) {
+  const CONVOLVE_OPS *convolve_ops = arg1;
+  (void)arg2;
+  av1_cnn_convolve(
+      convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
+      convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
+      convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
+  return 1;
+}
+
+static void convolve_layer_mt(const float **input, int in_width, int in_height,
+                              int in_stride,
+                              const CNN_LAYER_CONFIG *layer_config,
+                              const CNN_THREAD_DATA *thread_data,
+                              float **output, int out_stride) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const int num_workers = thread_data->num_workers;
+  assert(thread_data->workers);
+
+  CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    AVxWorker *const worker = &thread_data->workers[th];
+    winterface->reset(worker);
+
+    CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
+                                 in_stride,  layer_config, output,
+                                 out_stride, th,           num_workers };
+    convolve_ops[th] = convolve_op;
+    worker->hook = convolve_layer;
+    worker->data1 = &(convolve_ops[th]);
+    worker->data2 = NULL;
+
+    // Start convolving.
+    if (th == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait until all workers have finished.
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    winterface->sync(&thread_data->workers[th]);
+  }
+}
+
+static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
+  const int dif = AOMMAX(filt_width - stride, 0);
+  return dif / 2;
+}
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
+                         int stride, const float *gamma, const float *beta,
+                         const float *mean, const float *std) {
+  assert(gamma && beta && beta && std && "batchnorm has null parameter!");
+  for (int ch = 0; ch < channels; ch++) {
+    const float ch_gamma = gamma[ch];
+    const float ch_beta = beta[ch];
+    const float ch_mean = mean[ch];
+    const float ch_std = std[ch];
+    float *image_row = image[ch];
+
+    for (int row = 0; row < height; row++) {
+      for (int col = 0; col < width; col++) {
+        image_row[col] =
+            ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
+      }
+      image_row += stride;
+    }
+  }
+}
+
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
+                          int in_stride, const CNN_LAYER_CONFIG *layer_config,
+                          float **output, int out_stride) {
+  assert(layer_config->deconvolve);
+
+  const int cstep = layer_config->in_channels * layer_config->out_channels;
+
+  int out_width = 0;
+  int out_height = 0;
+  av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
+                                 &out_height);
+  switch (layer_config->pad) {
+    case PADDING_SAME_ZERO:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h =
+                    u - l +
+                    get_start_shift_deconvolve(layer_config->filter_height,
+                                               layer_config->skip_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w =
+                      v - m +
+                      get_start_shift_deconvolve(layer_config->filter_width,
+                                                 layer_config->skip_width);
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii = h / layer_config->skip_height;
+                  const int jj = w / layer_config->skip_width;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    case PADDING_SAME_REPLICATE:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h =
+                    u - l +
+                    get_start_shift_deconvolve(layer_config->filter_height,
+                                               layer_config->skip_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w =
+                      v - m +
+                      get_start_shift_deconvolve(layer_config->filter_width,
+                                                 layer_config->skip_width);
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii =
+                      CLAMPINDEX(h / layer_config->skip_height, in_height);
+                  const int jj =
+                      CLAMPINDEX(w / layer_config->skip_width, in_width);
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    case PADDING_VALID:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h = u - l;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w = v - m;
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii = h / layer_config->skip_height;
+                  const int jj = w / layer_config->skip_width;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    default: assert(0 && "Unknown padding type");
+  }
+}
+
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
+                       int in_stride, const CNN_CONFIG *cnn_config,
+                       const CNN_THREAD_DATA *thread_data,
+                       CNN_MULTI_OUT *output_struct) {
+  bool success = false;
+  TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
+  TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
+
+  float **output[CNN_MAX_BRANCHES];
+  const int *out_chs = output_struct->output_channels;
+  output[0] = output_struct->output_buffer;
+  for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
+    output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
+  }
+
+  int i_width = in_width;
+  int i_height = in_height;
+  int o_width = 0, o_height = 0;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    init_tensor(&tensor1[b]);
+    init_tensor(&tensor2[b]);
+  }
+
+  const int *out_stride = output_struct->output_strides;
+  for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+    const int branch = layer_config->branch;
+    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+
+    // Allocate input tensor
+    if (layer == 0) {       // First layer
+      assert(branch == 0);  // First layer must be primary branch
+      assign_tensor(&tensor1[branch], (float **)input,
+                    layer_config->in_channels, in_width, in_height, in_stride);
+    } else {  // Non-first layer
+      // Swap tensor1 and tensor2
+      swap_tensor(&tensor1[branch], &tensor2[branch]);
+
+      i_width = tensor1[branch].width;
+      i_height = tensor1[branch].height;
+    }
+
+    // Allocate output tensor
+    av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
+                                   &o_height);
+    const int output_num = layer_config->output_num;
+    if (output_num == -1) {  // Non-output layer
+      if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
+                          o_height)) {
+        goto Error;
+      }
+    } else {  // Output layer
+      free_tensor(&tensor2[branch]);
+      assign_tensor(&tensor2[branch], output[output_num],
+                    layer_config->out_channels, o_width, o_height,
+                    out_stride[output_num]);
+    }
+
+    // If we are combining branches make sure that the branch to combine
+    // is different from the current branch.
+    assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
+                   !(branch_config->branches_to_combine & (1 << branch))));
+
+    if (layer_config->branch_copy_type == BRANCH_INPUT) {
+      if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
+    }
+    // Check consistency of input and output channels
+    assert(tensor1[branch].channels == layer_config->in_channels);
+    assert(tensor2[branch].channels == layer_config->out_channels);
+
+    // Convolve/Deconvolve
+    if (!cnn_config->layer_config[layer].deconvolve) {
+      if (thread_data->num_workers > 1) {
+        convolve_layer_mt((const float **)tensor1[branch].buf,
+                          tensor1[branch].width, tensor1[branch].height,
+                          tensor1[branch].stride, layer_config, thread_data,
+                          tensor2[branch].buf, tensor2[branch].stride);
+      } else {
+        av1_cnn_convolve((const float **)tensor1[branch].buf,
+                         tensor1[branch].width, tensor1[branch].height,
+                         tensor1[branch].stride, layer_config,
+                         tensor2[branch].buf, tensor2[branch].stride, 0, 1);
+      }
+    } else {
+      av1_cnn_deconvolve((const float **)tensor1[branch].buf,
+                         tensor1[branch].width, tensor1[branch].height,
+                         tensor1[branch].stride, layer_config,
+                         tensor2[branch].buf, tensor2[branch].stride);
+    }
+
+    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+      if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
+    }
+
+    // Add tensors from other branches if needed
+    if (layer_config->branch_combine_type == BRANCH_ADD) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+          assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
+          av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
+                      tensor2[branch].width, tensor2[branch].height,
+                      tensor2[branch].stride, (const float **)tensor2[b].buf);
+        }
+      }
+    }
+
+    // Non-linearity
+    av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
+                     tensor2[branch].width, tensor2[branch].height,
+                     tensor2[branch].stride, layer_config->activation);
+
+    if (layer_config->bn_params.bn_gamma) {
+      av1_cnn_batchnorm(
+          tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
+          tensor2[branch].height, tensor2[branch].stride,
+          layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
+          layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
+    }
+
+    // Concatenate tensors
+    if (layer_config->branch_combine_type == BRANCH_CAT) {
+      if (output_num == -1) {  // Non-output layer
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            assert(tensor2[b].channels > 0);
+            if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
+          }
+        }
+      } else {  // Output layer
+        const int existing_channels = tensor2[branch].channels;
+        int num_chs = existing_channels;
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            // Needed only to assign the new channel buffers
+            num_chs += tensor2[b].channels;
+          }
+        }
+        assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
+                      o_height, out_stride[output_num]);
+
+        num_chs = existing_channels;
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            // Needed only to assign the new channel buffers
+            copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
+                        &tensor2[branch]);
+            num_chs += tensor2[b].channels;
+          }
+        }
+      }
+    }
+
+    if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+      if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
+    }
+  }
+
+  success = true;
+Error:
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    free_tensor(&tensor1[b]);
+    free_tensor(&tensor2[b]);
+  }
+  return success;
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+                                   int stride, const CNN_CONFIG *cnn_config,
+                                   const CNN_THREAD_DATA *thread_data,
+                                   CNN_MULTI_OUT *output) {
+  const float max_val = 255.0;
+
+  const int in_width = width + 2 * cnn_config->ext_width;
+  const int in_height = height + 2 * cnn_config->ext_height;
+  const int in_channels = cnn_config->layer_config[0].in_channels;
+  float *inputs[CNN_MAX_CHANNELS];
+  float *input_ =
+      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  if (!input_) return false;
+  const int in_stride = in_width;
+
+  for (int c = 0; c < in_channels; ++c) {
+    inputs[c] = input_ + c * in_stride * in_height;
+    float *input =
+        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+    if (cnn_config->strict_bounds) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+      // extend left and right
+      for (int i = 0; i < height; ++i) {
+        for (int j = -cnn_config->ext_width; j < 0; ++j)
+          input[i * in_stride + j] = input[i * in_stride];
+        for (int j = width; j < width + cnn_config->ext_width; ++j)
+          input[i * in_stride + j] = input[i * in_stride + width - 1];
+      }
+      // extend top and bottom
+      for (int i = -cnn_config->ext_height; i < 0; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[-cnn_config->ext_width], in_width * sizeof(*input));
+      for (int i = height; i < height + cnn_config->ext_height; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[(height - 1) * in_stride - cnn_config->ext_width],
+               in_width * sizeof(*input));
+    } else {
+      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+           ++i)
+        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+             ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+    }
+  }
+  bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+                                 in_stride, cnn_config, thread_data, output);
+
+  aom_free(input_);
+  return success;
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+                                          int stride,
+                                          const CNN_CONFIG *cnn_config,
+                                          const CNN_THREAD_DATA *thread_data,
+                                          int bit_depth,
+                                          CNN_MULTI_OUT *output) {
+  const float max_val = (float)((1 << bit_depth) - 1);
+
+  const int in_width = width + 2 * cnn_config->ext_width;
+  const int in_height = height + 2 * cnn_config->ext_height;
+  const int in_channels = cnn_config->layer_config[0].in_channels;
+  float *inputs[CNN_MAX_CHANNELS];
+  float *input_ =
+      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  if (!input_) return false;
+  const int in_stride = in_width;
+
+  for (int c = 0; c < in_channels; ++c) {
+    inputs[c] = input_ + c * in_stride * in_height;
+    float *input =
+        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+    if (cnn_config->strict_bounds) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+      // extend left and right
+      for (int i = 0; i < height; ++i) {
+        for (int j = -cnn_config->ext_width; j < 0; ++j)
+          input[i * in_stride + j] = input[i * in_stride];
+        for (int j = width; j < width + cnn_config->ext_width; ++j)
+          input[i * in_stride + j] = input[i * in_stride + width - 1];
+      }
+      // extend top and bottom
+      for (int i = -cnn_config->ext_height; i < 0; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[-cnn_config->ext_width], in_width * sizeof(*input));
+      for (int i = height; i < height + cnn_config->ext_height; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[(height - 1) * in_stride - cnn_config->ext_width],
+               in_width * sizeof(*input));
+    } else {
+      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+           ++i)
+        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+             ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+    }
+  }
+
+  bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+                                 in_stride, cnn_config, thread_data, output);
+
+  aom_free(input_);
+  return success;
+}
diff --git a/third_party/aom/av1/encoder/cnn.h b/third_party/aom/av1/encoder/cnn.h
new file mode 100644
index 0000000000..df6401f73f
--- /dev/null
+++ b/third_party/aom/av1/encoder/cnn.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CNN_H_
+#define AOM_AV1_ENCODER_CNN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <math.h>
+#include <stdbool.h>
+
+#include "aom_util/aom_thread.h"
+#include "config/av1_rtcd.h"
+
+struct AV1Common;
+
+#define CNN_MAX_HIDDEN_LAYERS 64
+#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1)
+#define CNN_MAX_CHANNELS 256
+#define CNN_MAX_BRANCHES 4
+#define CNN_MAX_THREADS 32
+
+#define NO_BRANCH_CONFIG \
+  { 0, 0, 0 }
+#define NO_BN_PARAMS \
+  { NULL, NULL, NULL, NULL }
+
+enum {
+  PADDING_SAME_ZERO,       // tensorflow's SAME padding with pixels outside
+                           // the image area assumed to be 0 (default)
+  PADDING_SAME_REPLICATE,  // tensorflow's SAME padding with pixels outside
+                           // the image area replicated from closest edge
+  PADDING_VALID            // tensorflow's VALID padding
+} UENUM1BYTE(PADDING_TYPE);
+
+// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION);
+
+// Times when input tensor may be copied to branches given in input_to_branches.
+// BRANCH_NO_COPY: doesn't copy any tensor.
+// BRANCH_INPUT: copies the input tensor to branches.
+// BRANCH_OUTPUT: copies the convolved tensor to branches.
+// BRANCH_COMBINED: copies the combined (after convolving and branch combining)
+//   tensor. If no combinations happen at this layer, then this option
+//   has the same effect as COPY_OUTPUT.
+enum {
+  BRANCH_NO_COPY,
+  BRANCH_INPUT,
+  BRANCH_OUTPUT,
+  BRANCH_COMBINED
+} UENUM1BYTE(BRANCH_COPY);
+
+// Types of combining branches with output of current layer:
+// BRANCH_NOC: no branch combining
+// BRANCH_ADD: Add previously stored branch tensor to output of layer
+// BRANCH_CAT: Concatenate branch tensor to output of layer
+enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE);
+
+// The parameters used to scale each channel in batch
+// normalization. The processing in done on a per-channel basis.
+// e.g. bn_mean[c] is the mean for all pixels in channel c. This
+// is always applied after activation. The output is given by
+// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where
+// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c]
+// here we assume that the effect of variance_epsilon is already
+// taken into account when bn_std is calculated. The pointers
+// needs to be either all zero or all valid. If all zero, then
+// batchnorm is disabled, else batchnorm is applied.
+struct CNN_BATCHNORM_PARAMS {
+  const float *bn_gamma;
+  const float *bn_beta;
+  const float *bn_mean;
+  const float *bn_std;
+};
+
+struct CNN_BRANCH_CONFIG {
+  int input_to_branches;  // If nonzero, copy the active tensor to the current
+  // layer and store for future use in branches
+  // specified in the field as a binary mask. For
+  // example, if input_to_branch = 0x06, it means the
+  // input tensor to the current branch is copied to
+  // branches 1 and 2 (where 0 represents the primary
+  // branch). One restriction is that the mask
+  // cannot indicate copying to the current branch.
+  // If greater than 0, only copies the channels up
+  // to the given index.
+  int channels_to_copy;  // Within the layer, input a copy of active
+  // tensor to branches given in input_to_branches.
+  int branches_to_combine;  // mask of branches to combine with output of
+  // current layer, if
+  // branch_combine_type != BRANCH_NOC
+  // For example, if branches_to_combine = 0x0A,
+  // it means that braches 1 and 3 are combined
+  // with the current branch.
+};
+
+struct CNN_LAYER_CONFIG {
+  int in_channels;
+  int filter_width;
+  int filter_height;
+  int out_channels;
+  int skip_width;
+  int skip_height;
+  int maxpool;            // whether to use maxpool or not (only effective when
+                          // skip width or skip_height are > 1)
+  const float *weights;   // array of length filter_height x filter_width x
+                          // in_channels x out_channels where the inner-most
+                          // scan is out_channels and the outer most scan is
+                          // filter_height.
+  const float *bias;      // array of length out_channels
+  PADDING_TYPE pad;       // padding type
+  ACTIVATION activation;  // the activation function to use after convolution
+  int deconvolve;         // whether this is a deconvolution layer.
+                          // 0: If skip_width or skip_height are > 1, then we
+                          // reduce resolution
+                          // 1: If skip_width or skip_height are > 1, then we
+                          // increase resolution
+  int branch;             // branch index in [0, CNN_MAX_BRANCHES - 1], where
+                          // 0 refers to the primary branch.
+  BRANCH_COPY branch_copy_type;
+  BRANCH_COMBINE branch_combine_type;
+  struct CNN_BRANCH_CONFIG branch_config;
+  struct CNN_BATCHNORM_PARAMS
+      bn_params;   // A struct that contains the parameters
+                   // used for batch normalization.
+  int output_num;  // The output buffer idx to which the layer output is
+                   // written. Set to -1 to disable writing it to the output. In
+                   // the case that branch_combine_type is BRANCH_CAT, all
+                   // concatenated channels will be written to output. In the
+                   // case of BRANCH_ADD, the output will be the result of
+                   // summation.
+};
+
+struct CNN_CONFIG {
+  int num_layers;  // number of CNN layers ( = number of hidden layers + 1)
+  int is_residue;  // whether the output activation is a residue
+  int ext_width, ext_height;  // extension horizontally and vertically
+  int strict_bounds;          // whether the input bounds are strict or not.
+                              // If strict, the extension area is filled by
+                              // replication; if not strict, image data is
+                              // assumed available beyond the bounds.
+  CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS];
+};
+
+struct CNN_THREAD_DATA {
+  int num_workers;
+  AVxWorker *workers;
+};
+
+struct CNN_MULTI_OUT {
+  int num_outputs;
+  const int *output_channels;
+  const int *output_strides;
+  float **output_buffer;
+};
+
+// Function to return size of output
+void av1_find_cnn_output_size(int in_width, int in_height,
+                              const CNN_CONFIG *cnn_config, int *out_width,
+                              int *out_height, int *out_channels);
+
+// Function to return output width and output height of given layer.
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+                                    const CNN_LAYER_CONFIG *layer_config,
+                                    int *out_width, int *out_height);
+
+// Prediction functions from set of input image buffers. This function supports
+// CNN with multiple outputs.
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+                                   int stride, const CNN_CONFIG *cnn_config,
+                                   const CNN_THREAD_DATA *thread_data,
+                                   struct CNN_MULTI_OUT *output);
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+                                          int stride,
+                                          const CNN_CONFIG *cnn_config,
+                                          const CNN_THREAD_DATA *thread_data,
+                                          int bit_depth, CNN_MULTI_OUT *output);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_CNN_H_
diff --git a/third_party/aom/av1/encoder/compound_type.c b/third_party/aom/av1/encoder/compound_type.c
new file mode 100644
index 0000000000..3b0ee88241
--- /dev/null
+++ b/third_party/aom/av1/encoder/compound_type.c
@@ -0,0 +1,1678 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tx_search.h"
+
+typedef int64_t (*pick_interinter_mask_type)(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+    const uint8_t *const p0, const uint8_t *const p1,
+    const int16_t *const residual1, const int16_t *const diff10,
+    uint64_t *best_sse);
+
+// Checks if characteristics of search match
+static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
+                                   const MACROBLOCK *const x,
+                                   const COMP_RD_STATS *st,
+                                   const MB_MODE_INFO *const mi,
+                                   int32_t *comp_rate, int64_t *comp_dist,
+                                   int32_t *comp_model_rate,
+                                   int64_t *comp_model_dist, int *comp_rs2) {
+  // TODO(ranjit): Ensure that compound type search use regular filter always
+  // and check if following check can be removed
+  // Check if interp filter matches with previous case
+  if (st->filter.as_int != mi->interp_filters.as_int) return 0;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  // Match MV and reference indices
+  for (int i = 0; i < 2; ++i) {
+    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+        (st->mv[i].as_int != mi->mv[i].as_int)) {
+      return 0;
+    }
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]];
+    if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
+  }
+
+  int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 };
+  // For compound wedge, reuse data if newmv search is disabled when NEWMV is
+  // present or if NEWMV is not present in either of the directions
+  if ((!have_newmv_in_inter_mode(mi->mode) &&
+       !have_newmv_in_inter_mode(st->mode)) ||
+      (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search))
+    reuse_data[COMPOUND_WEDGE] = 1;
+  // For compound diffwtd, reuse data if fast search is enabled (no newmv search
+  // when NEWMV is present) or if NEWMV is not present in either of the
+  // directions
+  if (cpi->sf.inter_sf.enable_fast_compound_mode_search ||
+      (!have_newmv_in_inter_mode(mi->mode) &&
+       !have_newmv_in_inter_mode(st->mode)))
+    reuse_data[COMPOUND_DIFFWTD] = 1;
+
+  // Store the stats for the different compound types
+  for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES;
+       comp_type++) {
+    if (reuse_data[comp_type]) {
+      comp_rate[comp_type] = st->rate[comp_type];
+      comp_dist[comp_type] = st->dist[comp_type];
+      comp_model_rate[comp_type] = st->model_rate[comp_type];
+      comp_model_dist[comp_type] = st->model_dist[comp_type];
+      comp_rs2[comp_type] = st->comp_rs2[comp_type];
+    }
+  }
+  return 1;
+}
+
+// Checks if similar compound type search case is accounted earlier
+// If found, returns relevant rd data
+static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
+                                        const MACROBLOCK *x,
+                                        const MB_MODE_INFO *const mbmi,
+                                        int32_t *comp_rate, int64_t *comp_dist,
+                                        int32_t *comp_model_rate,
+                                        int64_t *comp_model_dist, int *comp_rs2,
+                                        int *match_index) {
+  for (int j = 0; j < x->comp_rd_stats_idx; ++j) {
+    if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate,
+                         comp_dist, comp_model_rate, comp_model_dist,
+                         comp_rs2)) {
+      *match_index = j;
+      return 1;
+    }
+  }
+  return 0;  // no match result found
+}
+
+static INLINE bool enable_wedge_search(
+    MACROBLOCK *const x, const unsigned int disable_wedge_var_thresh) {
+  // Enable wedge search if source variance and edge strength are above
+  // the thresholds.
+  return x->source_variance > disable_wedge_var_thresh;
+}
+
+static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
+                                                  const AV1_COMP *const cpi) {
+  return enable_wedge_search(
+             x, cpi->sf.inter_sf.disable_interinter_wedge_var_thresh) &&
+         cpi->oxcf.comp_type_cfg.enable_interinter_wedge;
+}
+
+static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
+                                                  const AV1_COMP *const cpi) {
+  return enable_wedge_search(
+             x, cpi->sf.inter_sf.disable_interintra_wedge_var_thresh) &&
+         cpi->oxcf.comp_type_cfg.enable_interintra_wedge;
+}
+
+static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  const BLOCK_SIZE bsize, const uint8_t *pred0,
+                                  int stride0, const uint8_t *pred1,
+                                  int stride1) {
+  static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+    //                            4X4
+    BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+    // 64x128,     128x64,        128x128
+    BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+    // 4X16,       16X4,          8X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+    // 32X8,       16X64,         64X16
+    BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+  };
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint8_t *src = p->src.buf;
+  int src_stride = p->src.stride;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int bw_by2 = bw >> 1;
+  const int bh_by2 = bh >> 1;
+  uint32_t esq[2][2];
+  int64_t tl, br;
+
+  const BLOCK_SIZE f_index = split_qtr[bsize];
+  assert(f_index != BLOCK_INVALID);
+
+  if (is_cur_buf_hbd(&x->e_mbd)) {
+    pred0 = CONVERT_TO_BYTEPTR(pred0);
+    pred1 = CONVERT_TO_BYTEPTR(pred1);
+  }
+
+  // Residual variance computation over relevant quandrants in order to
+  // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1),
+  // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0)
+  // The 2nd and 3rd quadrants cancel out in TL + BR
+  // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0)
+  // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants)
+  // for all codebooks; experiment with other quadrant combinations for
+  // 0, 90 and 135 degrees also.
+  cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                               pred0 + bh_by2 * stride0 + bw_by2, stride0,
+                               &esq[0][1]);
+  cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                               pred1 + bh_by2 * stride1 + bw_by2, stride0,
+                               &esq[1][1]);
+
+  tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]);
+  br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]);
+  return (tl + br > 0);
+}
+
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+                          const BLOCK_SIZE bsize, const uint8_t *const p0,
+                          const int16_t *const residual1,
+                          const int16_t *const diff10,
+                          int8_t *const best_wedge_sign,
+                          int8_t *const best_wedge_index, uint64_t *best_sse) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  assert(N >= 64);
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int8_t wedge_index;
+  int8_t wedge_sign;
+  const int8_t wedge_types = get_wedge_types_lookup(bsize);
+  const uint8_t *mask;
+  uint64_t sse;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+
+  DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p0), bw);
+  } else {
+    aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+  }
+#else
+  (void)hbd;
+  aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+#endif
+
+  int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+                        (int64_t)aom_sum_squares_i16(residual1, N)) *
+                       (1 << WEDGE_WEIGHT_BITS) / 2;
+  int16_t *ds = residual0;
+
+  av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+    wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    // int rate2;
+    // int64_t dist2;
+    // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+    // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+    // sse, rate, dist, rate2, dist2); dist = dist2;
+    // rate = rate2;
+
+    rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
+    rd = RDCOST(x->rdmult, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      *best_wedge_sign = wedge_sign;
+      best_rd = rd;
+      *best_sse = sse;
+    }
+  }
+
+  return best_rd -
+         RDCOST(x->rdmult,
+                x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const BLOCK_SIZE bsize, const int16_t *const residual1,
+    const int16_t *const diff10, const int8_t wedge_sign,
+    int8_t *const best_wedge_index, uint64_t *best_sse) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  assert(N >= 64);
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int8_t wedge_index;
+  const int8_t wedge_types = get_wedge_types_lookup(bsize);
+  const uint8_t *mask;
+  uint64_t sse;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
+    rd = RDCOST(x->rdmult, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      best_rd = rd;
+      *best_sse = sse;
+    }
+  }
+  return best_rd -
+         RDCOST(x->rdmult,
+                x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+static int64_t pick_interinter_wedge(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+    const uint8_t *const p0, const uint8_t *const p1,
+    const int16_t *const residual1, const int16_t *const diff10,
+    uint64_t *best_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+
+  int64_t rd;
+  int8_t wedge_index = -1;
+  int8_t wedge_sign = 0;
+
+  assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+  assert(cpi->common.seq_params->enable_masked_compound);
+
+  if (cpi->sf.inter_sf.fast_wedge_sign_estimate) {
+    wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+    rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+                               &wedge_index, best_sse);
+  } else {
+    rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+                    &wedge_index, best_sse);
+  }
+
+  mbmi->interinter_comp.wedge_sign = wedge_sign;
+  mbmi->interinter_comp.wedge_index = wedge_index;
+  return rd;
+}
+
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, const BLOCK_SIZE bsize,
+                                   const uint8_t *const p0,
+                                   const uint8_t *const p1,
+                                   const int16_t *const residual1,
+                                   const int16_t *const diff10,
+                                   uint64_t *best_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = 1 << num_pels_log2_lookup[bsize];
+  int rate;
+  int64_t dist;
+  DIFFWTD_MASK_TYPE cur_mask_type;
+  int64_t best_rd = INT64_MAX;
+  DIFFWTD_MASK_TYPE best_mask_type = 0;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
+  // try each mask type and its inverse
+  for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+    // build mask and inverse
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (hbd)
+      av1_build_compound_diffwtd_mask_highbd(
+          tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+          CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
+    else
+      av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+                                      p0, bw, p1, bw, bh, bw);
+#else
+    (void)hbd;
+    av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, p0,
+                                    bw, p1, bw, bh, bw);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+    // compute rd for mask
+    uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+                                                tmp_mask[cur_mask_type], N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
+
+    if (rd0 < best_rd) {
+      best_mask_type = cur_mask_type;
+      best_rd = rd0;
+      *best_sse = sse;
+    }
+  }
+  mbmi->interinter_comp.mask_type = best_mask_type;
+  if (best_mask_type == DIFFWTD_38_INV) {
+    memcpy(xd->seg_mask, seg_mask, N * 2);
+  }
+  return best_rd;
+}
+
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(av1_is_wedge_used(bsize));
+  assert(cpi->common.seq_params->enable_interintra_compound);
+
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
+  DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+  }
+#else
+  aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+  aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+#endif
+  int8_t wedge_index = -1;
+  uint64_t sse;
+  int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0,
+                                     &wedge_index, &sse);
+
+  mbmi->interintra_wedge_index = wedge_index;
+  return rd;
+}
+
+static AOM_INLINE void get_inter_predictors_masked_compound(
+    MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1,
+    int16_t *residual1, int16_t *diff10, int *strides) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  // get inter predictors to use for masked compound modes
+  av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0,
+                                                   strides);
+  av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1,
+                                                   strides);
+  const struct buf_2d *const src = &x->plane[0].src;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(*preds1), bw);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+                              bw, CONVERT_TO_BYTEPTR(*preds0), bw);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+                       bw);
+    aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+  }
+#else
+  aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw);
+  aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+#endif
+}
+
+// Computes the rd cost for the given interintra mode and updates the best
+static INLINE void compute_best_interintra_mode(
+    const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+    MACROBLOCK *const x, const int *const interintra_mode_cost,
+    const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf,
+    INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd,
+    INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int rate;
+  uint8_t skip_txfm_sb;
+  int64_t dist, skip_sse_sb;
+  const int bw = block_size_wide[bsize];
+  mbmi->interintra_mode = interintra_mode;
+  int rmode = interintra_mode_cost[interintra_mode];
+  av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                            intrapred, bw);
+  av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist,
+                                          &skip_txfm_sb, &skip_sse_sb, NULL,
+                                          NULL, NULL);
+  int64_t rd = RDCOST(x->rdmult, rate + rmode, dist);
+  if (rd < *best_interintra_rd) {
+    *best_interintra_rd = rd;
+    *best_interintra_mode = mbmi->interintra_mode;
+  }
+}
+
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+                                   MACROBLOCK *x, int64_t ref_best_rd,
+                                   RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (ref_best_rd < 0) return INT64_MAX;
+  av1_subtract_plane(x, bs, 0);
+  const int64_t rd = av1_estimate_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs,
+                                           max_txsize_rect_lookup[bs]);
+  if (rd != INT64_MAX) {
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+    if (rd_stats->skip_txfm) {
+      const int s1 = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+      rd_stats->rate = s1;
+    } else {
+      const int s0 = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+      rd_stats->rate += s0;
+    }
+  }
+  return rd;
+}
+
+// Computes the rd_threshold for smooth interintra rd search.
+static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x,
+                                            int total_mode_rate,
+                                            int64_t ref_best_rd) {
+  const int64_t rd_thresh = get_rd_thresh_from_best_rd(
+      ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT),
+      INTER_INTRA_RD_THRESH_SCALE);
+  const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0);
+  return (rd_thresh - mode_rd);
+}
+
+// Computes the best wedge interintra mode
+static AOM_INLINE int64_t compute_best_wedge_interintra(
+    const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+    MACROBLOCK *const x, const int *const interintra_mode_cost,
+    const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_,
+    int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  int64_t best_interintra_rd_wedge = INT64_MAX;
+  int64_t best_total_rd = INT64_MAX;
+  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) {
+    mbmi->interintra_mode = mode;
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                              intrapred, bw);
+    int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+    const int rate_overhead =
+        interintra_mode_cost[mode] +
+        x->mode_costs.wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
+    const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0);
+    if (total_rd < best_total_rd) {
+      best_total_rd = total_rd;
+      best_interintra_rd_wedge = rd;
+      *best_mode = mbmi->interintra_mode;
+      *best_wedge_index = mbmi->interintra_wedge_index;
+    }
+  }
+  return best_interintra_rd_wedge;
+}
+
+static int handle_smooth_inter_intra_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    MB_MODE_INFO *mbmi, int64_t ref_best_rd, int *rate_mv,
+    INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd,
+    int *best_mode_rate, const BUFFER_SET *orig_dst, uint8_t *tmp_buf,
+    uint8_t *intrapred, HandleInterModeArgs *args) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const interintra_mode_cost =
+      mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+
+  mbmi->use_wedge_interintra = 0;
+
+  if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
+      *best_interintra_mode == INTERINTRA_MODES) {
+    int64_t best_interintra_rd = INT64_MAX;
+    for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+         ++cur_mode) {
+      if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+           cpi->sf.intra_sf.disable_smooth_intra) &&
+          cur_mode == II_SMOOTH_PRED)
+        continue;
+      compute_best_interintra_mode(
+          cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf,
+          best_interintra_mode, &best_interintra_rd, cur_mode, bsize);
+    }
+    args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+  }
+  assert(IMPLIES(!cpi->oxcf.comp_type_cfg.enable_smooth_interintra,
+                 *best_interintra_mode != II_SMOOTH_PRED));
+  // Recompute prediction if required
+  bool interintra_mode_reuse = cpi->sf.inter_sf.reuse_inter_intra_mode ||
+                               *best_interintra_mode != INTERINTRA_MODES;
+  if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) {
+    mbmi->interintra_mode = *best_interintra_mode;
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                              intrapred, bw);
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  }
+
+  // Compute rd cost for best smooth_interintra
+  RD_STATS rd_stats;
+  const int is_wedge_used = av1_is_wedge_used(bsize);
+  const int rmode =
+      interintra_mode_cost[*best_interintra_mode] +
+      (is_wedge_used ? mode_costs->wedge_interintra_cost[bsize][0] : 0);
+  const int total_mode_rate = rmode + *rate_mv;
+  const int64_t rd_thresh = compute_rd_thresh(x, total_mode_rate, ref_best_rd);
+  int64_t rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
+  if (rd != INT64_MAX) {
+    rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
+  } else {
+    return IGNORE_MODE;
+  }
+  *best_rd = rd;
+  *best_mode_rate = rmode;
+  // Return early if best rd not good enough
+  if (ref_best_rd < INT64_MAX &&
+      (*best_rd >> INTER_INTRA_RD_THRESH_SHIFT) * INTER_INTRA_RD_THRESH_SCALE >
+          ref_best_rd) {
+    return IGNORE_MODE;
+  }
+  return 0;
+}
+
+static int handle_wedge_inter_intra_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    MB_MODE_INFO *mbmi, int *rate_mv, INTERINTRA_MODE *best_interintra_mode,
+    int64_t *best_rd, const BUFFER_SET *orig_dst, uint8_t *tmp_buf_,
+    uint8_t *tmp_buf, uint8_t *intrapred_, uint8_t *intrapred,
+    HandleInterModeArgs *args, int *tmp_rate_mv, int *rate_overhead,
+    int_mv *tmp_mv, int64_t best_rd_no_wedge) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const interintra_mode_cost =
+      mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  const int try_smooth_interintra =
+      cpi->oxcf.comp_type_cfg.enable_smooth_interintra;
+
+  mbmi->use_wedge_interintra = 1;
+
+  if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
+    // Exhaustive search of all wedge and mode combinations.
+    int best_mode = 0;
+    int best_wedge_index = 0;
+    *best_rd = compute_best_wedge_interintra(
+        cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, tmp_buf_,
+        &best_mode, &best_wedge_index, bsize);
+    mbmi->interintra_mode = best_mode;
+    mbmi->interintra_wedge_index = best_wedge_index;
+    if (best_mode != INTERINTRA_MODES - 1) {
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+    }
+  } else if (!try_smooth_interintra) {
+    if (*best_interintra_mode == INTERINTRA_MODES) {
+      mbmi->interintra_mode = INTERINTRA_MODES - 1;
+      *best_interintra_mode = INTERINTRA_MODES - 1;
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      // Pick wedge mask based on INTERINTRA_MODES - 1
+      *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+      // Find the best interintra mode for the chosen wedge mask
+      for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+           ++cur_mode) {
+        compute_best_interintra_mode(
+            cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
+            tmp_buf, best_interintra_mode, best_rd, cur_mode, bsize);
+      }
+      args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+      mbmi->interintra_mode = *best_interintra_mode;
+
+      // Recompute prediction if required
+      if (*best_interintra_mode != INTERINTRA_MODES - 1) {
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+      }
+    } else {
+      // Pick wedge mask for the best interintra mode (reused)
+      mbmi->interintra_mode = *best_interintra_mode;
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+    }
+  } else {
+    // Pick wedge mask for the best interintra mode from smooth_interintra
+    *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+  }
+
+  *rate_overhead =
+      interintra_mode_cost[mbmi->interintra_mode] +
+      mode_costs->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
+      mode_costs->wedge_interintra_cost[bsize][1];
+  *best_rd += RDCOST(x->rdmult, *rate_overhead + *rate_mv, 0);
+
+  int64_t rd = INT64_MAX;
+  const int_mv mv0 = mbmi->mv[0];
+  // Refine motion vector for NEWMV case.
+  if (have_newmv_in_inter_mode(mbmi->mode)) {
+    int rate_sum;
+    uint8_t skip_txfm_sb;
+    int64_t dist_sum, skip_sse_sb;
+    // get negative of mask
+    const uint8_t *mask =
+        av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
+    av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv->as_mv, intrapred,
+                                      mask, bw, tmp_rate_mv, 0);
+    if (mbmi->mv[0].as_int != tmp_mv->as_int) {
+      mbmi->mv[0].as_int = tmp_mv->as_int;
+      // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
+      // predictor is not calculated again in av1_enc_build_inter_predictor().
+      mbmi->ref_frame[1] = NONE_FRAME;
+      const int mi_row = xd->mi_row;
+      const int mi_col = xd->mi_col;
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
+                             xd->plane[AOM_PLANE_Y].dst.stride, intrapred, bw);
+      model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+          cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
+          &skip_sse_sb, NULL, NULL, NULL);
+      rd =
+          RDCOST(x->rdmult, *tmp_rate_mv + *rate_overhead + rate_sum, dist_sum);
+    }
+  }
+  if (rd >= *best_rd) {
+    tmp_mv->as_int = mv0.as_int;
+    *tmp_rate_mv = *rate_mv;
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  }
+  // Evaluate closer to true rd
+  RD_STATS rd_stats;
+  const int64_t mode_rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv, 0);
+  const int64_t tmp_rd_thresh = best_rd_no_wedge - mode_rd;
+  rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+  if (rd != INT64_MAX) {
+    rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv + rd_stats.rate,
+                rd_stats.dist);
+  } else {
+    if (*best_rd == INT64_MAX) return IGNORE_MODE;
+  }
+  *best_rd = rd;
+  return 0;
+}
+
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                HandleInterModeArgs *args, int64_t ref_best_rd,
+                                int *rate_mv, int *tmp_rate2,
+                                const BUFFER_SET *orig_dst) {
+  const int try_smooth_interintra =
+      cpi->oxcf.comp_type_cfg.enable_smooth_interintra;
+
+  const int is_wedge_used = av1_is_wedge_used(bsize);
+  const int try_wedge_interintra =
+      is_wedge_used && enable_wedge_interintra_search(x, cpi);
+
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int bw = block_size_wide[bsize];
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // Single reference inter prediction
+  mbmi->ref_frame[1] = NONE_FRAME;
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = bw;
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+  const int num_planes = av1_num_planes(cm);
+
+  // Restore the buffers for intra prediction
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  mbmi->ref_frame[1] = INTRA_FRAME;
+  INTERINTRA_MODE best_interintra_mode =
+      args->inter_intra_mode[mbmi->ref_frame[0]];
+
+  // Compute smooth_interintra
+  int64_t best_interintra_rd_nowedge = INT64_MAX;
+  int best_mode_rate = INT_MAX;
+  if (try_smooth_interintra) {
+    int ret = handle_smooth_inter_intra_mode(
+        cpi, x, bsize, mbmi, ref_best_rd, rate_mv, &best_interintra_mode,
+        &best_interintra_rd_nowedge, &best_mode_rate, orig_dst, tmp_buf,
+        intrapred, args);
+    if (ret == IGNORE_MODE) {
+      return IGNORE_MODE;
+    }
+  }
+
+  // Compute wedge interintra
+  int64_t best_interintra_rd_wedge = INT64_MAX;
+  const int_mv mv0 = mbmi->mv[0];
+  int_mv tmp_mv = mv0;
+  int tmp_rate_mv = 0;
+  int rate_overhead = 0;
+  if (try_wedge_interintra) {
+    int ret = handle_wedge_inter_intra_mode(
+        cpi, x, bsize, mbmi, rate_mv, &best_interintra_mode,
+        &best_interintra_rd_wedge, orig_dst, tmp_buf_, tmp_buf, intrapred_,
+        intrapred, args, &tmp_rate_mv, &rate_overhead, &tmp_mv,
+        best_interintra_rd_nowedge);
+    if (ret == IGNORE_MODE) {
+      return IGNORE_MODE;
+    }
+  }
+
+  if (best_interintra_rd_nowedge == INT64_MAX &&
+      best_interintra_rd_wedge == INT64_MAX) {
+    return IGNORE_MODE;
+  }
+  if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+    mbmi->mv[0].as_int = tmp_mv.as_int;
+    *tmp_rate2 += tmp_rate_mv - *rate_mv;
+    *rate_mv = tmp_rate_mv;
+    best_mode_rate = rate_overhead;
+  } else if (try_smooth_interintra && try_wedge_interintra) {
+    // If smooth was best, but we over-wrote the values when evaluating the
+    // wedge mode, we need to recompute the smooth values.
+    mbmi->use_wedge_interintra = 0;
+    mbmi->interintra_mode = best_interintra_mode;
+    mbmi->mv[0].as_int = mv0.as_int;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+  }
+  *tmp_rate2 += best_mode_rate;
+
+  if (num_planes > 1) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_U, num_planes - 1);
+  }
+  return 0;
+}
+
+// Computes the valid compound_types to be evaluated
+static INLINE int compute_valid_comp_types(MACROBLOCK *x,
+                                           const AV1_COMP *const cpi,
+                                           BLOCK_SIZE bsize,
+                                           int masked_compound_used,
+                                           int mode_search_mask,
+                                           COMPOUND_TYPE *valid_comp_types) {
+  const AV1_COMMON *cm = &cpi->common;
+  int valid_type_count = 0;
+  int comp_type, valid_check;
+  int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 };
+
+  const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
+  const int try_distwtd_comp =
+      ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
+       cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 &&
+       cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+
+  // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
+  for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+       comp_type++) {
+    valid_check =
+        (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp;
+    if (valid_check && is_interinter_compound_used(comp_type, bsize))
+      valid_comp_types[valid_type_count++] = comp_type;
+  }
+  // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases
+  if (masked_compound_used) {
+    // enable_masked_type[0] corresponds to COMPOUND_WEDGE
+    // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD
+    enable_masked_type[0] = enable_wedge_interinter_search(x, cpi);
+    enable_masked_type[1] = cpi->oxcf.comp_type_cfg.enable_diff_wtd_comp;
+    for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD;
+         comp_type++) {
+      if ((mode_search_mask & (1 << comp_type)) &&
+          is_interinter_compound_used(comp_type, bsize) &&
+          enable_masked_type[comp_type - COMPOUND_WEDGE])
+        valid_comp_types[valid_type_count++] = comp_type;
+    }
+  }
+  return valid_type_count;
+}
+
+// Calculates the cost for compound type mask
+static INLINE void calc_masked_type_cost(
+    const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx,
+    int comp_index_ctx, int masked_compound_used, int *masked_type_cost) {
+  av1_zero_array(masked_type_cost, COMPOUND_TYPES);
+  // Account for group index cost when wedge and/or diffwtd prediction are
+  // enabled
+  if (masked_compound_used) {
+    // Compound group index of average and distwtd is 0
+    // Compound group index of wedge and diffwtd is 1
+    masked_type_cost[COMPOUND_AVERAGE] +=
+        mode_costs->comp_group_idx_cost[comp_group_idx_ctx][0];
+    masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE];
+    masked_type_cost[COMPOUND_WEDGE] +=
+        mode_costs->comp_group_idx_cost[comp_group_idx_ctx][1];
+    masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE];
+  }
+
+  // Compute the cost to signal compound index/type
+  masked_type_cost[COMPOUND_AVERAGE] +=
+      mode_costs->comp_idx_cost[comp_index_ctx][1];
+  masked_type_cost[COMPOUND_DISTWTD] +=
+      mode_costs->comp_idx_cost[comp_index_ctx][0];
+  masked_type_cost[COMPOUND_WEDGE] += mode_costs->compound_type_cost[bsize][0];
+  masked_type_cost[COMPOUND_DIFFWTD] +=
+      mode_costs->compound_type_cost[bsize][1];
+}
+
+// Updates mbmi structure with the relevant compound type info
+static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi,
+                                                 COMPOUND_TYPE cur_type) {
+  mbmi->interinter_comp.type = cur_type;
+  mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE);
+  mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD);
+}
+
+// When match is found, populate the compound type data
+// and calculate the rd cost using the stored stats and
+// update the mbmi appropriately.
+static INLINE int populate_reuse_comp_type_data(
+    const MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate,
+    int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd,
+    int match_index) {
+  const int winner_comp_type =
+      x->comp_rd_stats[match_index].interinter_comp.type;
+  if (comp_rate[winner_comp_type] == INT_MAX)
+    return best_type_stats->best_compmode_interinter_cost;
+  update_mbmi_for_compound_type(mbmi, winner_comp_type);
+  mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp;
+  *rd = RDCOST(
+      x->rdmult,
+      comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type],
+      comp_dist[winner_comp_type]);
+  mbmi->mv[0].as_int = cur_mv[0].as_int;
+  mbmi->mv[1].as_int = cur_mv[1].as_int;
+  return comp_rs2[winner_comp_type];
+}
+
+// Updates rd cost and relevant compound type data for the best compound type
+static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
+                                    BEST_COMP_TYPE_STATS *best_type_stats,
+                                    int64_t best_rd_cur,
+                                    int64_t comp_model_rd_cur, int rs2) {
+  *rd = best_rd_cur;
+  best_type_stats->comp_best_model_rd = comp_model_rd_cur;
+  best_type_stats->best_compound_data = mbmi->interinter_comp;
+  best_type_stats->best_compmode_interinter_cost = rs2;
+}
+
+// Updates best_mv for masked compound types
+static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
+                                       int_mv *best_mv, int *best_tmp_rate_mv,
+                                       int tmp_rate_mv) {
+  *best_tmp_rate_mv = tmp_rate_mv;
+  best_mv[0].as_int = mbmi->mv[0].as_int;
+  best_mv[1].as_int = mbmi->mv[1].as_int;
+}
+
+static INLINE void save_comp_rd_search_stat(
+    MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate,
+    const int64_t *comp_dist, const int32_t *comp_model_rate,
+    const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) {
+  const int offset = x->comp_rd_stats_idx;
+  if (offset < MAX_COMP_RD_STATS) {
+    COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset;
+    memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate));
+    memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist));
+    memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate));
+    memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist));
+    memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2));
+    memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv));
+    memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames));
+    rd_stats->mode = mbmi->mode;
+    rd_stats->filter = mbmi->interp_filters;
+    rd_stats->ref_mv_idx = mbmi->ref_mv_idx;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    for (int i = 0; i < 2; ++i) {
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mbmi->ref_frame[i]];
+      rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype);
+    }
+    memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp,
+           sizeof(rd_stats->interinter_comp));
+    ++x->comp_rd_stats_idx;
+  }
+}
+
+static INLINE int get_interinter_compound_mask_rate(
+    const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) {
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+  // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+  if (compound_type == COMPOUND_WEDGE) {
+    return av1_is_wedge_used(mbmi->bsize)
+               ? av1_cost_literal(1) +
+                     mode_costs
+                         ->wedge_idx_cost[mbmi->bsize]
+                                         [mbmi->interinter_comp.wedge_index]
+               : 0;
+  } else {
+    assert(compound_type == COMPOUND_DIFFWTD);
+    return av1_cost_literal(1);
+  }
+}
+
+// Takes a backup of rate, distortion and model_rd for future reuse
+static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
+                                int64_t *comp_dist, int32_t *comp_model_rate,
+                                int64_t *comp_model_dist, int rate_sum,
+                                int64_t dist_sum, RD_STATS *rd_stats,
+                                int *comp_rs2, int rs2) {
+  comp_rate[cur_type] = rd_stats->rate;
+  comp_dist[cur_type] = rd_stats->dist;
+  comp_model_rate[cur_type] = rate_sum;
+  comp_model_dist[cur_type] = dist_sum;
+  comp_rs2[cur_type] = rs2;
+}
+
+static INLINE int save_mask_search_results(const PREDICTION_MODE this_mode,
+                                           const int reuse_level) {
+  if (reuse_level || (this_mode == NEW_NEWMV))
+    return 1;
+  else
+    return 0;
+}
+
+static INLINE int prune_mode_by_skip_rd(const AV1_COMP *const cpi,
+                                        MACROBLOCK *x, MACROBLOCKD *xd,
+                                        const BLOCK_SIZE bsize,
+                                        int64_t ref_skip_rd, int mode_rate) {
+  int eval_txfm = 1;
+  const int txfm_rd_gate_level =
+      get_txfm_rd_gate_level(cpi->common.seq_params->enable_masked_compound,
+                             cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+                             TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
+  // Check if the mode is good enough based on skip rd
+  if (txfm_rd_gate_level) {
+    int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
+    int64_t skip_rd = RDCOST(x->rdmult, mode_rate, (sse_y << 4));
+    eval_txfm =
+        check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, txfm_rd_gate_level, 1);
+  }
+  return eval_txfm;
+}
+
+static int64_t masked_compound_type_rd(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+    int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+    uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+    int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound,
+    int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate,
+    int64_t *comp_model_dist, const int64_t comp_best_model_rd,
+    int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t best_rd_cur = INT64_MAX;
+  int64_t rd = INT64_MAX;
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+  // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+  assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD);
+  int rate_sum;
+  uint8_t tmp_skip_txfm_sb;
+  int64_t dist_sum, tmp_skip_sse_sb;
+  pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge,
+                                                        pick_interinter_seg };
+
+  // TODO(any): Save pred and mask calculation as well into records. However
+  // this may increase memory requirements as compound segment mask needs to be
+  // stored in each record.
+  if (*calc_pred_masked_compound) {
+    get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1,
+                                         diff10, strides);
+    *calc_pred_masked_compound = 0;
+  }
+  if (compound_type == COMPOUND_WEDGE) {
+    unsigned int sse;
+    if (is_cur_buf_hbd(xd))
+      (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
+                                       CONVERT_TO_BYTEPTR(*preds1), *strides,
+                                       &sse);
+    else
+      (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides,
+                                       &sse);
+    const unsigned int mse =
+        ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
+    // If two predictors are very similar, skip wedge compound mode search
+    if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+  }
+  // Function pointer to pick the appropriate mask
+  // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge()
+  // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg()
+  uint64_t cur_sse = UINT64_MAX;
+  best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
+      cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse);
+  *rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+  best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
+  assert(cur_sse != UINT64_MAX);
+  int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4));
+
+  // Although the true rate_mv might be different after motion search, but it
+  // is unlikely to be the best mode considering the transform rd cost and other
+  // mode overhead cost
+  int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+  if (mode_rd > rd_thresh) {
+    *comp_model_rd_cur = INT64_MAX;
+    return INT64_MAX;
+  }
+
+  // Check if the mode is good enough based on skip rd
+  // TODO(nithya): Handle wedge_newmv_search if extending for lower speed
+  // setting
+  const int txfm_rd_gate_level =
+      get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+                             cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+                             TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
+  if (txfm_rd_gate_level) {
+    int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur,
+                                    txfm_rd_gate_level, 1);
+    if (!eval_txfm) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+  }
+
+  // Compute cost if matching record not found, else, reuse data
+  if (comp_rate[compound_type] == INT_MAX) {
+    // Check whether new MV search for wedge is to be done
+    int wedge_newmv_search =
+        have_newmv_in_inter_mode(this_mode) &&
+        (compound_type == COMPOUND_WEDGE) &&
+        (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search);
+
+    // Search for new MV if needed and build predictor
+    if (wedge_newmv_search) {
+      *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                           bsize, this_mode);
+      const int mi_row = xd->mi_row;
+      const int mi_col = xd->mi_col;
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    } else {
+      *out_rate_mv = rate_mv;
+      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                               preds1, strides);
+    }
+    // Get the RD cost from model RD
+    model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+        cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb,
+        &tmp_skip_sse_sb, NULL, NULL, NULL);
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+    *comp_model_rd_cur = rd;
+    // Override with best if current is worse than best for new MV
+    if (wedge_newmv_search) {
+      if (rd >= best_rd_cur) {
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+        *out_rate_mv = rate_mv;
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                 strides, preds1, strides);
+        *comp_model_rd_cur = best_rd_cur;
+      }
+    }
+    if (cpi->sf.inter_sf.prune_comp_type_by_model_rd &&
+        (*comp_model_rd_cur > comp_best_model_rd) &&
+        comp_best_model_rd != INT64_MAX) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+    // Compute RD cost for the current type
+    RD_STATS rd_stats;
+    const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0);
+    const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd;
+    rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd =
+          RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist);
+      // Backup rate and distortion for future reuse
+      backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate,
+                   comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2,
+                   *rs2);
+    }
+  } else {
+    // Reuse data as matching record is found
+    assert(comp_dist[compound_type] != INT64_MAX);
+    // When disable_interinter_wedge_newmv_search is set, motion refinement is
+    // disabled. Hence rate and distortion can be reused in this case as well
+    assert(IMPLIES((have_newmv_in_inter_mode(this_mode) &&
+                    (compound_type == COMPOUND_WEDGE)),
+                   cpi->sf.inter_sf.disable_interinter_wedge_newmv_search));
+    assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
+    assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
+    *out_rate_mv = rate_mv;
+    // Calculate RD cost based on stored stats
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type],
+                comp_dist[compound_type]);
+    // Recalculate model rdcost with the updated rate
+    *comp_model_rd_cur =
+        RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type],
+               comp_model_dist[compound_type]);
+  }
+  return rd;
+}
+
+// scaling values to be used for gating wedge/compound segment based on best
+// approximate rd
+static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
+static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                         HandleInterModeArgs *args, BLOCK_SIZE bsize,
+                         int_mv *cur_mv, int mode_search_mask,
+                         int masked_compound_used, const BUFFER_SET *orig_dst,
+                         const BUFFER_SET *tmp_dst,
+                         const CompoundTypeRdBuffers *buffers, int *rate_mv,
+                         int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+                         int64_t ref_skip_rd, int *is_luma_interp_done,
+                         int64_t rd_thresh) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  int ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+  const int bw = block_size_wide[bsize];
+  int rs2;
+  int_mv best_mv[2];
+  int best_tmp_rate_mv = *rate_mv;
+  BEST_COMP_TYPE_STATS best_type_stats;
+  // Initializing BEST_COMP_TYPE_STATS
+  best_type_stats.best_compound_data.type = COMPOUND_AVERAGE;
+  best_type_stats.best_compmode_interinter_cost = 0;
+  best_type_stats.comp_best_model_rd = INT64_MAX;
+
+  uint8_t *preds0[1] = { buffers->pred0 };
+  uint8_t *preds1[1] = { buffers->pred1 };
+  int strides[1] = { bw };
+  int tmp_rate_mv;
+  COMPOUND_TYPE cur_type;
+  // Local array to store the mask cost for different compound types
+  int masked_type_cost[COMPOUND_TYPES];
+
+  int calc_pred_masked_compound = 1;
+  int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                        INT64_MAX };
+  int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+  int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+  int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX,
+                                              INT_MAX };
+  int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                              INT64_MAX };
+  int match_index = 0;
+  const int match_found =
+      find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate,
+                            comp_model_dist, comp_rs2, &match_index);
+  best_mv[0].as_int = cur_mv[0].as_int;
+  best_mv[1].as_int = cur_mv[1].as_int;
+  *rd = INT64_MAX;
+
+  // Local array to store the valid compound types to be evaluated in the core
+  // loop
+  COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = {
+    COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD
+  };
+  int valid_type_count = 0;
+  // compute_valid_comp_types() returns the number of valid compound types to be
+  // evaluated and populates the same in the local array valid_comp_types[].
+  // It also sets the flag 'try_average_and_distwtd_comp'
+  valid_type_count = compute_valid_comp_types(
+      x, cpi, bsize, masked_compound_used, mode_search_mask, valid_comp_types);
+
+  // The following context indices are independent of compound type
+  const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+  const int comp_index_ctx = get_comp_index_context(cm, xd);
+
+  // Populates masked_type_cost local array for the 4 compound types
+  calc_masked_type_cost(&x->mode_costs, bsize, comp_group_idx_ctx,
+                        comp_index_ctx, masked_compound_used, masked_type_cost);
+
+  int64_t comp_model_rd_cur = INT64_MAX;
+  int64_t best_rd_cur = ref_best_rd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // If the match is found, calculate the rd cost using the
+  // stored stats and update the mbmi appropriately.
+  if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) {
+    return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv,
+                                         comp_rate, comp_dist, comp_rs2,
+                                         rate_mv, rd, match_index);
+  }
+
+  // If COMPOUND_AVERAGE is not valid, use the spare buffer
+  if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+
+  // Loop over valid compound types
+  for (int i = 0; i < valid_type_count; i++) {
+    cur_type = valid_comp_types[i];
+
+    if (args->cmp_mode[ref_frame] == COMPOUND_AVERAGE) {
+      if (cur_type == COMPOUND_WEDGE) continue;
+    }
+
+    comp_model_rd_cur = INT64_MAX;
+    tmp_rate_mv = *rate_mv;
+    best_rd_cur = INT64_MAX;
+    ref_best_rd = AOMMIN(ref_best_rd, *rd);
+    update_mbmi_for_compound_type(mbmi, cur_type);
+    rs2 = masked_type_cost[cur_type];
+
+    int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+    if (mode_rd >= ref_best_rd) continue;
+
+    // Derive the flags to indicate enabling/disabling of MV refinement process.
+    const int enable_fast_compound_mode_search =
+        cpi->sf.inter_sf.enable_fast_compound_mode_search;
+    const bool skip_mv_refinement_for_avg_distwtd =
+        enable_fast_compound_mode_search == 3 ||
+        (enable_fast_compound_mode_search == 2 && (this_mode != NEW_NEWMV));
+    const bool skip_mv_refinement_for_diffwtd =
+        (!enable_fast_compound_mode_search && cur_type == COMPOUND_DIFFWTD);
+
+    // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD
+    if (cur_type < COMPOUND_WEDGE) {
+      if (skip_mv_refinement_for_avg_distwtd) {
+        int rate_sum;
+        uint8_t tmp_skip_txfm_sb;
+        int64_t dist_sum, tmp_skip_sse_sb;
+
+        // Reuse data if matching record is found
+        if (comp_rate[cur_type] == INT_MAX) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                        AOM_PLANE_Y, AOM_PLANE_Y);
+          if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+          // Compute RD cost for the current type
+          RD_STATS est_rd_stats;
+          const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
+          int64_t est_rd = INT64_MAX;
+          int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                                rs2 + *rate_mv);
+          // Evaluate further if skip rd is low enough
+          if (eval_txfm) {
+            est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh,
+                                         &est_rd_stats);
+          }
+          if (est_rd != INT64_MAX) {
+            best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+                                 est_rd_stats.dist);
+            model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+                cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+            comp_model_rd_cur =
+                RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+            // Backup rate and distortion for future reuse
+            backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate,
+                         comp_model_dist, rate_sum, dist_sum, &est_rd_stats,
+                         comp_rs2, rs2);
+          }
+        } else {
+          // Calculate RD cost based on stored stats
+          assert(comp_dist[cur_type] != INT64_MAX);
+          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
+                               comp_dist[cur_type]);
+          // Recalculate model rdcost with the updated rate
+          comp_model_rd_cur =
+              RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type],
+                     comp_model_dist[cur_type]);
+        }
+      } else {
+        tmp_rate_mv = *rate_mv;
+        if (have_newmv_in_inter_mode(this_mode)) {
+          InterPredParams inter_pred_params;
+          av1_dist_wtd_comp_weight_assign(
+              &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset,
+              &inter_pred_params.conv_params.bck_offset,
+              &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1);
+          int mask_value = inter_pred_params.conv_params.fwd_offset * 4;
+          memset(xd->seg_mask, mask_value,
+                 sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+
+        int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                              rs2 + *rate_mv);
+        if (eval_txfm) {
+          RD_STATS est_rd_stats;
+          estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+
+          best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+        }
+      }
+
+      // use spare buffer for following compound type try
+      if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+    } else if (cur_type == COMPOUND_WEDGE) {
+      int best_mask_index = 0;
+      int best_wedge_sign = 0;
+      int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] };
+      int best_rs2 = 0;
+      int best_rate_mv = *rate_mv;
+      int wedge_mask_size = get_wedge_types_lookup(bsize);
+      int need_mask_search = args->wedge_index == -1;
+      int wedge_newmv_search =
+          have_newmv_in_inter_mode(this_mode) &&
+          !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search;
+
+      if (need_mask_search && !wedge_newmv_search) {
+        // short cut repeated single reference block build
+        av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0,
+                                                         preds0, strides);
+        av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1,
+                                                         preds1, strides);
+      }
+
+      for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search;
+           ++wedge_mask) {
+        for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) {
+          tmp_rate_mv = *rate_mv;
+          mbmi->interinter_comp.wedge_index = wedge_mask;
+          mbmi->interinter_comp.wedge_sign = wedge_sign;
+          rs2 = masked_type_cost[cur_type];
+          rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+          mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+          if (mode_rd >= ref_best_rd / 2) continue;
+
+          if (wedge_newmv_search) {
+            tmp_rate_mv = av1_interinter_compound_motion_search(
+                cpi, x, cur_mv, bsize, this_mode);
+            av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
+                                          bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+          } else {
+            av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                     strides, preds1, strides);
+          }
+
+          RD_STATS est_rd_stats;
+          int64_t this_rd_cur = INT64_MAX;
+          int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                                rs2 + *rate_mv);
+          if (eval_txfm) {
+            this_rd_cur = estimate_yrd_for_sb(
+                cpi, bsize, x, AOMMIN(best_rd_cur, ref_best_rd), &est_rd_stats);
+          }
+          if (this_rd_cur < INT64_MAX) {
+            this_rd_cur =
+                RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                       est_rd_stats.dist);
+          }
+          if (this_rd_cur < best_rd_cur) {
+            best_mask_index = wedge_mask;
+            best_wedge_sign = wedge_sign;
+            best_rd_cur = this_rd_cur;
+            tmp_mv[0] = mbmi->mv[0];
+            tmp_mv[1] = mbmi->mv[1];
+            best_rate_mv = tmp_rate_mv;
+            best_rs2 = rs2;
+          }
+        }
+        // Consider the asymmetric partitions for oblique angle only if the
+        // corresponding symmetric partition is the best so far.
+        // Note: For horizontal and vertical types, both symmetric and
+        // asymmetric partitions are always considered.
+        if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) {
+          // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16]
+          // correspond to symmetric partitions of the 4 oblique angles, the
+          // next 4 entries correspond to the vertical/horizontal
+          // symmetric/asymmetric partitions and the last 8 entries correspond
+          // to the asymmetric partitions of oblique types.
+          const int idx_before_asym_oblique = 7;
+          const int last_oblique_sym_idx = 3;
+          if (wedge_mask == idx_before_asym_oblique) {
+            if (best_mask_index > last_oblique_sym_idx) {
+              break;
+            } else {
+              // Asymmetric (Index-1) map for the corresponding oblique masks.
+              // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9
+              // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13
+              // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15
+              // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11
+              const int asym_mask_idx[4] = { 7, 11, 13, 9 };
+              wedge_mask = asym_mask_idx[best_mask_index];
+              wedge_mask_size = wedge_mask + 3;
+            }
+          }
+        }
+      }
+
+      if (need_mask_search) {
+        if (save_mask_search_results(
+                this_mode, cpi->sf.inter_sf.reuse_mask_search_results)) {
+          args->wedge_index = best_mask_index;
+          args->wedge_sign = best_wedge_sign;
+        }
+      } else {
+        mbmi->interinter_comp.wedge_index = args->wedge_index;
+        mbmi->interinter_comp.wedge_sign = args->wedge_sign;
+        rs2 = masked_type_cost[cur_type];
+        rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+        if (wedge_newmv_search) {
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+
+        best_mask_index = args->wedge_index;
+        best_wedge_sign = args->wedge_sign;
+        tmp_mv[0] = mbmi->mv[0];
+        tmp_mv[1] = mbmi->mv[1];
+        best_rate_mv = tmp_rate_mv;
+        best_rs2 = masked_type_cost[cur_type];
+        best_rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                              best_rs2 + *rate_mv);
+        if (eval_txfm) {
+          RD_STATS est_rd_stats;
+          estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+          best_rd_cur =
+              RDCOST(x->rdmult, best_rs2 + tmp_rate_mv + est_rd_stats.rate,
+                     est_rd_stats.dist);
+        }
+      }
+
+      mbmi->interinter_comp.wedge_index = best_mask_index;
+      mbmi->interinter_comp.wedge_sign = best_wedge_sign;
+      mbmi->mv[0] = tmp_mv[0];
+      mbmi->mv[1] = tmp_mv[1];
+      tmp_rate_mv = best_rate_mv;
+      rs2 = best_rs2;
+    } else if (skip_mv_refinement_for_diffwtd) {
+      int_mv tmp_mv[2];
+      int best_mask_index = 0;
+      rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+      int need_mask_search = args->diffwtd_index == -1;
+
+      for (int mask_index = 0; mask_index < 2 && need_mask_search;
+           ++mask_index) {
+        tmp_rate_mv = *rate_mv;
+        mbmi->interinter_comp.mask_type = mask_index;
+        if (have_newmv_in_inter_mode(this_mode)) {
+          // hard coded number for diff wtd
+          int mask_value = mask_index == 0 ? 38 : 26;
+          memset(xd->seg_mask, mask_value,
+                 sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        RD_STATS est_rd_stats;
+        int64_t this_rd_cur = INT64_MAX;
+        int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                              rs2 + *rate_mv);
+        if (eval_txfm) {
+          this_rd_cur =
+              estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+        }
+        if (this_rd_cur < INT64_MAX) {
+          this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+        }
+
+        if (this_rd_cur < best_rd_cur) {
+          best_rd_cur = this_rd_cur;
+          best_mask_index = mbmi->interinter_comp.mask_type;
+          tmp_mv[0] = mbmi->mv[0];
+          tmp_mv[1] = mbmi->mv[1];
+        }
+      }
+
+      if (need_mask_search) {
+        if (save_mask_search_results(this_mode, 0))
+          args->diffwtd_index = best_mask_index;
+      } else {
+        mbmi->interinter_comp.mask_type = args->diffwtd_index;
+        rs2 = masked_type_cost[cur_type];
+        rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+        int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26;
+        memset(xd->seg_mask, mask_value,
+               sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+
+        if (have_newmv_in_inter_mode(this_mode)) {
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+        best_mask_index = mbmi->interinter_comp.mask_type;
+        tmp_mv[0] = mbmi->mv[0];
+        tmp_mv[1] = mbmi->mv[1];
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        RD_STATS est_rd_stats;
+        int64_t this_rd_cur = INT64_MAX;
+        int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                              rs2 + *rate_mv);
+        if (eval_txfm) {
+          this_rd_cur =
+              estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+        }
+        if (this_rd_cur < INT64_MAX) {
+          best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+        }
+      }
+
+      mbmi->interinter_comp.mask_type = best_mask_index;
+      mbmi->mv[0] = tmp_mv[0];
+      mbmi->mv[1] = tmp_mv[1];
+    } else {
+      // Handle masked compound types
+      bool eval_masked_comp_type = true;
+      if (*rd != INT64_MAX) {
+        // Factors to control gating of compound type selection based on best
+        // approximate rd so far
+        const int max_comp_type_rd_threshold_mul =
+            comp_type_rd_threshold_mul[cpi->sf.inter_sf
+                                           .prune_comp_type_by_comp_avg];
+        const int max_comp_type_rd_threshold_div =
+            comp_type_rd_threshold_div[cpi->sf.inter_sf
+                                           .prune_comp_type_by_comp_avg];
+        // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is
+        // within threshold
+        const int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) *
+                                   max_comp_type_rd_threshold_mul);
+        if (approx_rd >= ref_best_rd) eval_masked_comp_type = false;
+      }
+
+      if (eval_masked_comp_type) {
+        const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh);
+        best_rd_cur = masked_compound_type_rd(
+            cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+            &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+            strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound,
+            comp_rate, comp_dist, comp_model_rate, comp_model_dist,
+            best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2,
+            ref_skip_rd);
+      }
+    }
+
+    // Update stats for best compound type
+    if (best_rd_cur < *rd) {
+      update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+                       comp_model_rd_cur, rs2);
+      if (have_newmv_in_inter_mode(this_mode))
+        update_mask_best_mv(mbmi, best_mv, &best_tmp_rate_mv, tmp_rate_mv);
+    }
+    // reset to original mvs for next iteration
+    mbmi->mv[0].as_int = cur_mv[0].as_int;
+    mbmi->mv[1].as_int = cur_mv[1].as_int;
+  }
+
+  mbmi->comp_group_idx =
+      (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+  mbmi->compound_idx =
+      !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
+  mbmi->interinter_comp = best_type_stats.best_compound_data;
+
+  if (have_newmv_in_inter_mode(this_mode)) {
+    mbmi->mv[0].as_int = best_mv[0].as_int;
+    mbmi->mv[1].as_int = best_mv[1].as_int;
+    rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+    *rate_mv = best_tmp_rate_mv;
+  }
+
+  if (this_mode == NEW_NEWMV)
+    args->cmp_mode[ref_frame] = mbmi->interinter_comp.type;
+
+  restore_dst_buf(xd, *orig_dst, 1);
+  if (!match_found)
+    save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate,
+                             comp_model_dist, cur_mv, comp_rs2);
+  return best_type_stats.best_compmode_interinter_cost;
+}
diff --git a/third_party/aom/av1/encoder/compound_type.h b/third_party/aom/av1/encoder/compound_type.h
new file mode 100644
index 0000000000..a028a35093
--- /dev/null
+++ b/third_party/aom/av1/encoder/compound_type.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+#define AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Structure to store the compound type related stats for best compound type
+typedef struct {
+  INTERINTER_COMPOUND_DATA best_compound_data;
+  int64_t comp_best_model_rd;
+  int best_compmode_interinter_cost;
+} BEST_COMP_TYPE_STATS;
+
+#define IGNORE_MODE -1
+// Searches for the best inter-intra mode. Returns IGNORE_MODE if no good mode
+// is found, 0 otherwise.
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                HandleInterModeArgs *args, int64_t ref_best_rd,
+                                int *rate_mv, int *tmp_rate2,
+                                const BUFFER_SET *orig_dst);
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                         HandleInterModeArgs *args, BLOCK_SIZE bsize,
+                         int_mv *cur_mv, int mode_search_mask,
+                         int masked_compound_used, const BUFFER_SET *orig_dst,
+                         const BUFFER_SET *tmp_dst,
+                         const CompoundTypeRdBuffers *buffers, int *rate_mv,
+                         int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+                         int64_t ref_skip_rd, int *is_luma_interp_done,
+                         int64_t rd_thresh);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_COMPOUND_TYPE_H_
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
new file mode 100644
index 0000000000..aafe55d2d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+#include <assert.h>
+
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx) {
+  dst_ctx->mic = src_ctx->mic;
+  dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best;
+
+  dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+  dst_ctx->skippable = src_ctx->skippable;
+#if CONFIG_INTERNAL_STATS
+  dst_ctx->best_mode_index = src_ctx->best_mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+
+  memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+         sizeof(uint8_t) * src_ctx->num_4x4_blk);
+  av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map,
+                 src_ctx->num_4x4_blk);
+
+  dst_ctx->rd_stats = src_ctx->rd_stats;
+  dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+}
+
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+                                   PC_TREE_SHARED_BUFFERS *shared_bufs,
+                                   struct aom_internal_error_info *error) {
+  const int num_planes = seq_params->monochrome ? 1 : MAX_MB_PLANE;
+  const int max_sb_square_y = 1 << num_pels_log2_lookup[seq_params->sb_size];
+  const int max_sb_square_uv = max_sb_square_y >> (seq_params->subsampling_x +
+                                                   seq_params->subsampling_y);
+  for (int i = 0; i < num_planes; i++) {
+    const int max_num_pix =
+        (i == AOM_PLANE_Y) ? max_sb_square_y : max_sb_square_uv;
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->coeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->qcoeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->dqcoeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+  }
+}
+
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  for (int i = 0; i < 3; i++) {
+    aom_free(shared_bufs->coeff_buf[i]);
+    aom_free(shared_bufs->qcoeff_buf[i]);
+    aom_free(shared_bufs->dqcoeff_buf[i]);
+    shared_bufs->coeff_buf[i] = NULL;
+    shared_bufs->qcoeff_buf[i] = NULL;
+    shared_bufs->dqcoeff_buf[i] = NULL;
+  }
+}
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi,
+                                 BLOCK_SIZE bsize,
+                                 PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  PICK_MODE_CONTEXT *volatile ctx = NULL;
+  const AV1_COMMON *const cm = &cpi->common;
+  struct aom_internal_error_info error;
+
+  if (setjmp(error.jmp)) {
+    av1_free_pmc(ctx, av1_num_planes(cm));
+    return NULL;
+  }
+  error.setjmp = 1;
+
+  AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx)));
+  ctx->rd_mode_is_ready = 0;
+
+  const int num_planes = av1_num_planes(cm);
+  const int num_pix = block_size_wide[bsize] * block_size_high[bsize];
+  const int num_blk = num_pix / 16;
+
+  AOM_CHECK_MEM_ERROR(&error, ctx->blk_skip,
+                      aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
+  AOM_CHECK_MEM_ERROR(&error, ctx->tx_type_map,
+                      aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
+  ctx->num_4x4_blk = num_blk;
+
+  for (int i = 0; i < num_planes; ++i) {
+    ctx->coeff[i] = shared_bufs->coeff_buf[i];
+    ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
+    ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i];
+    AOM_CHECK_MEM_ERROR(&error, ctx->eobs[i],
+                        aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+    AOM_CHECK_MEM_ERROR(
+        &error, ctx->txb_entropy_ctx[i],
+        aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+  }
+
+  if (num_pix <= MAX_PALETTE_SQUARE) {
+    for (int i = 0; i < 2; ++i) {
+      if (cm->features.allow_screen_content_tools) {
+        AOM_CHECK_MEM_ERROR(
+            &error, ctx->color_index_map[i],
+            aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+      } else {
+        ctx->color_index_map[i] = NULL;
+      }
+    }
+  }
+
+  av1_invalid_rd_stats(&ctx->rd_stats);
+
+  return ctx;
+}
+
+void av1_reset_pmc(PICK_MODE_CONTEXT *ctx) {
+  av1_zero_array(ctx->blk_skip, ctx->num_4x4_blk);
+  av1_zero_array(ctx->tx_type_map, ctx->num_4x4_blk);
+  av1_invalid_rd_stats(&ctx->rd_stats);
+}
+
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes) {
+  if (ctx == NULL) return;
+
+  aom_free(ctx->blk_skip);
+  ctx->blk_skip = NULL;
+  aom_free(ctx->tx_type_map);
+  for (int i = 0; i < num_planes; ++i) {
+    ctx->coeff[i] = NULL;
+    ctx->qcoeff[i] = NULL;
+    ctx->dqcoeff[i] = NULL;
+    aom_free(ctx->eobs[i]);
+    ctx->eobs[i] = NULL;
+    aom_free(ctx->txb_entropy_ctx[i]);
+    ctx->txb_entropy_ctx[i] = NULL;
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    if (ctx->color_index_map[i]) {
+      aom_free(ctx->color_index_map[i]);
+      ctx->color_index_map[i] = NULL;
+    }
+  }
+
+  aom_free(ctx);
+}
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) {
+  PC_TREE *pc_tree = aom_calloc(1, sizeof(*pc_tree));
+  if (pc_tree == NULL) return NULL;
+
+  pc_tree->partitioning = PARTITION_NONE;
+  pc_tree->block_size = bsize;
+
+  return pc_tree;
+}
+
+#define FREE_PMC_NODE(CTX)         \
+  do {                             \
+    av1_free_pmc(CTX, num_planes); \
+    CTX = NULL;                    \
+  } while (0)
+
+void av1_free_pc_tree_recursive(PC_TREE *pc_tree, int num_planes, int keep_best,
+                                int keep_none,
+                                PARTITION_SEARCH_TYPE partition_search_type) {
+  if (pc_tree == NULL) return;
+
+  // Avoid freeing of extended partitions as they are not supported when
+  // partition_search_type is VAR_BASED_PARTITION.
+  if (partition_search_type == VAR_BASED_PARTITION && !keep_best &&
+      !keep_none) {
+    FREE_PMC_NODE(pc_tree->none);
+
+    for (int i = 0; i < 2; ++i) {
+      FREE_PMC_NODE(pc_tree->horizontal[i]);
+      FREE_PMC_NODE(pc_tree->vertical[i]);
+    }
+
+#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+    for (int i = 0; i < 3; ++i) {
+      assert(pc_tree->horizontala[i] == NULL);
+      assert(pc_tree->horizontalb[i] == NULL);
+      assert(pc_tree->verticala[i] == NULL);
+      assert(pc_tree->verticalb[i] == NULL);
+    }
+    for (int i = 0; i < 4; ++i) {
+      assert(pc_tree->horizontal4[i] == NULL);
+      assert(pc_tree->vertical4[i] == NULL);
+    }
+#endif
+
+    for (int i = 0; i < 4; ++i) {
+      if (pc_tree->split[i] != NULL) {
+        av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0,
+                                   partition_search_type);
+        pc_tree->split[i] = NULL;
+      }
+    }
+    aom_free(pc_tree);
+    return;
+  }
+
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+
+  if (!keep_none && (!keep_best || (partition != PARTITION_NONE)))
+    FREE_PMC_NODE(pc_tree->none);
+
+  for (int i = 0; i < 2; ++i) {
+    if (!keep_best || (partition != PARTITION_HORZ))
+      FREE_PMC_NODE(pc_tree->horizontal[i]);
+    if (!keep_best || (partition != PARTITION_VERT))
+      FREE_PMC_NODE(pc_tree->vertical[i]);
+  }
+#if !CONFIG_REALTIME_ONLY
+  for (int i = 0; i < 3; ++i) {
+    if (!keep_best || (partition != PARTITION_HORZ_A))
+      FREE_PMC_NODE(pc_tree->horizontala[i]);
+    if (!keep_best || (partition != PARTITION_HORZ_B))
+      FREE_PMC_NODE(pc_tree->horizontalb[i]);
+    if (!keep_best || (partition != PARTITION_VERT_A))
+      FREE_PMC_NODE(pc_tree->verticala[i]);
+    if (!keep_best || (partition != PARTITION_VERT_B))
+      FREE_PMC_NODE(pc_tree->verticalb[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (!keep_best || (partition != PARTITION_HORZ_4))
+      FREE_PMC_NODE(pc_tree->horizontal4[i]);
+    if (!keep_best || (partition != PARTITION_VERT_4))
+      FREE_PMC_NODE(pc_tree->vertical4[i]);
+  }
+#endif
+  if (!keep_best || (partition != PARTITION_SPLIT)) {
+    for (int i = 0; i < 4; ++i) {
+      if (pc_tree->split[i] != NULL) {
+        av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0,
+                                   partition_search_type);
+        pc_tree->split[i] = NULL;
+      }
+    }
+  }
+
+  if (!keep_best && !keep_none) aom_free(pc_tree);
+}
+
+int av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) {
+  // The structure 'sms_tree' is used to store the simple motion search data for
+  // partition pruning in inter frames. Hence, the memory allocations and
+  // initializations related to it are avoided for allintra encoding mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max == 0) return 0;
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  int sms_tree_index = 0;
+  SIMPLE_MOTION_DATA_TREE *this_sms;
+  int square_index = 1;
+  int nodes;
+
+  aom_free(td->sms_tree);
+  td->sms_tree =
+      (SIMPLE_MOTION_DATA_TREE *)aom_calloc(tree_nodes, sizeof(*td->sms_tree));
+  if (!td->sms_tree) return -1;
+  this_sms = &td->sms_tree[0];
+
+  if (!stat_generation_stage) {
+    const int leaf_factor = is_sb_size_128 ? 4 : 1;
+    const int leaf_nodes = 256 * leaf_factor;
+
+    // Sets up all the leaf nodes in the tree.
+    for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+      SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+      tree->block_size = square[0];
+    }
+
+    // Each node has 4 leaf nodes, fill each block_size level of the tree
+    // from leafs to the root.
+    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+      for (int i = 0; i < nodes; ++i) {
+        SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+        tree->block_size = square[square_index];
+        for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+        ++sms_tree_index;
+      }
+      ++square_index;
+    }
+  } else {
+    // Allocation for firstpass/LAP stage
+    // TODO(Mufaddal): refactor square_index to use a common block_size macro
+    // from firstpass.c
+    SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+    square_index = 2;
+    tree->block_size = square[square_index];
+  }
+
+  // Set up the root node for the largest superblock size
+  td->sms_root = &td->sms_tree[tree_nodes - 1];
+  return 0;
+}
+
+void av1_free_sms_tree(ThreadData *td) {
+  aom_free(td->sms_tree);
+  td->sms_tree = NULL;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
new file mode 100644
index 0000000000..0be7ccbb54
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/speed_features.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_PRIMARY;
+struct AV1_COMP;
+struct AV1Common;
+struct ThreadData;
+
+typedef struct {
+  tran_low_t *coeff_buf[MAX_MB_PLANE];
+  tran_low_t *qcoeff_buf[MAX_MB_PLANE];
+  tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
+} PC_TREE_SHARED_BUFFERS;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct PICK_MODE_CONTEXT {
+  MB_MODE_INFO mic;
+  MB_MODE_INFO_EXT_FRAME mbmi_ext_best;
+  uint8_t *color_index_map[2];
+  uint8_t *blk_skip;
+
+  tran_low_t *coeff[MAX_MB_PLANE];
+  tran_low_t *qcoeff[MAX_MB_PLANE];
+  tran_low_t *dqcoeff[MAX_MB_PLANE];
+  uint16_t *eobs[MAX_MB_PLANE];
+  uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+  uint8_t *tx_type_map;
+
+  int num_4x4_blk;
+  // For current partition, only if all Y, U, and V transform blocks'
+  // coefficients are quantized to 0, skippable is set to 1.
+  int skippable;
+#if CONFIG_INTERNAL_STATS
+  THR_MODES best_mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+  RD_STATS rd_stats;
+
+  int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
+                         // been made.
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  int64_t newmv_sse;
+  int64_t zeromv_sse;
+  int64_t zeromv_lastref_sse;
+  PREDICTION_MODE best_sse_inter_mode;
+  int_mv best_sse_mv;
+  MV_REFERENCE_FRAME best_reference_frame;
+  MV_REFERENCE_FRAME best_zeromv_reference_frame;
+  int sb_skip_denoising;
+#endif
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+  PARTITION_TYPE partitioning;
+  BLOCK_SIZE block_size;
+  PICK_MODE_CONTEXT *none;
+  PICK_MODE_CONTEXT *horizontal[2];
+  PICK_MODE_CONTEXT *vertical[2];
+#if !CONFIG_REALTIME_ONLY
+  PICK_MODE_CONTEXT *horizontala[3];
+  PICK_MODE_CONTEXT *horizontalb[3];
+  PICK_MODE_CONTEXT *verticala[3];
+  PICK_MODE_CONTEXT *verticalb[3];
+  PICK_MODE_CONTEXT *horizontal4[4];
+  PICK_MODE_CONTEXT *vertical4[4];
+#endif
+  struct PC_TREE *split[4];
+  int index;
+} PC_TREE;
+
+typedef struct SIMPLE_MOTION_DATA_TREE {
+  BLOCK_SIZE block_size;
+  PARTITION_TYPE partitioning;
+  struct SIMPLE_MOTION_DATA_TREE *split[4];
+
+  // Simple motion search_features
+  FULLPEL_MV start_mvs[REF_FRAMES];
+  unsigned int sms_none_feat[2];
+  unsigned int sms_rect_feat[8];
+  int sms_none_valid;
+  int sms_rect_valid;
+} SIMPLE_MOTION_DATA_TREE;
+
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+                                   PC_TREE_SHARED_BUFFERS *shared_bufs,
+                                   struct aom_internal_error_info *error);
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs);
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize);
+void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best,
+                                int keep_none,
+                                PARTITION_SEARCH_TYPE partition_search_type);
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi,
+                                 BLOCK_SIZE bsize,
+                                 PC_TREE_SHARED_BUFFERS *shared_bufs);
+void av1_reset_pmc(PICK_MODE_CONTEXT *ctx);
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes);
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx);
+
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
+};
+
+static AOM_INLINE int av1_get_pc_tree_nodes(const int is_sb_size_128,
+                                            int stat_generation_stage) {
+  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+  const int tree_nodes =
+      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+  return tree_nodes;
+}
+
+// Returns 0 on success, -1 on memory allocation failure.
+int av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_sms_tree(struct ThreadData *td);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
new file mode 100644
index 0000000000..323e2aed58
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
+
+// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255.
+const uint16_t av1_prob_cost[128] = {
+  512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+  430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361,
+  356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294,
+  289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232,
+  228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175,
+  171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122,
+  119, 115, 112, 109, 105, 102, 99,  95,  92,  89,  86,  82,  79,  76,  73,
+  70,  66,  63,  60,  57,  54,  51,  48,  45,  42,  38,  35,  32,  29,  26,
+  23,  20,  18,  15,  12,  9,   6,   3,
+};
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+                              const int *inv_map) {
+  int i;
+  aom_cdf_prob prev_cdf = 0;
+  for (i = 0;; ++i) {
+    aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+    p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15;
+    prev_cdf = AOM_ICDF(cdf[i]);
+
+    if (inv_map)
+      costs[inv_map[i]] = av1_cost_symbol(p15);
+    else
+      costs[i] = av1_cost_symbol(p15);
+
+    // Stop once we reach the end of the CDF
+    if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break;
+  }
+}
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
new file mode 100644
index 0000000000..be0241a820
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
+
+#include "aom_dsp/prob.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t av1_prob_cost[128];
+
+// The factor to scale from cost in bits to cost in av1_prob_cost units.
+#define AV1_PROB_COST_SHIFT 9
+
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
+// Calculate the cost of a symbol with probability p15 / 2^15
+static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+  // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the
+  // following cost calculation works correctly. Otherwise, if p15 =
+  // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong.
+  p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1);
+  assert(0 < p15 && p15 < CDF_PROB_TOP);
+  const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
+  const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
+  assert(prob >= 128);
+  return av1_prob_cost[prob - 128] + av1_cost_literal(shift);
+}
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+                              const int *inv_map);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/deltaq4_model.c b/third_party/aom/av1/encoder/deltaq4_model.c
new file mode 100644
index 0000000000..60a7e6d2cf
--- /dev/null
+++ b/third_party/aom/av1/encoder/deltaq4_model.c
@@ -0,0 +1,7776 @@
+/* Embedded file: model.tflite */
+const int av1_deltaq4_model_fsize = 101032;
+const unsigned char av1_deltaq4_model_file[101032] = {
+  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00, 0x1c,
+  0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00,
+  0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00,
+  0x00, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x7e, 0x01, 0x00, 0xd0, 0x7e, 0x01, 0x00,
+  0x24, 0x8a, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04,
+  0x00, 0x00, 0x00, 0x6a, 0x80, 0xfe, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00,
+  0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72,
+  0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb4, 0xff, 0xff, 0xff, 0x14,
+  0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x65,
+  0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x00, 0x04, 0x00, 0x00, 0x00, 0xca, 0x81, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04,
+  0x00, 0x08, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74, 0x69,
+  0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00, 0x17, 0x00,
+  0x00, 0x00, 0xfc, 0x7d, 0x01, 0x00, 0xf4, 0x7d, 0x01, 0x00, 0xdc, 0x7d, 0x01,
+  0x00, 0x84, 0x7d, 0x01, 0x00, 0xf4, 0x7c, 0x01, 0x00, 0xa4, 0x7c, 0x01, 0x00,
+  0x74, 0x7c, 0x01, 0x00, 0x5c, 0x7c, 0x01, 0x00, 0x4c, 0x5c, 0x00, 0x00, 0xbc,
+  0x5b, 0x00, 0x00, 0x8c, 0x5a, 0x00, 0x00, 0x7c, 0x48, 0x00, 0x00, 0x6c, 0x00,
+  0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00,
+  0x00, 0x4c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04,
+  0x00, 0x00, 0x00, 0x7e, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00,
+  0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x77, 0xfe, 0xff, 0x54, 0x77, 0xfe, 0xff,
+  0x58, 0x77, 0xfe, 0xff, 0x5c, 0x77, 0xfe, 0xff, 0x60, 0x77, 0xfe, 0xff, 0x64,
+  0x77, 0xfe, 0xff, 0x68, 0x77, 0xfe, 0xff, 0x6c, 0x77, 0xfe, 0xff, 0x70, 0x77,
+  0xfe, 0xff, 0xbe, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00,
+  0x00, 0x3e, 0x84, 0xfc, 0x3b, 0xef, 0x95, 0x2f, 0xbd, 0xd3, 0x21, 0x96, 0xbd,
+  0x11, 0x9a, 0xc6, 0x3d, 0xd9, 0x7e, 0x0c, 0xbe, 0xcb, 0xd2, 0x8c, 0xbb, 0x60,
+  0xf5, 0x92, 0xbd, 0x70, 0xce, 0x9e, 0x3d, 0x26, 0x67, 0xc4, 0x3d, 0x9b, 0x2a,
+  0x8b, 0x3b, 0x3b, 0xdd, 0x2a, 0xbd, 0xf9, 0x09, 0x8a, 0xbd, 0x1b, 0xae, 0xd7,
+  0x3c, 0xbf, 0x39, 0x87, 0xbd, 0x4c, 0x9e, 0xe2, 0x3d, 0x50, 0x9c, 0xe7, 0xbd,
+  0x1e, 0x58, 0x57, 0x3d, 0x38, 0x8c, 0x58, 0xbd, 0x48, 0x9f, 0x4a, 0x3d, 0xcb,
+  0x1c, 0x93, 0xbd, 0xeb, 0xb8, 0x5a, 0xbc, 0x63, 0x04, 0x4b, 0xbd, 0x9b, 0x76,
+  0xa8, 0x3d, 0x20, 0xb4, 0x69, 0x3d, 0xee, 0xcc, 0xe5, 0x3a, 0x4f, 0x40, 0x02,
+  0x3e, 0x21, 0x2e, 0x03, 0x3e, 0x25, 0x77, 0x99, 0xbd, 0xf5, 0xa1, 0xd0, 0x3c,
+  0xc5, 0x15, 0xeb, 0x3c, 0x58, 0xb5, 0xb7, 0x3c, 0x80, 0x63, 0x33, 0xbd, 0xc9,
+  0x66, 0x63, 0xbd, 0xf6, 0xef, 0xb8, 0xbd, 0xd7, 0xbf, 0x9f, 0x3b, 0x93, 0x68,
+  0x35, 0x3d, 0x60, 0xfc, 0xf3, 0xbd, 0xed, 0xd9, 0x35, 0xbd, 0x57, 0xef, 0x8a,
+  0x3d, 0x31, 0x97, 0xa4, 0x3d, 0x8e, 0x55, 0xe2, 0x3d, 0x27, 0xa5, 0xe9, 0x3d,
+  0x36, 0x26, 0x67, 0xbc, 0xeb, 0xd1, 0x9e, 0xbd, 0xc7, 0xcd, 0x37, 0x3d, 0x31,
+  0xfc, 0xce, 0x3d, 0x5e, 0xe3, 0x96, 0xbd, 0xeb, 0x24, 0x4d, 0x3c, 0xe6, 0x00,
+  0xe2, 0xbd, 0x9b, 0x00, 0x17, 0xbd, 0xee, 0x9f, 0xc4, 0xbd, 0x6a, 0xcd, 0xba,
+  0xbc, 0x2c, 0x2b, 0x97, 0xbd, 0x8a, 0x02, 0x68, 0xbc, 0xc3, 0x46, 0x9f, 0xbd,
+  0x85, 0x3d, 0xc2, 0x3d, 0xbc, 0x16, 0x22, 0x3c, 0xf1, 0xca, 0xdf, 0x3d, 0xaf,
+  0xef, 0xbc, 0x3c, 0x4c, 0xde, 0xe8, 0xbd, 0x5c, 0x5a, 0xc9, 0xbb, 0x35, 0xe5,
+  0xc1, 0x3d, 0x14, 0xc7, 0xba, 0xbc, 0x05, 0xfb, 0x1d, 0x3d, 0x61, 0x23, 0xb7,
+  0xbb, 0x17, 0x50, 0xb0, 0xbd, 0x14, 0x5b, 0xf4, 0xbd, 0xb1, 0x4d, 0x40, 0x3d,
+  0x7e, 0x3d, 0xd8, 0x3d, 0x35, 0x2e, 0x90, 0x3d, 0x93, 0xcd, 0x0d, 0xbe, 0x8d,
+  0x60, 0x70, 0x3d, 0x4a, 0x7c, 0xf2, 0x3c, 0x07, 0x2a, 0x7f, 0x3d, 0x2c, 0xab,
+  0xd8, 0x3d, 0xb3, 0x1f, 0x1d, 0xbd, 0x44, 0x69, 0xf7, 0x3c, 0x71, 0xfd, 0x5e,
+  0x3c, 0xc8, 0x14, 0x28, 0x3d, 0x71, 0x2e, 0x0c, 0x3b, 0x7f, 0xa3, 0xb5, 0x3d,
+  0x55, 0x5c, 0x07, 0x3e, 0x0f, 0xf0, 0x3b, 0x3c, 0xd9, 0xc2, 0xbd, 0xbc, 0x71,
+  0xaa, 0xc5, 0xbb, 0xa3, 0x86, 0xc7, 0x3d, 0xcf, 0x37, 0x95, 0xbd, 0x09, 0x63,
+  0xc3, 0x3d, 0x0c, 0x01, 0x4e, 0xbd, 0xf1, 0xf9, 0x8d, 0x3d, 0xe2, 0x98, 0x45,
+  0x3d, 0x76, 0xbc, 0x3b, 0x3d, 0x2a, 0xa2, 0x47, 0x3d, 0x8c, 0x1d, 0xae, 0xbd,
+  0x5f, 0x35, 0x8c, 0xbd, 0x17, 0xeb, 0x05, 0x3d, 0x75, 0x62, 0xdb, 0xbd, 0x37,
+  0xf8, 0xea, 0x3d, 0xf8, 0xa6, 0x6c, 0xbd, 0x8a, 0x86, 0x03, 0x3d, 0x67, 0x6c,
+  0x8d, 0xbd, 0x58, 0xaf, 0xc5, 0xbd, 0x36, 0x51, 0x14, 0xbe, 0x60, 0xac, 0xe3,
+  0x3d, 0x86, 0x4f, 0xf4, 0x3c, 0xf6, 0xa3, 0x29, 0x3d, 0xc3, 0x1d, 0x9a, 0x3c,
+  0x44, 0xdc, 0x0e, 0xbc, 0x6b, 0x97, 0x8f, 0x3c, 0xc9, 0x3d, 0x88, 0xbc, 0x74,
+  0x90, 0x9d, 0x3d, 0x0f, 0x02, 0xec, 0xbd, 0x12, 0xec, 0xb2, 0x3d, 0x6c, 0x32,
+  0x31, 0x3d, 0x0b, 0x84, 0x35, 0x3d, 0xfc, 0xc2, 0x3c, 0x3d, 0x59, 0xdf, 0x16,
+  0x3d, 0x8e, 0x29, 0xee, 0x3d, 0x83, 0xc3, 0xb7, 0xbd, 0x66, 0xbd, 0x84, 0xbd,
+  0xb7, 0x49, 0x1b, 0x3d, 0x3f, 0xc1, 0x4a, 0x3d, 0x1a, 0x7d, 0xdf, 0x3d, 0xee,
+  0x12, 0xb1, 0x3c, 0x29, 0x47, 0xe6, 0xbd, 0xd6, 0x04, 0xd6, 0x3d, 0xc2, 0x31,
+  0x6f, 0xbd, 0xb0, 0x2c, 0x3e, 0xbd, 0x20, 0xd8, 0x43, 0xbd, 0x2d, 0x0c, 0x26,
+  0xbd, 0x23, 0x47, 0x06, 0xbe, 0xb9, 0xd2, 0xb9, 0xbd, 0x7b, 0xef, 0xc8, 0x3d,
+  0x23, 0x06, 0x06, 0x3d, 0x65, 0xc6, 0x45, 0xbd, 0x20, 0xc9, 0x24, 0xbc, 0xf7,
+  0x2b, 0xf5, 0x3d, 0x41, 0x91, 0x15, 0xbd, 0x90, 0xbe, 0x0f, 0x3d, 0xe8, 0x94,
+  0x8c, 0xbd, 0xdf, 0x96, 0x72, 0x3c, 0x8d, 0xb4, 0xed, 0x3d, 0x33, 0xf0, 0xb3,
+  0xbd, 0x60, 0x49, 0xbc, 0xbd, 0x32, 0xf2, 0xd5, 0x3d, 0x3e, 0x3e, 0x6b, 0xbd,
+  0xb4, 0x31, 0x09, 0x3e, 0xc6, 0x40, 0xfb, 0xbc, 0x75, 0x1a, 0x88, 0xbd, 0xbf,
+  0x13, 0xb2, 0xbd, 0xe3, 0x78, 0xc4, 0xba, 0x68, 0xfc, 0x10, 0x3e, 0x27, 0x4c,
+  0xf5, 0x3c, 0xfc, 0x68, 0x27, 0x3d, 0xb2, 0x2c, 0xe0, 0x3c, 0x6e, 0x4f, 0x9a,
+  0xbb, 0xbb, 0x9f, 0xa1, 0xbd, 0x91, 0x7b, 0x9a, 0xbc, 0x17, 0x21, 0x52, 0xba,
+  0x39, 0x8e, 0x4c, 0xbd, 0x03, 0xf5, 0xe5, 0x3d, 0x3a, 0x22, 0xcd, 0xbd, 0x90,
+  0x1c, 0x78, 0xbd, 0x3f, 0xb1, 0x8d, 0xbd, 0xfc, 0x77, 0x25, 0xbe, 0x48, 0x9a,
+  0xfd, 0x3c, 0xca, 0x6a, 0xa2, 0x3d, 0x45, 0xd6, 0x7a, 0xbd, 0xce, 0x9d, 0xbf,
+  0x3d, 0x94, 0x1c, 0xbe, 0xbd, 0xcc, 0xc4, 0x83, 0xbc, 0xe9, 0xc7, 0xf3, 0xbc,
+  0xdc, 0x31, 0x19, 0x39, 0x3a, 0x36, 0xea, 0x3d, 0x40, 0xa6, 0x72, 0xbd, 0x66,
+  0xeb, 0x85, 0xb9, 0x68, 0xa0, 0x97, 0xbd, 0xa7, 0xeb, 0xa9, 0x3c, 0x4d, 0x79,
+  0xf9, 0x3c, 0x55, 0x67, 0xb2, 0x3c, 0x80, 0x2a, 0x8f, 0xbd, 0xd5, 0x70, 0x17,
+  0x3b, 0x41, 0xfb, 0xed, 0xbd, 0xae, 0xfe, 0x0e, 0xbd, 0x6d, 0x06, 0xd6, 0xbc,
+  0x90, 0xc9, 0xd1, 0x3d, 0xb4, 0x6c, 0x19, 0x3b, 0xa3, 0x4f, 0x11, 0x3c, 0xb1,
+  0x71, 0xc1, 0xbd, 0xcc, 0x5b, 0x20, 0xbc, 0x7a, 0xb5, 0xe9, 0x3d, 0x6f, 0x8c,
+  0x95, 0x3d, 0x10, 0x56, 0x79, 0xbd, 0x45, 0x06, 0x69, 0x3c, 0xe4, 0x89, 0x9f,
+  0xbd, 0xad, 0x43, 0x82, 0xbd, 0x7a, 0x1f, 0xbd, 0xbd, 0xbb, 0x25, 0x9b, 0x3c,
+  0x27, 0xdc, 0x0f, 0xbe, 0x42, 0x7b, 0xe1, 0x3d, 0xaa, 0xd9, 0xcb, 0xbd, 0xa4,
+  0xdf, 0x0e, 0x3e, 0xdd, 0x57, 0xbe, 0xbd, 0xf0, 0xb7, 0x87, 0xbd, 0xbb, 0x8a,
+  0x73, 0xbd, 0x20, 0x8b, 0xb5, 0x3c, 0xb3, 0xac, 0x57, 0xbd, 0x4a, 0x5c, 0x68,
+  0x3d, 0x46, 0xc5, 0x6e, 0x3b, 0x44, 0xd8, 0x22, 0xbd, 0xc8, 0x88, 0x93, 0xbd,
+  0x71, 0x42, 0xd3, 0xbc, 0x80, 0x60, 0xf6, 0xbc, 0xe0, 0xb7, 0x04, 0x3d, 0xcb,
+  0x28, 0xf7, 0xbd, 0xfd, 0x2e, 0x9d, 0xbd, 0xd8, 0x81, 0x5b, 0x3d, 0x90, 0x88,
+  0x06, 0xbd, 0xb1, 0x2d, 0x8b, 0xbc, 0x74, 0x4d, 0x80, 0xbd, 0x1b, 0xce, 0x54,
+  0x3d, 0xd3, 0xea, 0x89, 0xbd, 0x7a, 0x0a, 0xc6, 0x3c, 0x8b, 0x33, 0xa2, 0x3d,
+  0x68, 0xe5, 0x8b, 0x3d, 0xcf, 0x19, 0x63, 0xbd, 0x50, 0x05, 0xc1, 0xbd, 0x2b,
+  0x1f, 0xc4, 0xbc, 0x9f, 0xed, 0xaf, 0xbd, 0xc6, 0x72, 0x07, 0xbb, 0xc1, 0x58,
+  0xa2, 0x3d, 0xf6, 0x27, 0x43, 0xbc, 0xa1, 0x5b, 0x36, 0x3d, 0x6b, 0x6b, 0x20,
+  0x3d, 0x03, 0xb0, 0xfb, 0xbd, 0xf9, 0xf7, 0x9b, 0xbd, 0x9a, 0xbf, 0x92, 0x3d,
+  0xa2, 0x0c, 0x5c, 0x3d, 0xd2, 0xc2, 0x73, 0xbd, 0x5c, 0xd3, 0xac, 0x3d, 0x9f,
+  0x28, 0xa6, 0x3d, 0x23, 0xf4, 0x46, 0xbd, 0xf5, 0xfe, 0x6b, 0x3d, 0x2d, 0x03,
+  0x56, 0x3d, 0x0c, 0x21, 0xe8, 0x3c, 0x6f, 0xdb, 0xe5, 0xbd, 0xd4, 0x8c, 0xe3,
+  0xbd, 0xdf, 0x9d, 0x62, 0x3d, 0x38, 0xa0, 0xd1, 0xbd, 0x67, 0x9e, 0x8d, 0xbc,
+  0xab, 0x78, 0x46, 0x3d, 0xf8, 0x88, 0x8e, 0xbc, 0x5a, 0x87, 0xd3, 0xbd, 0x40,
+  0xba, 0xab, 0xbd, 0x45, 0xf8, 0x9a, 0x3d, 0x77, 0x60, 0x49, 0xbd, 0xa5, 0x29,
+  0x98, 0xbc, 0xf9, 0xa7, 0x6b, 0x3d, 0xf8, 0x57, 0x1b, 0x3e, 0xf9, 0x7f, 0xcb,
+  0x3d, 0xc8, 0x38, 0x3f, 0xbb, 0x0e, 0x77, 0xd9, 0x3d, 0xa9, 0x8f, 0xca, 0x3d,
+  0x78, 0xbc, 0x92, 0x3d, 0xde, 0xe4, 0x31, 0xbc, 0x7f, 0x35, 0xec, 0x3d, 0x0b,
+  0x98, 0x5c, 0x3d, 0x3a, 0x86, 0xa0, 0x3d, 0x9d, 0xb7, 0xad, 0xbd, 0x42, 0x3c,
+  0xc2, 0xbc, 0x26, 0x4b, 0x7b, 0x3d, 0xbe, 0x8b, 0x0a, 0xb9, 0x28, 0x3e, 0xc5,
+  0x3d, 0xef, 0xac, 0xbb, 0xbd, 0xb3, 0xcc, 0x69, 0xbd, 0xb9, 0xff, 0x07, 0x3d,
+  0x30, 0xf6, 0x26, 0x3d, 0xa9, 0x18, 0xe6, 0x3d, 0x85, 0x72, 0xdb, 0xbd, 0xda,
+  0x6e, 0xa1, 0x3d, 0x3b, 0x16, 0xf7, 0x3c, 0xb1, 0x3d, 0x96, 0xbd, 0xd9, 0x88,
+  0xeb, 0x3b, 0x52, 0x76, 0x9a, 0xbd, 0xb9, 0x81, 0x1a, 0xbd, 0x81, 0x94, 0x96,
+  0xbc, 0xd4, 0x4b, 0xe8, 0x3d, 0x0f, 0x6c, 0xe4, 0xbc, 0xc0, 0xbd, 0xab, 0x3c,
+  0x1b, 0xdd, 0x76, 0x3c, 0x98, 0x18, 0xae, 0xbd, 0xfb, 0x1a, 0x6f, 0xbd, 0x72,
+  0x50, 0x83, 0xbd, 0x46, 0x0b, 0x12, 0xbc, 0x64, 0x93, 0xf2, 0x3d, 0x1f, 0xad,
+  0x71, 0x3b, 0xcf, 0x26, 0x77, 0xbd, 0x8b, 0x31, 0x2d, 0xbd, 0x0d, 0xb7, 0x54,
+  0x3b, 0x5b, 0x00, 0xc4, 0x3d, 0x57, 0x4c, 0x58, 0x3d, 0x11, 0x4c, 0x15, 0x3d,
+  0x1a, 0xfc, 0xa2, 0xbc, 0xf2, 0xed, 0xea, 0x3d, 0x9e, 0xad, 0xf7, 0xbd, 0x47,
+  0x8d, 0x41, 0x3d, 0xce, 0xc5, 0x96, 0xbb, 0x2a, 0x72, 0xa0, 0xbd, 0x93, 0x27,
+  0x9a, 0xbd, 0x3f, 0xcb, 0xef, 0xbb, 0xb5, 0xa5, 0x1e, 0x3d, 0xd6, 0x2a, 0xfd,
+  0xbc, 0xf5, 0xe0, 0xd4, 0xbc, 0xa1, 0x7d, 0x9d, 0x3d, 0xbb, 0x60, 0x22, 0xbd,
+  0x32, 0x15, 0x16, 0x3e, 0x80, 0x77, 0xb7, 0xbc, 0xba, 0x1c, 0xa4, 0xbd, 0x45,
+  0xb7, 0x0b, 0xbd, 0x6a, 0x33, 0x9a, 0x3d, 0xfc, 0x27, 0xab, 0xbc, 0x10, 0xcd,
+  0x2c, 0x3e, 0xb3, 0xf1, 0xa5, 0x3d, 0x03, 0xf7, 0xa3, 0x3c, 0x25, 0x0c, 0xe1,
+  0x3c, 0xc4, 0x82, 0xaa, 0xbd, 0x3a, 0x4a, 0x15, 0x3c, 0x5c, 0x56, 0x9e, 0x3d,
+  0x96, 0x52, 0xee, 0x3d, 0x67, 0xf7, 0x96, 0x3d, 0x3e, 0xb0, 0xd6, 0xbd, 0x6e,
+  0xbd, 0x8e, 0xbd, 0x16, 0xb3, 0x85, 0x3d, 0x84, 0xca, 0x6e, 0xbd, 0x0f, 0xfc,
+  0x40, 0x3d, 0x2d, 0xe0, 0xdc, 0x3d, 0xc1, 0xa1, 0xde, 0x39, 0x30, 0x79, 0xe7,
+  0x3d, 0x0a, 0xab, 0xba, 0x3d, 0x35, 0x57, 0xc7, 0xbd, 0x7e, 0x38, 0xa1, 0x3d,
+  0xe3, 0x25, 0x60, 0x3d, 0x47, 0xbd, 0x56, 0x3d, 0x62, 0xcf, 0xf6, 0x3d, 0xad,
+  0x06, 0xd5, 0xbd, 0x41, 0xda, 0xe8, 0x3a, 0x81, 0xcb, 0xbb, 0x3d, 0xce, 0x38,
+  0x4c, 0xbc, 0x17, 0xc0, 0x88, 0xbd, 0x12, 0x25, 0xd7, 0xbd, 0x3b, 0xf5, 0x9b,
+  0xbd, 0x4e, 0xa0, 0xb1, 0xbc, 0xa1, 0x8c, 0x9c, 0x3d, 0xc5, 0x2f, 0xb3, 0x3d,
+  0xe0, 0xc2, 0x08, 0x3e, 0x0b, 0xcc, 0x2f, 0x3d, 0x87, 0x3f, 0x1d, 0x3e, 0x76,
+  0xcd, 0xc3, 0xbd, 0x4f, 0x1d, 0xd4, 0xbd, 0x65, 0x6f, 0x00, 0x3e, 0x95, 0x4f,
+  0x9a, 0x3d, 0xa2, 0x66, 0x28, 0xbd, 0xaf, 0x81, 0x90, 0x3d, 0x16, 0x50, 0xde,
+  0x3b, 0x65, 0xec, 0xe3, 0xbd, 0x47, 0x6c, 0x34, 0xbc, 0xae, 0xe8, 0xe5, 0xbd,
+  0x5b, 0x7c, 0xa6, 0xbb, 0x1d, 0x4d, 0x8d, 0xbc, 0xb1, 0x7a, 0x1d, 0x3e, 0xbf,
+  0x37, 0xe6, 0xbc, 0x7b, 0x0c, 0x70, 0x3d, 0x09, 0x57, 0xe2, 0x3d, 0x10, 0x4a,
+  0x35, 0xbc, 0x5d, 0x58, 0xf5, 0xbc, 0xb9, 0x89, 0xa1, 0x3d, 0x6a, 0xb2, 0x68,
+  0xbd, 0xf4, 0xf6, 0x03, 0x3e, 0xf1, 0xc6, 0x3a, 0xbd, 0xf5, 0x3b, 0xe2, 0x3d,
+  0x3a, 0xd2, 0x4a, 0x3d, 0xe7, 0xb8, 0x9e, 0xbd, 0x18, 0xe7, 0xd9, 0x3c, 0x1d,
+  0x95, 0x8e, 0x3d, 0xde, 0x6f, 0x9e, 0xbc, 0xae, 0x7d, 0x0f, 0x3e, 0xb0, 0xf3,
+  0x04, 0x3d, 0xe0, 0xdc, 0x6b, 0x3d, 0x02, 0x2c, 0xee, 0xbd, 0x7c, 0xb2, 0x9f,
+  0xbd, 0xae, 0x94, 0xc3, 0x3c, 0x82, 0xba, 0xab, 0x3d, 0x07, 0x80, 0xde, 0x3c,
+  0x75, 0xec, 0xb3, 0xbd, 0x34, 0x42, 0x74, 0xbd, 0x44, 0xce, 0x7a, 0x3d, 0x21,
+  0xac, 0x28, 0xbe, 0xb1, 0xbb, 0x14, 0xbd, 0xe2, 0xe1, 0xdb, 0x3c, 0x41, 0x82,
+  0xc7, 0x3d, 0x3e, 0x0f, 0x9c, 0xbd, 0x92, 0x4e, 0x97, 0x3d, 0x69, 0x45, 0xf2,
+  0x3d, 0xc3, 0x86, 0xc4, 0xbb, 0x57, 0x0f, 0xb1, 0x3d, 0x8c, 0xa7, 0xc6, 0x3d,
+  0x27, 0xe2, 0xf3, 0xbc, 0xdd, 0x31, 0x44, 0xbd, 0x94, 0x2c, 0x29, 0xbc, 0xe6,
+  0xeb, 0xd1, 0xbd, 0x74, 0xf9, 0x02, 0x3d, 0x43, 0x51, 0x92, 0xbd, 0x38, 0xb8,
+  0x72, 0x3d, 0x73, 0xd3, 0x89, 0xbc, 0x06, 0x13, 0xdb, 0x3d, 0x75, 0xc5, 0xb2,
+  0x3b, 0x9a, 0xe9, 0x95, 0xbc, 0xd2, 0x6a, 0x05, 0x3e, 0x65, 0xc5, 0xa3, 0x3d,
+  0x59, 0x09, 0x72, 0xbd, 0x93, 0x0e, 0x85, 0xbc, 0x0d, 0x55, 0x6b, 0xbd, 0x55,
+  0x64, 0x16, 0xbd, 0x50, 0x04, 0x9f, 0x3d, 0x93, 0x37, 0x14, 0xbd, 0xe9, 0x24,
+  0x58, 0x3d, 0x04, 0x8e, 0xe9, 0xbd, 0xe4, 0x6e, 0x2b, 0xbd, 0x43, 0xbc, 0xba,
+  0xbd, 0x80, 0xa1, 0xc3, 0xbd, 0x32, 0x81, 0xf5, 0xbd, 0x94, 0x5a, 0x10, 0x3d,
+  0xfb, 0x5d, 0x27, 0x3c, 0xd7, 0x26, 0xc5, 0x3d, 0xf5, 0xc3, 0x4b, 0x3d, 0x32,
+  0xca, 0xdc, 0x3d, 0xb2, 0xe8, 0x35, 0xbc, 0xb2, 0x47, 0xb9, 0xbd, 0xfa, 0x59,
+  0x29, 0xbe, 0xab, 0x6f, 0x0a, 0x3e, 0x81, 0xa5, 0x10, 0xbd, 0x73, 0x96, 0x99,
+  0xbd, 0x39, 0x77, 0x23, 0xbc, 0xa8, 0x50, 0xf8, 0xbd, 0x4c, 0x1d, 0xdd, 0xbd,
+  0xf8, 0xf5, 0xb9, 0xbd, 0x65, 0x4e, 0x12, 0x3e, 0xc0, 0xa1, 0x7a, 0xbd, 0x16,
+  0x33, 0x27, 0x3d, 0xc4, 0xc6, 0x31, 0x3b, 0x0e, 0xcd, 0x48, 0xbd, 0xd2, 0x7f,
+  0xb4, 0xbd, 0x2c, 0x3a, 0x8b, 0x3c, 0x6f, 0x43, 0x59, 0x3d, 0x4e, 0x8a, 0x52,
+  0x3d, 0x91, 0x68, 0xc4, 0x3d, 0xa2, 0x78, 0x16, 0xbd, 0xe5, 0x2c, 0x60, 0x3d,
+  0x7f, 0x73, 0x8f, 0x3d, 0x9f, 0x70, 0x09, 0xbe, 0xf2, 0xf2, 0x05, 0x3c, 0x1e,
+  0x58, 0x98, 0x3d, 0xec, 0xfc, 0x03, 0x3e, 0x88, 0xbf, 0x56, 0xbd, 0x2b, 0xc8,
+  0x99, 0xbd, 0x9e, 0x13, 0x9a, 0xbc, 0x4f, 0x72, 0xca, 0xbd, 0x79, 0x6e, 0xef,
+  0x3d, 0x87, 0xc3, 0x80, 0xbc, 0xe7, 0xef, 0x05, 0x3d, 0xc7, 0x99, 0x0a, 0x3d,
+  0x17, 0x7c, 0x56, 0x3d, 0x01, 0xab, 0xd3, 0xbd, 0x48, 0x8b, 0xa2, 0xbd, 0x06,
+  0xad, 0xcc, 0xbc, 0xf0, 0xf5, 0x6d, 0xbd, 0x6a, 0x67, 0x0c, 0xbe, 0x7e, 0x2e,
+  0x6e, 0x3d, 0x53, 0x50, 0x29, 0xbd, 0x8c, 0x40, 0xb3, 0x3d, 0x5c, 0x9a, 0x0f,
+  0xbd, 0xe9, 0x4e, 0x0a, 0x3e, 0x4d, 0x05, 0xac, 0x3d, 0xf9, 0x1a, 0x8e, 0x3d,
+  0x0d, 0x69, 0xa6, 0xbd, 0x88, 0x94, 0x60, 0x3d, 0x48, 0x2a, 0x8a, 0xbb, 0x5a,
+  0x5d, 0x39, 0x3d, 0x88, 0x56, 0xc8, 0x3c, 0xb8, 0x91, 0x93, 0x3a, 0x64, 0x69,
+  0x8b, 0x3d, 0x4b, 0x48, 0x43, 0xbd, 0xb8, 0x91, 0xa7, 0xbd, 0x92, 0x96, 0xe5,
+  0x3d, 0x4c, 0x62, 0xd6, 0x3d, 0xa6, 0x7a, 0x88, 0xbd, 0x6c, 0xdb, 0xc6, 0x3d,
+  0x1c, 0x4d, 0xab, 0x3d, 0xe0, 0x1d, 0x57, 0x3c, 0x2a, 0xa3, 0x0c, 0x3d, 0xac,
+  0xff, 0xe8, 0xbb, 0x12, 0x86, 0x89, 0xbd, 0xc6, 0x68, 0xd3, 0xbd, 0xe7, 0xb0,
+  0xa6, 0xbc, 0x3c, 0xd2, 0xfa, 0xbb, 0xf2, 0xd6, 0xda, 0xbd, 0x80, 0x95, 0xc5,
+  0xbd, 0x0a, 0x19, 0x93, 0xbd, 0x94, 0xc1, 0xe4, 0xbd, 0xdd, 0x20, 0x18, 0x3e,
+  0xb3, 0x48, 0xba, 0xbd, 0xdd, 0x6b, 0x86, 0xbd, 0x3d, 0xbc, 0xb1, 0xbd, 0xbe,
+  0xc1, 0x7f, 0xbc, 0xfc, 0x54, 0x83, 0x3d, 0xb5, 0x4e, 0x1e, 0xbd, 0x5f, 0x54,
+  0xc3, 0x3c, 0xe4, 0x2e, 0x0a, 0x3e, 0xc9, 0x05, 0x05, 0x3d, 0xc7, 0x8d, 0x2c,
+  0xbc, 0x37, 0x21, 0xc2, 0xbc, 0xea, 0x7e, 0x96, 0x3d, 0x64, 0x7a, 0xca, 0x3d,
+  0xcb, 0xcf, 0xc8, 0x3b, 0x5a, 0xd4, 0x00, 0xbe, 0x5f, 0x49, 0xd0, 0x3d, 0xbe,
+  0x56, 0x15, 0x3e, 0x3f, 0x1d, 0x9e, 0xbd, 0xd4, 0x91, 0xa9, 0x3d, 0xf1, 0xea,
+  0x4b, 0xbb, 0x78, 0x4a, 0xa5, 0x3c, 0xc2, 0x9b, 0xac, 0xbd, 0x8c, 0xd3, 0x94,
+  0xbd, 0xb1, 0x52, 0x94, 0xbd, 0x55, 0xdd, 0x0d, 0xbe, 0x93, 0x2e, 0xa1, 0x3d,
+  0x31, 0x1e, 0xe0, 0x3c, 0xaf, 0xba, 0x6c, 0x3d, 0x8e, 0xec, 0x8f, 0xbd, 0x38,
+  0x79, 0xd2, 0xbc, 0x21, 0x7e, 0x9d, 0x3d, 0xbb, 0x21, 0xeb, 0x3d, 0x6e, 0x68,
+  0xec, 0x3d, 0xc2, 0xf4, 0xb6, 0xbd, 0x80, 0xe2, 0x91, 0xbc, 0x45, 0xa5, 0x8f,
+  0xbb, 0xf8, 0xb2, 0xc7, 0xbd, 0xe4, 0x47, 0x3a, 0xbd, 0xa2, 0x4f, 0xe9, 0xbd,
+  0xcc, 0x37, 0x53, 0x3c, 0x51, 0x03, 0x4f, 0x3d, 0x35, 0xa2, 0xfa, 0x3d, 0xea,
+  0x64, 0x7b, 0xbc, 0xbf, 0x49, 0xfb, 0x3d, 0x3d, 0x8e, 0x7b, 0x3b, 0x9c, 0x4b,
+  0x35, 0xbd, 0x62, 0xf1, 0x10, 0xbe, 0xac, 0xd2, 0xd8, 0xbd, 0x80, 0x00, 0x9d,
+  0x3d, 0xcc, 0x19, 0xaf, 0xbc, 0x97, 0x73, 0xdb, 0x3d, 0x6d, 0xb6, 0xf3, 0x3d,
+  0x19, 0xe7, 0x7a, 0xbd, 0xcf, 0xba, 0xc6, 0x3c, 0x77, 0xfc, 0x23, 0x3d, 0xd6,
+  0xfe, 0x3f, 0x3d, 0x73, 0xf2, 0xdb, 0xbd, 0x3d, 0x21, 0x95, 0xbb, 0x58, 0xb8,
+  0x86, 0xbd, 0x01, 0x3c, 0x6f, 0x3d, 0xaf, 0x2e, 0x3e, 0xbd, 0x7b, 0x6d, 0x73,
+  0xbd, 0x33, 0xe2, 0x5f, 0xbc, 0x64, 0x5f, 0xdb, 0xbd, 0x31, 0xf5, 0xb6, 0xbd,
+  0xfc, 0x90, 0xd4, 0xbd, 0x25, 0xd8, 0xc4, 0xbd, 0x38, 0xdf, 0xb9, 0x3d, 0x89,
+  0x14, 0x8b, 0x3d, 0x8d, 0x05, 0x2c, 0xbd, 0x20, 0xb8, 0xa3, 0xbc, 0xaf, 0x68,
+  0x12, 0x3d, 0xce, 0x53, 0xb0, 0xbd, 0xca, 0x8a, 0x95, 0x3d, 0x11, 0x84, 0x8a,
+  0x3d, 0x6d, 0xbd, 0x67, 0xbb, 0xe8, 0xd5, 0x76, 0xbc, 0xac, 0xc8, 0xfb, 0xbd,
+  0xa9, 0x8b, 0xa4, 0xbb, 0x3e, 0x3a, 0xba, 0x3d, 0xe2, 0xa5, 0x50, 0x3d, 0xf0,
+  0x4d, 0x81, 0x3b, 0x96, 0x79, 0x31, 0xbd, 0x87, 0xaf, 0xe5, 0x3a, 0x27, 0xb7,
+  0xa5, 0x3d, 0xd4, 0x71, 0xb5, 0xbd, 0x95, 0x06, 0xd1, 0xbd, 0x82, 0x3d, 0x1c,
+  0xbc, 0xdc, 0xe4, 0x6e, 0x3d, 0x21, 0xcf, 0x80, 0xbc, 0xbe, 0xc7, 0xb7, 0xbc,
+  0x21, 0x87, 0x3c, 0x3d, 0x11, 0x3a, 0x67, 0xbd, 0xa5, 0xd3, 0xe8, 0xbd, 0x9a,
+  0xb7, 0xc2, 0x3d, 0x2e, 0xa7, 0x86, 0xbc, 0xbe, 0x03, 0x26, 0xbc, 0x5e, 0x12,
+  0x08, 0xbe, 0x1d, 0xd9, 0xf8, 0xbd, 0xf3, 0x79, 0xe4, 0xbd, 0x38, 0xaa, 0x04,
+  0x3e, 0x98, 0x40, 0xa7, 0x3d, 0xfa, 0xd9, 0xce, 0xbd, 0x08, 0x73, 0x16, 0xb9,
+  0xd6, 0x47, 0x2c, 0x3d, 0x08, 0xb5, 0x8b, 0xbd, 0x04, 0x66, 0x70, 0x3c, 0x9f,
+  0xe6, 0xe4, 0xbd, 0x7f, 0xcd, 0xa5, 0x3b, 0x5b, 0x92, 0x8b, 0xbd, 0x29, 0x55,
+  0x19, 0xbd, 0x79, 0x98, 0x26, 0x3d, 0x32, 0x3d, 0xc3, 0xb9, 0x29, 0x8a, 0x05,
+  0xbe, 0xe8, 0x61, 0x92, 0x3d, 0x4f, 0x64, 0xa9, 0x3d, 0x00, 0x9a, 0xa0, 0xbd,
+  0x34, 0xcc, 0xd8, 0x3c, 0xcd, 0x8a, 0xaf, 0x3d, 0x69, 0xc6, 0x5c, 0x3c, 0xe0,
+  0x76, 0xd3, 0x3d, 0x49, 0x6a, 0x79, 0x3b, 0x33, 0x10, 0xbd, 0x3c, 0xe9, 0x47,
+  0x2a, 0xbd, 0x7f, 0xb4, 0x3e, 0xbb, 0x80, 0xd2, 0x18, 0xbe, 0xf3, 0x5c, 0x90,
+  0xbd, 0x0b, 0x88, 0xaf, 0xbd, 0x24, 0x0c, 0x94, 0xbd, 0xfd, 0xa9, 0xa1, 0xbd,
+  0x40, 0xc9, 0x82, 0xbd, 0x24, 0x56, 0xa0, 0x3c, 0xa0, 0x3e, 0x09, 0x3e, 0x30,
+  0x93, 0xc7, 0x3d, 0x03, 0xa3, 0x0c, 0x3c, 0x88, 0xdc, 0x96, 0x3d, 0xac, 0x34,
+  0xc7, 0xbd, 0x64, 0xb0, 0xe5, 0x3d, 0x61, 0x56, 0xc8, 0x3d, 0x08, 0x55, 0x99,
+  0x3d, 0xb5, 0xa9, 0x56, 0xbd, 0xfb, 0x4f, 0x95, 0xbd, 0xe9, 0xeb, 0x55, 0x3d,
+  0xbf, 0x4c, 0xdf, 0xbd, 0xbf, 0x4a, 0x12, 0xbb, 0x93, 0x9d, 0x65, 0xbd, 0x26,
+  0xd0, 0xce, 0x3d, 0x89, 0x19, 0x64, 0xbd, 0x91, 0x3d, 0x3f, 0x3d, 0x23, 0x3a,
+  0x3b, 0xbd, 0xc8, 0x9d, 0x20, 0xbc, 0xa1, 0x2c, 0xff, 0xbb, 0x8c, 0x39, 0xb2,
+  0x3b, 0xf3, 0xbe, 0x86, 0x3d, 0xa3, 0xfa, 0xcc, 0xbd, 0x3d, 0x3c, 0x07, 0xbe,
+  0xd4, 0xb4, 0xa7, 0xbd, 0x94, 0xfc, 0x71, 0x3d, 0x8b, 0xe6, 0x2e, 0x3d, 0x94,
+  0x30, 0x41, 0xbd, 0xb3, 0x63, 0x18, 0x3d, 0xbf, 0x35, 0x3c, 0xbb, 0x4c, 0xaa,
+  0xd9, 0xbd, 0x20, 0x83, 0xa1, 0x3d, 0xdb, 0xca, 0x49, 0x3c, 0x1d, 0xbb, 0xac,
+  0xbb, 0x3c, 0xea, 0x1c, 0xbc, 0x5b, 0xc3, 0xd1, 0x3d, 0x15, 0xd3, 0xc9, 0xbd,
+  0xb9, 0x30, 0x12, 0xbb, 0xe3, 0x34, 0xde, 0xbd, 0xa0, 0x31, 0xeb, 0xbd, 0xc2,
+  0x64, 0xe2, 0x3d, 0xb2, 0xfd, 0xf4, 0xbd, 0x45, 0xa5, 0xbe, 0x3c, 0xa1, 0x40,
+  0x56, 0xbd, 0x52, 0x01, 0xed, 0x3d, 0xd0, 0x6b, 0xfc, 0xbd, 0xef, 0x73, 0xb2,
+  0xbd, 0x03, 0xa0, 0xcd, 0xbd, 0x24, 0x69, 0xbe, 0x3c, 0x76, 0xcd, 0x9e, 0x3d,
+  0xbe, 0xcb, 0x3b, 0x3d, 0x55, 0x49, 0x4e, 0xbd, 0x99, 0xe9, 0xd5, 0xbc, 0x9c,
+  0x73, 0x88, 0x3c, 0x9a, 0x64, 0x75, 0xbd, 0x53, 0x89, 0xb2, 0xbd, 0x73, 0xa4,
+  0xb9, 0x3d, 0xa8, 0x68, 0xf3, 0xbd, 0x2a, 0xf3, 0x89, 0xbd, 0x8d, 0x63, 0x85,
+  0x3c, 0xbb, 0x72, 0x63, 0x3d, 0x29, 0x8a, 0xe8, 0xbd, 0x87, 0x03, 0xab, 0x3d,
+  0xbf, 0x88, 0x44, 0xbd, 0x74, 0x28, 0xae, 0xbd, 0xf7, 0xe8, 0x87, 0xbd, 0x16,
+  0x46, 0x04, 0xbd, 0x87, 0xf6, 0xcf, 0xbd, 0x8b, 0x67, 0x44, 0xbd, 0xac, 0xd4,
+  0xa5, 0xbd, 0xed, 0x0b, 0xf2, 0xbd, 0x20, 0x9e, 0xf5, 0xbd, 0xc1, 0xbd, 0x70,
+  0x3d, 0xae, 0xfe, 0x77, 0x3d, 0x27, 0x07, 0x82, 0xbd, 0xbe, 0x56, 0x19, 0xbd,
+  0xae, 0x94, 0xc9, 0xbd, 0x7a, 0x52, 0xc6, 0xbd, 0x4e, 0x64, 0x4d, 0x3c, 0xf7,
+  0xe4, 0x18, 0x3d, 0xef, 0x06, 0xa4, 0xbd, 0x8c, 0xad, 0xa8, 0xbd, 0xab, 0xcc,
+  0x62, 0xbc, 0x4a, 0x7c, 0x09, 0xba, 0x01, 0x0d, 0x2b, 0xbd, 0x3d, 0x77, 0xb6,
+  0x3b, 0xd3, 0x48, 0xc8, 0x3d, 0x89, 0xcf, 0x05, 0x3e, 0xdb, 0x48, 0x92, 0x3d,
+  0x1e, 0xa5, 0xc9, 0x3c, 0xc7, 0xad, 0x74, 0x3d, 0x66, 0x26, 0x4e, 0xbd, 0x8f,
+  0x4c, 0x85, 0x3d, 0xe2, 0x14, 0xe3, 0x3d, 0xad, 0x90, 0x2b, 0xbd, 0xcd, 0x7c,
+  0xf4, 0x3d, 0xe6, 0xae, 0x98, 0x3c, 0xa6, 0x86, 0x66, 0x3c, 0x18, 0x11, 0x1f,
+  0xbc, 0xb8, 0xe5, 0xa3, 0xbc, 0xea, 0xd7, 0x47, 0xbd, 0x39, 0x8a, 0xbb, 0x3d,
+  0x1c, 0x27, 0x4c, 0xba, 0x50, 0x9a, 0x4b, 0xbd, 0xda, 0x55, 0x5c, 0xbd, 0xa7,
+  0xd6, 0xb4, 0x3d, 0x40, 0x3f, 0xa0, 0xbd, 0x26, 0xa7, 0xba, 0xbd, 0x4c, 0xc0,
+  0x5c, 0x3d, 0x5c, 0xe1, 0x96, 0x3d, 0x50, 0xd9, 0x36, 0xbb, 0x8b, 0xf8, 0x7e,
+  0xbb, 0xb4, 0x9c, 0xf0, 0x3d, 0x88, 0xf4, 0xa8, 0xbd, 0x92, 0x72, 0x0e, 0xbd,
+  0x18, 0xc1, 0xa0, 0x3c, 0x78, 0x3f, 0xc6, 0xbd, 0xfa, 0xec, 0xe8, 0xbd, 0xa4,
+  0xbc, 0x3d, 0xbd, 0x47, 0x9d, 0xc6, 0xbc, 0x8e, 0x10, 0x4b, 0x3d, 0x18, 0x89,
+  0x51, 0xbd, 0x26, 0xd5, 0x9b, 0xbd, 0xb9, 0xbb, 0x0a, 0xbe, 0xa7, 0x0f, 0x8f,
+  0x3d, 0x62, 0x63, 0x4b, 0xbb, 0xfe, 0x46, 0x56, 0xbd, 0x64, 0xcc, 0xbb, 0x3d,
+  0x85, 0x17, 0x52, 0x3d, 0x08, 0xa8, 0x0e, 0x3d, 0x75, 0xdc, 0x4c, 0xbd, 0xf9,
+  0xc3, 0x92, 0x3d, 0xe0, 0x13, 0x84, 0x3d, 0xa1, 0x30, 0xe8, 0xbd, 0x2d, 0x2b,
+  0xd0, 0xbd, 0x68, 0x62, 0x91, 0xbc, 0x32, 0xd7, 0xd3, 0xbb, 0xac, 0xd6, 0xdb,
+  0x3d, 0x0d, 0x70, 0xe9, 0xbd, 0xed, 0xea, 0x69, 0x3d, 0xa4, 0xa3, 0x99, 0x3d,
+  0x60, 0xa0, 0xcd, 0xbd, 0xd8, 0x9b, 0x20, 0x3c, 0x29, 0x39, 0xaf, 0x3d, 0xd3,
+  0x2d, 0x2e, 0x3d, 0x10, 0xd7, 0x60, 0x3d, 0x2b, 0x82, 0xb1, 0xbd, 0x3d, 0x6b,
+  0x94, 0xbd, 0x73, 0xa6, 0x24, 0x3d, 0x33, 0x6b, 0xf9, 0xbd, 0x94, 0xe1, 0xac,
+  0x3d, 0xdf, 0x2c, 0x77, 0x3d, 0x82, 0x66, 0xa0, 0x3c, 0x9d, 0x7c, 0xd1, 0xbd,
+  0x67, 0x66, 0x39, 0x3d, 0x1b, 0xb4, 0x5e, 0x3d, 0x0a, 0x50, 0x7f, 0x3d, 0x1a,
+  0x08, 0x6c, 0x3d, 0x6c, 0x55, 0xac, 0xbd, 0x27, 0x4d, 0x04, 0xbc, 0x28, 0x6e,
+  0x54, 0x3c, 0x8d, 0x2e, 0x95, 0xbd, 0x56, 0x25, 0xd5, 0x3a, 0x8d, 0xf8, 0xde,
+  0xbd, 0x53, 0xd6, 0xe0, 0x3c, 0x09, 0xfc, 0x3f, 0x3d, 0x95, 0x29, 0xbe, 0xba,
+  0x9b, 0x98, 0xa6, 0x3d, 0xfd, 0xd1, 0xe1, 0x3d, 0x00, 0x2a, 0x04, 0xbe, 0x06,
+  0x73, 0x8b, 0xbd, 0x1e, 0x77, 0xcd, 0x3d, 0xf3, 0x47, 0x01, 0xbe, 0x41, 0x8d,
+  0xd2, 0xbc, 0x98, 0xba, 0x02, 0xbe, 0x14, 0x4e, 0x84, 0xbc, 0x7b, 0xee, 0xc1,
+  0x3d, 0x5c, 0x1f, 0x5f, 0xbd, 0x66, 0x1e, 0xd4, 0xbd, 0xa7, 0x18, 0x51, 0x3d,
+  0xaa, 0xbb, 0x7f, 0x3b, 0x9a, 0x15, 0x33, 0x3d, 0xcd, 0x6b, 0x8d, 0x3d, 0x9c,
+  0x73, 0x6d, 0xbd, 0x76, 0x3e, 0x54, 0x3c, 0x3d, 0x4f, 0xe4, 0x3d, 0x89, 0xaf,
+  0xf9, 0x3d, 0x0f, 0x5f, 0x8b, 0xbd, 0x5d, 0xcc, 0x9c, 0xbd, 0x8b, 0x08, 0xf1,
+  0xbd, 0xe3, 0xc3, 0x04, 0xbd, 0x5f, 0x0b, 0xf8, 0x3d, 0x4f, 0xd8, 0xaf, 0x3d,
+  0x2f, 0xff, 0x3e, 0x3d, 0x07, 0xf0, 0x5f, 0xbb, 0xcd, 0x6b, 0xbd, 0xbd, 0x0a,
+  0x80, 0xee, 0x3d, 0x58, 0xa2, 0xbd, 0x3c, 0xa6, 0x43, 0xf9, 0xbc, 0x7e, 0x76,
+  0xbb, 0x3d, 0x0b, 0x75, 0x11, 0xb9, 0x7c, 0x78, 0x46, 0x3d, 0xe9, 0xf0, 0x73,
+  0x3d, 0x6d, 0x01, 0x50, 0xbc, 0x6f, 0x55, 0x80, 0x3d, 0x88, 0x5d, 0xd4, 0xbc,
+  0x20, 0x61, 0x94, 0xbd, 0xbd, 0x32, 0xa3, 0x3c, 0x91, 0x29, 0xb3, 0xbd, 0x7a,
+  0x60, 0x62, 0xbc, 0xd8, 0x67, 0x99, 0xbb, 0xea, 0xd6, 0x4a, 0xbd, 0xb2, 0xb3,
+  0x14, 0xbd, 0x15, 0x9f, 0xf6, 0x3d, 0xc4, 0x35, 0xbe, 0xbd, 0xc6, 0x0b, 0x63,
+  0x3d, 0x43, 0x76, 0x43, 0xbd, 0x4f, 0x5e, 0x18, 0xbc, 0x6b, 0xac, 0xb1, 0x3d,
+  0x4e, 0xca, 0xd8, 0xbd, 0x2f, 0xef, 0xc3, 0x3d, 0x96, 0xc3, 0x48, 0x3c, 0x1c,
+  0x73, 0x17, 0x3d, 0x56, 0x34, 0xfb, 0x3c, 0x25, 0xa7, 0xb2, 0x3d, 0x29, 0x5e,
+  0xac, 0x3d, 0xdd, 0x3b, 0x80, 0x3d, 0x5a, 0xec, 0x37, 0x3c, 0xdc, 0xf9, 0x92,
+  0x3b, 0x66, 0x0b, 0xc6, 0xbd, 0x75, 0x09, 0xfc, 0xbc, 0x55, 0xd9, 0xea, 0xbd,
+  0x01, 0xed, 0x7a, 0x3c, 0x90, 0x7d, 0x5e, 0xbd, 0xb8, 0x38, 0xc9, 0x3d, 0xb8,
+  0x23, 0xa6, 0x3d, 0xb8, 0x83, 0x01, 0x3e, 0xe8, 0x22, 0xda, 0x3c, 0x66, 0xf5,
+  0x92, 0x3d, 0x82, 0xe0, 0x87, 0x3c, 0x6f, 0xa1, 0x6e, 0x3d, 0x27, 0xca, 0xaf,
+  0x3c, 0x7f, 0x68, 0xd6, 0xbd, 0x38, 0x98, 0x93, 0x3d, 0x4d, 0xdc, 0x5e, 0x3d,
+  0xc8, 0xb8, 0xb2, 0x3d, 0xab, 0xeb, 0x8a, 0xbb, 0x39, 0x48, 0xbb, 0xbd, 0x17,
+  0xe6, 0x0f, 0x3d, 0x57, 0x79, 0xea, 0xbc, 0xb2, 0x5e, 0xdb, 0x3d, 0x0c, 0x19,
+  0xc7, 0xbd, 0xeb, 0x33, 0x2b, 0x3d, 0x4b, 0x15, 0xf6, 0x3d, 0x96, 0x9b, 0xa1,
+  0xbc, 0x5c, 0xc8, 0x03, 0xbd, 0x88, 0x56, 0x21, 0x3e, 0x85, 0x0c, 0xa5, 0x3c,
+  0x85, 0xcb, 0xf4, 0xbd, 0x61, 0x03, 0x4d, 0x3c, 0xf1, 0xf4, 0x8c, 0xbd, 0x7b,
+  0x39, 0x34, 0x3b, 0xf4, 0xa2, 0x47, 0xbc, 0x10, 0x2d, 0xfc, 0xbd, 0xe8, 0xdd,
+  0xe6, 0x3c, 0xa5, 0x7c, 0x85, 0x3c, 0x3f, 0xcd, 0xeb, 0xbc, 0x42, 0x94, 0xba,
+  0xbd, 0x50, 0x23, 0xe3, 0xbd, 0x92, 0xf6, 0xa7, 0xbd, 0x5c, 0x36, 0xd0, 0xbd,
+  0x27, 0x9e, 0x18, 0x3e, 0x33, 0x9a, 0xe8, 0xbc, 0x80, 0x3a, 0x5d, 0x3d, 0xd0,
+  0xdc, 0x9c, 0xbd, 0xa3, 0x93, 0x51, 0xbd, 0x36, 0xab, 0x7a, 0x3d, 0x74, 0x9c,
+  0x63, 0x3d, 0x1c, 0x19, 0x9b, 0xbd, 0xa6, 0x10, 0xb4, 0xbd, 0xf4, 0x80, 0xb4,
+  0xbc, 0xd3, 0x9c, 0xd2, 0xbc, 0x6d, 0x1b, 0x68, 0xbd, 0x31, 0x6a, 0xfd, 0xbd,
+  0xdc, 0xa4, 0x82, 0xbd, 0xa7, 0xe7, 0x37, 0xbd, 0x5c, 0xd1, 0x07, 0xbd, 0x4e,
+  0x82, 0x15, 0xbc, 0x31, 0x43, 0x16, 0x3e, 0xe2, 0xf3, 0x1e, 0x3e, 0x62, 0x22,
+  0x14, 0x3e, 0x27, 0x65, 0x0d, 0x39, 0xaa, 0x9e, 0x8f, 0x3d, 0xdd, 0x59, 0x4c,
+  0x3c, 0x4a, 0xc5, 0xc5, 0xbd, 0x4a, 0xa5, 0xc7, 0x3b, 0xb9, 0x73, 0xcc, 0x3d,
+  0x10, 0x62, 0x5c, 0x3c, 0x87, 0xd8, 0xb2, 0xbd, 0x15, 0x50, 0xf8, 0x3d, 0xd7,
+  0x7f, 0x91, 0xbd, 0xf4, 0x07, 0xfb, 0x3c, 0x93, 0x09, 0xae, 0xbc, 0x54, 0x19,
+  0x76, 0x3a, 0x42, 0x4f, 0xbe, 0xbc, 0x6a, 0xef, 0xee, 0x3d, 0x98, 0x97, 0xb7,
+  0x3d, 0x33, 0x07, 0x3c, 0xbd, 0xe0, 0xc2, 0x46, 0x3c, 0x33, 0x5f, 0x80, 0x3c,
+  0x4d, 0x5e, 0xff, 0xbc, 0x4e, 0x02, 0xe8, 0xbc, 0x1f, 0x5b, 0xcd, 0xbc, 0x2d,
+  0x41, 0x8a, 0x3d, 0x2d, 0xeb, 0x5e, 0xbd, 0xff, 0x53, 0xb0, 0x3d, 0x7c, 0x37,
+  0xb0, 0x3c, 0x0b, 0xc9, 0x87, 0xbd, 0x32, 0xd1, 0xe6, 0xbb, 0xc0, 0x2f, 0xcf,
+  0x3d, 0x42, 0x5e, 0xb5, 0x3d, 0xd4, 0xbf, 0x36, 0xbd, 0x26, 0xd8, 0xf1, 0xbd,
+  0xf3, 0x8b, 0xc2, 0x3d, 0x1d, 0xd9, 0xe7, 0xbb, 0xab, 0xf9, 0x16, 0x3d, 0x13,
+  0x82, 0x93, 0x3d, 0x5e, 0xab, 0xbc, 0xbd, 0x57, 0xf5, 0x2f, 0x3c, 0x86, 0x19,
+  0x96, 0x3c, 0x17, 0xb1, 0x3e, 0x3d, 0xcd, 0xfd, 0x72, 0xbd, 0xae, 0x8d, 0xbf,
+  0x3c, 0x5e, 0x94, 0x5c, 0x3d, 0x16, 0x67, 0x88, 0x3d, 0xf1, 0xcb, 0x43, 0xbd,
+  0xc5, 0x5e, 0x6b, 0xbd, 0xa0, 0xc2, 0xdb, 0x3d, 0x94, 0x36, 0x11, 0xbd, 0x26,
+  0xb6, 0xb2, 0xbd, 0xe6, 0x9d, 0x93, 0xbd, 0x66, 0x04, 0x5e, 0xbd, 0xed, 0xfe,
+  0xaf, 0xbb, 0xbc, 0x70, 0x50, 0x3d, 0x0a, 0xeb, 0xd0, 0xbd, 0x3d, 0x06, 0xb5,
+  0x3d, 0xa7, 0x77, 0x31, 0xbd, 0x5f, 0x4b, 0xa6, 0xbd, 0x9b, 0x0f, 0x96, 0xbc,
+  0x7e, 0x02, 0xd4, 0xbc, 0x39, 0x52, 0xc4, 0xbd, 0xc3, 0x4e, 0x09, 0x3e, 0x5c,
+  0xc9, 0x48, 0x3d, 0xa4, 0x28, 0x36, 0xbd, 0xe3, 0xa7, 0x31, 0x3b, 0xdd, 0x29,
+  0xf4, 0x3d, 0x30, 0x52, 0x76, 0x3d, 0x10, 0xa8, 0x27, 0x3c, 0x0c, 0x16, 0x56,
+  0x3d, 0x84, 0xd6, 0x1a, 0xbd, 0x34, 0xea, 0xaa, 0x3c, 0x8b, 0xaa, 0x50, 0xbc,
+  0x02, 0x56, 0xc2, 0x3c, 0xee, 0x61, 0xe8, 0xbd, 0xf2, 0xaa, 0xb0, 0x3d, 0x22,
+  0xd5, 0x23, 0x3e, 0x2d, 0x7d, 0x62, 0xbd, 0x8a, 0x95, 0x6d, 0xbc, 0x6a, 0xaf,
+  0xb4, 0xbb, 0x34, 0x65, 0xad, 0x3d, 0x14, 0xff, 0xda, 0xbd, 0x43, 0xdc, 0x04,
+  0xbd, 0x26, 0xed, 0xa8, 0xbd, 0x97, 0xc7, 0xc3, 0x3d, 0x76, 0x2d, 0xd3, 0xbc,
+  0xe1, 0xc3, 0xbd, 0xbd, 0x75, 0x52, 0xca, 0x3c, 0x84, 0xfa, 0x13, 0x3c, 0x2e,
+  0xea, 0x00, 0xbd, 0xb9, 0xbc, 0xcf, 0x3d, 0xcb, 0x67, 0x65, 0xbd, 0xda, 0x95,
+  0xac, 0xbd, 0x51, 0x71, 0xed, 0x3c, 0xaf, 0xe1, 0x2c, 0xbd, 0xbf, 0x09, 0x2c,
+  0xba, 0xd1, 0xdc, 0xab, 0xbd, 0x60, 0xab, 0x71, 0xbc, 0x10, 0xa2, 0x2b, 0xbd,
+  0xb7, 0xba, 0x8f, 0xbd, 0x5e, 0x4b, 0x18, 0x3d, 0x4f, 0x72, 0xa6, 0xbc, 0xbb,
+  0x54, 0xc5, 0x3d, 0x2a, 0x54, 0xeb, 0xbd, 0x5b, 0x2e, 0x67, 0xbd, 0xc0, 0xd2,
+  0x61, 0x3b, 0x30, 0x8d, 0x34, 0x3d, 0xaa, 0x2e, 0xfe, 0xbc, 0x37, 0xa2, 0x7b,
+  0xbd, 0xb0, 0x0d, 0x7c, 0xbd, 0x05, 0x3f, 0x39, 0x3d, 0x52, 0xfc, 0xb2, 0x3d,
+  0xe8, 0x4a, 0xe6, 0xbd, 0x49, 0x3f, 0xd0, 0x3c, 0x1d, 0x43, 0x1a, 0xbd, 0x52,
+  0xcc, 0xc7, 0x3d, 0x6a, 0x3f, 0x72, 0x3b, 0x47, 0x6e, 0xdb, 0xbd, 0x6b, 0x97,
+  0xc2, 0xbd, 0xa0, 0x78, 0xe5, 0xbc, 0x01, 0xb0, 0xd8, 0xbc, 0xd0, 0x9f, 0x9f,
+  0xbc, 0x51, 0x99, 0x79, 0x3d, 0xf1, 0xd4, 0x1d, 0x3b, 0xe6, 0x19, 0x78, 0x3c,
+  0xb0, 0x8a, 0x8e, 0xbd, 0x90, 0xfc, 0xc9, 0x3d, 0x91, 0xe7, 0x85, 0x3d, 0xdd,
+  0xe2, 0x09, 0x3d, 0xb6, 0xf7, 0x5a, 0xbd, 0x26, 0xe8, 0xdc, 0xbd, 0x42, 0xca,
+  0x18, 0xbd, 0x2a, 0x1d, 0xb4, 0xbd, 0x83, 0x0b, 0xf1, 0x3a, 0xbd, 0x7b, 0x15,
+  0x3c, 0xf1, 0x7b, 0xa6, 0xbd, 0x55, 0xe4, 0x4d, 0xbd, 0xed, 0x07, 0xf8, 0xbc,
+  0xf3, 0x73, 0xa0, 0x3d, 0x75, 0x8a, 0xc5, 0xbd, 0x44, 0x2f, 0x7f, 0x3d, 0x35,
+  0x6c, 0x87, 0x3c, 0x61, 0x2c, 0x4b, 0xbc, 0x67, 0xde, 0x7d, 0xbd, 0x17, 0xaf,
+  0xe9, 0x3c, 0xaa, 0xd5, 0x0c, 0x3d, 0x98, 0xf5, 0xd8, 0xbc, 0x86, 0xa5, 0x2c,
+  0xbb, 0xad, 0x8e, 0x43, 0x3d, 0xd2, 0x59, 0xbd, 0xbd, 0x94, 0xc9, 0x69, 0xbd,
+  0x15, 0xa0, 0x81, 0x3d, 0x18, 0x49, 0x1e, 0x3d, 0xe7, 0xd7, 0xb5, 0xbd, 0x1f,
+  0x20, 0x10, 0xbd, 0xb0, 0x8b, 0xe0, 0xbd, 0xe0, 0x7c, 0x46, 0x3d, 0x1f, 0xc6,
+  0x5c, 0xbd, 0xbc, 0xc1, 0x1b, 0x3d, 0xc1, 0x1c, 0xc5, 0xbd, 0xf3, 0x52, 0x48,
+  0xbb, 0x39, 0x79, 0x86, 0x3d, 0x72, 0xbd, 0x36, 0x3c, 0xa5, 0xd7, 0x95, 0xbd,
+  0x73, 0xe0, 0x13, 0x3c, 0xe4, 0x9a, 0x50, 0xbd, 0x90, 0x58, 0x93, 0xbd, 0x3d,
+  0x9e, 0xac, 0x3d, 0x57, 0x08, 0xbb, 0x3d, 0x4e, 0xaf, 0x84, 0xbd, 0xdc, 0x16,
+  0xbc, 0xbd, 0x51, 0x1a, 0xbf, 0x3d, 0x62, 0x61, 0x97, 0x3d, 0x7a, 0xeb, 0x45,
+  0x3d, 0xa1, 0x27, 0xe7, 0x3d, 0x20, 0xcb, 0x45, 0xbd, 0xc3, 0x36, 0xda, 0x3d,
+  0xa2, 0x88, 0x48, 0x3d, 0x7c, 0x0d, 0x0d, 0x3b, 0x00, 0xa8, 0xaf, 0xbd, 0xda,
+  0x09, 0x51, 0xbd, 0xbd, 0xb3, 0x99, 0xbc, 0x6e, 0x40, 0x6a, 0xbd, 0x31, 0xdb,
+  0x71, 0x3c, 0x14, 0x0e, 0x0b, 0xbd, 0xe8, 0x4f, 0xae, 0xbd, 0xbb, 0xf3, 0xd4,
+  0x3d, 0xad, 0xdb, 0x8d, 0x3c, 0x72, 0x12, 0x66, 0xbd, 0x1f, 0xea, 0x98, 0xbd,
+  0xf7, 0xd0, 0x68, 0x3d, 0x47, 0x27, 0x13, 0x3d, 0xe9, 0x9d, 0xa2, 0xbd, 0x01,
+  0x07, 0xa9, 0x3d, 0x81, 0xa9, 0xa2, 0x3c, 0x54, 0x75, 0xb5, 0xbc, 0xbc, 0x9f,
+  0x8e, 0x3c, 0xdd, 0x55, 0x8c, 0x3c, 0xf6, 0x8f, 0xdc, 0x3d, 0x63, 0x45, 0xe7,
+  0x3c, 0xc2, 0x06, 0x48, 0x3c, 0x63, 0x7a, 0xe9, 0xbd, 0xb0, 0x14, 0x3f, 0x3d,
+  0x1b, 0x99, 0xe4, 0xbd, 0x0d, 0xa5, 0x89, 0x3d, 0x5d, 0x1e, 0xc4, 0xbd, 0x9b,
+  0x12, 0x8e, 0x3d, 0x47, 0xa7, 0xb6, 0xbc, 0xc7, 0x3f, 0xf3, 0xbd, 0x82, 0x32,
+  0x8f, 0xbd, 0xed, 0x11, 0xbe, 0x3d, 0xe4, 0x1e, 0xc6, 0xbc, 0x9d, 0x73, 0xee,
+  0xbd, 0xce, 0x18, 0xe3, 0xbd, 0x3f, 0x2c, 0x90, 0xbd, 0xc6, 0x82, 0xad, 0x3d,
+  0xa4, 0x9e, 0xf1, 0xbd, 0x6e, 0x4f, 0xe7, 0x3d, 0x63, 0x8b, 0x28, 0xbd, 0x0a,
+  0x66, 0x80, 0xbd, 0xa0, 0xa5, 0x84, 0xbd, 0xb0, 0xce, 0xbb, 0xbd, 0x72, 0xba,
+  0xa1, 0xbd, 0x42, 0x55, 0xa6, 0xbd, 0x36, 0x00, 0xce, 0x3d, 0x11, 0x44, 0xbc,
+  0x3b, 0xb4, 0x63, 0xa9, 0x3d, 0x07, 0x61, 0x9b, 0x3d, 0x50, 0xb7, 0xb3, 0xbd,
+  0xe1, 0xcc, 0x74, 0xbd, 0xa1, 0x8e, 0x6c, 0x3d, 0xa6, 0x54, 0xb6, 0xbd, 0xce,
+  0xde, 0xb4, 0x3c, 0x29, 0xd3, 0x31, 0xbc, 0x74, 0x1c, 0x78, 0xbd, 0xa7, 0xa4,
+  0x25, 0xbb, 0x01, 0xe0, 0x85, 0x3d, 0x67, 0xc7, 0xbd, 0xbc, 0xae, 0xdb, 0x3a,
+  0xbd, 0xaa, 0x9c, 0xdd, 0xbd, 0x7a, 0x65, 0xaa, 0xbc, 0x11, 0x1d, 0x53, 0xbd,
+  0xc0, 0xf8, 0x3a, 0xbd, 0x50, 0xd4, 0x84, 0xbc, 0x3b, 0x49, 0x7f, 0xbd, 0x44,
+  0x79, 0xde, 0x3d, 0xb9, 0x83, 0xfb, 0x3d, 0x12, 0x34, 0x8d, 0xbd, 0x0a, 0x31,
+  0xf0, 0x3c, 0x16, 0x71, 0x4e, 0xbd, 0xc4, 0x6a, 0x5f, 0x3d, 0x5a, 0xbe, 0x7e,
+  0x3d, 0xca, 0x56, 0xe7, 0xbc, 0xe7, 0xa1, 0xb8, 0xbd, 0xf7, 0xac, 0x17, 0x3d,
+  0xf1, 0x7c, 0x83, 0xbd, 0xe4, 0x5f, 0xec, 0xbd, 0x18, 0x92, 0xa9, 0xbb, 0x71,
+  0x9a, 0x3d, 0xbd, 0xd1, 0x18, 0x20, 0xbd, 0x94, 0xfa, 0xbd, 0x3d, 0x2f, 0x1f,
+  0x85, 0xbd, 0xc1, 0xc3, 0xa3, 0x3d, 0x36, 0xdb, 0x96, 0x3d, 0xa5, 0xae, 0x4e,
+  0xbc, 0xaa, 0x11, 0x9c, 0xbd, 0x44, 0xa2, 0x95, 0x3d, 0xe7, 0x39, 0x73, 0x3b,
+  0x1d, 0x57, 0x86, 0xbd, 0x14, 0x17, 0xa7, 0xbd, 0xaf, 0xc3, 0x09, 0xbd, 0x2f,
+  0x90, 0x20, 0xbd, 0x08, 0x91, 0x9c, 0x3c, 0x88, 0x0c, 0xd1, 0x3d, 0x56, 0x99,
+  0x9d, 0xbd, 0xb3, 0x75, 0xb2, 0x3d, 0xa1, 0x04, 0x59, 0xbb, 0x44, 0x0a, 0x6f,
+  0x3b, 0x5a, 0x42, 0xce, 0xbd, 0x1b, 0x3b, 0x91, 0x3d, 0x14, 0xb8, 0xdf, 0xbd,
+  0x85, 0x51, 0x8c, 0xbc, 0xa7, 0xd5, 0x5f, 0x3d, 0xe7, 0x88, 0x61, 0xbd, 0x97,
+  0x11, 0xd9, 0x39, 0x5c, 0x0b, 0x6d, 0xbd, 0xe4, 0xe3, 0xb1, 0xbd, 0xeb, 0xfe,
+  0xeb, 0xbd, 0xd3, 0x37, 0x66, 0x3c, 0x4b, 0x72, 0x49, 0xbd, 0x12, 0x06, 0xbf,
+  0x3b, 0x12, 0x40, 0x77, 0x3d, 0x7c, 0x9d, 0x92, 0x3d, 0xb2, 0xcd, 0xad, 0x3d,
+  0xb2, 0xe3, 0x65, 0x3d, 0x91, 0x55, 0xbd, 0x3c, 0x31, 0x00, 0xc0, 0xbd, 0xc9,
+  0x3b, 0x46, 0x3d, 0x51, 0xd9, 0xa6, 0x3d, 0xb9, 0xcb, 0xaf, 0xbd, 0xf8, 0x85,
+  0xd4, 0xbd, 0x47, 0x6f, 0xf2, 0xbd, 0x70, 0xd4, 0x13, 0x3d, 0x2c, 0x38, 0x55,
+  0x3d, 0x61, 0x11, 0xd7, 0x3d, 0x62, 0x90, 0xed, 0xbc, 0xd0, 0x71, 0x79, 0xbd,
+  0xc5, 0xc9, 0x87, 0xbd, 0x6d, 0x23, 0x96, 0xbc, 0xc1, 0x06, 0x9b, 0xbd, 0xc8,
+  0x2d, 0xfc, 0xbc, 0x79, 0x8d, 0xb8, 0xbd, 0xb3, 0x32, 0xca, 0xbc, 0x17, 0x71,
+  0xd3, 0xbd, 0x51, 0x07, 0xc6, 0xbc, 0x59, 0x04, 0x49, 0x3d, 0x15, 0x14, 0x8a,
+  0xbd, 0xd0, 0xae, 0xa4, 0xbd, 0x4c, 0x5f, 0xdd, 0x3d, 0xb5, 0x52, 0xbc, 0x3b,
+  0x4d, 0xca, 0x3f, 0xbd, 0x85, 0x21, 0xb0, 0xbd, 0x9e, 0x8b, 0xc3, 0xbd, 0x51,
+  0xd9, 0xa8, 0x3d, 0x53, 0x49, 0xd1, 0x3c, 0x35, 0x6f, 0xe3, 0xbd, 0x7f, 0xe2,
+  0x9e, 0xbd, 0x42, 0xd8, 0x14, 0xbd, 0x00, 0x6f, 0x19, 0x3d, 0xe1, 0x4e, 0x53,
+  0x3d, 0xda, 0xc8, 0x66, 0xbd, 0xf1, 0x51, 0xea, 0xbd, 0x8a, 0x7f, 0xbb, 0x3d,
+  0xa6, 0x85, 0x10, 0xbd, 0x4e, 0xcc, 0xd7, 0x3d, 0x8b, 0x94, 0xad, 0xbd, 0xaa,
+  0x92, 0x92, 0xbc, 0xdb, 0xcd, 0x3a, 0x3d, 0x43, 0x71, 0x99, 0x3d, 0xa0, 0xeb,
+  0xe1, 0x3d, 0xbe, 0x5e, 0xe3, 0x3c, 0x43, 0x28, 0x98, 0xbd, 0x04, 0x2b, 0x96,
+  0xbd, 0xc6, 0x1a, 0x21, 0xbb, 0xce, 0xba, 0xd3, 0xbd, 0x57, 0xee, 0x04, 0x3d,
+  0x87, 0xf6, 0x8a, 0xbb, 0xda, 0x72, 0x99, 0x3d, 0xcb, 0x2f, 0x8a, 0x3d, 0x1f,
+  0x20, 0xb5, 0xbd, 0xbe, 0x1f, 0x1e, 0xbd, 0x17, 0x5e, 0x84, 0xbd, 0xfd, 0xce,
+  0xb2, 0xbd, 0xfc, 0xcc, 0x74, 0x3d, 0x66, 0x53, 0xca, 0x3c, 0x35, 0x5e, 0x9e,
+  0x3d, 0x6c, 0x9b, 0xb4, 0x3d, 0x08, 0xbd, 0x90, 0x3d, 0x45, 0xc0, 0xc1, 0xbd,
+  0x83, 0x2c, 0xd3, 0xbc, 0x85, 0xa9, 0x81, 0xbc, 0xa4, 0x47, 0xbc, 0x3d, 0xc2,
+  0xc6, 0x91, 0xbb, 0x45, 0xf7, 0x51, 0x3d, 0x7c, 0x74, 0x32, 0x3d, 0x64, 0x6d,
+  0x67, 0xbd, 0xaf, 0x34, 0x37, 0x3d, 0xea, 0xb0, 0x95, 0xbd, 0xe6, 0x42, 0x22,
+  0x3d, 0xe4, 0x2b, 0xf9, 0xbd, 0x27, 0x85, 0x8c, 0xbc, 0x57, 0x16, 0xd4, 0x3d,
+  0x0d, 0x41, 0xb9, 0xbc, 0xde, 0xf7, 0xb3, 0xbc, 0xb1, 0x86, 0x5a, 0x3d, 0x16,
+  0x06, 0x99, 0x3d, 0x36, 0x5c, 0xf2, 0x3d, 0x96, 0x49, 0xfc, 0xbd, 0xd0, 0xda,
+  0x0b, 0xbd, 0x74, 0x35, 0xfd, 0x3d, 0x3c, 0x9d, 0x12, 0xbd, 0x88, 0xae, 0xc0,
+  0xbd, 0xd6, 0xe7, 0x5e, 0x3d, 0x31, 0x3f, 0xba, 0xbd, 0x0a, 0x05, 0xb9, 0xbd,
+  0x8d, 0xe3, 0x35, 0xbd, 0x83, 0xd0, 0x26, 0xbd, 0x04, 0xba, 0x97, 0xbc, 0x46,
+  0x99, 0xbf, 0xbd, 0xa1, 0x44, 0x75, 0x3b, 0xb8, 0x9b, 0x07, 0x3e, 0x32, 0xe6,
+  0xd5, 0xbd, 0xc0, 0x9f, 0xf3, 0x3d, 0x7f, 0x4f, 0x36, 0xbc, 0x42, 0xda, 0xe3,
+  0x3d, 0x3b, 0xb2, 0x5c, 0x3c, 0x97, 0x30, 0xd7, 0x3d, 0x51, 0xe8, 0xea, 0xbc,
+  0x6e, 0x73, 0x4d, 0x3d, 0x2f, 0x77, 0xb5, 0x3b, 0x0b, 0x79, 0xc1, 0x3c, 0x2f,
+  0xd9, 0x8c, 0xbd, 0x0e, 0x78, 0xbf, 0xbd, 0x3c, 0xec, 0x84, 0x3d, 0x59, 0xa9,
+  0xaa, 0xbd, 0x35, 0xdc, 0xe4, 0xbd, 0x91, 0xcf, 0x2e, 0x3d, 0x3c, 0x17, 0x0d,
+  0xbc, 0x10, 0xd0, 0xf9, 0x3d, 0xab, 0xca, 0xf9, 0xbd, 0x4b, 0xd7, 0x9b, 0x3d,
+  0xd0, 0x10, 0xc9, 0xbd, 0x11, 0x82, 0x05, 0x3e, 0xd0, 0x14, 0x21, 0xbd, 0x6d,
+  0x61, 0x99, 0xbd, 0xae, 0x85, 0x7a, 0xbd, 0x67, 0xc0, 0x86, 0xbb, 0x1e, 0xd0,
+  0xbf, 0x3d, 0x92, 0x46, 0xf8, 0xbc, 0x0d, 0xad, 0xa1, 0x3c, 0xea, 0x8d, 0xd0,
+  0x3c, 0x61, 0x10, 0x49, 0x3c, 0x8a, 0x7e, 0xe9, 0xbc, 0x31, 0x95, 0xdf, 0xb9,
+  0xb5, 0x03, 0x0d, 0x3d, 0x0b, 0xf5, 0xd9, 0xbb, 0xba, 0x95, 0x8f, 0xbd, 0x7c,
+  0x81, 0xde, 0xbd, 0xfc, 0x64, 0xcb, 0x3d, 0x0e, 0x80, 0x2c, 0x3d, 0x64, 0xa8,
+  0x0b, 0x3d, 0x58, 0xd7, 0xcc, 0xbc, 0x06, 0x10, 0x81, 0x3d, 0xd6, 0x24, 0x2f,
+  0xbe, 0x2f, 0x77, 0x4e, 0xbd, 0x53, 0x72, 0x1a, 0xbd, 0xc1, 0x05, 0x6e, 0x3d,
+  0x0b, 0x99, 0x8e, 0xbd, 0x30, 0x10, 0x04, 0xbd, 0xc3, 0x1c, 0x00, 0xbd, 0xf1,
+  0x16, 0xba, 0xbd, 0x00, 0x43, 0x03, 0xbc, 0xb8, 0x2d, 0xf4, 0x3c, 0x18, 0x18,
+  0x4d, 0x3d, 0x70, 0x7c, 0x99, 0xb9, 0x49, 0xef, 0xd2, 0xbc, 0x8a, 0xa4, 0x11,
+  0x3d, 0xe4, 0x8b, 0x5b, 0xbc, 0x16, 0xc1, 0x8c, 0xb9, 0x71, 0xa4, 0x37, 0x3d,
+  0xb2, 0xa4, 0xb0, 0x3c, 0x79, 0x6c, 0x8a, 0x3d, 0xb6, 0x86, 0x96, 0x3c, 0x06,
+  0xd1, 0x58, 0xbd, 0xae, 0x40, 0x92, 0xbc, 0x4c, 0x63, 0xa7, 0x3d, 0xac, 0x67,
+  0xb4, 0xbd, 0x5b, 0xda, 0x17, 0xbd, 0xeb, 0xfc, 0x09, 0x3d, 0x44, 0x95, 0x68,
+  0x3c, 0x03, 0xee, 0xd7, 0x3d, 0x57, 0x9f, 0xc2, 0x3d, 0x9c, 0xa6, 0xe7, 0x3b,
+  0xff, 0x8e, 0xcd, 0xbc, 0x22, 0x41, 0xf7, 0x3c, 0x19, 0xe0, 0x1d, 0xbd, 0xae,
+  0xcc, 0xe2, 0x3b, 0x70, 0xb1, 0x9f, 0x3d, 0xd8, 0x1d, 0xb7, 0x3d, 0xa1, 0xde,
+  0x4d, 0x3c, 0x12, 0xb6, 0x08, 0x3e, 0x1d, 0x9c, 0xbf, 0x3d, 0xd8, 0x48, 0x4a,
+  0xbb, 0x07, 0xd1, 0x5e, 0xbd, 0xd3, 0x82, 0xb1, 0x3d, 0x82, 0xef, 0x8d, 0x3d,
+  0x40, 0x79, 0xe5, 0xbc, 0x3f, 0x85, 0x8b, 0x3d, 0x6a, 0xa3, 0xa7, 0xbd, 0xed,
+  0xd4, 0xaf, 0xbd, 0x15, 0xf2, 0x96, 0xbd, 0x16, 0x8b, 0xf2, 0xbc, 0xdc, 0x5f,
+  0xc8, 0xbd, 0xef, 0x46, 0xb3, 0xbd, 0x41, 0x7a, 0x8c, 0xbd, 0x24, 0xfe, 0x62,
+  0xbd, 0xdf, 0xab, 0x89, 0xbb, 0xa9, 0x9c, 0xd6, 0x3d, 0xf5, 0xc0, 0x2c, 0x3d,
+  0x20, 0x81, 0xef, 0x3d, 0x1d, 0x1f, 0xd8, 0x3d, 0xe3, 0xea, 0xb7, 0xbc, 0xe5,
+  0x98, 0xb7, 0x3d, 0x97, 0x67, 0x48, 0x3d, 0x42, 0x5e, 0x10, 0xbe, 0x52, 0xdd,
+  0xb2, 0xbd, 0x79, 0x0f, 0x60, 0x3d, 0x7e, 0xc5, 0x1c, 0x3d, 0x9b, 0x47, 0x8a,
+  0xbd, 0xfe, 0x5a, 0x90, 0xba, 0xb3, 0x60, 0x7e, 0xbd, 0x59, 0x16, 0x7e, 0xbd,
+  0xb6, 0xb7, 0x01, 0x3d, 0x0d, 0x3c, 0xed, 0xbc, 0x0d, 0x44, 0x3c, 0xbb, 0x77,
+  0x3f, 0xf6, 0xbc, 0x74, 0x91, 0xb9, 0x3d, 0x15, 0xa6, 0x38, 0xbd, 0x6f, 0xa1,
+  0x39, 0x3d, 0xc8, 0x2e, 0xd8, 0x3d, 0x70, 0xf9, 0x7c, 0xbc, 0x17, 0x9c, 0xa5,
+  0x3a, 0xfd, 0x15, 0x0a, 0x3d, 0x55, 0x8c, 0xa7, 0x3d, 0xff, 0x06, 0x22, 0xbd,
+  0x2d, 0x31, 0x15, 0xbe, 0x70, 0x92, 0x92, 0xbd, 0x29, 0x8a, 0x0d, 0x3b, 0x6b,
+  0xca, 0x3d, 0xbd, 0xf2, 0xe1, 0x28, 0xbc, 0x36, 0x7a, 0x44, 0xbc, 0xea, 0x62,
+  0xd9, 0x3a, 0xd2, 0xdd, 0x9e, 0xbc, 0xda, 0xce, 0x16, 0xbe, 0x79, 0x5e, 0x97,
+  0x3b, 0x26, 0x34, 0x38, 0xbd, 0x77, 0x5d, 0x97, 0x3c, 0xc6, 0xcb, 0x84, 0xbd,
+  0xed, 0xa4, 0xda, 0x3d, 0xd2, 0x4f, 0x6d, 0xbc, 0x35, 0x16, 0xdc, 0xbd, 0xea,
+  0xfb, 0x08, 0xbe, 0x84, 0xea, 0x1e, 0xbd, 0x0e, 0x3a, 0x60, 0xb8, 0x4f, 0x4b,
+  0x0a, 0xbe, 0xfe, 0x33, 0x87, 0x3d, 0x63, 0x5e, 0x8d, 0x3d, 0x68, 0x29, 0x17,
+  0x3e, 0xa5, 0x25, 0x8f, 0xbc, 0x0a, 0x09, 0x78, 0xbd, 0x43, 0x98, 0x6d, 0xbd,
+  0x98, 0xa8, 0xa0, 0xbd, 0x7c, 0xa3, 0x13, 0x3d, 0xd4, 0xb8, 0x6d, 0xbc, 0x20,
+  0x1f, 0xc5, 0xbc, 0x06, 0xb5, 0x16, 0x3e, 0xcd, 0x4d, 0x90, 0xbd, 0xb8, 0xcc,
+  0xd4, 0x3d, 0xbd, 0xe9, 0xd1, 0xbd, 0x90, 0x68, 0xcf, 0x3d, 0xa7, 0xc6, 0x08,
+  0xbe, 0x1c, 0xe5, 0x5c, 0xbd, 0x6e, 0x56, 0xa6, 0x3d, 0x74, 0x4f, 0xa5, 0x3d,
+  0x96, 0x2b, 0x5a, 0x3d, 0xbe, 0xc6, 0x9b, 0xbd, 0x94, 0x33, 0x18, 0x3d, 0x57,
+  0x1a, 0x6b, 0xbd, 0xd7, 0x3d, 0x03, 0xbe, 0x6a, 0x36, 0x65, 0xbd, 0x13, 0x36,
+  0xbf, 0x3d, 0x82, 0x9a, 0x0a, 0x3d, 0x3c, 0x1d, 0xca, 0xbd, 0x0c, 0x40, 0x0e,
+  0xbe, 0x3f, 0x94, 0xae, 0xbd, 0x1f, 0x7e, 0x89, 0x3d, 0xe3, 0xbf, 0x30, 0xbe,
+  0x7a, 0x48, 0x23, 0x3a, 0xe5, 0x0e, 0x5d, 0x3d, 0x91, 0xd3, 0xf2, 0x3d, 0xb6,
+  0xef, 0x4a, 0xbd, 0xd4, 0xb3, 0x08, 0xbe, 0xa9, 0xba, 0xac, 0x3d, 0x31, 0x40,
+  0x86, 0x3d, 0xc2, 0xc7, 0x04, 0xbe, 0x7c, 0x3b, 0xdb, 0x3d, 0x11, 0x25, 0x04,
+  0xbd, 0x3f, 0x5d, 0xf3, 0xbc, 0xc2, 0x3f, 0xfb, 0x3c, 0x12, 0xac, 0xf4, 0xbd,
+  0xa7, 0xc4, 0x32, 0x3c, 0xc9, 0xea, 0xe3, 0x3c, 0x7d, 0xda, 0x36, 0x3c, 0x43,
+  0x55, 0x09, 0x3e, 0x5f, 0xd8, 0x22, 0xbd, 0x33, 0xf5, 0x29, 0x3e, 0xb8, 0x23,
+  0x8a, 0xbc, 0xfb, 0x3f, 0x52, 0xbe, 0xec, 0x1c, 0x79, 0x3d, 0x09, 0x9e, 0x24,
+  0xbd, 0x5b, 0x3c, 0xd3, 0xbd, 0x9f, 0x0b, 0x1f, 0x3e, 0x1f, 0xa2, 0xfc, 0xbd,
+  0x3b, 0x42, 0x9b, 0x3b, 0x0a, 0xae, 0xc4, 0xbc, 0x8b, 0xc8, 0xa7, 0x3d, 0x88,
+  0xaa, 0x9b, 0xbd, 0xaa, 0x37, 0xb6, 0x3d, 0x0d, 0x6a, 0x15, 0x3d, 0x47, 0xa8,
+  0x87, 0x3d, 0x53, 0xb1, 0xe3, 0x3d, 0xf7, 0x63, 0x0e, 0x3c, 0x37, 0x70, 0x8e,
+  0xbc, 0xc5, 0x5c, 0x32, 0xbe, 0x72, 0x7a, 0xd5, 0x3d, 0xcb, 0xac, 0xc7, 0xbd,
+  0x6f, 0xf1, 0x3a, 0xbd, 0x74, 0x40, 0x99, 0x3d, 0x35, 0x16, 0x88, 0xbc, 0xb4,
+  0x80, 0x14, 0x3e, 0x0b, 0x98, 0xd9, 0x3c, 0xa7, 0x98, 0x17, 0xbc, 0x6e, 0xd0,
+  0x60, 0xbb, 0xd9, 0xc2, 0x8f, 0x3d, 0xea, 0x37, 0xe1, 0xbd, 0x00, 0x42, 0xfd,
+  0x3d, 0xde, 0xb0, 0x3a, 0x3d, 0x4f, 0xe2, 0x50, 0x3c, 0x76, 0x9f, 0x42, 0xbd,
+  0x73, 0x18, 0x4e, 0xbe, 0x9b, 0xfd, 0x69, 0xbd, 0x69, 0xb2, 0x88, 0xbc, 0x6a,
+  0x13, 0x3e, 0xbd, 0x29, 0xf0, 0x0c, 0x3c, 0x1f, 0x81, 0x18, 0x3d, 0x03, 0x2e,
+  0x0c, 0x3e, 0xff, 0xf1, 0x4a, 0xbc, 0xb7, 0x9c, 0x14, 0xbe, 0xd5, 0x52, 0xce,
+  0xbd, 0xf6, 0x45, 0xf0, 0x3d, 0x8d, 0xc8, 0x55, 0xbd, 0x8f, 0xf0, 0x88, 0x3d,
+  0x8c, 0x8f, 0x20, 0xbd, 0x38, 0x7c, 0x4d, 0x3e, 0x6d, 0xba, 0x95, 0xbd, 0xdc,
+  0x7b, 0x0d, 0xbe, 0x3d, 0xbf, 0x2d, 0x3c, 0xee, 0xf6, 0xcb, 0x3c, 0x42, 0x85,
+  0x2e, 0x3d, 0x43, 0x4c, 0xb3, 0x3d, 0xe6, 0x70, 0x91, 0xbd, 0x58, 0x98, 0xfd,
+  0x3d, 0x70, 0x75, 0x52, 0xbd, 0xb7, 0x44, 0x34, 0xbe, 0x62, 0x65, 0xdc, 0xbd,
+  0xb8, 0xc7, 0x83, 0x3c, 0x0d, 0x0a, 0xaa, 0xbd, 0x09, 0xcb, 0x92, 0x3c, 0xbd,
+  0x5d, 0xc7, 0xb9, 0x3a, 0x4e, 0xa6, 0xbd, 0xd8, 0xfb, 0xa6, 0xbd, 0xcd, 0xfc,
+  0x72, 0xbe, 0x12, 0xdc, 0x4d, 0xbd, 0x0a, 0x7c, 0x5d, 0x3d, 0x8c, 0xce, 0x7a,
+  0x3d, 0xe8, 0x3d, 0x83, 0xbd, 0x0d, 0x6c, 0x9e, 0x3d, 0x14, 0xb3, 0x3c, 0x3d,
+  0x05, 0x0e, 0xdf, 0x3d, 0xf7, 0x27, 0xb7, 0xbd, 0xa3, 0x18, 0x08, 0x3d, 0x54,
+  0xdb, 0x6a, 0x3c, 0x93, 0x1a, 0x80, 0xbd, 0xf9, 0x13, 0x05, 0x3e, 0xd9, 0x61,
+  0x87, 0x3d, 0x08, 0xa5, 0x9b, 0xbd, 0x70, 0x5d, 0xc9, 0xbc, 0x9b, 0x99, 0x94,
+  0xbd, 0xc5, 0x6e, 0xd4, 0xbd, 0xc8, 0x60, 0xad, 0x3d, 0x29, 0x62, 0x05, 0xbd,
+  0x83, 0xd8, 0xc1, 0xbd, 0xa2, 0x72, 0xf1, 0x3d, 0x57, 0x3f, 0x2e, 0xbb, 0xb8,
+  0x1a, 0xcf, 0xbc, 0xc3, 0xda, 0x96, 0xbd, 0xd3, 0xbc, 0x81, 0xbd, 0xca, 0x52,
+  0xa1, 0xbb, 0xe8, 0xaf, 0x6a, 0x3d, 0x49, 0xaa, 0xf8, 0x3c, 0x5f, 0x2a, 0x9a,
+  0xbd, 0xcb, 0x12, 0x6b, 0xbd, 0xc9, 0x4a, 0x8f, 0xbc, 0xce, 0x3c, 0xfd, 0x3d,
+  0x71, 0x17, 0xed, 0x3d, 0x54, 0x40, 0xea, 0xbd, 0xcb, 0x7f, 0x2d, 0xbd, 0x2c,
+  0x13, 0x86, 0x3d, 0xcd, 0x8c, 0x44, 0xbd, 0xe4, 0x65, 0xa6, 0xbb, 0x06, 0x81,
+  0x04, 0x3d, 0x64, 0x45, 0x8e, 0x3d, 0xef, 0x80, 0x22, 0xbd, 0x35, 0x90, 0xaa,
+  0xbd, 0x02, 0xb6, 0x48, 0x3d, 0x76, 0xba, 0x39, 0x3d, 0xf3, 0xce, 0x66, 0xbd,
+  0x3f, 0x8e, 0xf1, 0xbd, 0x2a, 0x81, 0x0e, 0xbd, 0x82, 0x05, 0x0b, 0x3e, 0x7b,
+  0xdb, 0x2f, 0x3d, 0x86, 0xe3, 0xba, 0x3d, 0xac, 0x47, 0x17, 0x3e, 0xcb, 0x96,
+  0x8f, 0x3c, 0x3b, 0x58, 0xe7, 0xbd, 0x38, 0x64, 0x46, 0xbe, 0x9e, 0x73, 0x88,
+  0xbd, 0x0f, 0xf0, 0x8e, 0xbd, 0xc1, 0x4c, 0x00, 0xbd, 0x70, 0xbb, 0x54, 0xbd,
+  0x74, 0x55, 0x20, 0x3b, 0x1f, 0x22, 0x8d, 0x3d, 0xc9, 0x1d, 0xce, 0x3c, 0xad,
+  0x53, 0x3f, 0x3d, 0x7e, 0xd8, 0xb2, 0x3d, 0x9e, 0xc0, 0xf5, 0x3d, 0x79, 0x01,
+  0x32, 0xbd, 0x49, 0x13, 0x2e, 0x3d, 0xff, 0x7a, 0xce, 0x3d, 0xb5, 0xbc, 0x46,
+  0x3d, 0x43, 0xa5, 0xc8, 0xbd, 0xf2, 0x4d, 0xd3, 0x3b, 0x78, 0x3e, 0x39, 0x3d,
+  0x2c, 0x01, 0xc7, 0xbd, 0x5d, 0x5b, 0x8d, 0xbd, 0xb1, 0x3b, 0xa3, 0xbd, 0x1f,
+  0x70, 0x6e, 0x3c, 0x62, 0x07, 0x58, 0xbd, 0x29, 0xd9, 0xc8, 0xba, 0x13, 0xa6,
+  0xd3, 0xbd, 0xc1, 0x45, 0xbf, 0xbc, 0x3e, 0x9f, 0xea, 0xbc, 0x7c, 0x4d, 0xcc,
+  0x3d, 0x6c, 0x0c, 0x2e, 0xbd, 0xcf, 0xa0, 0x9a, 0x3b, 0x83, 0x9e, 0xfa, 0xbd,
+  0x77, 0x21, 0xaa, 0x3d, 0xcf, 0x18, 0xf5, 0xbd, 0xfe, 0x30, 0x79, 0x3d, 0x24,
+  0x33, 0x4d, 0x3d, 0xf7, 0x5f, 0x54, 0x3d, 0xda, 0x9d, 0xc9, 0xbd, 0x28, 0x08,
+  0x16, 0x3d, 0x53, 0x5a, 0xf6, 0xbc, 0xa5, 0x86, 0x84, 0xbd, 0x91, 0x39, 0xc5,
+  0xbc, 0x54, 0x2b, 0xda, 0xbd, 0x49, 0x34, 0xae, 0xbd, 0x9d, 0xad, 0x3a, 0xbd,
+  0x43, 0x59, 0xf1, 0x3d, 0x5c, 0xef, 0x06, 0x3e, 0xc7, 0xe0, 0x32, 0x3d, 0x43,
+  0xb3, 0x87, 0x3d, 0x12, 0x6c, 0x02, 0xbe, 0x9c, 0xdc, 0x02, 0x3e, 0x22, 0xcc,
+  0x1b, 0xbe, 0x46, 0x37, 0xe8, 0x3d, 0xf0, 0x11, 0x3b, 0xbd, 0x0d, 0x62, 0x51,
+  0x3d, 0x8b, 0x64, 0x2f, 0x3d, 0x57, 0x97, 0x5e, 0x3d, 0x53, 0xdd, 0xd6, 0x3c,
+  0x00, 0xf5, 0xfb, 0xbc, 0x6f, 0x83, 0xea, 0x3b, 0xec, 0x88, 0x20, 0xbb, 0xe5,
+  0x7f, 0xe6, 0x3d, 0xe6, 0xc4, 0xb5, 0x3d, 0x05, 0x76, 0x0f, 0xbe, 0x4a, 0x2f,
+  0x61, 0xbd, 0xa0, 0x69, 0xe2, 0x3d, 0xab, 0xc9, 0xb4, 0x3d, 0xeb, 0xd7, 0x88,
+  0xbc, 0x8f, 0x65, 0xfb, 0xbd, 0xc5, 0xca, 0x93, 0xbc, 0x1f, 0xe5, 0xa9, 0x3d,
+  0x0b, 0x34, 0x06, 0x3e, 0xbd, 0x9e, 0xe1, 0x3d, 0x58, 0x9d, 0xec, 0xbd, 0x60,
+  0x28, 0xe3, 0xbc, 0x62, 0x2e, 0x85, 0x3d, 0xec, 0x10, 0xb6, 0x3d, 0xd4, 0x0e,
+  0x55, 0x3d, 0x6a, 0xd9, 0x22, 0xbd, 0xa4, 0x2c, 0xb0, 0xbd, 0x8f, 0x8c, 0x8b,
+  0x3d, 0x05, 0xa0, 0xbb, 0x3d, 0x7b, 0xf7, 0xc0, 0x3d, 0xca, 0x2f, 0x90, 0xbc,
+  0x07, 0x79, 0xe3, 0xbd, 0x8b, 0x7d, 0x83, 0xbd, 0xfe, 0x8a, 0x93, 0xbc, 0xc0,
+  0xe9, 0xd0, 0x3d, 0xfb, 0x88, 0x76, 0xbc, 0x2d, 0x4b, 0x99, 0x3c, 0x69, 0x04,
+  0xd3, 0x3c, 0xb6, 0xd2, 0x88, 0x3d, 0xeb, 0xe2, 0x71, 0xbd, 0xa8, 0xb5, 0x98,
+  0x3d, 0x08, 0x79, 0xea, 0xbd, 0x7c, 0x53, 0x03, 0xbd, 0xb1, 0xda, 0xf9, 0xbd,
+  0xf1, 0x53, 0x83, 0xbc, 0xa0, 0xb3, 0x49, 0xbd, 0x7c, 0x79, 0x07, 0x3c, 0x68,
+  0x60, 0x21, 0x3c, 0xb1, 0x1f, 0x38, 0x3d, 0x5d, 0x0c, 0x4e, 0x3d, 0x36, 0x83,
+  0x62, 0x3c, 0x87, 0x96, 0x22, 0xbd, 0xd2, 0x3a, 0x09, 0x3c, 0xa2, 0x6e, 0x7a,
+  0xbd, 0x54, 0xc7, 0x31, 0xbc, 0x3a, 0x58, 0x1e, 0xbd, 0x51, 0x31, 0x94, 0x3d,
+  0x28, 0x85, 0xde, 0xbc, 0x52, 0x0e, 0xce, 0xbd, 0x79, 0x6a, 0xfb, 0xbd, 0x0f,
+  0x76, 0x14, 0xbd, 0xb4, 0xf0, 0xb3, 0x3c, 0x30, 0x4e, 0xab, 0xbd, 0xbc, 0x21,
+  0x2a, 0x3d, 0xa7, 0x29, 0x93, 0x3d, 0x05, 0x5e, 0x79, 0x3c, 0xc0, 0xdc, 0x93,
+  0xbd, 0x8c, 0x46, 0xd3, 0x3d, 0x6d, 0xef, 0x21, 0x3d, 0xcd, 0x62, 0xe5, 0x3d,
+  0xf2, 0x5f, 0xbc, 0xbd, 0xec, 0xb5, 0x6e, 0x3d, 0x8f, 0xdd, 0xd1, 0x3c, 0xb6,
+  0x13, 0x93, 0xbd, 0x1e, 0x1d, 0x0a, 0x3e, 0xfe, 0x00, 0x0a, 0x3d, 0xfe, 0xea,
+  0x70, 0x3c, 0x1e, 0x69, 0x94, 0xbd, 0x54, 0x92, 0xdf, 0x3d, 0x8d, 0xc4, 0xe3,
+  0xbd, 0xa8, 0x26, 0xc1, 0x3d, 0x90, 0x69, 0x97, 0x3d, 0x5f, 0xf7, 0x21, 0x3e,
+  0xd8, 0xf4, 0x13, 0x3d, 0x8e, 0x0f, 0x2a, 0x3d, 0x1a, 0xf3, 0xe8, 0x3d, 0xb1,
+  0x70, 0x75, 0xbd, 0x3d, 0x10, 0x87, 0x3d, 0xf2, 0x55, 0x8f, 0xbd, 0x7f, 0x15,
+  0x07, 0xbe, 0xe0, 0x3c, 0xba, 0x3d, 0x6d, 0x1f, 0xc2, 0xbc, 0xd6, 0xbf, 0x2c,
+  0xbd, 0x01, 0x4c, 0x87, 0x3c, 0xd8, 0xe5, 0x93, 0x3d, 0x6e, 0x5a, 0x12, 0x3d,
+  0xff, 0x3a, 0xd1, 0x3d, 0xfa, 0x05, 0x0a, 0x3d, 0x5a, 0xce, 0xa3, 0xbc, 0xc5,
+  0x2b, 0xd8, 0x3d, 0x98, 0xb3, 0xce, 0xbd, 0x6b, 0x72, 0x90, 0x3d, 0xa7, 0x35,
+  0xbb, 0xbd, 0xe2, 0xcb, 0xae, 0xbc, 0x8e, 0xe3, 0x74, 0x3d, 0xcd, 0x32, 0xcf,
+  0xbd, 0x76, 0x8d, 0x1d, 0x3d, 0x27, 0xc5, 0x0c, 0xbe, 0x27, 0x7e, 0x6c, 0xbd,
+  0x54, 0xf1, 0xdb, 0x3d, 0x39, 0x03, 0xed, 0xbc, 0xd7, 0x4b, 0xe1, 0x3a, 0x19,
+  0x67, 0x90, 0x3d, 0xf5, 0x03, 0x89, 0x3d, 0x31, 0x9d, 0xd4, 0x3a, 0x06, 0x9d,
+  0x05, 0x3e, 0xde, 0xaf, 0x63, 0xbd, 0xed, 0xfe, 0x54, 0x3c, 0xdd, 0x40, 0xc5,
+  0xbd, 0xf5, 0x54, 0x0d, 0xbc, 0x3e, 0xaa, 0xcd, 0x3c, 0x08, 0x18, 0xbf, 0xbd,
+  0x79, 0x2e, 0x90, 0xbd, 0x15, 0xe3, 0x8a, 0x3d, 0x7b, 0x54, 0x7c, 0xbd, 0x85,
+  0x07, 0xd0, 0x3d, 0xfb, 0x39, 0x01, 0xbd, 0x12, 0x57, 0xf0, 0xbd, 0x56, 0x7c,
+  0x8d, 0xbd, 0xae, 0x9e, 0xaf, 0x3c, 0x90, 0xc3, 0x85, 0x3d, 0x9c, 0x00, 0x88,
+  0x3d, 0x1f, 0x9a, 0x8f, 0xbd, 0x80, 0xef, 0xc4, 0xb9, 0x60, 0xba, 0x5b, 0xbd,
+  0x05, 0x25, 0xd8, 0x3c, 0x76, 0x60, 0x6d, 0x3d, 0xc5, 0xf0, 0xe1, 0x3c, 0x0d,
+  0x00, 0xf7, 0x3d, 0x57, 0xb7, 0x24, 0x3d, 0x2c, 0x11, 0x06, 0xbe, 0x48, 0x15,
+  0x5b, 0xbd, 0x0c, 0x67, 0x22, 0xbd, 0xc9, 0x10, 0x07, 0x3c, 0x69, 0x42, 0xbb,
+  0xbd, 0x5b, 0x32, 0xb8, 0xbd, 0x62, 0x5e, 0x35, 0xbd, 0xfc, 0xe1, 0x22, 0xbd,
+  0xff, 0xb3, 0x51, 0xbd, 0x6e, 0x4d, 0x2d, 0x3c, 0xfb, 0xca, 0xc5, 0xbd, 0x15,
+  0x16, 0x32, 0x3d, 0x50, 0xff, 0xbe, 0xbd, 0xf7, 0x84, 0x5e, 0xbb, 0x27, 0xa2,
+  0x17, 0x3c, 0x83, 0x85, 0xda, 0xbd, 0xd3, 0x8f, 0xd8, 0x3d, 0x19, 0xd4, 0x9d,
+  0xbd, 0x05, 0x56, 0xbd, 0x3b, 0x80, 0x5c, 0x8d, 0xbd, 0x02, 0x07, 0x01, 0x3e,
+  0x46, 0x0a, 0xd0, 0x3c, 0x28, 0x0a, 0x74, 0x3d, 0x45, 0xd8, 0x9c, 0x3d, 0x51,
+  0x8c, 0xe1, 0x3d, 0x94, 0x9d, 0x44, 0xbc, 0x1a, 0xfd, 0x6d, 0x3d, 0x6a, 0xa7,
+  0x00, 0x3e, 0x03, 0xb0, 0xa5, 0xbd, 0x84, 0xb6, 0x94, 0x3c, 0x6e, 0x1b, 0xd2,
+  0xbd, 0xff, 0xcf, 0xbd, 0xbd, 0x7f, 0x7c, 0x6c, 0xbd, 0xa0, 0xb0, 0x4a, 0xbd,
+  0x8c, 0xfc, 0xca, 0xbc, 0xf4, 0xa1, 0x81, 0xbd, 0x22, 0xad, 0xe2, 0x3c, 0xfa,
+  0x91, 0xaf, 0x3d, 0xf4, 0x2e, 0x19, 0xbd, 0x0b, 0x57, 0x71, 0xbc, 0x21, 0xca,
+  0x8d, 0x3c, 0xee, 0x8c, 0x2b, 0x3a, 0x46, 0x1a, 0xc1, 0xbb, 0x51, 0xbe, 0x2c,
+  0xbd, 0xc0, 0x3f, 0x40, 0x3d, 0xb2, 0xbb, 0x96, 0x3d, 0x88, 0x43, 0x23, 0xbe,
+  0x26, 0xd9, 0xe8, 0xbd, 0xf7, 0xfc, 0x9d, 0xbd, 0x4e, 0xf6, 0xd3, 0xbc, 0x2a,
+  0xda, 0xba, 0xbd, 0xe1, 0x21, 0xe1, 0x3d, 0x81, 0xea, 0x2e, 0xbd, 0xde, 0xaa,
+  0xd2, 0xbb, 0xde, 0x20, 0xbe, 0x3d, 0x15, 0x2f, 0x44, 0x3d, 0x37, 0x58, 0x6e,
+  0xbd, 0xcd, 0x34, 0x4c, 0xbb, 0x8d, 0xad, 0x08, 0xbc, 0xd9, 0xe2, 0x21, 0x3d,
+  0xfe, 0x8b, 0xab, 0x3d, 0xa2, 0x7f, 0x47, 0xbd, 0xad, 0xbe, 0xe3, 0xbc, 0x5f,
+  0x5d, 0x20, 0x3d, 0xa7, 0xa7, 0x19, 0xbe, 0x27, 0x1b, 0x8a, 0xbd, 0x2e, 0xcf,
+  0x4d, 0x3d, 0x68, 0x43, 0xb0, 0x3d, 0x54, 0xe8, 0xec, 0x3b, 0x5f, 0x47, 0x57,
+  0xbd, 0xde, 0x1b, 0xc4, 0x3d, 0xd2, 0x08, 0xfa, 0xbb, 0x23, 0x97, 0xe5, 0x3d,
+  0xb3, 0x70, 0x6b, 0x3d, 0x33, 0x68, 0x2a, 0xbc, 0xbb, 0xc7, 0xb5, 0xbd, 0x31,
+  0xe2, 0xcd, 0xbd, 0xe3, 0x77, 0x44, 0x3d, 0xb1, 0xf5, 0x60, 0x3d, 0x03, 0x24,
+  0xf7, 0xbd, 0x6c, 0x04, 0xb0, 0x3c, 0xba, 0x53, 0xa9, 0xbd, 0xcb, 0x94, 0x03,
+  0xbe, 0x19, 0x25, 0xfc, 0xbb, 0x8d, 0xaf, 0xe5, 0x3d, 0x95, 0xec, 0xa3, 0x3d,
+  0xca, 0x8d, 0xcb, 0xbd, 0x71, 0x02, 0xee, 0x3c, 0x31, 0x55, 0xdf, 0xbd, 0x85,
+  0xd6, 0x69, 0x3d, 0xa1, 0xd8, 0x1d, 0x3d, 0xd6, 0x60, 0x12, 0xbb, 0x46, 0x47,
+  0x46, 0x3d, 0x75, 0xf9, 0x97, 0x3d, 0x4c, 0xd5, 0x87, 0x3d, 0xc4, 0x77, 0xb7,
+  0x3c, 0x0a, 0xd5, 0x08, 0x3d, 0x7f, 0x4d, 0x74, 0xbd, 0xdd, 0x0e, 0x07, 0xbe,
+  0x0d, 0xb1, 0x51, 0xbb, 0x95, 0xf0, 0xa7, 0x3d, 0x8d, 0xdc, 0xe7, 0xbd, 0x11,
+  0x22, 0xd1, 0x3d, 0x81, 0xad, 0x8c, 0x3d, 0x51, 0x36, 0x1e, 0x3d, 0xe3, 0x75,
+  0x01, 0x3e, 0xa1, 0xd1, 0x9a, 0x3d, 0x4f, 0xd4, 0xc4, 0x3d, 0x50, 0x2a, 0x61,
+  0x3c, 0x9a, 0xd5, 0xbd, 0xbd, 0x37, 0xd1, 0xd5, 0x3c, 0xd5, 0x83, 0x8e, 0x3d,
+  0xbd, 0x05, 0xb6, 0xbb, 0x52, 0x6b, 0x66, 0x3d, 0x25, 0xcb, 0x0c, 0xbe, 0x3a,
+  0xff, 0xd3, 0xbd, 0xaf, 0xdc, 0xb3, 0xbd, 0xde, 0xdf, 0x06, 0x3d, 0x91, 0x0f,
+  0xc8, 0xbd, 0x62, 0xa1, 0x8f, 0xbc, 0x1c, 0x36, 0x40, 0x3c, 0x7d, 0x4f, 0xfa,
+  0x3d, 0x99, 0x76, 0xd5, 0x3d, 0xc3, 0x21, 0x5c, 0xbb, 0x61, 0x54, 0x52, 0xbc,
+  0xc4, 0x07, 0x9b, 0xbd, 0xb3, 0x00, 0x44, 0xbc, 0xbe, 0x1b, 0x06, 0xbd, 0x35,
+  0x4c, 0x5d, 0x3d, 0x6b, 0x45, 0x17, 0xbd, 0x10, 0xd6, 0xe5, 0xbd, 0x40, 0x57,
+  0x83, 0x3d, 0x62, 0xd1, 0x64, 0xbd, 0x79, 0x90, 0xbd, 0xbc, 0xce, 0xf0, 0x07,
+  0x3e, 0xc0, 0xbd, 0xaf, 0x3d, 0x88, 0xe1, 0x84, 0xbd, 0xf0, 0xdb, 0x4c, 0x3d,
+  0x17, 0x35, 0x02, 0x3b, 0x30, 0x1c, 0xed, 0xbd, 0x4f, 0xfc, 0xda, 0x3d, 0x92,
+  0x80, 0x87, 0xbc, 0x02, 0x74, 0x1a, 0xbe, 0xdc, 0xb1, 0xb3, 0xbd, 0x6c, 0x01,
+  0xc0, 0xbc, 0x8f, 0x2d, 0x8c, 0x3d, 0xf5, 0x96, 0xc0, 0xbd, 0x77, 0xbc, 0x7f,
+  0xbd, 0x8a, 0x64, 0xf1, 0x3c, 0xb7, 0x6c, 0xb4, 0xbd, 0x1c, 0x6f, 0x84, 0x3d,
+  0xa1, 0xd5, 0xc0, 0xbd, 0xbf, 0x63, 0xd4, 0x3d, 0xd6, 0xd7, 0xe7, 0x3d, 0x89,
+  0x1e, 0x64, 0x3c, 0xf3, 0x81, 0xbe, 0xbd, 0xb3, 0x57, 0xe9, 0xbd, 0x84, 0x5e,
+  0x9a, 0x3d, 0x77, 0x22, 0x01, 0xbe, 0x53, 0xa3, 0xb8, 0xbd, 0xc0, 0x62, 0xff,
+  0x3b, 0x9a, 0xfb, 0xbd, 0x3d, 0x13, 0x1a, 0xeb, 0x3b, 0x3b, 0x96, 0x78, 0x3d,
+  0xfc, 0xc6, 0x93, 0x3d, 0xfc, 0x33, 0x92, 0x3d, 0xcc, 0xc1, 0x62, 0xbd, 0x63,
+  0x7c, 0x77, 0xbd, 0x69, 0x92, 0x05, 0xbd, 0xbd, 0xee, 0xb8, 0x3a, 0xa2, 0x9d,
+  0x0e, 0xbe, 0xf3, 0xba, 0xed, 0xbd, 0x2f, 0x6a, 0xaa, 0x3d, 0x77, 0x4a, 0xc6,
+  0x3d, 0x4f, 0xe7, 0xa8, 0x3d, 0x1e, 0x3f, 0xbb, 0xbd, 0xae, 0x6c, 0xb8, 0xbc,
+  0x75, 0xf1, 0x6d, 0xbd, 0xc1, 0x5d, 0x11, 0xbe, 0x2b, 0xe2, 0x4f, 0xbd, 0x54,
+  0x21, 0xf6, 0x3b, 0x5c, 0xe2, 0x96, 0x3c, 0xbe, 0xe8, 0x2e, 0x3d, 0x38, 0x39,
+  0x93, 0x3c, 0xc3, 0x50, 0xbc, 0x3d, 0x67, 0x1d, 0xc4, 0x3d, 0xe6, 0x29, 0x56,
+  0xbc, 0x4d, 0x70, 0x4d, 0x3c, 0xd2, 0xca, 0xc4, 0xbd, 0xa1, 0x30, 0x3b, 0xbd,
+  0x97, 0x9b, 0xb5, 0xbd, 0x65, 0x99, 0x9b, 0xbd, 0xb5, 0x65, 0xb7, 0xbd, 0x51,
+  0xe1, 0x9a, 0xbd, 0x2f, 0x56, 0x4a, 0xbb, 0x9c, 0x68, 0x98, 0xbd, 0x36, 0x75,
+  0x73, 0xbd, 0x19, 0xe1, 0x83, 0xbd, 0x37, 0x69, 0xee, 0x3d, 0xe7, 0xd1, 0xad,
+  0xbd, 0x3b, 0x29, 0x95, 0xbd, 0xcd, 0x10, 0x75, 0x3d, 0xb4, 0x82, 0xc2, 0xbc,
+  0x72, 0xd7, 0x91, 0x3d, 0xc8, 0x77, 0x49, 0xbd, 0x96, 0x67, 0x4d, 0xbd, 0xc5,
+  0x75, 0x98, 0xbd, 0x96, 0x67, 0xcc, 0x3d, 0xba, 0x7a, 0x1e, 0xbe, 0x30, 0x3a,
+  0x02, 0x3d, 0xc1, 0xf8, 0x78, 0x3d, 0x46, 0xfc, 0xc1, 0x3d, 0x99, 0x3c, 0xc5,
+  0xbd, 0xbc, 0x69, 0x39, 0x3d, 0x7f, 0x95, 0xf0, 0x3b, 0x50, 0x78, 0x57, 0xbd,
+  0xfa, 0xf7, 0xa9, 0xbc, 0xb2, 0xae, 0x2b, 0x3c, 0x22, 0x75, 0x0d, 0x3e, 0x63,
+  0xaa, 0x03, 0x3d, 0xfa, 0x00, 0xd7, 0x3d, 0xc3, 0xcb, 0x60, 0x3c, 0xab, 0xf2,
+  0x61, 0x3c, 0x1b, 0x9a, 0x38, 0xbd, 0x1a, 0x33, 0xef, 0xbd, 0x9e, 0x11, 0xc5,
+  0x3d, 0xf5, 0xb1, 0x99, 0xbc, 0x65, 0xee, 0x5e, 0xbc, 0xde, 0x02, 0xe8, 0xbd,
+  0xef, 0x87, 0x58, 0x3d, 0x0e, 0x01, 0xcf, 0x3d, 0x51, 0xf7, 0xcb, 0xbc, 0x9e,
+  0x48, 0x50, 0xbd, 0xd2, 0xc8, 0x88, 0xbc, 0x56, 0x0a, 0x18, 0x3e, 0x49, 0xa6,
+  0xce, 0xbd, 0x9d, 0x8d, 0xf4, 0x3d, 0xd9, 0x71, 0x7e, 0x3d, 0x49, 0xcb, 0x67,
+  0x3d, 0x3d, 0x4f, 0xdb, 0x3c, 0x8c, 0x3b, 0xaa, 0xbd, 0xce, 0xc4, 0x1f, 0x3d,
+  0xda, 0x94, 0xaa, 0x3c, 0x4c, 0xae, 0x89, 0x3d, 0xac, 0x7e, 0x8d, 0x3d, 0xff,
+  0xfe, 0xf7, 0x3d, 0x89, 0xba, 0xbd, 0xbd, 0x98, 0xc1, 0x5c, 0x3d, 0x9a, 0xcf,
+  0x1b, 0xba, 0xdb, 0x22, 0xf3, 0x3d, 0x3a, 0xa6, 0x58, 0xbd, 0x6b, 0x7d, 0x2b,
+  0x3d, 0x22, 0x6f, 0xa2, 0xbd, 0x95, 0xf3, 0x07, 0x3e, 0x14, 0xfb, 0x7a, 0x3d,
+  0xda, 0x56, 0x40, 0xbd, 0x85, 0xe7, 0xcf, 0xbd, 0x7f, 0x4c, 0xb8, 0x3c, 0xf0,
+  0x6d, 0xc1, 0xbd, 0xb1, 0x01, 0xbd, 0x3d, 0xb4, 0xc0, 0xc0, 0xbd, 0x4f, 0x5f,
+  0xca, 0xbd, 0x4e, 0x96, 0xe1, 0x3d, 0x92, 0x0a, 0xa6, 0x3d, 0xd6, 0xd9, 0xb7,
+  0x3d, 0x8b, 0x52, 0xa8, 0x3d, 0xa9, 0xe6, 0xb4, 0xbc, 0x16, 0x49, 0xc0, 0x3b,
+  0xed, 0x64, 0xd1, 0x3d, 0xf1, 0xaf, 0x20, 0xbc, 0x8f, 0x44, 0xd9, 0x3b, 0xc0,
+  0x7a, 0xb4, 0x3d, 0x31, 0xb6, 0x15, 0xbe, 0x82, 0x8e, 0x62, 0xbd, 0xb3, 0x93,
+  0x1e, 0xbd, 0xae, 0x33, 0x8c, 0xbd, 0x82, 0xf3, 0xa6, 0x3c, 0xd2, 0x41, 0xb2,
+  0xbc, 0x58, 0x37, 0xce, 0x3d, 0xb9, 0xd2, 0xce, 0x3d, 0x99, 0x90, 0x69, 0x3d,
+  0xc3, 0x4b, 0xc8, 0x3d, 0xba, 0xfa, 0xcb, 0x3d, 0xee, 0x4a, 0xfe, 0xbc, 0x24,
+  0xc5, 0x3c, 0xbd, 0x5a, 0x95, 0xb3, 0xbd, 0xb1, 0xc0, 0x1f, 0xbd, 0x61, 0x53,
+  0xb4, 0x3c, 0x2e, 0x79, 0xc7, 0xbd, 0xd6, 0x70, 0x9d, 0xbd, 0x9d, 0xe7, 0x16,
+  0x3d, 0x4f, 0xe9, 0xa9, 0xbc, 0x7d, 0xbb, 0x7c, 0xbd, 0xf0, 0xdf, 0xe9, 0xbc,
+  0x66, 0xc4, 0x3f, 0xbd, 0xfc, 0xd3, 0x20, 0xbd, 0xd3, 0x4f, 0x36, 0xbd, 0x72,
+  0x8d, 0xec, 0x3d, 0x79, 0xbc, 0xaa, 0x3d, 0x69, 0x95, 0xe7, 0x3d, 0x46, 0xb6,
+  0xcc, 0xbc, 0xdd, 0x97, 0x70, 0xbd, 0x96, 0x31, 0x0c, 0xbe, 0x48, 0x86, 0xeb,
+  0x3d, 0x74, 0xf6, 0xa3, 0x3c, 0xe8, 0x26, 0xa1, 0x3d, 0xe3, 0xdd, 0x70, 0xbd,
+  0xcf, 0xbd, 0x02, 0x3c, 0x13, 0x3e, 0xbc, 0xbd, 0x69, 0xad, 0x05, 0xbd, 0xc0,
+  0xad, 0x53, 0x3c, 0xb6, 0x7c, 0xb2, 0xbd, 0x27, 0xc3, 0xfd, 0xbc, 0x5f, 0x42,
+  0xc5, 0x3d, 0x2f, 0x17, 0xd6, 0x3d, 0xb2, 0x68, 0xda, 0xbd, 0x95, 0xe5, 0x4f,
+  0x3c, 0xae, 0x99, 0xe4, 0x3d, 0x8f, 0x5c, 0xde, 0xbd, 0xf1, 0x87, 0x02, 0xbb,
+  0x17, 0x17, 0x7a, 0x3d, 0x75, 0x72, 0x1f, 0x3d, 0x70, 0x34, 0xa4, 0xbd, 0x43,
+  0x2a, 0xb2, 0x3d, 0xd9, 0x5a, 0xc7, 0x3d, 0xa5, 0x58, 0xc6, 0x3d, 0xa3, 0xb8,
+  0x76, 0xbd, 0x5b, 0xf5, 0x27, 0x3c, 0x58, 0xfa, 0x60, 0x3c, 0xcc, 0x2e, 0xd4,
+  0x3d, 0x71, 0xc3, 0x54, 0x3c, 0x75, 0xe3, 0x6b, 0x3d, 0x29, 0xf3, 0x9a, 0x3d,
+  0x9d, 0x62, 0x8b, 0xbd, 0xcd, 0xa8, 0x9f, 0xbd, 0xee, 0xaa, 0xbf, 0x3c, 0xd7,
+  0xe4, 0x20, 0xbd, 0x9f, 0x2c, 0xa4, 0x3c, 0x3a, 0x5e, 0x76, 0xbd, 0x9b, 0xcb,
+  0x07, 0x3e, 0x3e, 0x33, 0x34, 0x3d, 0x69, 0x57, 0x26, 0x3c, 0xf5, 0x54, 0xef,
+  0xbd, 0xf5, 0x3d, 0xe9, 0xbd, 0x8e, 0xed, 0x2b, 0x3d, 0x86, 0xf8, 0xb2, 0x3c,
+  0xb2, 0x7f, 0x45, 0x3d, 0xe1, 0x4f, 0xbd, 0x3c, 0xa7, 0xc8, 0x91, 0xbd, 0xea,
+  0x4c, 0xc5, 0x3d, 0x7a, 0x60, 0x7c, 0x3d, 0xce, 0x3e, 0xb6, 0x3d, 0xc3, 0x22,
+  0x52, 0xbd, 0xbf, 0x54, 0xd3, 0xbc, 0xc7, 0xe0, 0xe1, 0xbd, 0x08, 0x86, 0xc8,
+  0x3c, 0x98, 0x6c, 0xc3, 0xbd, 0xe6, 0xe1, 0x25, 0xbd, 0xdb, 0x07, 0x53, 0xbb,
+  0xbd, 0x04, 0x5f, 0xbd, 0x12, 0xfd, 0xe6, 0xbd, 0x2d, 0x0f, 0xe8, 0x3d, 0x9e,
+  0x08, 0x47, 0x3d, 0x93, 0xc8, 0xdc, 0xbd, 0x97, 0x91, 0xc9, 0xbd, 0xbd, 0x45,
+  0x88, 0xbd, 0x45, 0x8e, 0x0b, 0xbe, 0x8f, 0xb7, 0xd1, 0xbd, 0x9b, 0x3c, 0xc2,
+  0x3c, 0x04, 0xc5, 0xda, 0xba, 0xce, 0x19, 0x9a, 0x3d, 0xaf, 0xee, 0x25, 0x3e,
+  0xdf, 0x56, 0x48, 0xbd, 0x9d, 0x42, 0x02, 0x3e, 0x2c, 0x6a, 0xef, 0x3c, 0x25,
+  0x99, 0x07, 0x3c, 0x74, 0xa1, 0xca, 0x3c, 0xae, 0x08, 0x9e, 0x3c, 0xe5, 0xec,
+  0x25, 0xbd, 0x63, 0x8f, 0xd5, 0x3d, 0xf3, 0x4a, 0xc5, 0xbc, 0xab, 0x02, 0x53,
+  0xbd, 0x3e, 0xec, 0x5e, 0x3d, 0xea, 0xf2, 0x8f, 0x3d, 0xb9, 0xa3, 0x91, 0xbd,
+  0xa9, 0x34, 0x93, 0xbd, 0xd4, 0x95, 0x78, 0x3d, 0x84, 0x2b, 0x04, 0x3e, 0xe7,
+  0x61, 0x87, 0x3d, 0x41, 0x40, 0xe9, 0x3d, 0x3f, 0xea, 0xdc, 0xbc, 0xc9, 0xfd,
+  0xa4, 0x3d, 0xf6, 0xd5, 0x69, 0x3d, 0xa5, 0x93, 0x99, 0xbb, 0x21, 0x84, 0x76,
+  0x3d, 0xaa, 0xf2, 0x52, 0x3d, 0xbb, 0x3d, 0x9f, 0xbd, 0xd3, 0xd6, 0x6c, 0x3d,
+  0xe6, 0xb2, 0xcc, 0xbc, 0x18, 0x3b, 0x30, 0x3d, 0x25, 0xcf, 0xc5, 0xbc, 0xe0,
+  0xfd, 0xb4, 0x3c, 0x5c, 0x92, 0x6b, 0x3d, 0xa8, 0x01, 0x17, 0x3d, 0xf6, 0xed,
+  0xa2, 0xbd, 0x42, 0x7b, 0xec, 0x3d, 0x8e, 0x87, 0xd7, 0x3d, 0xfa, 0x30, 0xb7,
+  0x3d, 0x54, 0x66, 0x38, 0xbd, 0x68, 0xb5, 0xa9, 0xbd, 0x30, 0x1e, 0x7d, 0x3d,
+  0x93, 0xf4, 0xd5, 0xbc, 0x69, 0x6a, 0x98, 0xbd, 0x8f, 0x2b, 0x4f, 0xbd, 0xd3,
+  0x99, 0x9a, 0xbd, 0x9b, 0x72, 0xfe, 0xbc, 0xaf, 0xc3, 0xad, 0xbd, 0xe2, 0xdf,
+  0xde, 0x3c, 0xdc, 0x3e, 0xd3, 0x3d, 0x46, 0xb7, 0x92, 0xbd, 0x22, 0xd0, 0x21,
+  0xbd, 0x7a, 0x5e, 0xae, 0x3c, 0xb6, 0x91, 0xa4, 0x3d, 0xba, 0xda, 0x8f, 0xbc,
+  0xad, 0xb4, 0x18, 0x3b, 0xb1, 0x16, 0x9c, 0xbd, 0x2f, 0xf7, 0x89, 0xbd, 0x89,
+  0x33, 0xba, 0xbd, 0x03, 0x89, 0x61, 0xbd, 0xa8, 0x17, 0x50, 0xbd, 0xf5, 0xfe,
+  0x1a, 0x3d, 0xd2, 0x25, 0x02, 0x3d, 0xbb, 0xc9, 0x67, 0xbd, 0xc8, 0x32, 0xe0,
+  0x3d, 0x8e, 0xb2, 0x9e, 0xbd, 0x57, 0x57, 0x2a, 0xbc, 0xb4, 0xc4, 0x76, 0x3d,
+  0xfd, 0x46, 0x11, 0x3b, 0x38, 0x45, 0xe8, 0x3a, 0x90, 0x49, 0xc6, 0xbd, 0xc3,
+  0x50, 0x0b, 0xbe, 0x19, 0xca, 0xd9, 0x3d, 0x17, 0x4d, 0xe0, 0x3d, 0x68, 0x36,
+  0x3f, 0xbc, 0x3a, 0x6e, 0xda, 0xbd, 0x50, 0xd8, 0xde, 0x3d, 0x6f, 0x09, 0x29,
+  0xbe, 0x9d, 0x50, 0x03, 0xbd, 0x9a, 0x25, 0xf6, 0xbd, 0x43, 0xa2, 0xbc, 0x3d,
+  0x9a, 0x55, 0xa5, 0x3d, 0xa9, 0x0d, 0x2f, 0xbd, 0x5c, 0x8e, 0x22, 0xbd, 0x2e,
+  0xc1, 0x58, 0xbd, 0x5a, 0x05, 0x2c, 0xbd, 0xec, 0x19, 0xa1, 0xbd, 0xd7, 0x75,
+  0x7b, 0x3d, 0x9a, 0xcf, 0x82, 0x3c, 0x46, 0xc6, 0xff, 0x3c, 0x37, 0xc8, 0xca,
+  0x3d, 0xa0, 0xb7, 0x28, 0x3d, 0xaa, 0xb5, 0x2f, 0x3d, 0xaa, 0xa3, 0x9e, 0xbb,
+  0x01, 0x2b, 0xd6, 0xbd, 0xa5, 0x6d, 0xb1, 0x3d, 0x2c, 0x3d, 0x97, 0xbc, 0x63,
+  0xfb, 0x18, 0xbe, 0xb9, 0xa9, 0xcb, 0x3d, 0xb0, 0x7d, 0xb4, 0x3d, 0x22, 0x6a,
+  0x65, 0x3d, 0x7a, 0xaf, 0xf5, 0xba, 0xed, 0x29, 0x0e, 0x3d, 0x5c, 0xd5, 0x6f,
+  0xbd, 0xbe, 0xd9, 0xa0, 0xbc, 0x05, 0x8b, 0xe2, 0x3c, 0x35, 0xec, 0x8b, 0xbc,
+  0xa9, 0x59, 0x0d, 0x3c, 0x0b, 0x4c, 0x56, 0x3c, 0x39, 0x59, 0xad, 0xbd, 0x41,
+  0x06, 0xe3, 0xbd, 0xb1, 0xcd, 0xaa, 0x3d, 0xa8, 0xcc, 0xa1, 0xbd, 0x35, 0x63,
+  0x36, 0xbd, 0x44, 0xf9, 0x43, 0x3c, 0xee, 0x2c, 0xdb, 0x3c, 0x79, 0xd4, 0x78,
+  0x3d, 0x81, 0x34, 0x96, 0x3d, 0xc0, 0x43, 0xda, 0x3b, 0x9f, 0x9c, 0x0b, 0xbd,
+  0xaf, 0x07, 0xac, 0x3d, 0xcf, 0xe3, 0xf0, 0x3c, 0x44, 0x9b, 0xf8, 0x3d, 0xd4,
+  0x1f, 0x4e, 0xbd, 0xa6, 0xab, 0x9f, 0x3d, 0xcb, 0xd4, 0x30, 0x3d, 0x4b, 0xd4,
+  0x17, 0x3d, 0x7e, 0xf2, 0x3d, 0x3b, 0x47, 0x47, 0xac, 0x3b, 0x2f, 0xda, 0xa8,
+  0xbd, 0xb0, 0x53, 0xde, 0xbd, 0x2e, 0x06, 0xdc, 0x3d, 0x9a, 0x92, 0x9a, 0xbd,
+  0x86, 0xf9, 0xf2, 0xbd, 0xb0, 0x9b, 0xd6, 0xbd, 0x8f, 0x36, 0x53, 0x3d, 0x09,
+  0x68, 0x99, 0x3d, 0x25, 0xbb, 0xeb, 0x3d, 0x76, 0x5e, 0xfb, 0xbc, 0x24, 0x11,
+  0x05, 0xbd, 0xcf, 0xaf, 0xb7, 0xbd, 0x97, 0xcd, 0x65, 0xbd, 0xeb, 0x59, 0xf7,
+  0xb8, 0x95, 0x28, 0xb1, 0xbc, 0xff, 0xba, 0x91, 0xbd, 0x58, 0x33, 0xf0, 0x3c,
+  0x42, 0x68, 0xd9, 0xbd, 0xa7, 0x71, 0x95, 0xbb, 0x41, 0x0b, 0x6a, 0x3d, 0xe4,
+  0x83, 0x06, 0x3d, 0xae, 0x90, 0xa0, 0xbd, 0xfe, 0xf5, 0x27, 0xbd, 0x7f, 0xdc,
+  0xb4, 0x3d, 0x32, 0xf0, 0x75, 0xbd, 0x99, 0xfa, 0x7b, 0x3d, 0x5f, 0xca, 0x7a,
+  0x3d, 0xd9, 0x7e, 0x49, 0xbd, 0x7f, 0x2b, 0x5b, 0x3d, 0x02, 0x92, 0x46, 0xbb,
+  0x20, 0x77, 0x5b, 0x3c, 0x57, 0xa6, 0xd1, 0x3a, 0x74, 0x68, 0xb2, 0xbd, 0xa2,
+  0x4c, 0x0a, 0xbe, 0xb9, 0xcf, 0x43, 0xbd, 0xd6, 0x2e, 0x2d, 0xbc, 0x0f, 0x5d,
+  0xde, 0x3d, 0xfc, 0xdc, 0x1c, 0xb9, 0x6d, 0x7b, 0x91, 0xbc, 0x33, 0x39, 0x97,
+  0x3d, 0x37, 0xcf, 0x1f, 0x3d, 0xb3, 0x0b, 0xe3, 0x3d, 0x45, 0xbe, 0xa0, 0x3d,
+  0xda, 0x7c, 0x0e, 0x3d, 0x66, 0xd7, 0x25, 0xbd, 0xa7, 0xe0, 0x0f, 0x3d, 0xd2,
+  0x48, 0x8f, 0xbc, 0x2b, 0xbd, 0x9a, 0x3d, 0xf9, 0xe3, 0xd9, 0x3d, 0x0d, 0x1e,
+  0xf3, 0x3c, 0x12, 0xc5, 0xfe, 0xbc, 0x59, 0x75, 0x9f, 0x3c, 0x76, 0x0e, 0x46,
+  0xbd, 0xa3, 0x5d, 0xb9, 0x3d, 0x8c, 0x5a, 0xc9, 0x3c, 0xb5, 0x90, 0xbd, 0x3d,
+  0xe5, 0xaa, 0x42, 0x3d, 0xaf, 0x43, 0x9b, 0xbd, 0x50, 0x0e, 0xc9, 0xbc, 0xea,
+  0x53, 0x75, 0x3d, 0xfd, 0x0d, 0x4b, 0x3d, 0x7d, 0xc8, 0x17, 0x3d, 0xdd, 0xf0,
+  0xb5, 0xbd, 0x00, 0x53, 0xf4, 0xba, 0xa6, 0x3a, 0x54, 0xbd, 0x7f, 0x57, 0x5f,
+  0xbd, 0x00, 0x98, 0x56, 0xbd, 0xe6, 0x33, 0xbe, 0x3c, 0xe2, 0x66, 0x96, 0x3c,
+  0x41, 0x08, 0x88, 0x3c, 0x66, 0x40, 0x88, 0xbd, 0xfd, 0x89, 0xbb, 0x3d, 0xa6,
+  0xde, 0x99, 0x3a, 0xa4, 0x22, 0xf4, 0x3c, 0x94, 0xbc, 0xaf, 0xbd, 0x94, 0x01,
+  0xcd, 0xbd, 0x89, 0x93, 0x0d, 0x3d, 0x74, 0x5a, 0xdf, 0x3b, 0x5b, 0x0a, 0xce,
+  0xbd, 0xee, 0x6d, 0x87, 0x3d, 0x7c, 0x6a, 0xb0, 0x3d, 0x6d, 0xb0, 0x7b, 0x3c,
+  0x6f, 0xb8, 0x4e, 0x3d, 0x06, 0x6a, 0x25, 0xbd, 0x7c, 0xb9, 0xcc, 0x3d, 0xf5,
+  0x54, 0xb0, 0xbd, 0xf3, 0xf9, 0xe1, 0xbd, 0xcf, 0x6d, 0x91, 0x3c, 0x8d, 0x15,
+  0xa4, 0x3c, 0x15, 0xa1, 0x86, 0x3d, 0x47, 0x35, 0xc3, 0xbd, 0x34, 0xa8, 0x16,
+  0xbd, 0x11, 0xda, 0x49, 0x3d, 0x45, 0xb4, 0x61, 0x3d, 0x41, 0x15, 0xbf, 0xbc,
+  0xd4, 0x07, 0xfa, 0x3d, 0xb0, 0x3a, 0x18, 0x3d, 0xda, 0x7f, 0x69, 0xbd, 0x6b,
+  0xec, 0x9f, 0xbd, 0x6e, 0xfc, 0xe6, 0x3d, 0xc9, 0x5d, 0xb4, 0x3d, 0xa2, 0x1d,
+  0x12, 0xbc, 0x51, 0x23, 0xce, 0xbd, 0x0a, 0x20, 0x86, 0xbc, 0xc4, 0x1f, 0xbe,
+  0x3d, 0x18, 0x10, 0x6a, 0x3d, 0xe1, 0x58, 0x9f, 0x3c, 0x22, 0x7f, 0xc9, 0xbc,
+  0x1a, 0xed, 0x1e, 0xbe, 0x47, 0x93, 0x87, 0x3c, 0x4d, 0x77, 0x31, 0xbc, 0xf9,
+  0x29, 0xb2, 0x3d, 0xa9, 0xb3, 0x77, 0xbd, 0x43, 0x16, 0x0a, 0x3d, 0x88, 0x2f,
+  0x98, 0x3d, 0x3b, 0x7c, 0x2b, 0x3d, 0xfc, 0x29, 0x07, 0x3e, 0xa6, 0x27, 0x93,
+  0xbd, 0x5a, 0xa8, 0x13, 0xbe, 0xa8, 0xb8, 0x88, 0xbd, 0x9b, 0x64, 0xc5, 0xbc,
+  0xef, 0xb1, 0xe6, 0x3d, 0x33, 0x47, 0xc3, 0x38, 0x56, 0x92, 0x7b, 0xbd, 0x87,
+  0x81, 0xc7, 0x3c, 0x94, 0xe2, 0x21, 0x3c, 0xc2, 0x28, 0x75, 0x3d, 0xb7, 0x6f,
+  0x8b, 0xbd, 0x2b, 0xdd, 0x09, 0xbc, 0x1f, 0xb9, 0xbc, 0xbd, 0xd6, 0xef, 0x90,
+  0xbd, 0x52, 0xc7, 0xa5, 0xbc, 0xf7, 0x2c, 0x4d, 0x3c, 0xc7, 0xfe, 0x94, 0x3c,
+  0x24, 0x12, 0x46, 0xbc, 0x95, 0x3b, 0x59, 0x3c, 0x64, 0x96, 0xd7, 0xbc, 0xb3,
+  0x3c, 0xc7, 0xbd, 0xe6, 0x41, 0xbc, 0x3d, 0x70, 0xd8, 0x5c, 0x3b, 0xe2, 0x16,
+  0x88, 0xbd, 0x21, 0x12, 0xfc, 0x3d, 0xbd, 0x55, 0x1e, 0xbe, 0x3a, 0xf9, 0x1f,
+  0xbd, 0x59, 0xd3, 0x27, 0xbd, 0x14, 0x3b, 0xd7, 0x3d, 0x13, 0xf9, 0x66, 0x3d,
+  0x79, 0x92, 0x77, 0xbd, 0x9a, 0x35, 0x63, 0x3d, 0x07, 0xf2, 0x75, 0xbc, 0xc1,
+  0x6f, 0x73, 0x3d, 0x0f, 0x02, 0xc2, 0x3c, 0xd0, 0x45, 0x0c, 0x3d, 0x37, 0x87,
+  0x5e, 0x3d, 0x03, 0x9e, 0xce, 0x3d, 0x2b, 0x90, 0x13, 0xbd, 0xf4, 0x1a, 0xc5,
+  0xbd, 0xdf, 0x42, 0xdb, 0x3d, 0x47, 0x02, 0x58, 0xbd, 0x0f, 0x74, 0x1a, 0xbd,
+  0x1d, 0x5f, 0x05, 0x3d, 0x99, 0x81, 0xff, 0xbc, 0x56, 0x85, 0xb3, 0x3d, 0xac,
+  0x62, 0x17, 0xbd, 0xaa, 0x30, 0xc3, 0x3d, 0xdc, 0x53, 0x0f, 0xbe, 0x9b, 0x95,
+  0x49, 0x3d, 0xf8, 0x4e, 0xa7, 0x3d, 0x76, 0x74, 0x10, 0xbd, 0x2c, 0xe0, 0x9c,
+  0x3d, 0x7b, 0xc1, 0xc7, 0xbd, 0x15, 0x39, 0xe6, 0x3d, 0x52, 0xb3, 0xff, 0xbd,
+  0x72, 0x77, 0xd3, 0x3d, 0x6a, 0xc4, 0xfb, 0x3c, 0x27, 0x15, 0x5b, 0x3d, 0xba,
+  0xa2, 0x6b, 0xbd, 0x2b, 0xbc, 0x02, 0x3e, 0x6c, 0x7c, 0xda, 0x3c, 0x24, 0xa1,
+  0x61, 0xbb, 0xfb, 0x9b, 0xc9, 0xbc, 0x20, 0xcb, 0x93, 0xbc, 0x95, 0x98, 0x6c,
+  0xbd, 0x96, 0x34, 0xda, 0x3d, 0x5b, 0xa3, 0xe1, 0xbc, 0x71, 0xff, 0x07, 0x3d,
+  0x5e, 0x18, 0xd0, 0xbd, 0xc1, 0x9e, 0x26, 0x3e, 0x8b, 0x3d, 0x9c, 0x3d, 0x90,
+  0xe5, 0x84, 0x3d, 0x0d, 0xaa, 0x37, 0x3b, 0x99, 0x2d, 0xf6, 0x3c, 0x40, 0x23,
+  0xca, 0x3d, 0x1c, 0x56, 0xb4, 0xbd, 0xa9, 0x04, 0x97, 0xbd, 0x41, 0xa7, 0x9e,
+  0x3a, 0xb3, 0xfe, 0xb9, 0xbd, 0xf9, 0x34, 0x02, 0xbd, 0x44, 0x97, 0xb4, 0xbd,
+  0x67, 0x43, 0x80, 0xbd, 0xb0, 0xce, 0x36, 0xbd, 0x28, 0x48, 0xa2, 0x3d, 0x32,
+  0x52, 0xd3, 0x3d, 0x2a, 0xd4, 0x12, 0x3e, 0x8e, 0x41, 0xd5, 0x3c, 0x5e, 0x6b,
+  0x64, 0xbd, 0x19, 0x1a, 0xee, 0xbd, 0x91, 0xf3, 0xb1, 0xbb, 0x9e, 0x4f, 0x9b,
+  0x3d, 0x50, 0x3a, 0x9d, 0x3d, 0x25, 0xbc, 0xb5, 0xbd, 0xf7, 0xd6, 0x7b, 0x3d,
+  0x69, 0x87, 0x94, 0xbb, 0xed, 0x33, 0x31, 0xbd, 0x8f, 0xf3, 0xaa, 0xbd, 0x5b,
+  0x0b, 0xc0, 0x3d, 0xd9, 0xac, 0x60, 0xbd, 0x24, 0xa6, 0x9c, 0x3d, 0xfb, 0x17,
+  0x3f, 0x3d, 0x49, 0x6a, 0x97, 0x3d, 0x02, 0xe9, 0xef, 0xbd, 0x44, 0xbe, 0xb5,
+  0xbc, 0x61, 0x77, 0x94, 0xbb, 0x9e, 0x6d, 0xe1, 0xbc, 0xfa, 0x8c, 0xf2, 0xbc,
+  0x9c, 0xfc, 0x45, 0xbd, 0xed, 0x91, 0xde, 0xbd, 0xcd, 0xa8, 0xe7, 0x3d, 0x4e,
+  0x05, 0x10, 0xbe, 0x33, 0x4d, 0xa1, 0x3c, 0x01, 0x95, 0x91, 0x3d, 0x33, 0xf9,
+  0x13, 0xbd, 0x78, 0x50, 0x03, 0xbd, 0x7f, 0xa1, 0xd7, 0xbd, 0x0f, 0xe3, 0x92,
+  0x3d, 0x46, 0x19, 0x9e, 0x3d, 0xa8, 0xa7, 0x06, 0xbc, 0x0e, 0x64, 0xa6, 0x3d,
+  0xb4, 0x52, 0xe8, 0xbd, 0x87, 0xc6, 0x8f, 0xbd, 0x50, 0x8c, 0xbf, 0xbb, 0x76,
+  0x39, 0x34, 0x3d, 0xd2, 0x2f, 0x0b, 0xbd, 0xf4, 0xa3, 0x51, 0xbd, 0xb0, 0x28,
+  0x7d, 0xbd, 0x83, 0x61, 0x57, 0x3d, 0xca, 0x95, 0xb5, 0x3d, 0xdc, 0x22, 0x32,
+  0xbc, 0x58, 0xb3, 0x69, 0xbd, 0x09, 0x10, 0x79, 0x3c, 0x3c, 0x79, 0x35, 0xbd,
+  0xa0, 0x99, 0xa9, 0xbd, 0xdf, 0x93, 0x18, 0x3e, 0x6f, 0x5f, 0xad, 0x3d, 0xb2,
+  0x0b, 0x8e, 0xbd, 0xf5, 0xf2, 0xaa, 0x3d, 0xf2, 0x2e, 0xa9, 0xbd, 0xf6, 0xe2,
+  0x23, 0x3d, 0x17, 0xa2, 0xaf, 0x3d, 0xd9, 0x35, 0x8e, 0xbd, 0xf1, 0x8d, 0x08,
+  0x3e, 0xcc, 0x76, 0xb4, 0xbd, 0x71, 0xb4, 0xc9, 0xbd, 0x00, 0x10, 0xd4, 0xbc,
+  0xbe, 0x87, 0xf0, 0x3c, 0xe8, 0x15, 0xad, 0xbd, 0xfb, 0x2e, 0x5e, 0xbd, 0x6f,
+  0x3b, 0x99, 0xbc, 0x77, 0xc7, 0xe5, 0xbd, 0xf4, 0x52, 0x03, 0xbe, 0x74, 0x7b,
+  0x00, 0xbe, 0xe8, 0x51, 0x8c, 0x3d, 0xe1, 0x8d, 0x1c, 0xbc, 0x3d, 0x3c, 0x16,
+  0x3d, 0x94, 0x51, 0xd5, 0x3d, 0xff, 0x2e, 0xb0, 0x3d, 0xf5, 0x3c, 0xaa, 0xbc,
+  0x39, 0x6b, 0xb2, 0x3d, 0x1f, 0x8b, 0x44, 0x3d, 0xe4, 0xa4, 0xa8, 0x3d, 0xa9,
+  0xbc, 0x81, 0x3d, 0x67, 0x10, 0x83, 0xbd, 0x03, 0x1b, 0x08, 0x3d, 0xed, 0xef,
+  0x29, 0x3d, 0x46, 0x38, 0x58, 0xbc, 0x98, 0x03, 0xa3, 0x3d, 0x7d, 0xd6, 0x34,
+  0xbd, 0x36, 0xbd, 0xf7, 0x3d, 0xe7, 0xf9, 0x5d, 0xbd, 0x9c, 0x88, 0x87, 0x3d,
+  0x85, 0x7d, 0xa3, 0x3d, 0x81, 0x29, 0x75, 0xbc, 0xca, 0x17, 0x97, 0x3d, 0xbf,
+  0xd1, 0x04, 0x3e, 0xc9, 0x18, 0xfa, 0x3b, 0x0f, 0x59, 0xc3, 0x3d, 0x40, 0xa6,
+  0x05, 0xbd, 0x5e, 0x98, 0x8d, 0x3c, 0x8f, 0x73, 0xff, 0x3c, 0xb2, 0x58, 0xde,
+  0xbc, 0x97, 0x10, 0x04, 0xbd, 0x2d, 0xd2, 0x1c, 0x3d, 0xac, 0x03, 0x6e, 0xbd,
+  0xa8, 0x9a, 0xa8, 0x3d, 0x1c, 0x0e, 0x41, 0x3d, 0x30, 0x7a, 0xab, 0xbd, 0xec,
+  0x58, 0x14, 0xbd, 0xac, 0xe9, 0x9e, 0xbb, 0x0b, 0x14, 0x02, 0x3d, 0xac, 0x78,
+  0x00, 0x3e, 0xa1, 0xb6, 0xc2, 0xbd, 0x04, 0x51, 0x91, 0xbc, 0x57, 0x51, 0xf1,
+  0xbd, 0x95, 0x42, 0x49, 0x3d, 0x91, 0x54, 0xa2, 0x3c, 0xbd, 0x0f, 0x03, 0xbe,
+  0x0a, 0xf8, 0x17, 0xbd, 0xbb, 0x25, 0x14, 0x3d, 0xf2, 0x00, 0x19, 0xbd, 0x79,
+  0xea, 0x85, 0xbd, 0x4a, 0xf9, 0xb6, 0xbc, 0x4f, 0x1c, 0x34, 0xbc, 0x2e, 0x3e,
+  0x31, 0x3d, 0xe3, 0x63, 0x5e, 0xbd, 0x63, 0xf1, 0xaf, 0x3d, 0x4e, 0xee, 0xaa,
+  0x3d, 0x91, 0xc0, 0xcc, 0xbc, 0xc3, 0x43, 0xb2, 0xbc, 0xab, 0x9d, 0x54, 0xbd,
+  0x0b, 0x92, 0xa3, 0xbc, 0xc5, 0xe0, 0xf6, 0x3d, 0xb5, 0x2d, 0x52, 0xbd, 0x89,
+  0x8d, 0xf0, 0xbd, 0xd4, 0x40, 0x0c, 0xbe, 0x88, 0xf8, 0xaa, 0x3d, 0xc6, 0x0d,
+  0x10, 0x3d, 0xe0, 0x7d, 0xcb, 0xbc, 0x14, 0x58, 0xba, 0x3a, 0x11, 0x9d, 0x24,
+  0xbd, 0x14, 0x54, 0x03, 0x3b, 0x2c, 0xb4, 0x7d, 0x3c, 0x5a, 0x71, 0x99, 0xbd,
+  0x5d, 0xa3, 0xa3, 0xbd, 0xfc, 0xd0, 0xe5, 0x39, 0x4a, 0x6c, 0xf8, 0xbd, 0x81,
+  0x0e, 0xab, 0x3d, 0x0d, 0x40, 0x9a, 0x3d, 0x89, 0xff, 0x07, 0x3d, 0xd4, 0x8c,
+  0x97, 0x3b, 0x8a, 0x7a, 0xc5, 0x3c, 0xbb, 0xbf, 0xe3, 0x3a, 0xcb, 0x47, 0x41,
+  0x3d, 0x80, 0x8d, 0x29, 0x3d, 0x16, 0xe7, 0xf6, 0xbc, 0x01, 0x5f, 0xc0, 0x3d,
+  0xf1, 0x20, 0xe3, 0xbc, 0xec, 0x9f, 0x29, 0x3e, 0x8f, 0x46, 0x8d, 0x3d, 0x20,
+  0x99, 0xe9, 0x3c, 0x90, 0x04, 0x00, 0x3e, 0x35, 0xda, 0xba, 0xbd, 0x6c, 0xc5,
+  0x5b, 0x3d, 0x9a, 0x42, 0x41, 0xbd, 0x1a, 0x84, 0x6f, 0x3d, 0x94, 0xc4, 0x0c,
+  0xbd, 0x08, 0x43, 0x8a, 0x3d, 0xd8, 0xdb, 0xa4, 0x3d, 0xac, 0xc6, 0xa8, 0x3d,
+  0xa5, 0xf4, 0xff, 0xb9, 0xdc, 0x01, 0x58, 0xbc, 0x43, 0x37, 0xf0, 0x3d, 0xed,
+  0x73, 0x3b, 0xbd, 0x8d, 0x1f, 0x00, 0x3c, 0x4c, 0x89, 0x71, 0x3d, 0xb0, 0xbf,
+  0x4e, 0x3d, 0x1e, 0x61, 0x83, 0xbd, 0x82, 0xf6, 0x02, 0xbe, 0x3c, 0x97, 0xf9,
+  0x3d, 0x06, 0x96, 0x97, 0x3d, 0x5c, 0x13, 0xd7, 0xbd, 0xce, 0x77, 0x88, 0xbd,
+  0x26, 0x76, 0xba, 0x3c, 0x46, 0x28, 0xc4, 0x3d, 0x35, 0x72, 0x8d, 0x3c, 0x3e,
+  0x63, 0x81, 0xbd, 0x06, 0x13, 0x9b, 0x3d, 0xf9, 0x80, 0x20, 0x3d, 0x9c, 0xfb,
+  0x94, 0x3c, 0x50, 0x2c, 0x16, 0xbd, 0xdb, 0x7d, 0x59, 0xbd, 0x7a, 0xa8, 0x8d,
+  0x3d, 0x8b, 0x56, 0x94, 0xbd, 0xa5, 0x49, 0x8b, 0x3d, 0x76, 0xae, 0x99, 0xbc,
+  0x6e, 0x40, 0x84, 0x3d, 0xe0, 0x5a, 0x40, 0xbd, 0x33, 0xb8, 0x0b, 0xbd, 0x96,
+  0x14, 0x25, 0x3c, 0x3e, 0x5c, 0x78, 0xbd, 0x31, 0x40, 0x06, 0x3e, 0x05, 0x0b,
+  0xb7, 0x3c, 0x24, 0x3e, 0xe5, 0xbd, 0x94, 0x06, 0x12, 0x3d, 0x14, 0x07, 0x96,
+  0xbd, 0x14, 0x1d, 0x80, 0xbd, 0xfc, 0xd3, 0x66, 0xbd, 0xfa, 0xef, 0x67, 0x3d,
+  0x62, 0x1e, 0x9f, 0x3c, 0x27, 0x05, 0x2a, 0xbc, 0xbb, 0x0b, 0xa2, 0x3d, 0x07,
+  0x02, 0xaf, 0x3d, 0xcb, 0x9d, 0xc9, 0x3d, 0xbe, 0x5c, 0x15, 0x3b, 0x73, 0xc6,
+  0x92, 0xbd, 0x70, 0x29, 0xe4, 0x3d, 0x46, 0xa2, 0xb2, 0xbc, 0x56, 0xb8, 0xe1,
+  0x3d, 0x82, 0xf9, 0x0d, 0xbd, 0x9b, 0x59, 0xa8, 0xbd, 0x42, 0x59, 0x98, 0x3d,
+  0xae, 0x31, 0x22, 0xbd, 0x0d, 0xa2, 0x1f, 0x3e, 0xc8, 0xfd, 0x58, 0xbc, 0x4e,
+  0xd4, 0xca, 0x3d, 0xbd, 0x39, 0x81, 0xbd, 0x7c, 0x0a, 0x25, 0x3e, 0xdb, 0x88,
+  0x7f, 0x3c, 0xf1, 0x64, 0x07, 0x3e, 0xd2, 0x99, 0x1d, 0x3d, 0x2c, 0xc9, 0xb0,
+  0xbd, 0x7a, 0xe0, 0x9d, 0xbc, 0x9e, 0x93, 0x19, 0x3d, 0x7f, 0xfd, 0xd2, 0xbc,
+  0xec, 0x44, 0xd5, 0x3d, 0x69, 0x81, 0xbf, 0x3d, 0x9e, 0xff, 0xac, 0x3c, 0x60,
+  0x6b, 0x6a, 0xbd, 0xe6, 0x22, 0x48, 0xbd, 0x3b, 0xc4, 0xa3, 0xbd, 0x0c, 0xd3,
+  0xf5, 0x3c, 0x08, 0x03, 0x62, 0x3c, 0x5c, 0x46, 0x16, 0x3e, 0xd3, 0x2a, 0xce,
+  0x3c, 0xfc, 0x31, 0xa8, 0x3d, 0xbd, 0x02, 0x95, 0x3c, 0xe8, 0xc7, 0x7a, 0x3c,
+  0xff, 0xc5, 0xf8, 0x3c, 0x3a, 0xb0, 0x79, 0x3b, 0xe6, 0xfd, 0x37, 0xbd, 0x5e,
+  0xd3, 0x06, 0x3e, 0x21, 0x21, 0xe8, 0x3c, 0xa1, 0x6f, 0xf1, 0x3d, 0xa6, 0xc2,
+  0x54, 0x3d, 0x9c, 0xae, 0x9c, 0x3d, 0xcb, 0xfd, 0x0a, 0x3c, 0x3e, 0x2e, 0x00,
+  0xbd, 0xdc, 0xf2, 0x4b, 0xbd, 0x7a, 0xdf, 0xbd, 0x3d, 0xbd, 0x27, 0x8b, 0x3c,
+  0x1c, 0x12, 0x2d, 0xbd, 0xf9, 0xf3, 0x28, 0x3e, 0x4c, 0x90, 0xb3, 0xbd, 0x49,
+  0xfc, 0x84, 0x3d, 0x2e, 0xc1, 0x82, 0x3d, 0x54, 0xc7, 0x62, 0x3d, 0xcb, 0x24,
+  0xf9, 0x3d, 0xf4, 0x6a, 0x2b, 0x3c, 0x38, 0x27, 0x1c, 0xbd, 0x05, 0xf1, 0xf5,
+  0x3d, 0xc0, 0x87, 0xa2, 0x3d, 0x7e, 0x5c, 0x92, 0x3d, 0xef, 0x33, 0xad, 0x3d,
+  0x34, 0xff, 0x43, 0x3d, 0x87, 0x47, 0xc6, 0x3d, 0x58, 0x18, 0x76, 0xbd, 0x1d,
+  0x74, 0x9e, 0x3d, 0xae, 0x41, 0xb1, 0xbc, 0x7d, 0x42, 0x94, 0xbd, 0x37, 0x01,
+  0x66, 0x3d, 0xb4, 0x18, 0x96, 0xbd, 0x69, 0x31, 0xc4, 0x3c, 0xe7, 0x09, 0x00,
+  0xbe, 0x46, 0x1a, 0x2b, 0xbd, 0x76, 0xd4, 0x7b, 0xbd, 0x48, 0xcd, 0xfc, 0x3b,
+  0xf9, 0x98, 0xf6, 0xbc, 0x33, 0x91, 0x2c, 0xbe, 0xe1, 0x08, 0xf5, 0xbd, 0xb0,
+  0xcd, 0x79, 0x3d, 0xd3, 0x1d, 0x0f, 0x3e, 0x5a, 0x9f, 0x13, 0xbd, 0x7d, 0x6b,
+  0x44, 0x3c, 0xcf, 0x14, 0x38, 0x3d, 0xe3, 0xfb, 0x47, 0x3d, 0x37, 0x1e, 0x2f,
+  0x3c, 0x89, 0xa0, 0xb2, 0xbd, 0x89, 0x21, 0x81, 0xbd, 0x04, 0xda, 0xc5, 0x3d,
+  0xa7, 0xa8, 0x16, 0xbc, 0x07, 0x2e, 0xc1, 0xbb, 0x8c, 0x6f, 0xc2, 0x3c, 0x3b,
+  0x0c, 0x03, 0xbd, 0x74, 0xc2, 0xa5, 0x3d, 0x3f, 0xeb, 0xb2, 0xbd, 0x2f, 0x66,
+  0x94, 0xbd, 0x4f, 0x30, 0xab, 0xbd, 0xc4, 0xdd, 0x45, 0x3d, 0x4a, 0xb7, 0x48,
+  0x3d, 0x55, 0x77, 0x26, 0x3e, 0xbe, 0x1c, 0x96, 0xbb, 0x5b, 0xca, 0x62, 0xbd,
+  0xcf, 0x1e, 0xd3, 0x3c, 0xa7, 0x0e, 0xb9, 0xbd, 0x67, 0x75, 0x2b, 0xbd, 0x26,
+  0x12, 0xd5, 0xbc, 0xb6, 0x0f, 0xc0, 0xbd, 0x12, 0xab, 0x23, 0x3d, 0xf6, 0x23,
+  0xb2, 0x3d, 0x3f, 0x71, 0x83, 0x3d, 0x2a, 0x08, 0x95, 0xbc, 0xd8, 0x6e, 0xdc,
+  0xbd, 0x1c, 0x85, 0xa6, 0xbd, 0xc4, 0xbc, 0x52, 0xbd, 0xa8, 0xe0, 0x9c, 0x3d,
+  0xf8, 0xa9, 0xe5, 0x3d, 0xfe, 0xbd, 0x9c, 0x3d, 0x9d, 0x62, 0xc3, 0x3c, 0xe6,
+  0x95, 0xd6, 0xbc, 0x08, 0x07, 0x68, 0xbc, 0x99, 0x7b, 0xe4, 0xbd, 0xcf, 0x18,
+  0xb0, 0x3d, 0xdb, 0x65, 0x8e, 0xbd, 0x47, 0x34, 0xa9, 0xbd, 0x65, 0xab, 0x0a,
+  0xbe, 0xb3, 0x57, 0x24, 0xbe, 0x1f, 0xce, 0xa2, 0xbc, 0xd2, 0x8a, 0xb7, 0xbc,
+  0x1e, 0xd4, 0x53, 0x3d, 0xec, 0x02, 0x14, 0xbd, 0xd7, 0xc2, 0x05, 0x3d, 0x05,
+  0xe3, 0xcb, 0xbc, 0x18, 0xc7, 0x9d, 0x3d, 0x99, 0x69, 0x0a, 0xbe, 0xee, 0x58,
+  0xa1, 0x3d, 0xae, 0xa3, 0x36, 0xbe, 0x5c, 0x5d, 0x9c, 0xbd, 0x39, 0xfb, 0x00,
+  0xbd, 0x38, 0xcd, 0x70, 0xbd, 0x2f, 0x77, 0xf2, 0xbd, 0x8a, 0x7d, 0x74, 0xbd,
+  0x4b, 0x08, 0x7b, 0xbd, 0x42, 0xaf, 0x4a, 0xba, 0x56, 0x2e, 0x80, 0xbd, 0x81,
+  0x9b, 0xb9, 0x3d, 0xf0, 0x6d, 0x86, 0x3c, 0xfe, 0x53, 0x82, 0xbd, 0xb8, 0xac,
+  0x56, 0xbd, 0xf7, 0xc9, 0x14, 0x3d, 0xea, 0xe6, 0x1f, 0xbd, 0x9f, 0x23, 0xd0,
+  0xbd, 0x73, 0xd5, 0x6a, 0x3d, 0x24, 0xdb, 0xba, 0xbd, 0xf5, 0xf1, 0xda, 0xbc,
+  0xe6, 0x8b, 0x34, 0xbd, 0x6c, 0x15, 0x8a, 0x3c, 0x26, 0x05, 0x63, 0x3d, 0x27,
+  0xc2, 0x8b, 0xbd, 0x62, 0xb2, 0x83, 0x3d, 0x71, 0x11, 0x50, 0xbc, 0x67, 0x3d,
+  0xe4, 0x3d, 0xa5, 0x3d, 0x59, 0xbd, 0x18, 0xa4, 0x70, 0x3c, 0x6b, 0x86, 0x9c,
+  0x3d, 0xa6, 0xe4, 0xbf, 0x3d, 0x3a, 0x8f, 0xe2, 0xbd, 0xd7, 0xf8, 0x71, 0x3d,
+  0x1d, 0x46, 0x00, 0xbd, 0x3c, 0x59, 0xc0, 0xbc, 0x1f, 0x60, 0x50, 0xbd, 0x91,
+  0xe2, 0xe6, 0xbd, 0x4c, 0x72, 0xb6, 0xbd, 0x49, 0x1e, 0xba, 0x3d, 0xdd, 0x1e,
+  0x77, 0xbc, 0x35, 0x26, 0xab, 0x3c, 0x63, 0x83, 0xd7, 0xbd, 0x41, 0x6f, 0xa8,
+  0x3d, 0x6d, 0xf0, 0x50, 0xbd, 0xdc, 0x5f, 0x2f, 0xbd, 0x73, 0x67, 0xce, 0xbc,
+  0x10, 0x47, 0x0b, 0xbd, 0xdc, 0x85, 0x41, 0x3c, 0xcd, 0x61, 0xc9, 0xbd, 0x9d,
+  0x79, 0x77, 0x3d, 0xbd, 0xe5, 0xb5, 0xbd, 0xa4, 0x88, 0xf7, 0xbd, 0x43, 0xf7,
+  0x5e, 0x3b, 0x95, 0x23, 0x26, 0xbd, 0x39, 0x1e, 0xa7, 0x3d, 0x60, 0xd5, 0x2e,
+  0xbd, 0x78, 0xa7, 0x1b, 0x3d, 0xad, 0x5b, 0xcd, 0x3d, 0x73, 0xba, 0x9d, 0xbd,
+  0xb7, 0xe0, 0x91, 0x3d, 0xa7, 0x90, 0x8e, 0x3d, 0x12, 0x0d, 0x11, 0x3d, 0x6d,
+  0xf8, 0x9b, 0xbd, 0x7d, 0xd4, 0xdf, 0x3d, 0x67, 0x4c, 0xa3, 0x3d, 0x21, 0x33,
+  0x88, 0xbc, 0xc8, 0xd2, 0xc7, 0xbd, 0x93, 0xea, 0x80, 0xbd, 0x4d, 0xe7, 0x42,
+  0xbd, 0x0b, 0x43, 0xfb, 0xbc, 0xb0, 0x8c, 0x7f, 0xbc, 0x16, 0x83, 0xc3, 0x3d,
+  0x42, 0xd0, 0x86, 0xbd, 0x7f, 0x6f, 0xa6, 0x3d, 0xed, 0xee, 0x4c, 0x3d, 0xc9,
+  0x3e, 0x03, 0x3d, 0x72, 0x47, 0x9e, 0xbd, 0x2f, 0x66, 0xda, 0x3d, 0x3d, 0x45,
+  0x80, 0x3b, 0x3c, 0xab, 0xa6, 0xbd, 0x73, 0xe8, 0x9f, 0xbd, 0xf6, 0x76, 0xc2,
+  0xbd, 0x18, 0xaf, 0xb4, 0x3d, 0x94, 0x94, 0x9f, 0xbd, 0x46, 0xcd, 0xad, 0xbd,
+  0xdb, 0xe6, 0x87, 0xbd, 0x67, 0x03, 0x07, 0x3d, 0x05, 0xc2, 0x84, 0xbc, 0xb7,
+  0x1f, 0x8d, 0xbd, 0x19, 0x72, 0xa1, 0x3d, 0xd8, 0xa5, 0x52, 0x3d, 0x63, 0x90,
+  0x03, 0xbd, 0xf5, 0xe3, 0xcd, 0x3d, 0xd8, 0xfb, 0x9c, 0x3d, 0x74, 0xd7, 0x06,
+  0xbd, 0x8c, 0xb5, 0xdd, 0xbd, 0x20, 0x07, 0xba, 0xbd, 0x83, 0xa1, 0xd2, 0x3d,
+  0x4c, 0x58, 0xe3, 0x3d, 0x31, 0x7d, 0xe1, 0xbd, 0x29, 0x06, 0xa1, 0xbd, 0x64,
+  0xa9, 0x2e, 0xbd, 0x79, 0x6c, 0xb5, 0xbd, 0x8f, 0xe5, 0xac, 0x3d, 0x68, 0xc1,
+  0xc3, 0x3c, 0xd5, 0xa7, 0xf2, 0xbd, 0x2e, 0x24, 0x40, 0xbd, 0xd6, 0x39, 0xe7,
+  0x3d, 0xe0, 0xaf, 0x02, 0xbd, 0xe1, 0xd6, 0xe1, 0xbd, 0xfa, 0xa0, 0x25, 0x3d,
+  0x26, 0xe8, 0x57, 0x3d, 0xa5, 0x58, 0xf6, 0xbd, 0xd2, 0x32, 0x0f, 0xbd, 0x8e,
+  0xa1, 0x8d, 0x3c, 0xb6, 0x98, 0xce, 0xbc, 0x71, 0x96, 0xfa, 0xbc, 0xe2, 0x69,
+  0x35, 0x3c, 0x3d, 0x07, 0x21, 0x3d, 0xc1, 0x9f, 0x8a, 0x3d, 0x0a, 0x9e, 0x64,
+  0xbd, 0x3b, 0x91, 0x57, 0xbb, 0x99, 0x41, 0x8c, 0x3d, 0xcf, 0x60, 0x8f, 0xbd,
+  0x5e, 0xe6, 0x25, 0xbd, 0xec, 0x60, 0xb0, 0xbd, 0xcf, 0xd7, 0x87, 0x3d, 0x1a,
+  0x3f, 0x4e, 0xbd, 0xd7, 0xbf, 0x78, 0xbd, 0xe3, 0x77, 0xd9, 0x3d, 0x81, 0xd8,
+  0x81, 0xbd, 0x52, 0x2a, 0xd3, 0x3d, 0xc1, 0x32, 0x80, 0xbd, 0xaa, 0xbf, 0x9d,
+  0x3d, 0xbf, 0x21, 0x3b, 0x3d, 0x30, 0x5e, 0x9e, 0xbd, 0xfa, 0xf3, 0xda, 0xbc,
+  0x41, 0xeb, 0x9c, 0xbd, 0x71, 0x88, 0xd3, 0xbc, 0xf1, 0x4c, 0x00, 0xbd, 0x38,
+  0xd5, 0x2f, 0x3c, 0xcd, 0xd9, 0x3e, 0x3d, 0xf4, 0xf8, 0xa4, 0x3d, 0xbc, 0x2f,
+  0x0e, 0xbd, 0x28, 0x35, 0x34, 0x3d, 0x3a, 0x20, 0x5c, 0x3d, 0x97, 0x22, 0xdb,
+  0xbd, 0x75, 0xd3, 0x5f, 0xbd, 0xf9, 0x3b, 0x66, 0xbd, 0x4a, 0x18, 0xe7, 0xbb,
+  0x4e, 0x21, 0x5d, 0xbd, 0x9c, 0x6c, 0x45, 0xbd, 0x2c, 0xb8, 0xe7, 0x3c, 0x65,
+  0xbf, 0x45, 0x3d, 0x15, 0xbb, 0xa5, 0xbd, 0x7e, 0x1c, 0xba, 0xbd, 0xfa, 0x2d,
+  0xfc, 0x3c, 0xc2, 0xfb, 0x20, 0xbd, 0x62, 0xc3, 0xa6, 0xbd, 0xae, 0x66, 0xc1,
+  0x3b, 0x8e, 0x5e, 0x29, 0xbd, 0x1a, 0x5d, 0x27, 0xbd, 0xce, 0x36, 0xaf, 0xbd,
+  0x6d, 0x03, 0xdd, 0x3d, 0xb5, 0x5d, 0x95, 0x3c, 0xd2, 0x9d, 0x60, 0xbd, 0xf0,
+  0xb5, 0x60, 0xbc, 0x80, 0x21, 0x34, 0xbd, 0xf1, 0x05, 0xc8, 0x3b, 0x2c, 0x2a,
+  0x2f, 0x3e, 0x99, 0x23, 0x3c, 0x3d, 0x73, 0x2f, 0xe4, 0x3d, 0xc8, 0x22, 0xce,
+  0x3d, 0xbf, 0x98, 0xad, 0xbd, 0xa5, 0xb2, 0xd4, 0xbd, 0x6d, 0xca, 0x3b, 0xbe,
+  0xd1, 0xa0, 0x95, 0x3c, 0xa0, 0xed, 0xe1, 0x3b, 0x8c, 0x5d, 0x6f, 0x3d, 0x10,
+  0x04, 0x88, 0xbd, 0x76, 0x62, 0xe7, 0x3d, 0x53, 0x28, 0x8c, 0xbd, 0x7b, 0x4f,
+  0x5d, 0xbd, 0x2e, 0x69, 0x8b, 0x3c, 0xe7, 0x7f, 0x79, 0x3c, 0x2e, 0xe5, 0xbf,
+  0x3c, 0x56, 0x90, 0xf6, 0xbc, 0x8a, 0xc6, 0x3b, 0x3d, 0x86, 0xbf, 0xb8, 0xbd,
+  0xe6, 0xf7, 0xd7, 0xbc, 0xc5, 0x96, 0xcb, 0x3d, 0x48, 0xe0, 0x9a, 0xbd, 0xd8,
+  0xe1, 0x45, 0xbd, 0xa7, 0x00, 0xd7, 0xbd, 0xda, 0x57, 0x1c, 0xbc, 0x8e, 0x49,
+  0x40, 0x3d, 0x8b, 0x52, 0x0a, 0x3d, 0xe2, 0xe8, 0x1b, 0xbd, 0x74, 0xd1, 0x0f,
+  0x3e, 0x17, 0x20, 0xc1, 0x3d, 0x3a, 0xbe, 0x8a, 0xbd, 0xa4, 0xd5, 0xca, 0x3c,
+  0x4f, 0x17, 0x82, 0xbc, 0x1f, 0xea, 0x09, 0xbd, 0x8e, 0xcb, 0xd0, 0x3d, 0x9c,
+  0x1a, 0x36, 0xbd, 0x99, 0xee, 0x5b, 0xbd, 0x5c, 0x1d, 0x10, 0xbe, 0x9e, 0x99,
+  0x22, 0x3d, 0x8f, 0x8f, 0xda, 0x3c, 0x42, 0xa7, 0x2e, 0x3d, 0x37, 0x33, 0x03,
+  0xbe, 0x11, 0x7b, 0x8f, 0xbd, 0xb8, 0xa1, 0x7e, 0x3d, 0x31, 0x04, 0x62, 0x3d,
+  0x93, 0x03, 0xfe, 0x3b, 0x59, 0x82, 0xa0, 0xbd, 0x07, 0xb8, 0x24, 0x3d, 0x7a,
+  0x45, 0xf2, 0x3d, 0xab, 0xf4, 0xd7, 0xbd, 0x2f, 0xbd, 0xc6, 0x3d, 0xb2, 0x1c,
+  0x47, 0x3d, 0xbe, 0xf6, 0xb2, 0x3d, 0xe2, 0xd0, 0x92, 0xbd, 0x0d, 0xec, 0xb2,
+  0xbd, 0x40, 0x5c, 0xc0, 0xbd, 0xa8, 0xf7, 0x0e, 0x3c, 0xef, 0x56, 0xb1, 0xbd,
+  0x91, 0x09, 0x4f, 0xbd, 0x47, 0x51, 0xcc, 0x3d, 0xcd, 0x6d, 0x85, 0xbd, 0xfe,
+  0xb2, 0x6f, 0xbd, 0x3f, 0x9b, 0xec, 0x3c, 0x64, 0x20, 0x98, 0xbb, 0x82, 0x78,
+  0x09, 0x3d, 0x2f, 0xbf, 0xe7, 0xbc, 0x5d, 0x5e, 0x01, 0xbd, 0x0c, 0xca, 0x4b,
+  0x3d, 0xf2, 0xa2, 0x89, 0xbd, 0xa6, 0x59, 0x54, 0x3d, 0x62, 0x46, 0x04, 0x3c,
+  0x99, 0x2f, 0x48, 0xbd, 0x22, 0x21, 0x1b, 0xbd, 0x07, 0x3b, 0xb4, 0xbd, 0x88,
+  0x42, 0x0a, 0x3e, 0x7e, 0x29, 0xc3, 0xbb, 0xab, 0x7a, 0x86, 0x3d, 0xe7, 0x26,
+  0xc0, 0x3c, 0xac, 0x99, 0x0f, 0xbd, 0x6e, 0xdb, 0x74, 0x3d, 0xba, 0x02, 0xdb,
+  0x3d, 0x3c, 0x38, 0xae, 0x3d, 0xdf, 0x34, 0xe1, 0xbd, 0x53, 0xa6, 0x26, 0xbe,
+  0x26, 0xa7, 0x82, 0x3d, 0x7b, 0x0f, 0x03, 0xbe, 0x85, 0xb6, 0xaa, 0xbc, 0xc5,
+  0x08, 0xbf, 0x3c, 0x4f, 0xd1, 0xa8, 0xbb, 0x9f, 0x58, 0xa6, 0x3c, 0x51, 0xdc,
+  0xfb, 0x3d, 0x2e, 0x30, 0xab, 0xbd, 0x38, 0x19, 0x19, 0x3c, 0xa2, 0x6a, 0x7c,
+  0x3d, 0x1d, 0x52, 0xd5, 0xbc, 0x15, 0x5f, 0xb3, 0x3b, 0x9b, 0xd8, 0x75, 0xbd,
+  0x5f, 0xa1, 0x13, 0xbd, 0xdc, 0xc7, 0xfd, 0xbb, 0x44, 0x9b, 0x73, 0xbd, 0x41,
+  0x1d, 0x82, 0xbd, 0xa7, 0x0b, 0x15, 0x3c, 0x87, 0x91, 0x80, 0x3c, 0x74, 0x55,
+  0xab, 0xbd, 0xf4, 0xb6, 0x3d, 0x3b, 0xa7, 0x2c, 0xcd, 0xbd, 0x19, 0xa5, 0x96,
+  0xbc, 0xea, 0x8f, 0xfa, 0x3d, 0x98, 0x47, 0x12, 0xbd, 0xfc, 0x40, 0x62, 0x3d,
+  0x72, 0x61, 0xa0, 0xbd, 0x79, 0x4d, 0x71, 0x3d, 0x2f, 0x4a, 0x89, 0x3d, 0xb8,
+  0xdc, 0x98, 0x3d, 0x66, 0x46, 0x6f, 0x3d, 0xa2, 0xf2, 0x0d, 0x3d, 0x36, 0xf5,
+  0xd4, 0x3c, 0xb9, 0xe5, 0x88, 0x3d, 0xa4, 0x93, 0x05, 0x3e, 0x64, 0x7e, 0x18,
+  0xbe, 0xb6, 0x47, 0x76, 0x3d, 0x8e, 0x31, 0xca, 0x3d, 0x2f, 0x72, 0xf3, 0x3d,
+  0x73, 0x45, 0x0d, 0x3e, 0xf4, 0x52, 0xfa, 0xbc, 0x40, 0x37, 0x88, 0xbd, 0x44,
+  0x13, 0xae, 0xbc, 0x25, 0x7e, 0x0a, 0xbd, 0xbe, 0x26, 0x45, 0xbd, 0x2c, 0xf1,
+  0x37, 0x3d, 0x29, 0xbd, 0x9f, 0xbd, 0xcb, 0xff, 0x1c, 0xbd, 0x62, 0xf2, 0xa0,
+  0xba, 0x20, 0x57, 0xa8, 0xbc, 0xaa, 0xc1, 0x9c, 0xbd, 0xfb, 0xd0, 0x3b, 0x3d,
+  0xe2, 0xae, 0x3f, 0x3d, 0x41, 0x4d, 0x93, 0x3d, 0x28, 0x11, 0xcc, 0x3d, 0x52,
+  0x6e, 0x06, 0x3e, 0x8f, 0x9b, 0xc0, 0x3d, 0x40, 0xb0, 0xa4, 0xbc, 0xb0, 0x45,
+  0x86, 0x3d, 0xc9, 0x85, 0x40, 0xbd, 0xfa, 0xdb, 0xe3, 0xbd, 0xf3, 0x0e, 0x9b,
+  0x3d, 0x48, 0x39, 0x03, 0xbe, 0xc4, 0xfc, 0x2f, 0xbd, 0xb9, 0xbf, 0xbe, 0x3d,
+  0xd9, 0x2f, 0x11, 0xbd, 0x71, 0x6a, 0x75, 0x3c, 0x89, 0x2b, 0xc2, 0xbd, 0x21,
+  0x82, 0xd4, 0xbd, 0x36, 0xcc, 0xf5, 0x3d, 0xa3, 0x91, 0x3d, 0x3d, 0x16, 0xd1,
+  0x7d, 0xbd, 0x40, 0xba, 0x75, 0x3b, 0x5a, 0x82, 0xfa, 0x3d, 0xc1, 0x09, 0xaf,
+  0x3d, 0x1e, 0x44, 0xa3, 0x3d, 0xd7, 0x2a, 0x37, 0xbd, 0xd9, 0x72, 0xcc, 0x3d,
+  0x58, 0x58, 0x9a, 0xbd, 0xea, 0x90, 0x35, 0xbc, 0x0e, 0x69, 0x92, 0x3c, 0x68,
+  0x7e, 0x5c, 0xbc, 0x0a, 0xba, 0x55, 0x3d, 0x7e, 0xd4, 0xb9, 0x3b, 0x45, 0x5b,
+  0xe7, 0xbd, 0x6b, 0xe6, 0xd5, 0xbc, 0xbc, 0x3e, 0x14, 0xbd, 0xe8, 0xb5, 0x09,
+  0x3d, 0xbd, 0xde, 0xaf, 0x3d, 0xcf, 0x2d, 0x94, 0xbd, 0x12, 0x0f, 0xac, 0x3d,
+  0x21, 0x99, 0xc2, 0xbd, 0x45, 0x93, 0x0d, 0x3d, 0x8a, 0x1e, 0xe4, 0x3d, 0xe8,
+  0xfe, 0xb2, 0x3d, 0x0e, 0x69, 0xb8, 0xbd, 0xab, 0x2a, 0x91, 0xbc, 0x02, 0x24,
+  0x8f, 0xbd, 0xef, 0x96, 0xa7, 0x3b, 0x39, 0x39, 0xda, 0xbd, 0x31, 0x03, 0xcd,
+  0x3d, 0xe5, 0xf7, 0x4c, 0x3c, 0xca, 0x45, 0x3f, 0x3c, 0xb4, 0xf6, 0x8c, 0xbd,
+  0x4a, 0x36, 0x4f, 0x3c, 0x5c, 0xe7, 0x56, 0x3d, 0xe3, 0x81, 0xd6, 0xbd, 0x44,
+  0x9d, 0x3d, 0xbd, 0xb2, 0xf5, 0xe2, 0x3d, 0xaa, 0xd0, 0xff, 0xbc, 0x49, 0x86,
+  0x4b, 0x3d, 0x79, 0x40, 0x51, 0xbd, 0x60, 0xd2, 0x91, 0xbd, 0x9d, 0x61, 0x26,
+  0xbe, 0x32, 0x82, 0xe5, 0x3d, 0xa3, 0x28, 0xc5, 0xbc, 0x3f, 0x02, 0x08, 0xbd,
+  0x9b, 0xe8, 0xca, 0x3d, 0xb4, 0x34, 0xed, 0x3c, 0x48, 0x7f, 0xea, 0x3d, 0xd6,
+  0x07, 0xa1, 0xbd, 0xf9, 0xad, 0x18, 0x3c, 0xba, 0x0d, 0x8b, 0x3d, 0xa6, 0x13,
+  0x0f, 0x3e, 0x25, 0xfc, 0x99, 0x3c, 0xc4, 0x8e, 0xc1, 0x3c, 0xfe, 0xa2, 0x14,
+  0x3d, 0x0f, 0x96, 0xd5, 0xbc, 0x21, 0x99, 0xbb, 0xbc, 0xd7, 0x9c, 0xd1, 0x3d,
+  0x14, 0xd2, 0xa2, 0x3d, 0x8b, 0x64, 0xd9, 0xbd, 0x11, 0x36, 0xa2, 0x3c, 0xec,
+  0xbe, 0x24, 0xbd, 0x9f, 0x0f, 0x2a, 0x3d, 0x9d, 0xd5, 0xa6, 0xbd, 0xba, 0xe4,
+  0x83, 0xbd, 0xc1, 0xce, 0x45, 0xbd, 0x4a, 0x99, 0x8c, 0xbd, 0xa0, 0x8d, 0x99,
+  0x3b, 0xf1, 0x4b, 0x7a, 0xbc, 0x9d, 0x76, 0xd1, 0xbd, 0x65, 0x96, 0xd5, 0x3d,
+  0x65, 0xd5, 0x0a, 0xbd, 0x03, 0xb9, 0x60, 0x3c, 0xbe, 0xb3, 0x0e, 0xbe, 0xf3,
+  0x86, 0xf3, 0x3d, 0x28, 0xc1, 0x0f, 0x3d, 0x88, 0x69, 0xc0, 0xbc, 0x0e, 0x06,
+  0x7e, 0x3d, 0x42, 0x82, 0xa5, 0x3d, 0x28, 0x95, 0x1b, 0x3d, 0xb7, 0x6d, 0xac,
+  0xbd, 0xe0, 0xc9, 0x14, 0xbd, 0x5c, 0xf4, 0xb3, 0x3d, 0x74, 0x9e, 0xd4, 0xbd,
+  0x8d, 0x9a, 0xed, 0x3c, 0x9c, 0xe3, 0x01, 0x3d, 0x08, 0x0d, 0xc5, 0xbd, 0xc5,
+  0xba, 0xa7, 0xbd, 0xf2, 0xf8, 0x30, 0x3c, 0x41, 0x3c, 0xa8, 0x3d, 0x15, 0x63,
+  0x60, 0xbd, 0x31, 0x27, 0xc6, 0xbc, 0x61, 0x0f, 0xe8, 0xbd, 0xcf, 0x0c, 0xbb,
+  0xbc, 0xf5, 0x06, 0xbd, 0x3d, 0x99, 0x20, 0xb4, 0x3c, 0x5c, 0x27, 0x2d, 0xbd,
+  0x5f, 0x29, 0x4b, 0xbd, 0xe6, 0x17, 0xef, 0x3d, 0x9c, 0x60, 0x84, 0xbd, 0x6a,
+  0x76, 0xce, 0x3d, 0xf7, 0x48, 0x92, 0x3d, 0x6a, 0x72, 0xa3, 0x3d, 0x07, 0x7e,
+  0x04, 0x3e, 0x71, 0x2a, 0xa8, 0x3d, 0x9a, 0x94, 0x74, 0x3d, 0x78, 0x1b, 0xf6,
+  0x3d, 0x98, 0x1e, 0xfd, 0xbc, 0x3a, 0xf5, 0xc4, 0x39, 0x5f, 0x45, 0xc6, 0x3d,
+  0x14, 0xc4, 0x8b, 0x3d, 0xea, 0x0c, 0x16, 0xbd, 0x43, 0x08, 0x98, 0x3c, 0x42,
+  0x6d, 0x04, 0x3d, 0x8f, 0x4f, 0xc5, 0xbd, 0x88, 0x9e, 0x35, 0xbd, 0xfd, 0x1d,
+  0xfc, 0xbc, 0x82, 0x9f, 0xa5, 0x3c, 0xfe, 0xe2, 0x30, 0xbc, 0x6a, 0x80, 0xf1,
+  0x3c, 0xc0, 0x61, 0x39, 0x3d, 0xcd, 0x81, 0x08, 0xbe, 0x6f, 0xa9, 0xa9, 0xbd,
+  0x51, 0x50, 0x2b, 0xba, 0xaa, 0xd4, 0xa1, 0xbd, 0x13, 0x64, 0xdf, 0xbd, 0xa4,
+  0xd4, 0x5c, 0xbc, 0x2d, 0x83, 0xad, 0xbd, 0xc3, 0x31, 0x07, 0x3d, 0x7d, 0x7a,
+  0x97, 0xbc, 0xa7, 0x23, 0xf7, 0xbd, 0x61, 0x7f, 0xda, 0xbd, 0x1d, 0x39, 0xd4,
+  0xbd, 0x0b, 0x50, 0x8f, 0xbc, 0xfc, 0xa2, 0x06, 0x3e, 0x7b, 0x0e, 0x90, 0x3d,
+  0xf8, 0xa0, 0x9d, 0xbd, 0x25, 0x0f, 0x6d, 0x3d, 0xae, 0x7f, 0xb7, 0xbc, 0xe9,
+  0x1f, 0x10, 0xbe, 0x5b, 0x7f, 0x52, 0xbd, 0xe5, 0x86, 0x0d, 0xbd, 0x03, 0x12,
+  0x58, 0x3c, 0xee, 0x04, 0xaa, 0xbd, 0x08, 0x85, 0x0a, 0x3d, 0x73, 0x0b, 0x93,
+  0xbd, 0x4c, 0x42, 0x0d, 0xbd, 0xe9, 0xa4, 0x7f, 0x3d, 0x3b, 0x8a, 0xa8, 0x3c,
+  0xa6, 0x4d, 0x88, 0x3d, 0x44, 0xe9, 0x1e, 0x3c, 0x05, 0x39, 0xd0, 0x3d, 0x09,
+  0xc4, 0xc7, 0x3b, 0xdb, 0x43, 0x88, 0xbd, 0xb2, 0x44, 0x9d, 0x3d, 0x00, 0x42,
+  0x13, 0xbe, 0x25, 0x15, 0x9a, 0x3d, 0xee, 0x5d, 0x9d, 0x3d, 0x04, 0x63, 0x5b,
+  0xbb, 0x67, 0x1c, 0x9e, 0x3d, 0xe1, 0x8e, 0xb4, 0x3d, 0x68, 0xae, 0x8c, 0x3d,
+  0x1a, 0xdc, 0xac, 0x3d, 0xdb, 0x00, 0x86, 0x3d, 0x60, 0xb7, 0x07, 0xbd, 0x92,
+  0x7c, 0xbc, 0xbd, 0x47, 0xb6, 0x8f, 0x3c, 0x16, 0x03, 0xc1, 0x3d, 0xbb, 0x65,
+  0x94, 0x3d, 0x0c, 0x98, 0x05, 0xbd, 0xf1, 0xe1, 0xc2, 0x3d, 0xb5, 0xf2, 0x01,
+  0xbe, 0xf2, 0xe0, 0x01, 0x3d, 0xb4, 0x4a, 0xa5, 0x3d, 0x7c, 0x67, 0x97, 0x3d,
+  0xa4, 0xbe, 0x52, 0x3d, 0x17, 0x60, 0x1c, 0x3d, 0x95, 0x83, 0x5b, 0xbc, 0x33,
+  0x59, 0xd3, 0xbd, 0x45, 0x05, 0xf7, 0xbd, 0xa5, 0x82, 0xbe, 0x3d, 0x91, 0xc4,
+  0x46, 0x3d, 0x5c, 0x4b, 0x27, 0xb8, 0x32, 0xe3, 0xf9, 0x3c, 0xdf, 0xcb, 0xcc,
+  0x3d, 0xc3, 0x94, 0x6f, 0xbd, 0x10, 0xa2, 0xec, 0x3d, 0x2e, 0xaf, 0x09, 0xbc,
+  0x49, 0x91, 0x8d, 0x3d, 0x6e, 0xc8, 0xc5, 0xbc, 0x45, 0x0e, 0x66, 0xbc, 0x37,
+  0xd6, 0xfd, 0xbc, 0x2a, 0xea, 0x81, 0xbd, 0xf7, 0xc2, 0xc2, 0x3d, 0x12, 0x27,
+  0x6b, 0x3c, 0x97, 0x69, 0xf3, 0x3b, 0xc8, 0xb7, 0xa6, 0xbc, 0xd6, 0xdf, 0x96,
+  0xbc, 0xe0, 0x8a, 0x1b, 0x3e, 0xe3, 0x34, 0xc5, 0x3c, 0x96, 0xcd, 0x12, 0xbe,
+  0xcd, 0x75, 0x5a, 0x3c, 0x81, 0xd5, 0xd6, 0xbd, 0x2f, 0x97, 0x6e, 0xbd, 0x92,
+  0x28, 0x45, 0xbc, 0x81, 0xaf, 0xce, 0x3d, 0xc3, 0x35, 0xd3, 0x3d, 0x97, 0x1f,
+  0x99, 0x3c, 0x48, 0xb6, 0x5b, 0x3d, 0x98, 0x96, 0x9d, 0x3d, 0xed, 0x0a, 0xa3,
+  0x3c, 0x5e, 0x72, 0xe5, 0xbb, 0xad, 0x65, 0xaa, 0xbd, 0x16, 0x57, 0x8c, 0xbd,
+  0x4a, 0x37, 0x6b, 0xbd, 0x18, 0x35, 0xbe, 0xbd, 0xa8, 0xaa, 0x07, 0xbd, 0xbe,
+  0xcb, 0xf5, 0xbb, 0xbe, 0x69, 0xad, 0x3c, 0x1f, 0x82, 0x54, 0x3d, 0x32, 0xbe,
+  0x87, 0xbd, 0x67, 0x54, 0x41, 0x3d, 0x46, 0xb6, 0x2e, 0xbd, 0x04, 0xb2, 0x75,
+  0x3c, 0xb8, 0xf0, 0xcd, 0xbc, 0x63, 0x01, 0x7f, 0x3d, 0x92, 0xb6, 0x84, 0xbd,
+  0x43, 0x6b, 0xe0, 0x3d, 0x4a, 0xa8, 0xb3, 0x3c, 0x05, 0x93, 0x8f, 0xbd, 0xca,
+  0xa0, 0x84, 0x3d, 0x84, 0x4b, 0x27, 0x3e, 0x68, 0xce, 0xe2, 0xbd, 0x30, 0x5d,
+  0x22, 0x3d, 0xa3, 0x3c, 0xc0, 0x3d, 0xc3, 0xa5, 0x37, 0xbd, 0xc8, 0xb2, 0xa3,
+  0x3d, 0x79, 0xee, 0x82, 0x3d, 0xc6, 0xb3, 0xab, 0x3a, 0x72, 0xa4, 0x65, 0xbb,
+  0x5c, 0x20, 0xa7, 0x3d, 0xdd, 0xd9, 0xe5, 0xba, 0xbe, 0xcb, 0x9d, 0xbd, 0xdc,
+  0x19, 0xc5, 0xbd, 0xa8, 0x93, 0xc8, 0x3d, 0x4d, 0x2f, 0x1a, 0x3d, 0x24, 0x73,
+  0xa2, 0x3d, 0x11, 0xb1, 0x08, 0x3e, 0x8a, 0x27, 0xcf, 0x3d, 0xb6, 0xee, 0xab,
+  0xbd, 0x1f, 0xd7, 0xe1, 0x3d, 0x5d, 0xcf, 0x5f, 0xbd, 0x8e, 0xa9, 0xb0, 0x3c,
+  0x86, 0xb9, 0x31, 0x3d, 0xd7, 0xa8, 0x92, 0xbd, 0x7f, 0x37, 0xd0, 0x3d, 0x4c,
+  0xbb, 0xb6, 0x3d, 0xa4, 0x4d, 0x09, 0xbd, 0xc5, 0x8e, 0x0f, 0xbd, 0xbf, 0x27,
+  0xa8, 0xbd, 0x62, 0x94, 0xb2, 0x3d, 0x2d, 0x35, 0xe8, 0x3d, 0xd5, 0x78, 0xee,
+  0xbd, 0x2a, 0x5b, 0x5a, 0xbd, 0x72, 0x89, 0x4d, 0x3d, 0x7f, 0x5b, 0xfd, 0xb8,
+  0x11, 0x80, 0x58, 0xbd, 0x69, 0xa9, 0xbc, 0xbc, 0xdb, 0xe9, 0xd3, 0xbc, 0x45,
+  0x3b, 0xf5, 0xbc, 0xa6, 0x28, 0xc5, 0x3d, 0xe2, 0x48, 0x31, 0x3d, 0x49, 0xab,
+  0x36, 0x3b, 0xca, 0xd2, 0xc6, 0xbc, 0x29, 0x1f, 0x5a, 0x3d, 0x90, 0xe6, 0x3b,
+  0xbd, 0xf7, 0x5f, 0xa0, 0x3d, 0xb7, 0xc1, 0x91, 0x3d, 0x18, 0xcc, 0xc4, 0x3c,
+  0x0a, 0xc0, 0x8a, 0xbd, 0x2a, 0x5e, 0x63, 0xbd, 0xa1, 0x2f, 0xb7, 0xbc, 0xf2,
+  0xfb, 0xac, 0x3b, 0xa4, 0xed, 0x17, 0x3d, 0xc1, 0x09, 0x59, 0xbd, 0xe9, 0xf7,
+  0xf4, 0x3d, 0xad, 0xe5, 0x8f, 0xbd, 0xa9, 0x9e, 0xd0, 0x3d, 0x0a, 0x98, 0x40,
+  0xbd, 0xbc, 0x1f, 0x95, 0x3d, 0x0b, 0x17, 0xf0, 0x3c, 0x64, 0x3f, 0x60, 0xbd,
+  0xc0, 0xb2, 0xc7, 0x3b, 0x42, 0x3f, 0x62, 0x3c, 0x6a, 0x39, 0x8c, 0xbd, 0xbf,
+  0x72, 0xfd, 0xbd, 0x47, 0x3d, 0xd1, 0xbd, 0x7c, 0x0b, 0x6d, 0x3d, 0xf3, 0x4a,
+  0xda, 0xbc, 0xce, 0x57, 0x9d, 0x3d, 0xf0, 0x13, 0x53, 0x3b, 0x94, 0x39, 0x31,
+  0x3d, 0x3d, 0xa7, 0x3f, 0xbd, 0xfa, 0x3e, 0x6b, 0x3d, 0xfb, 0x19, 0xa9, 0x3d,
+  0x07, 0xfc, 0x5e, 0xbd, 0xfa, 0x47, 0xd3, 0x3d, 0xd6, 0x83, 0x9a, 0xbd, 0x2c,
+  0xa9, 0x14, 0x3e, 0x01, 0xb5, 0x7e, 0x3d, 0x27, 0xfb, 0x00, 0x3a, 0x7d, 0xe5,
+  0x35, 0xbd, 0x68, 0x50, 0x05, 0xbc, 0x87, 0xdb, 0x19, 0x3d, 0xbe, 0x2e, 0xe3,
+  0x3d, 0xe4, 0x41, 0x07, 0xbd, 0x53, 0x57, 0xcc, 0xb9, 0x28, 0x92, 0x96, 0x3d,
+  0xb6, 0x14, 0xa4, 0xbc, 0xad, 0x84, 0x69, 0x3c, 0x19, 0xe4, 0xde, 0xbd, 0x3b,
+  0xad, 0x04, 0xbe, 0xd9, 0xe3, 0xbc, 0x3d, 0x5b, 0x59, 0xd3, 0x3d, 0x00, 0x12,
+  0xcc, 0xbd, 0x2d, 0x0c, 0x8a, 0xbd, 0xc6, 0x1c, 0x79, 0x3d, 0x03, 0xf3, 0x14,
+  0xbc, 0xb7, 0x28, 0xa6, 0x3d, 0x28, 0x0d, 0xa5, 0xbd, 0xa9, 0x8e, 0x32, 0x3b,
+  0x60, 0xef, 0x30, 0x3d, 0x21, 0x9f, 0x68, 0xbc, 0x13, 0x02, 0x83, 0xbc, 0x21,
+  0x90, 0x9e, 0x3c, 0x78, 0xfa, 0xf4, 0xbc, 0xf9, 0x40, 0x6e, 0x3a, 0x11, 0xdb,
+  0x05, 0x3e, 0xc1, 0xb7, 0xff, 0x3b, 0x04, 0x47, 0x65, 0xbd, 0x6b, 0x8a, 0x85,
+  0xbd, 0x30, 0xd5, 0x95, 0x3d, 0x3c, 0x4a, 0x92, 0x3d, 0xa6, 0x20, 0x11, 0x3d,
+  0x03, 0xd8, 0xb1, 0x3c, 0x7d, 0x1e, 0x0b, 0xbd, 0xe9, 0x0a, 0x92, 0x3d, 0x7e,
+  0x9d, 0xb8, 0x3c, 0xb5, 0x1e, 0x6d, 0x3d, 0x6d, 0x4e, 0x6f, 0x3d, 0xbc, 0x1e,
+  0xdc, 0x3c, 0x2e, 0x87, 0xa0, 0x3d, 0x2d, 0x00, 0x5c, 0xb8, 0x8f, 0xfb, 0xb3,
+  0xbd, 0x9e, 0x36, 0x08, 0x3d, 0xa4, 0x19, 0xe0, 0xbb, 0x5f, 0xc0, 0xb7, 0xbb,
+  0xc7, 0x3c, 0x78, 0x3d, 0x53, 0xe4, 0x65, 0x3d, 0xca, 0xdf, 0xc9, 0x3d, 0x18,
+  0x8b, 0x27, 0xbd, 0x19, 0x05, 0xa6, 0x3d, 0x23, 0xa2, 0xa2, 0x3d, 0xc2, 0x4b,
+  0xac, 0xbd, 0x1b, 0x23, 0xd7, 0xbd, 0xc2, 0x53, 0x97, 0x3d, 0x2e, 0xb2, 0x45,
+  0xbd, 0x73, 0x7b, 0xbc, 0xbd, 0x33, 0xfc, 0x47, 0xbc, 0x0b, 0x36, 0x91, 0x3d,
+  0xaa, 0x1e, 0x0b, 0xbd, 0xc8, 0x3a, 0xda, 0x3c, 0x22, 0x29, 0xc5, 0x3d, 0x62,
+  0x18, 0xf3, 0x3c, 0x75, 0x25, 0xc1, 0xbc, 0xe8, 0x19, 0xb8, 0x3d, 0x30, 0x46,
+  0x47, 0x3d, 0x22, 0x80, 0x9f, 0xbc, 0x59, 0xcc, 0xcf, 0x3d, 0x00, 0x51, 0x95,
+  0xbc, 0x8b, 0x00, 0xbf, 0xbc, 0xf5, 0xca, 0x89, 0xbd, 0xca, 0x56, 0xe4, 0x3d,
+  0x7f, 0x86, 0x24, 0x3e, 0x23, 0xd7, 0x14, 0x3d, 0xe2, 0x8f, 0xa7, 0xbc, 0x1d,
+  0x6d, 0xb3, 0x3c, 0xa4, 0x8a, 0x85, 0xbd, 0x4a, 0x36, 0x40, 0xbd, 0x20, 0xa4,
+  0xa7, 0xbd, 0xfe, 0x10, 0xa3, 0xbc, 0xa3, 0x3b, 0xce, 0x3d, 0x88, 0x99, 0x12,
+  0xbd, 0x3d, 0x58, 0xd5, 0xbd, 0x76, 0xe5, 0x7f, 0x3c, 0x87, 0xa0, 0x68, 0xbd,
+  0x8a, 0xd4, 0xb7, 0xbd, 0xdb, 0x68, 0x6f, 0x3c, 0x22, 0x84, 0x2e, 0xbc, 0x94,
+  0x63, 0xa6, 0xbc, 0x35, 0xa4, 0xa9, 0x3d, 0x17, 0xec, 0x0d, 0xbd, 0xd4, 0x25,
+  0x9b, 0xbd, 0xf1, 0x84, 0x04, 0xbd, 0x3a, 0x19, 0xdd, 0x3d, 0xd8, 0xba, 0xb1,
+  0x3d, 0xb2, 0xb7, 0x21, 0xbd, 0xeb, 0x7e, 0x19, 0x3d, 0xb9, 0xd3, 0xb9, 0x3b,
+  0xa5, 0x6a, 0x88, 0xbd, 0xdc, 0x78, 0x99, 0xbd, 0xf4, 0x9f, 0xc4, 0x3d, 0x23,
+  0xfe, 0x49, 0xbb, 0xbe, 0xa0, 0x98, 0xbb, 0x05, 0xe8, 0x84, 0xbd, 0x0e, 0x24,
+  0x20, 0x3d, 0x30, 0x96, 0x80, 0xbd, 0xd8, 0x1e, 0xef, 0x3c, 0x0a, 0xad, 0xfe,
+  0x3d, 0xa3, 0xaa, 0x3b, 0xbd, 0x24, 0xd1, 0xb9, 0xbd, 0xfd, 0xb4, 0xd6, 0x3c,
+  0xe7, 0xfe, 0xe9, 0xbb, 0xf7, 0xd6, 0xaa, 0x3c, 0xa5, 0x35, 0xc1, 0xbc, 0x39,
+  0xbd, 0x00, 0xbe, 0x19, 0xed, 0x3b, 0x3d, 0x7f, 0x4e, 0x99, 0x3d, 0x09, 0x63,
+  0xe3, 0xbd, 0x74, 0xc3, 0x73, 0xbd, 0xb7, 0x7d, 0xa4, 0x3d, 0x68, 0x37, 0x50,
+  0xbd, 0xb0, 0xb0, 0xe8, 0xbd, 0x28, 0x4f, 0xa7, 0xbd, 0x22, 0x85, 0x9e, 0xbd,
+  0x32, 0xce, 0x12, 0x3e, 0x60, 0x47, 0xbb, 0x3c, 0xdb, 0xa8, 0xc6, 0x3d, 0x50,
+  0xcf, 0x0c, 0x3d, 0x4b, 0x7d, 0x9c, 0x3b, 0xa9, 0xeb, 0xb9, 0xbd, 0x07, 0x97,
+  0x13, 0x3c, 0xbe, 0x6b, 0x8f, 0xbd, 0x9c, 0xb3, 0xa9, 0x3d, 0x64, 0xd6, 0x96,
+  0xbd, 0x75, 0x6a, 0xc4, 0x3c, 0x20, 0xb6, 0x7e, 0x3d, 0x9b, 0x0e, 0x0c, 0x3e,
+  0xf3, 0xd5, 0xc5, 0x3d, 0x54, 0xb8, 0xdf, 0xbd, 0x12, 0x6e, 0xf2, 0x3a, 0x7b,
+  0xe4, 0xaa, 0x3c, 0xe3, 0x7c, 0xb5, 0xbd, 0xe6, 0x11, 0x05, 0x3d, 0xc6, 0x65,
+  0xa2, 0x3d, 0x95, 0x9e, 0x0c, 0x3d, 0x7f, 0xfe, 0xea, 0xbc, 0x22, 0x51, 0xcf,
+  0x3b, 0x7b, 0xdd, 0x98, 0xbc, 0x6e, 0x2f, 0xba, 0xbc, 0xb3, 0x8e, 0xe6, 0xbd,
+  0x5e, 0x5e, 0x76, 0x3d, 0x3e, 0xd4, 0xaf, 0xbd, 0x25, 0xbc, 0xa8, 0x3d, 0xb0,
+  0xd0, 0x81, 0x3c, 0x4c, 0x3f, 0x52, 0x3c, 0x10, 0xd7, 0x13, 0xbd, 0xd0, 0x83,
+  0x02, 0x3e, 0xd3, 0x03, 0xa5, 0x3d, 0xeb, 0xa7, 0xca, 0xbd, 0x91, 0x09, 0x1b,
+  0x3d, 0x7a, 0x8c, 0xbf, 0x3c, 0x89, 0x04, 0xdb, 0xbd, 0xf8, 0xfc, 0x56, 0xbd,
+  0x8a, 0x66, 0x36, 0x3d, 0x42, 0x8f, 0x6e, 0xbd, 0xc9, 0x79, 0x87, 0x3d, 0xbf,
+  0xfb, 0x26, 0x3d, 0x56, 0xeb, 0xbc, 0xbb, 0x3b, 0xa7, 0x17, 0x3d, 0x17, 0x46,
+  0x27, 0x3d, 0x87, 0xfb, 0xb4, 0x3d, 0x09, 0x7b, 0x9d, 0xbc, 0xf4, 0xdc, 0x30,
+  0x3d, 0xca, 0xee, 0xf7, 0xbd, 0x08, 0x73, 0xec, 0x3d, 0x60, 0xed, 0x24, 0x3d,
+  0x77, 0xa3, 0x26, 0x3c, 0x07, 0x95, 0xe2, 0x3c, 0x27, 0x2f, 0xde, 0x3c, 0xd3,
+  0x8a, 0x94, 0xbc, 0x58, 0x57, 0xaa, 0xbd, 0x86, 0xdd, 0x0d, 0x3d, 0x29, 0x14,
+  0x56, 0x3d, 0x94, 0xdf, 0xa8, 0x3d, 0x33, 0x86, 0xbd, 0x3d, 0xb2, 0x8a, 0x7b,
+  0x3c, 0x8d, 0x7b, 0x26, 0xbc, 0x2f, 0x59, 0xb8, 0xbd, 0x65, 0xc2, 0x87, 0xbd,
+  0xd3, 0x4b, 0x76, 0x3d, 0x16, 0x20, 0x22, 0x3d, 0xb9, 0xef, 0x62, 0x3b, 0xda,
+  0x3b, 0x6b, 0x3d, 0xce, 0x75, 0x59, 0x3d, 0x90, 0xde, 0x33, 0x3d, 0x77, 0x8b,
+  0xf7, 0x3d, 0x98, 0xfd, 0xa0, 0xbd, 0xcc, 0xa0, 0xd2, 0x3d, 0xec, 0x73, 0x84,
+  0xbd, 0x2c, 0x7a, 0x34, 0x3c, 0xbd, 0x44, 0x07, 0x3e, 0xd8, 0xf6, 0x74, 0xbd,
+  0x0a, 0x72, 0x8c, 0xbd, 0xad, 0xd3, 0xd5, 0xbd, 0x78, 0xf7, 0xc9, 0x3d, 0x28,
+  0xef, 0x5f, 0x3d, 0x01, 0xbf, 0x80, 0xbd, 0xcc, 0xd6, 0x01, 0xbd, 0x37, 0x34,
+  0x75, 0xbd, 0x4a, 0x00, 0x87, 0x3d, 0x4c, 0xd9, 0x4c, 0xbb, 0xcd, 0x86, 0x42,
+  0xbd, 0x7b, 0xef, 0x1a, 0x3d, 0x98, 0x2b, 0x3a, 0x3d, 0x97, 0x7a, 0x18, 0x3c,
+  0xd0, 0x24, 0xe6, 0xbd, 0xcd, 0xc5, 0xc2, 0x3c, 0x8d, 0x69, 0x7f, 0xbc, 0xed,
+  0xef, 0x88, 0xbd, 0x54, 0x72, 0xd6, 0x3d, 0xc4, 0x5b, 0xba, 0x3d, 0x13, 0xd9,
+  0x1d, 0xbd, 0xa9, 0x69, 0xd5, 0x3d, 0xf6, 0xab, 0x4b, 0x3d, 0xaf, 0x3c, 0xab,
+  0x3d, 0xad, 0x17, 0x02, 0x3d, 0xfe, 0x82, 0x97, 0xbd, 0xe7, 0x5b, 0xca, 0x3d,
+  0x0d, 0x04, 0x1b, 0x3d, 0x6a, 0x95, 0xb5, 0x3d, 0xa7, 0x5f, 0xc5, 0x3d, 0x57,
+  0xf4, 0xdc, 0x3d, 0x25, 0xf3, 0xa2, 0xbd, 0xad, 0x96, 0xd3, 0x3d, 0x16, 0xb7,
+  0x2f, 0xbe, 0x61, 0x4c, 0xaa, 0x3d, 0x71, 0x82, 0xcc, 0x3d, 0x44, 0x36, 0xbb,
+  0x3d, 0xba, 0x8f, 0xca, 0xbc, 0xe0, 0xa3, 0x63, 0x3c, 0xfa, 0x02, 0xb3, 0xbd,
+  0x0a, 0xcf, 0x00, 0xbe, 0x4b, 0xce, 0x7e, 0xbd, 0xe9, 0x90, 0xcf, 0x3b, 0x32,
+  0x0d, 0xa9, 0xbd, 0x54, 0x4d, 0x42, 0x3d, 0x30, 0x36, 0x32, 0x3d, 0x04, 0xa6,
+  0xb2, 0xbd, 0x79, 0x05, 0x0a, 0x3e, 0xbb, 0x45, 0xe6, 0x3c, 0xfd, 0xf6, 0x79,
+  0x3d, 0x1c, 0x9f, 0x1d, 0x3d, 0xe5, 0x27, 0x97, 0x3c, 0x31, 0xf4, 0x02, 0xbd,
+  0x30, 0x19, 0x45, 0x3d, 0xa4, 0x54, 0x06, 0x3d, 0x94, 0x4d, 0xb9, 0xbd, 0x3b,
+  0x21, 0xdf, 0xbd, 0xbb, 0x79, 0x1f, 0xbd, 0x41, 0x34, 0x9f, 0x3d, 0x02, 0x58,
+  0xb8, 0x3d, 0xe1, 0xb2, 0x03, 0xbe, 0x5e, 0x71, 0x29, 0x3d, 0x9e, 0xf7, 0xbf,
+  0xbd, 0xc7, 0x01, 0x75, 0xbd, 0x0d, 0xe3, 0x14, 0xbd, 0x38, 0x23, 0xa3, 0x3d,
+  0x93, 0xbc, 0xaa, 0xbd, 0xc9, 0x19, 0x91, 0x3d, 0xcb, 0xba, 0x69, 0x3d, 0xfc,
+  0xfa, 0xd7, 0x3d, 0x95, 0xd9, 0x38, 0xbd, 0x4e, 0x3f, 0x75, 0x3d, 0x73, 0xdb,
+  0x15, 0xbe, 0xdf, 0x76, 0x8d, 0x3d, 0x0f, 0xb1, 0x13, 0x3d, 0x90, 0x32, 0x24,
+  0x3e, 0x3a, 0x17, 0xf9, 0xbd, 0xcd, 0xd1, 0x38, 0xbd, 0x27, 0xf4, 0x9b, 0xbd,
+  0x10, 0x6c, 0xa3, 0xbc, 0x1e, 0x12, 0x42, 0x3d, 0xee, 0x38, 0xff, 0xbc, 0xb4,
+  0x28, 0x2e, 0x3d, 0xba, 0x69, 0xbd, 0xbc, 0x7c, 0x69, 0xbb, 0xbc, 0x1a, 0xe8,
+  0xde, 0xbd, 0xd8, 0xa2, 0x17, 0x3c, 0xb8, 0x9e, 0xb6, 0xbb, 0xae, 0x5e, 0x96,
+  0x3c, 0x4f, 0xbb, 0x03, 0xbd, 0x8f, 0x72, 0xb4, 0xbc, 0x94, 0x57, 0xd7, 0x3d,
+  0xf5, 0xe3, 0xaf, 0xbc, 0xa4, 0x0c, 0x0d, 0xbd, 0x13, 0xbb, 0x83, 0x3d, 0x62,
+  0x06, 0xda, 0x3d, 0xb7, 0xa5, 0x1c, 0x3e, 0x90, 0xd8, 0x86, 0xbd, 0xf5, 0x7e,
+  0xd0, 0xbd, 0x8b, 0x5e, 0xcb, 0xbd, 0x0e, 0x81, 0xf5, 0xbd, 0xfe, 0xf3, 0xe4,
+  0xbc, 0xe2, 0xc9, 0xd6, 0xbc, 0x4c, 0xa9, 0xc8, 0x3b, 0x04, 0xd2, 0x49, 0xbc,
+  0xf0, 0xb2, 0xa5, 0xbd, 0xc7, 0xd6, 0xea, 0x3d, 0xa6, 0xa6, 0x77, 0x3d, 0xdf,
+  0x24, 0x03, 0x3d, 0x05, 0x9e, 0x86, 0xbd, 0xce, 0x27, 0x31, 0x3d, 0x46, 0x54,
+  0xa4, 0x3d, 0x27, 0x9b, 0x35, 0xbd, 0x28, 0x86, 0x68, 0xbb, 0x2c, 0x1e, 0xc1,
+  0xbd, 0xda, 0x7e, 0xa2, 0x3b, 0xa6, 0xe6, 0xe9, 0x3d, 0x8a, 0xcf, 0x0f, 0x3d,
+  0x5e, 0xf0, 0x6f, 0xbd, 0xa0, 0xc6, 0xb1, 0xbb, 0x08, 0xc6, 0x77, 0xbc, 0x6d,
+  0x17, 0x16, 0xbd, 0xf5, 0xc6, 0x21, 0x3d, 0x70, 0x2a, 0x11, 0xbd, 0x3f, 0x5a,
+  0x6c, 0xbd, 0xfb, 0xd9, 0xbc, 0x3d, 0x91, 0x33, 0xb4, 0x3c, 0xc1, 0xc7, 0x84,
+  0x3d, 0xd9, 0xca, 0x41, 0xbd, 0xd8, 0x5d, 0xec, 0x3d, 0x17, 0xe2, 0x94, 0x3d,
+  0xbf, 0x3f, 0x04, 0xbe, 0x24, 0xa8, 0x66, 0xbd, 0xc4, 0xcd, 0xc0, 0x3d, 0x07,
+  0xce, 0x9e, 0xbd, 0x67, 0x5d, 0xe0, 0x3d, 0x9e, 0xdd, 0x1c, 0xbe, 0x77, 0xe5,
+  0x5c, 0x3d, 0x98, 0x1f, 0xaf, 0x3d, 0x8a, 0xfd, 0x02, 0x3e, 0x9f, 0x9a, 0xba,
+  0xbc, 0x40, 0xe9, 0xbb, 0x3c, 0x4e, 0x51, 0x10, 0xbc, 0xc6, 0xcc, 0x81, 0x3d,
+  0x83, 0x18, 0x78, 0xbc, 0x7f, 0x25, 0xe8, 0xbd, 0x2e, 0xa6, 0xcb, 0x3c, 0x2f,
+  0x8c, 0x3e, 0x3c, 0x38, 0xdc, 0x67, 0xbb, 0x57, 0xf8, 0xbd, 0x3d, 0xa2, 0x4b,
+  0x13, 0x3e, 0x6d, 0x76, 0x64, 0x3d, 0xcf, 0x5e, 0x98, 0x3c, 0x09, 0xc1, 0x8a,
+  0x3c, 0x42, 0x2b, 0x82, 0x3d, 0xa3, 0x83, 0x4a, 0x3d, 0xe3, 0x74, 0xb9, 0xbb,
+  0x26, 0xf8, 0x62, 0x3d, 0xd6, 0x4d, 0xa4, 0xbc, 0x68, 0x44, 0x13, 0x3d, 0x3b,
+  0x7d, 0x54, 0x3d, 0xf4, 0xdf, 0x8c, 0x3d, 0xef, 0x72, 0xcf, 0xbd, 0x4e, 0xd6,
+  0x85, 0x3c, 0x6a, 0x11, 0x38, 0xbc, 0xa5, 0xec, 0x83, 0xbd, 0x23, 0x95, 0x86,
+  0xbd, 0x93, 0xa0, 0xbf, 0x3c, 0x91, 0xc5, 0x11, 0xbd, 0x96, 0x1b, 0x23, 0x3d,
+  0xbc, 0x6d, 0x00, 0x3d, 0x55, 0xb7, 0x9d, 0x3d, 0x44, 0x45, 0x8d, 0x3c, 0x83,
+  0x34, 0x19, 0xbd, 0x1c, 0x2e, 0xbe, 0xbd, 0xfb, 0x4b, 0xd5, 0x3c, 0x25, 0xec,
+  0xd9, 0xba, 0xe0, 0xcd, 0xa9, 0x3d, 0x72, 0x99, 0xa1, 0x3d, 0xa6, 0xa1, 0x91,
+  0xbd, 0xc8, 0x70, 0x39, 0xbd, 0x33, 0x54, 0x24, 0x3d, 0x80, 0x25, 0xd8, 0x3c,
+  0x3c, 0x36, 0xdb, 0x3b, 0x04, 0x22, 0x3c, 0xbd, 0xc8, 0x81, 0xfb, 0x3d, 0x89,
+  0x15, 0xe1, 0x3d, 0xa5, 0x9d, 0x17, 0xbd, 0x68, 0xad, 0x64, 0xbd, 0xad, 0xbd,
+  0x59, 0xbc, 0xfc, 0x1a, 0xa5, 0xbd, 0xf5, 0x88, 0x44, 0x3d, 0x53, 0xa7, 0x9b,
+  0x3d, 0x2e, 0x00, 0x93, 0xbd, 0xbd, 0xb1, 0xb9, 0x3c, 0x61, 0x54, 0xc8, 0x3c,
+  0xe3, 0xe9, 0xd7, 0x3d, 0x78, 0xe2, 0xe0, 0x3d, 0x6c, 0xe0, 0x08, 0xbe, 0x80,
+  0xc2, 0xaf, 0x3d, 0x2a, 0x5c, 0x10, 0xbd, 0x60, 0xcb, 0xf0, 0x3d, 0x7a, 0xa1,
+  0xf0, 0xbb, 0x02, 0x56, 0xa9, 0x3d, 0x11, 0xf1, 0x1c, 0x3c, 0x39, 0xec, 0xa9,
+  0xbd, 0x73, 0xfd, 0x24, 0xbd, 0xd5, 0x86, 0x8c, 0x3d, 0xdc, 0x85, 0x21, 0x3c,
+  0xa7, 0x6f, 0xf6, 0x3d, 0xe0, 0x6b, 0x0c, 0xbd, 0x08, 0x15, 0xf2, 0x3d, 0xd6,
+  0x6a, 0xed, 0x3d, 0xda, 0xc1, 0x51, 0xbd, 0x27, 0x6e, 0x11, 0xbe, 0xbe, 0x8f,
+  0xcf, 0xbc, 0xa9, 0xf1, 0x05, 0x3d, 0xa1, 0x30, 0x8d, 0xbd, 0x35, 0x5e, 0x97,
+  0xbd, 0xee, 0x02, 0x9d, 0xbc, 0xf8, 0xba, 0xe9, 0xbd, 0x61, 0xe1, 0xb5, 0xbd,
+  0xaa, 0x6d, 0x0c, 0xbd, 0xeb, 0x1f, 0x5d, 0xbd, 0x17, 0x11, 0xda, 0x3c, 0xe3,
+  0x75, 0x55, 0xbd, 0x8b, 0x40, 0x4a, 0x3d, 0xb2, 0x5b, 0x17, 0xbd, 0xc2, 0xbb,
+  0x66, 0xbd, 0x42, 0x20, 0xf7, 0x3d, 0x05, 0x75, 0xff, 0xbd, 0xce, 0xd3, 0xca,
+  0x3c, 0x76, 0x10, 0xbb, 0x3d, 0x66, 0xa2, 0xcc, 0xbc, 0x96, 0x30, 0xf7, 0xba,
+  0xad, 0xa8, 0x16, 0xbc, 0x32, 0x10, 0x77, 0x3b, 0x98, 0xde, 0x1f, 0xbd, 0xc7,
+  0xd6, 0x72, 0x3d, 0x33, 0xea, 0xe1, 0x3d, 0xb5, 0x5d, 0x8d, 0x3c, 0xfe, 0xf1,
+  0x64, 0x3d, 0x3f, 0xe1, 0x88, 0x3c, 0x0d, 0xa2, 0x92, 0x3d, 0x52, 0x90, 0x20,
+  0xbd, 0xcd, 0x17, 0x88, 0xbd, 0xf7, 0xf1, 0x7b, 0x3d, 0x55, 0xbe, 0x9c, 0x3b,
+  0x1a, 0x3f, 0xd1, 0x3c, 0x46, 0xbe, 0x0d, 0x3d, 0x53, 0xd7, 0xd9, 0x3d, 0xda,
+  0x58, 0xb5, 0xbc, 0x3a, 0x41, 0x78, 0xbd, 0x78, 0xc0, 0x54, 0xbd, 0x3c, 0x27,
+  0x10, 0x3e, 0x16, 0x00, 0xe9, 0x3b, 0x6e, 0xcd, 0xc5, 0x3d, 0xd9, 0xf0, 0x82,
+  0x3d, 0x44, 0x3e, 0x82, 0x3d, 0xde, 0x31, 0x83, 0x3d, 0x10, 0x32, 0x4e, 0xbd,
+  0x13, 0x46, 0xd7, 0xbd, 0x60, 0xa0, 0xbb, 0xbc, 0x33, 0xc9, 0xb0, 0xbd, 0x8d,
+  0x52, 0xfb, 0x3d, 0x5e, 0xa7, 0x07, 0x3d, 0x05, 0xd7, 0xb7, 0x3d, 0x34, 0x8c,
+  0x71, 0x3d, 0xcf, 0x5d, 0x66, 0xbd, 0x2a, 0x61, 0x1c, 0x3d, 0xa5, 0xa5, 0x70,
+  0xbd, 0xd2, 0xb9, 0x67, 0x3b, 0x9e, 0x63, 0x5a, 0x3d, 0xbe, 0xea, 0xd4, 0xbc,
+  0x57, 0xe9, 0xb5, 0x3d, 0x03, 0xe4, 0xa6, 0x3d, 0xc4, 0x6b, 0xb3, 0x3d, 0x6e,
+  0x60, 0x9f, 0x3d, 0xac, 0x31, 0xa0, 0x3d, 0xcf, 0xcc, 0xb5, 0x3d, 0xd0, 0x80,
+  0xd6, 0x3d, 0xb9, 0x3f, 0x96, 0xbd, 0x2d, 0x17, 0x17, 0xbb, 0x6f, 0xf2, 0xe4,
+  0xbd, 0x17, 0x51, 0x6e, 0x3d, 0xc2, 0xe2, 0xc2, 0x3d, 0xfe, 0x71, 0x59, 0x3d,
+  0x0e, 0x1c, 0x78, 0xbd, 0xc9, 0xc7, 0xbc, 0xbd, 0x40, 0xb0, 0xa8, 0x3d, 0xbf,
+  0xff, 0x42, 0xbd, 0xe4, 0x2e, 0x67, 0x3d, 0xca, 0x73, 0x81, 0xbd, 0x0b, 0x0d,
+  0xf3, 0x3d, 0xce, 0x97, 0x70, 0x3d, 0xe9, 0x59, 0xe9, 0x3d, 0x45, 0x22, 0x73,
+  0xbd, 0x24, 0xb8, 0xdf, 0x3d, 0x96, 0xbb, 0x3f, 0x3c, 0x02, 0xed, 0x65, 0x3d,
+  0x84, 0x40, 0x25, 0x3c, 0x6c, 0xc5, 0xd2, 0x3c, 0xea, 0x38, 0x4a, 0x3d, 0xf9,
+  0xa2, 0xc9, 0x3d, 0x6f, 0x30, 0xbc, 0x3a, 0x2d, 0xd5, 0x81, 0xbd, 0xd2, 0xae,
+  0xa3, 0xbb, 0x8e, 0x91, 0xe7, 0x3c, 0x28, 0x6b, 0xc4, 0xbd, 0xf3, 0x0c, 0xbf,
+  0xbc, 0x66, 0xf8, 0xd3, 0x3b, 0x6d, 0x3e, 0x01, 0x3d, 0xf3, 0xbf, 0xc2, 0xbc,
+  0x0d, 0xc5, 0x6f, 0xbd, 0xb7, 0x9b, 0x9c, 0x3d, 0xeb, 0x79, 0x88, 0x3d, 0x81,
+  0x8a, 0x7d, 0xbc, 0xde, 0x8b, 0x14, 0x3d, 0xa4, 0x3f, 0x7d, 0x3d, 0xb4, 0x27,
+  0xa9, 0x3d, 0xb7, 0x75, 0x51, 0x3d, 0xff, 0x73, 0x85, 0x3d, 0x3f, 0xf3, 0x51,
+  0x3d, 0xe6, 0xdd, 0xe2, 0xbb, 0x83, 0xc7, 0x65, 0xbd, 0x6a, 0x16, 0xb6, 0xbd,
+  0xcf, 0xe8, 0x90, 0x3d, 0x5b, 0xc8, 0xad, 0xbc, 0xa1, 0x27, 0x29, 0xbd, 0x57,
+  0xbd, 0x3d, 0x3d, 0x61, 0x4e, 0x41, 0xbc, 0x21, 0x2f, 0x29, 0x3d, 0x55, 0x0b,
+  0xba, 0x3d, 0xaa, 0x67, 0xf3, 0xba, 0x7d, 0x60, 0xe4, 0x3d, 0xab, 0xe7, 0x20,
+  0xbd, 0x01, 0x71, 0x9f, 0x3d, 0x5a, 0xd5, 0x95, 0xbd, 0x2f, 0x75, 0xd5, 0x3d,
+  0x7c, 0x91, 0xf6, 0x3d, 0xaa, 0xd6, 0x0c, 0x3d, 0x6d, 0x1c, 0xd9, 0xbd, 0xb4,
+  0x4e, 0x82, 0xbc, 0x3f, 0x5a, 0x1a, 0x3b, 0xb4, 0x94, 0xfb, 0x3d, 0x0a, 0x71,
+  0x3c, 0xbd, 0x97, 0xba, 0x12, 0xbc, 0xfd, 0x3d, 0x33, 0xbd, 0xa3, 0x4d, 0x01,
+  0x3e, 0x54, 0xe2, 0x33, 0xbd, 0x8d, 0x32, 0x5d, 0x3d, 0x92, 0x84, 0xcb, 0x3d,
+  0x91, 0x67, 0xde, 0xbd, 0x4b, 0xfd, 0xc7, 0xbd, 0x4b, 0x11, 0x04, 0xbe, 0x3e,
+  0xde, 0xac, 0x3d, 0xe4, 0x9e, 0x3c, 0x3d, 0x5e, 0x7d, 0xfb, 0x3d, 0xfd, 0x4d,
+  0xae, 0x3d, 0x63, 0xcf, 0x6f, 0xbd, 0xa0, 0x4f, 0x8b, 0x3d, 0x46, 0x2c, 0x84,
+  0xbd, 0xda, 0x69, 0x11, 0x3b, 0xca, 0x5b, 0x1c, 0xbd, 0x59, 0x23, 0x26, 0x3e,
+  0x16, 0xb1, 0x68, 0xbd, 0x1c, 0xd4, 0x98, 0xbd, 0x9c, 0x91, 0x6e, 0xbd, 0xa5,
+  0xc6, 0x55, 0xbc, 0xd0, 0xf3, 0xcc, 0xbd, 0xe8, 0x91, 0xe0, 0xbd, 0xdf, 0xe3,
+  0xb4, 0x3d, 0x04, 0x77, 0xc2, 0xbd, 0xcc, 0x21, 0xda, 0xbd, 0x7d, 0xed, 0x1d,
+  0x3d, 0x1c, 0xa9, 0x0f, 0x3e, 0x25, 0x19, 0x67, 0x3d, 0xcc, 0x29, 0x65, 0xbd,
+  0x34, 0x00, 0xdd, 0x3d, 0xe3, 0x04, 0x15, 0xbd, 0x79, 0xb8, 0x50, 0xbd, 0x98,
+  0x5b, 0x44, 0xbc, 0x32, 0x55, 0xd1, 0x3d, 0x19, 0x20, 0x2a, 0xbd, 0xbd, 0x28,
+  0xb6, 0x3c, 0x33, 0xf4, 0xc4, 0xbb, 0x95, 0x26, 0x9f, 0xbb, 0x93, 0xb7, 0x7f,
+  0x3d, 0x16, 0xbc, 0x5f, 0x3d, 0x0a, 0x14, 0x82, 0x3c, 0x3a, 0x40, 0x12, 0x3e,
+  0x99, 0x9c, 0xbe, 0x3c, 0x6c, 0x22, 0x72, 0x3d, 0xb3, 0x18, 0x10, 0xbe, 0x2b,
+  0x6f, 0x4b, 0x3d, 0xaf, 0x83, 0x90, 0x3c, 0x67, 0x6b, 0x57, 0x3d, 0xae, 0xba,
+  0x1d, 0xbd, 0x42, 0x58, 0xda, 0xbd, 0xcd, 0x16, 0xc6, 0xbd, 0x28, 0x11, 0xa1,
+  0xbd, 0xc3, 0xfa, 0x6b, 0x3d, 0xff, 0x35, 0xc4, 0x3d, 0xca, 0x54, 0x9d, 0x3d,
+  0x65, 0xc0, 0x0a, 0x3d, 0xbe, 0xbd, 0x73, 0xbc, 0xee, 0xf8, 0xfb, 0x3a, 0x88,
+  0xcf, 0x2c, 0x3d, 0xa4, 0x2d, 0xb9, 0x3d, 0x30, 0xbf, 0x9c, 0xbd, 0x16, 0xf6,
+  0x97, 0x3c, 0x72, 0xf4, 0x12, 0x3d, 0x4c, 0xc6, 0x01, 0xbd, 0x68, 0x2e, 0xc0,
+  0xbd, 0x38, 0xd4, 0x2c, 0x3d, 0xe6, 0xb4, 0xbf, 0x3d, 0xf5, 0x15, 0x66, 0xbd,
+  0x29, 0x0f, 0x83, 0x3d, 0x44, 0x2b, 0xb0, 0x3d, 0xa1, 0x53, 0xeb, 0x3d, 0xc6,
+  0x86, 0x8a, 0x3d, 0xe0, 0x36, 0x48, 0xbd, 0x29, 0xff, 0x22, 0xbd, 0xff, 0x33,
+  0xae, 0x3d, 0xa2, 0x5b, 0x13, 0xbd, 0x1d, 0x6f, 0x9e, 0x3d, 0x0e, 0x6d, 0x09,
+  0x3d, 0x7f, 0x06, 0x01, 0xbe, 0xc8, 0x08, 0xc7, 0x3d, 0xc2, 0xe8, 0xae, 0x3d,
+  0xe6, 0x4a, 0xc7, 0x3d, 0x29, 0x40, 0xb3, 0x3d, 0xb5, 0x99, 0x83, 0xbd, 0xa4,
+  0x23, 0x8f, 0x3d, 0x4a, 0xa2, 0x9c, 0x3d, 0x0d, 0xe2, 0x04, 0x3d, 0x40, 0xff,
+  0x07, 0x3d, 0xa4, 0x8c, 0x30, 0x3d, 0x75, 0x00, 0x1c, 0x3d, 0x45, 0x9b, 0x02,
+  0x3e, 0xb2, 0xce, 0x2e, 0x3d, 0x16, 0x9d, 0x3f, 0xbd, 0x8e, 0xf1, 0x1b, 0xbc,
+  0x9b, 0x59, 0x04, 0xbd, 0xae, 0xd7, 0xd3, 0x3d, 0x2b, 0x15, 0x05, 0x3b, 0x12,
+  0xec, 0x5d, 0x3c, 0x30, 0xe9, 0xea, 0x3d, 0x58, 0xe5, 0xe4, 0xbd, 0x9b, 0x54,
+  0x86, 0xbd, 0xf0, 0x47, 0x4e, 0xbd, 0x21, 0xa7, 0xef, 0x3b, 0x89, 0xf9, 0x23,
+  0x3d, 0xec, 0x14, 0x48, 0xbd, 0xfc, 0x86, 0x20, 0x3e, 0x08, 0x69, 0x95, 0x3d,
+  0x26, 0x08, 0xb6, 0xbd, 0xd9, 0xe2, 0xb3, 0xbd, 0x27, 0x6f, 0xf0, 0x3d, 0x9d,
+  0xc4, 0x1c, 0xbe, 0x1a, 0x6e, 0x22, 0x3d, 0xc5, 0xe3, 0x68, 0x3d, 0x45, 0x2d,
+  0x8a, 0xbb, 0xbe, 0xf3, 0x84, 0x3d, 0x63, 0xef, 0x10, 0x3d, 0x54, 0xfa, 0xde,
+  0x3c, 0x57, 0x4c, 0xc4, 0x3d, 0xa7, 0x44, 0x8b, 0xbd, 0x9e, 0xf0, 0x33, 0xbd,
+  0x9a, 0x6c, 0x89, 0x3d, 0x6c, 0xc9, 0x21, 0xbe, 0x0e, 0x60, 0x9d, 0xbd, 0xd9,
+  0x35, 0x1f, 0xbd, 0x0d, 0x4f, 0x9a, 0x3d, 0xd4, 0x24, 0xca, 0x3d, 0xc4, 0x5c,
+  0x45, 0xbd, 0x28, 0x24, 0xea, 0x3c, 0xee, 0xea, 0xef, 0xbd, 0x4d, 0xae, 0x89,
+  0x3d, 0x91, 0x99, 0x79, 0xbc, 0xb6, 0x1b, 0xc2, 0x3d, 0xcb, 0x8d, 0xb4, 0xbc,
+  0x63, 0xaa, 0x7f, 0xbd, 0x19, 0xbc, 0xe6, 0xbc, 0x82, 0x28, 0x4e, 0xbd, 0xf4,
+  0x7a, 0xbc, 0x3d, 0xe4, 0xe7, 0xcd, 0xbd, 0x2c, 0xe3, 0xda, 0xbd, 0xc6, 0x98,
+  0xec, 0x3d, 0xd7, 0xfc, 0xf8, 0xbc, 0xd4, 0x80, 0x76, 0x3d, 0xbf, 0x17, 0x3e,
+  0xbd, 0x20, 0x69, 0x48, 0x3a, 0x1c, 0x2c, 0xa2, 0x3d, 0xc2, 0x8b, 0x95, 0x3d,
+  0xc4, 0xb5, 0xa9, 0x3d, 0x43, 0x5b, 0xde, 0xbc, 0xf1, 0x1e, 0x0f, 0xbd, 0x52,
+  0x3e, 0xbb, 0x3d, 0xff, 0xaf, 0xfd, 0x3d, 0x66, 0x65, 0x59, 0x3d, 0x03, 0x95,
+  0x55, 0x3d, 0x97, 0x22, 0x04, 0xbe, 0xcb, 0x24, 0x32, 0xbd, 0xf3, 0x26, 0xa5,
+  0xbd, 0xaa, 0xd3, 0xdb, 0xbc, 0x75, 0x5b, 0x41, 0xbd, 0x2e, 0x2c, 0xc4, 0x3d,
+  0xd5, 0x98, 0xc4, 0x3c, 0xa3, 0x19, 0x01, 0x3c, 0x4e, 0x3f, 0x3c, 0x3d, 0xea,
+  0xee, 0x2d, 0xbd, 0x3f, 0x97, 0x13, 0xbc, 0xed, 0xdd, 0x55, 0x3d, 0x49, 0xba,
+  0xfb, 0xbd, 0x5c, 0xbd, 0xc9, 0xbd, 0xe8, 0x9f, 0xad, 0x3d, 0x9c, 0x26, 0x32,
+  0xbd, 0xf6, 0xfa, 0x15, 0xbe, 0x09, 0x88, 0xc0, 0xbd, 0xe2, 0xcc, 0xaf, 0xbd,
+  0xdb, 0x22, 0x56, 0x3d, 0x78, 0x3f, 0x0f, 0xbc, 0x50, 0xe5, 0x93, 0xbd, 0x55,
+  0x90, 0x09, 0x3d, 0xac, 0xec, 0x6d, 0xbd, 0x93, 0x0e, 0xce, 0xbc, 0x5b, 0xde,
+  0x85, 0x3d, 0x08, 0x1d, 0x4b, 0x3d, 0x8f, 0x16, 0xf4, 0xbd, 0x89, 0xf8, 0x83,
+  0xbd, 0x65, 0xf3, 0xf8, 0xbc, 0xe3, 0x37, 0x09, 0x3b, 0x37, 0x89, 0x91, 0xbc,
+  0x69, 0xea, 0x2f, 0xbd, 0x2c, 0xf2, 0xbf, 0x3c, 0xd0, 0x57, 0xa7, 0x3d, 0xae,
+  0x94, 0xbf, 0x3d, 0x15, 0x1d, 0x63, 0x3d, 0x53, 0x20, 0x4b, 0xbd, 0x4f, 0xf2,
+  0x00, 0x3e, 0x29, 0x36, 0x54, 0xbd, 0x49, 0x2d, 0x8c, 0xbd, 0x29, 0xbc, 0xb6,
+  0x3d, 0x08, 0xc4, 0xc7, 0x3d, 0xb6, 0x3d, 0xf9, 0xbd, 0x84, 0x0f, 0xa1, 0x3d,
+  0xe8, 0x20, 0xb1, 0xbd, 0x8b, 0xf6, 0xa8, 0xbd, 0x51, 0xec, 0x75, 0x3d, 0x85,
+  0xeb, 0x13, 0xbe, 0x5c, 0xe5, 0x4f, 0x3d, 0xe5, 0x90, 0xf3, 0xbc, 0x5a, 0xb0,
+  0x39, 0xbd, 0xbf, 0x7a, 0x63, 0x3d, 0xa4, 0x35, 0x08, 0x3e, 0xae, 0x8a, 0xa6,
+  0xbd, 0x4d, 0x53, 0x46, 0xbd, 0x8e, 0xb0, 0x46, 0xbc, 0x9d, 0x94, 0x15, 0x3d,
+  0x6d, 0xdc, 0x62, 0x3c, 0x75, 0x33, 0x29, 0x3d, 0x61, 0xba, 0x3d, 0x3d, 0x0a,
+  0xdb, 0x72, 0xbc, 0x18, 0x43, 0xdb, 0xbc, 0xb0, 0xca, 0x83, 0xbc, 0x33, 0x9b,
+  0x12, 0xbe, 0xdb, 0x85, 0xb2, 0xbd, 0xe1, 0x52, 0xc7, 0xbd, 0xd6, 0xbc, 0x12,
+  0xbd, 0x19, 0x0f, 0x90, 0xbc, 0x75, 0xb0, 0x4c, 0x3d, 0x91, 0x46, 0xd2, 0x3b,
+  0xae, 0x95, 0x0e, 0x3d, 0x51, 0xa0, 0x74, 0x3d, 0x9b, 0x73, 0x90, 0xba, 0xec,
+  0x61, 0x85, 0x3c, 0xaa, 0x01, 0xb7, 0x3d, 0x83, 0x19, 0x96, 0xbd, 0xeb, 0x6f,
+  0xce, 0x3c, 0x46, 0x50, 0x15, 0xbe, 0x4c, 0x9d, 0xe2, 0xbb, 0xee, 0x86, 0x59,
+  0xbb, 0xd9, 0xea, 0x8c, 0x3d, 0x5e, 0x80, 0x96, 0x3b, 0x9e, 0x36, 0xf2, 0x3d,
+  0xfc, 0x4e, 0xa8, 0x3c, 0x67, 0x32, 0xb0, 0x3d, 0x93, 0xf9, 0x1a, 0x3d, 0x71,
+  0x3b, 0xaa, 0xbd, 0xd4, 0xcf, 0x34, 0x3d, 0x93, 0x11, 0x84, 0xbd, 0x76, 0x9c,
+  0xc7, 0x3d, 0x6b, 0xee, 0xd5, 0xbd, 0xb6, 0x03, 0xd8, 0x3d, 0xb8, 0x56, 0x53,
+  0xbd, 0x61, 0x89, 0xab, 0xbd, 0x69, 0x71, 0x46, 0xbc, 0x79, 0x31, 0x81, 0xbd,
+  0xa0, 0xaa, 0x9d, 0xbc, 0xab, 0x17, 0x0c, 0x3d, 0x31, 0xb8, 0x0a, 0x3d, 0xc3,
+  0x40, 0xb4, 0xbd, 0xab, 0xb6, 0x97, 0x3d, 0xc1, 0x3a, 0x47, 0x3d, 0x31, 0xdc,
+  0xdb, 0xbc, 0xb4, 0x23, 0x60, 0xbc, 0x9d, 0x47, 0x93, 0x3d, 0xc9, 0x69, 0xa1,
+  0x3d, 0xbb, 0x2f, 0x7a, 0x3d, 0x07, 0x8d, 0x91, 0x3d, 0x20, 0xdb, 0xca, 0x3d,
+  0xf8, 0x44, 0xd3, 0xbd, 0x68, 0xfc, 0x66, 0xbc, 0xfa, 0xab, 0x29, 0x3d, 0xcb,
+  0xb6, 0xa4, 0x3d, 0x9e, 0xbd, 0x06, 0x3d, 0xd1, 0x54, 0xb1, 0x3d, 0x06, 0x7e,
+  0xcb, 0xbd, 0x24, 0x71, 0xc4, 0x3d, 0x08, 0x17, 0x40, 0x3d, 0x7a, 0xf7, 0xae,
+  0xbd, 0xc0, 0x66, 0xc1, 0xbd, 0xfa, 0x2a, 0x22, 0xbd, 0xf0, 0x3d, 0xd2, 0xbc,
+  0x2e, 0xc7, 0x71, 0xbd, 0xc5, 0x4f, 0xd0, 0xbd, 0xf7, 0x68, 0x85, 0xbd, 0xab,
+  0xeb, 0x92, 0xbd, 0x5e, 0xb7, 0xe8, 0xbd, 0x66, 0xc1, 0xef, 0xbd, 0xb7, 0x07,
+  0x06, 0xbd, 0x5b, 0x2f, 0x40, 0x3d, 0xd6, 0xb0, 0xa8, 0xbd, 0xb8, 0x1a, 0xe8,
+  0x3d, 0x9f, 0xb7, 0xc4, 0x3d, 0x3c, 0xb5, 0x8f, 0xbd, 0x23, 0x9f, 0xbc, 0x3d,
+  0xfd, 0x90, 0x88, 0xbd, 0xa2, 0xa9, 0x27, 0xbc, 0x41, 0xe4, 0xd7, 0xbd, 0x29,
+  0x97, 0x07, 0xbd, 0xff, 0x72, 0x04, 0x3c, 0x56, 0x5a, 0x34, 0xbd, 0xf4, 0x8a,
+  0x9d, 0xbd, 0x7e, 0x5d, 0x83, 0xbd, 0xd2, 0x00, 0x4e, 0x3d, 0xbe, 0x7e, 0x5d,
+  0x3d, 0x03, 0xd1, 0x38, 0xbd, 0xb2, 0x2b, 0xbc, 0xbd, 0x04, 0xa8, 0x4d, 0x3d,
+  0xa8, 0x0b, 0xaa, 0xbd, 0x84, 0x50, 0xac, 0xbd, 0x09, 0xef, 0xbf, 0xbc, 0xfa,
+  0xb8, 0xb2, 0xbd, 0xeb, 0x7e, 0xd9, 0x3d, 0x54, 0x08, 0xda, 0xbd, 0x21, 0x24,
+  0x61, 0xbd, 0xae, 0x1e, 0xae, 0xbd, 0xb4, 0x50, 0x3a, 0xbc, 0x2e, 0x07, 0xe9,
+  0xbd, 0xec, 0xb1, 0x9d, 0xbd, 0x88, 0x5d, 0xca, 0xbc, 0x0c, 0x8a, 0x8c, 0x3d,
+  0x58, 0x56, 0xf9, 0x3c, 0x57, 0x0f, 0xe7, 0x3d, 0xd4, 0xd9, 0x1c, 0xbd, 0x87,
+  0xfe, 0x38, 0xbd, 0x1c, 0x08, 0x17, 0xbd, 0x72, 0xbb, 0xc1, 0xbc, 0x5b, 0xa9,
+  0xf7, 0xba, 0xf2, 0xd5, 0x34, 0xbd, 0x71, 0x2f, 0x4b, 0xbd, 0x6a, 0xd6, 0xab,
+  0xbd, 0x07, 0x81, 0xcd, 0x3d, 0x03, 0xf0, 0x2e, 0x3d, 0xcd, 0x20, 0xd4, 0xbd,
+  0x0e, 0xf4, 0x3f, 0xbc, 0xf3, 0xed, 0xe1, 0x3d, 0xf6, 0xc4, 0x82, 0x3d, 0x0b,
+  0x42, 0x48, 0x3d, 0xf9, 0xcd, 0x87, 0x3d, 0x91, 0x7d, 0x49, 0x3b, 0x9a, 0xc7,
+  0x28, 0xbd, 0xf6, 0x02, 0xc3, 0x3d, 0x6e, 0x82, 0xa4, 0xbd, 0x41, 0x1f, 0xe7,
+  0x3d, 0x44, 0x06, 0x76, 0x3d, 0x3b, 0xbc, 0xc1, 0x3b, 0x20, 0xf7, 0x7c, 0xbd,
+  0x0d, 0x0d, 0xe0, 0xbd, 0x2b, 0xa5, 0xc5, 0x3d, 0x51, 0x84, 0x6f, 0xbd, 0xd0,
+  0x24, 0x22, 0x3d, 0x33, 0x68, 0xb7, 0x3d, 0x37, 0x88, 0x87, 0x3d, 0x24, 0x04,
+  0x98, 0xbd, 0x1b, 0xba, 0x04, 0xbd, 0x48, 0x09, 0xdf, 0x3b, 0xac, 0x9e, 0x3c,
+  0xbd, 0x4b, 0xbf, 0x2c, 0x3c, 0x07, 0xba, 0xf4, 0xbd, 0x6e, 0x91, 0x84, 0x3d,
+  0x99, 0x5a, 0x7e, 0x3c, 0x21, 0x9e, 0xeb, 0x3c, 0xde, 0x69, 0x18, 0x3d, 0x1f,
+  0x8f, 0xaa, 0x3d, 0x09, 0x55, 0x08, 0xbd, 0x42, 0xf3, 0xe5, 0xbd, 0x61, 0x6b,
+  0x82, 0xbd, 0xe1, 0xe2, 0xd2, 0x3d, 0x3f, 0xd1, 0xb6, 0x3d, 0xf9, 0xf5, 0xc7,
+  0xbd, 0x47, 0x47, 0x90, 0xbd, 0x74, 0xa3, 0x42, 0xbd, 0xa5, 0xda, 0x3e, 0x3d,
+  0xaf, 0x45, 0xc1, 0x3d, 0x68, 0x46, 0xe5, 0xbd, 0x79, 0x83, 0x31, 0x3d, 0x7e,
+  0xd3, 0xce, 0x3c, 0xea, 0x30, 0xca, 0xbd, 0x00, 0xb0, 0xae, 0x3b, 0x66, 0x91,
+  0xde, 0xbd, 0x0e, 0x11, 0xc0, 0xbd, 0xd0, 0x6a, 0x41, 0xbd, 0x6d, 0x7a, 0x8e,
+  0xbd, 0x0a, 0xe2, 0x70, 0x3d, 0x7b, 0x4d, 0xcf, 0x3d, 0x2c, 0x2b, 0x3d, 0xbd,
+  0x7e, 0xc3, 0x6f, 0xbd, 0xd0, 0x38, 0xac, 0x3c, 0xac, 0x35, 0xd0, 0xbd, 0x88,
+  0x08, 0xe3, 0xbd, 0x78, 0x27, 0xbf, 0x3d, 0x80, 0x1e, 0xf8, 0xbc, 0x52, 0x7a,
+  0x84, 0xbc, 0x77, 0x84, 0xbb, 0xbc, 0x22, 0xdf, 0x2b, 0x3d, 0xa8, 0x16, 0xe9,
+  0xbd, 0xec, 0xab, 0xda, 0x3b, 0xb9, 0x2f, 0x9b, 0x3d, 0x28, 0x97, 0xd6, 0x3d,
+  0x08, 0xde, 0x2c, 0xbc, 0x8a, 0x6c, 0x29, 0x3d, 0xdd, 0xfe, 0xa4, 0xbc, 0x13,
+  0xb3, 0x4e, 0xbc, 0x4f, 0x72, 0x81, 0xbc, 0x33, 0x6c, 0xcc, 0x3d, 0x1c, 0xbc,
+  0x76, 0xbc, 0xfd, 0xd7, 0x8f, 0xbd, 0x99, 0xfd, 0x53, 0xbd, 0x2c, 0x76, 0x80,
+  0xbd, 0x65, 0x2e, 0x1d, 0xbd, 0x9d, 0xd5, 0x8e, 0x3d, 0xeb, 0x16, 0xac, 0x3d,
+  0xa6, 0x14, 0x3d, 0x3d, 0x75, 0x14, 0x97, 0x3d, 0x5e, 0x11, 0xf5, 0xbc, 0xca,
+  0x20, 0x46, 0xbb, 0xb1, 0x04, 0xa1, 0xbd, 0x90, 0xcd, 0x3a, 0x3d, 0x70, 0xaf,
+  0x01, 0xbe, 0x9d, 0xe3, 0xb2, 0xbd, 0xc3, 0xdf, 0x99, 0x3d, 0x20, 0x09, 0xab,
+  0x3d, 0x35, 0x91, 0x06, 0xbd, 0x10, 0x3a, 0xa0, 0xbc, 0xc2, 0xd1, 0xad, 0x3d,
+  0x60, 0x90, 0xe4, 0x3d, 0x9f, 0x47, 0xfd, 0x3c, 0x84, 0xa1, 0x5f, 0x3d, 0x06,
+  0x5e, 0xf0, 0x3c, 0xab, 0x8c, 0x07, 0xbc, 0xf4, 0x6c, 0x16, 0x3d, 0x64, 0x06,
+  0x04, 0xbe, 0xa8, 0x16, 0x85, 0x3d, 0xea, 0x1a, 0xa1, 0xbd, 0x0d, 0xb4, 0xdc,
+  0xbd, 0xf4, 0x77, 0xc0, 0xbc, 0x5d, 0x03, 0x28, 0xbd, 0x29, 0x7d, 0xcc, 0xbc,
+  0xae, 0x19, 0x9f, 0x3d, 0x09, 0x2a, 0xcd, 0x3d, 0xa4, 0x58, 0xaa, 0xbd, 0x6d,
+  0xb8, 0xa9, 0x3c, 0xa1, 0xb7, 0xe6, 0xbd, 0xa9, 0x41, 0x9a, 0xbd, 0x69, 0xa4,
+  0xab, 0x3c, 0xdd, 0x32, 0xa9, 0x3d, 0x19, 0x90, 0xd4, 0x3d, 0x52, 0xa8, 0xea,
+  0xbd, 0x1e, 0x3d, 0xd4, 0x39, 0x84, 0x91, 0x03, 0xbe, 0xc9, 0x63, 0x3f, 0x3d,
+  0x81, 0x1e, 0xe0, 0x3d, 0x05, 0xc5, 0x95, 0xbd, 0x2e, 0x1d, 0xc9, 0xbd, 0xf2,
+  0x9c, 0x7c, 0xbc, 0x69, 0x19, 0xdb, 0xbc, 0x09, 0x3d, 0x6f, 0xbd, 0x58, 0x94,
+  0xf8, 0x3d, 0x2c, 0x78, 0xb6, 0x3d, 0x96, 0xbe, 0xf8, 0x3d, 0x98, 0x4e, 0xb6,
+  0x3d, 0x1a, 0xa0, 0x90, 0x3d, 0xa3, 0xeb, 0xd2, 0xbd, 0x4c, 0xfb, 0x2d, 0xbd,
+  0xcb, 0xca, 0xa8, 0xbc, 0xa7, 0xca, 0x80, 0xbd, 0x65, 0xe2, 0x87, 0xbd, 0x9d,
+  0x9a, 0x25, 0x3c, 0xc7, 0xf2, 0xcc, 0x3c, 0x38, 0x81, 0x48, 0xbd, 0xd3, 0x83,
+  0xea, 0x3d, 0x4f, 0x72, 0xad, 0xbd, 0x6d, 0xef, 0x3f, 0xbc, 0x22, 0xc7, 0xbf,
+  0xbc, 0xb6, 0x25, 0x64, 0x3c, 0x82, 0x76, 0x53, 0xbd, 0xd7, 0x9a, 0x89, 0x3c,
+  0x01, 0xa7, 0x40, 0x3d, 0xbe, 0x03, 0x69, 0xbd, 0x5c, 0x79, 0x0e, 0xbe, 0xeb,
+  0x87, 0x9f, 0xbd, 0x14, 0xa6, 0xad, 0x3c, 0x78, 0x6b, 0x25, 0x3d, 0xea, 0xa0,
+  0xd7, 0x3d, 0x19, 0xb6, 0x22, 0xbd, 0xc6, 0xf6, 0xba, 0xbc, 0xe9, 0xd6, 0xe4,
+  0x3c, 0x55, 0x68, 0x2a, 0xbd, 0xc0, 0x4c, 0xb0, 0xbc, 0xf5, 0xa5, 0x01, 0x3e,
+  0x59, 0x9a, 0xd0, 0xbd, 0x4a, 0xb2, 0xfc, 0x3d, 0x3a, 0x59, 0x8f, 0x3d, 0x4a,
+  0x0a, 0xb4, 0xbd, 0x7d, 0xc4, 0x63, 0x3d, 0xb6, 0xb8, 0xb9, 0x3d, 0xb0, 0x95,
+  0x81, 0x3c, 0x2f, 0x7a, 0x32, 0x3d, 0x32, 0x87, 0xe4, 0xbc, 0xf0, 0xfc, 0xd5,
+  0x3d, 0xfc, 0xe6, 0xf1, 0x3d, 0x04, 0x66, 0x98, 0x3c, 0x14, 0x23, 0x72, 0x3c,
+  0xfe, 0x50, 0x95, 0x3d, 0xdf, 0xe6, 0x4c, 0x3d, 0x84, 0x80, 0x8e, 0x3d, 0x13,
+  0xe8, 0x4c, 0xbd, 0xd4, 0xca, 0x83, 0xbd, 0x20, 0x86, 0xb0, 0xbd, 0xed, 0x66,
+  0x89, 0x3c, 0x6a, 0x59, 0x19, 0xbd, 0xc2, 0x32, 0xc3, 0xbd, 0x04, 0x3f, 0x8d,
+  0xbc, 0x51, 0xcc, 0x23, 0xbc, 0xb4, 0x4f, 0xa3, 0xbc, 0x30, 0x98, 0xc8, 0x3d,
+  0x29, 0xaa, 0xd4, 0xbb, 0x5c, 0x7d, 0x88, 0xbd, 0x3a, 0xe9, 0xa9, 0xbd, 0xc3,
+  0x4f, 0x40, 0xbd, 0x2d, 0x12, 0x49, 0xbd, 0x9e, 0x4e, 0x9a, 0xbd, 0xf1, 0xa9,
+  0x84, 0xbd, 0x29, 0x09, 0x94, 0x3d, 0x98, 0x3c, 0xf0, 0x3d, 0x5f, 0xfe, 0x2a,
+  0xbd, 0xd8, 0xa8, 0x46, 0xbd, 0xa1, 0xc8, 0x1c, 0xbb, 0x12, 0x3d, 0xbc, 0x3d,
+  0x38, 0x39, 0x51, 0x3c, 0x3a, 0x00, 0x95, 0x3d, 0xd8, 0x2e, 0x67, 0x3c, 0x48,
+  0x7e, 0xe0, 0xbd, 0x8c, 0x90, 0x79, 0x3c, 0xf2, 0x3d, 0x50, 0x3d, 0xbc, 0x2f,
+  0xa1, 0x3c, 0xf9, 0xf0, 0x8a, 0x3d, 0x0e, 0x11, 0x30, 0x3c, 0x7c, 0xc8, 0xf8,
+  0x3c, 0xe0, 0x88, 0x10, 0x3d, 0x4b, 0xaa, 0xbe, 0xbd, 0xa4, 0x0a, 0x5b, 0x3d,
+  0xe2, 0x3c, 0x94, 0x3d, 0xdd, 0x36, 0x95, 0xbd, 0xc7, 0x70, 0x89, 0xbd, 0x95,
+  0xe7, 0x89, 0x3d, 0x91, 0x0e, 0x23, 0x3c, 0xfe, 0x32, 0x4f, 0x3b, 0xd4, 0x79,
+  0xc2, 0x3d, 0x52, 0xab, 0xb4, 0xbd, 0xb3, 0x98, 0xd2, 0x3d, 0xb8, 0x70, 0x88,
+  0xbd, 0x2e, 0x3e, 0x77, 0x3d, 0xb5, 0x44, 0x00, 0x3d, 0xb4, 0xe9, 0x59, 0x3d,
+  0xae, 0x3b, 0x9d, 0x3d, 0x3d, 0x89, 0x36, 0x3d, 0x22, 0x67, 0x9b, 0xbb, 0xca,
+  0xca, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0xcf, 0x02,
+  0xcf, 0x3d, 0x6b, 0xe2, 0x84, 0x3d, 0x62, 0xaa, 0xdc, 0x3d, 0xdf, 0x55, 0xef,
+  0x3b, 0xc1, 0x2b, 0x41, 0xbd, 0x6e, 0x82, 0xb3, 0xbd, 0x08, 0xc0, 0x6c, 0xbd,
+  0x7c, 0xb9, 0x10, 0xbe, 0x97, 0x76, 0xbb, 0xbc, 0xa3, 0x52, 0x00, 0xbe, 0xd9,
+  0x90, 0x32, 0xbe, 0xac, 0x38, 0x62, 0x3d, 0x6c, 0xdc, 0xae, 0xbc, 0x2a, 0x7d,
+  0x01, 0xbe, 0x2f, 0xf8, 0x30, 0xbd, 0x8f, 0x24, 0x45, 0xbe, 0x0c, 0x74, 0x1f,
+  0xbe, 0x5e, 0x0b, 0x0f, 0xbd, 0xf7, 0xb6, 0xc5, 0x3d, 0xe9, 0x3c, 0xbb, 0xbd,
+  0x61, 0x11, 0x19, 0x3d, 0x68, 0xf0, 0x44, 0x3e, 0x26, 0x64, 0x95, 0x3c, 0xa1,
+  0xde, 0x54, 0x3d, 0x25, 0x8b, 0x14, 0x3e, 0x0f, 0xed, 0xfe, 0x3b, 0x1b, 0x37,
+  0xf4, 0xbd, 0x9e, 0x28, 0xbd, 0x3d, 0x26, 0x5c, 0xca, 0x3d, 0xbb, 0xad, 0x02,
+  0x3d, 0x1f, 0xc1, 0x25, 0x3e, 0x85, 0x0a, 0x39, 0xbe, 0xfa, 0xc3, 0xf7, 0xbd,
+  0xda, 0x75, 0xc6, 0xbd, 0x06, 0x2d, 0x4a, 0x3c, 0x1a, 0xc1, 0x94, 0xbd, 0xb0,
+  0x62, 0xa0, 0xbd, 0x63, 0x0c, 0x0e, 0xbe, 0xf3, 0x67, 0x01, 0xbe, 0xd9, 0x42,
+  0x48, 0xbe, 0xaa, 0xf0, 0xf6, 0xbd, 0xc7, 0xa6, 0x39, 0xbe, 0xf6, 0xef, 0xb2,
+  0x3d, 0xe6, 0x6f, 0xd7, 0xbd, 0x14, 0x4f, 0xfb, 0xbc, 0x7f, 0xb1, 0x86, 0x3d,
+  0xcc, 0xca, 0xd9, 0xbd, 0x34, 0x6f, 0x3e, 0xbc, 0x90, 0x24, 0xe8, 0x3d, 0xda,
+  0x5a, 0xf9, 0x3d, 0x78, 0xc9, 0xf0, 0xbd, 0x1e, 0x50, 0xa5, 0x3d, 0xce, 0xed,
+  0x6d, 0xbd, 0x65, 0x3b, 0x62, 0xbd, 0x52, 0x36, 0x3d, 0xbd, 0xf8, 0x54, 0x70,
+  0x3d, 0x01, 0x85, 0x39, 0x3c, 0x57, 0xf0, 0xa8, 0xbc, 0xf5, 0x69, 0xda, 0xbd,
+  0xd5, 0x00, 0xda, 0x3d, 0x47, 0x0a, 0xe6, 0x3d, 0xf1, 0xed, 0xae, 0xbd, 0x1b,
+  0x51, 0x93, 0x3d, 0x25, 0x8d, 0x1e, 0x3e, 0x65, 0x36, 0x24, 0x3e, 0xab, 0x4e,
+  0x3b, 0xbe, 0x73, 0x91, 0x7b, 0x3d, 0x79, 0x2a, 0xa6, 0x3c, 0x6e, 0x13, 0x29,
+  0x3e, 0xae, 0x98, 0x8b, 0x3d, 0x61, 0xec, 0x36, 0xbe, 0xee, 0xd9, 0x8a, 0x3d,
+  0xe8, 0xd8, 0xff, 0xbd, 0x87, 0xae, 0x13, 0xbe, 0x45, 0x02, 0xae, 0x3d, 0xbc,
+  0x03, 0x94, 0xbd, 0xf6, 0x5b, 0x17, 0xbe, 0x3c, 0x46, 0x15, 0x3e, 0x99, 0xe3,
+  0x3b, 0x3e, 0x6c, 0x0a, 0x82, 0xbd, 0x67, 0xb1, 0xb4, 0x3c, 0x68, 0xc6, 0x0a,
+  0x3e, 0x7f, 0xe1, 0xa5, 0x3d, 0x38, 0x5c, 0x61, 0x3e, 0x0d, 0x37, 0xdd, 0xbd,
+  0x14, 0xae, 0xff, 0xbc, 0x00, 0xba, 0x97, 0x3d, 0x61, 0xf4, 0xd7, 0x3c, 0xb9,
+  0x7e, 0x0b, 0xbe, 0x87, 0xa5, 0x59, 0xbc, 0x01, 0x95, 0x19, 0x3c, 0x3e, 0xf3,
+  0x72, 0xbd, 0x8b, 0x32, 0x0e, 0xbe, 0x8e, 0x5c, 0x30, 0x3e, 0xd1, 0x09, 0x10,
+  0x3e, 0xfb, 0xc9, 0x13, 0x3e, 0x82, 0x6f, 0xe2, 0x3d, 0x71, 0xd7, 0xc8, 0xbd,
+  0x57, 0x14, 0xbb, 0xbd, 0x0f, 0x10, 0x40, 0x3d, 0xa6, 0x30, 0x1e, 0x3d, 0xc8,
+  0x3f, 0x4a, 0x3e, 0x06, 0xe9, 0x15, 0xbd, 0x8a, 0x87, 0x11, 0x3e, 0xe2, 0xa4,
+  0x0b, 0xbe, 0xe5, 0x96, 0x3d, 0x3e, 0x5e, 0x78, 0x0c, 0x3e, 0x32, 0x79, 0x7a,
+  0xba, 0x24, 0x9f, 0x1f, 0xbe, 0xe1, 0x2d, 0xc3, 0xbc, 0xdf, 0x43, 0xb4, 0xbd,
+  0xb1, 0x00, 0xde, 0x3d, 0x7e, 0x34, 0x4b, 0xbe, 0xeb, 0x21, 0xdd, 0xbd, 0xbe,
+  0x43, 0xe2, 0xbd, 0x4b, 0x49, 0x9f, 0x3d, 0xa3, 0xd0, 0x8e, 0x3d, 0xdf, 0x84,
+  0x17, 0xbe, 0x12, 0x0b, 0xc8, 0xbd, 0xcb, 0x0e, 0x64, 0xbd, 0xdd, 0x25, 0x83,
+  0xbd, 0xa0, 0x78, 0x1b, 0x3e, 0x2e, 0x77, 0x1e, 0xbe, 0x94, 0x81, 0xc8, 0xbd,
+  0x8d, 0x3e, 0xba, 0xbd, 0xff, 0xe9, 0x32, 0x3e, 0xb0, 0x76, 0xb9, 0xbd, 0xfd,
+  0x8a, 0x71, 0xbd, 0xab, 0xf3, 0x4c, 0xbc, 0x0c, 0xa0, 0x0c, 0x3e, 0xa2, 0x36,
+  0xb2, 0xbc, 0x1b, 0x34, 0xb2, 0xbd, 0x44, 0x18, 0x8c, 0xbd, 0xa3, 0xe3, 0x83,
+  0xbd, 0x45, 0x8c, 0xae, 0xbd, 0x4e, 0x7d, 0x09, 0xbe, 0xdf, 0x58, 0x19, 0xbd,
+  0xae, 0x8f, 0x5f, 0x3d, 0xa7, 0x36, 0x80, 0xbd, 0xfb, 0x12, 0x22, 0x3e, 0x25,
+  0x11, 0x99, 0xbb, 0x51, 0xc9, 0x4a, 0x3d, 0x99, 0x68, 0x32, 0x3e, 0x44, 0xcc,
+  0x7a, 0xbc, 0xa8, 0x46, 0xb7, 0x3d, 0x5f, 0xbb, 0x8a, 0xbd, 0xd3, 0xbb, 0x3a,
+  0x3e, 0x46, 0x2c, 0x89, 0x3d, 0x26, 0xcb, 0x79, 0x3d, 0xe1, 0x45, 0x40, 0xbd,
+  0x01, 0xc4, 0xe3, 0x3d, 0x42, 0x18, 0x24, 0x3e, 0x34, 0x73, 0x19, 0x3e, 0x00,
+  0x53, 0xb7, 0x3d, 0x33, 0x6d, 0xf8, 0x3c, 0x2c, 0x5d, 0x3f, 0xbd, 0x85, 0xa9,
+  0x1b, 0xbe, 0x18, 0xda, 0xb8, 0xbc, 0xaa, 0x92, 0xb4, 0x3d, 0x53, 0x65, 0x43,
+  0x3e, 0x4f, 0xda, 0x03, 0xbd, 0xba, 0x8e, 0x40, 0xbe, 0xc1, 0x11, 0xb8, 0xbb,
+  0x3e, 0x07, 0x66, 0x3e, 0xb8, 0x25, 0xe0, 0x3c, 0x7f, 0x4d, 0x0f, 0xbd, 0x35,
+  0x57, 0xaa, 0xbd, 0xe5, 0x8b, 0xec, 0xbd, 0x70, 0xda, 0x08, 0xbc, 0x03, 0xc2,
+  0xf5, 0xbb, 0xa5, 0x57, 0x83, 0xbd, 0xf1, 0x0b, 0x74, 0x3e, 0x9a, 0x63, 0x5a,
+  0xbd, 0x8f, 0xb3, 0xa1, 0xbb, 0xe3, 0x0a, 0xd1, 0x3c, 0xa8, 0xc3, 0xfd, 0x3d,
+  0x58, 0x80, 0x04, 0xbe, 0xfb, 0xca, 0xe0, 0x3d, 0x01, 0x75, 0x04, 0xbe, 0xbe,
+  0xa9, 0x55, 0xbd, 0x59, 0x90, 0xff, 0xbd, 0x6a, 0xf0, 0x64, 0xbd, 0x89, 0xdc,
+  0x1d, 0xbe, 0xb8, 0x8f, 0x26, 0xbd, 0x3b, 0x31, 0xc8, 0xbd, 0x2c, 0x3d, 0x88,
+  0xbd, 0x48, 0xea, 0x0f, 0xbd, 0xce, 0x3f, 0x22, 0x3d, 0x8b, 0x31, 0xe7, 0x3d,
+  0xa1, 0x13, 0x55, 0xbd, 0x2a, 0x96, 0xcc, 0x3d, 0xa1, 0xd9, 0xcf, 0x3d, 0x9f,
+  0x0f, 0xcf, 0x3c, 0xac, 0x8b, 0xa4, 0xbc, 0x88, 0x69, 0xb6, 0x3d, 0x35, 0x40,
+  0xc8, 0x3d, 0x5a, 0x6e, 0x23, 0xbe, 0x5f, 0xd9, 0x17, 0xbe, 0x4b, 0x8e, 0x9f,
+  0xbd, 0x44, 0xeb, 0x15, 0xbe, 0xe9, 0x93, 0xba, 0x3d, 0x4b, 0x93, 0x08, 0xbe,
+  0x79, 0x4d, 0x09, 0x3e, 0x5a, 0x98, 0x6d, 0xbd, 0x02, 0x95, 0x24, 0xbe, 0x80,
+  0x67, 0x9d, 0xbd, 0xd2, 0x10, 0x1f, 0xbe, 0x64, 0xd2, 0x62, 0xbd, 0x01, 0x92,
+  0x09, 0x3e, 0x96, 0x6e, 0xca, 0xbd, 0x62, 0x32, 0xf3, 0xbd, 0xe1, 0x10, 0x50,
+  0x3d, 0x61, 0x3e, 0xdc, 0x3d, 0x7e, 0x6e, 0xd5, 0xbd, 0xf4, 0xea, 0x1f, 0x3e,
+  0x2a, 0xd2, 0x10, 0xbd, 0x04, 0xa4, 0xdd, 0x3b, 0x7f, 0x19, 0x50, 0xbd, 0xad,
+  0x49, 0x0e, 0x3e, 0x63, 0x14, 0xe3, 0x3d, 0x6f, 0x2d, 0x99, 0x3d, 0x4a, 0x0b,
+  0x08, 0xbe, 0xd6, 0x54, 0xdd, 0xbd, 0xfb, 0x6b, 0x9e, 0xbd, 0xc0, 0x42, 0xe9,
+  0xbd, 0xba, 0xef, 0x40, 0xbb, 0x9c, 0x44, 0xc5, 0x3d, 0x1e, 0x3a, 0xde, 0xbd,
+  0xce, 0x6d, 0xef, 0x3d, 0x92, 0x4d, 0xf6, 0xbd, 0xa3, 0xc5, 0x0c, 0xbe, 0x74,
+  0x63, 0xd8, 0xbd, 0xff, 0xd4, 0x11, 0x3e, 0x02, 0x10, 0x28, 0xbd, 0x86, 0xf5,
+  0x4f, 0x3d, 0x6a, 0xfb, 0xc6, 0x3d, 0x6d, 0x29, 0x1f, 0xbe, 0xa4, 0x55, 0xab,
+  0x3d, 0xaa, 0xc8, 0xc7, 0x3d, 0xf4, 0xec, 0x59, 0x3d, 0xd1, 0x44, 0x75, 0x3d,
+  0xe6, 0x18, 0x3c, 0x3e, 0xd7, 0x83, 0xb5, 0x3d, 0xdc, 0xa3, 0xb1, 0xbd, 0xbb,
+  0xa7, 0x73, 0xbd, 0x03, 0x00, 0x3c, 0x3d, 0x3b, 0x59, 0x8d, 0xbd, 0x27, 0x1f,
+  0x07, 0xbe, 0x46, 0x5f, 0xcf, 0xbd, 0x5b, 0xf5, 0x13, 0xbe, 0xe9, 0xa9, 0x1b,
+  0x3e, 0x05, 0x6e, 0x0e, 0x3e, 0xd2, 0xa7, 0xad, 0xbc, 0x55, 0xda, 0x12, 0x3e,
+  0xd4, 0xd5, 0xcc, 0xbd, 0x5e, 0x0d, 0x33, 0xbe, 0x5f, 0xfa, 0x99, 0xbd, 0xa1,
+  0xd4, 0x96, 0xbd, 0x7b, 0xec, 0x08, 0x3d, 0xf0, 0x43, 0x04, 0xbe, 0xd6, 0x6a,
+  0x3e, 0x3d, 0x9c, 0x4c, 0xa5, 0xbd, 0xc1, 0x25, 0xeb, 0x3c, 0x00, 0x84, 0x7f,
+  0xbd, 0x8e, 0x5b, 0x2d, 0xbd, 0x5a, 0x0d, 0x93, 0x3c, 0x14, 0x09, 0x5e, 0x3d,
+  0x0e, 0x7c, 0x25, 0x3d, 0x4b, 0x3f, 0x0f, 0xbe, 0xad, 0x31, 0xd8, 0xbd, 0x81,
+  0xa4, 0x66, 0xbd, 0x25, 0x37, 0x32, 0xbe, 0x64, 0x42, 0x6f, 0x3d, 0x9c, 0xdb,
+  0xc2, 0x3d, 0x1f, 0x78, 0xcc, 0x3c, 0x45, 0xa8, 0x0c, 0x3e, 0xe8, 0x27, 0xe3,
+  0x3d, 0xbf, 0xb1, 0xff, 0x3d, 0x3e, 0x13, 0xc6, 0x3d, 0xf2, 0x5b, 0x64, 0x3d,
+  0xf1, 0xf8, 0x16, 0x3e, 0x24, 0x46, 0x40, 0x3d, 0xa1, 0x7e, 0x99, 0x3c, 0x6d,
+  0x30, 0x1e, 0xbe, 0x04, 0xdd, 0x2a, 0xbe, 0x03, 0x25, 0x20, 0xbd, 0x07, 0xf4,
+  0x74, 0xbc, 0xc8, 0x71, 0x03, 0xbd, 0x46, 0xf3, 0xd9, 0xbc, 0x33, 0x6d, 0xbb,
+  0xbd, 0xbd, 0x8a, 0xd5, 0x3d, 0x68, 0xbd, 0x9e, 0xbc, 0x1c, 0x26, 0x09, 0xbe,
+  0x0f, 0x3c, 0x9d, 0xbd, 0xde, 0x13, 0x53, 0xbd, 0x73, 0xe9, 0x90, 0x3d, 0xdc,
+  0x50, 0xef, 0x3c, 0x6f, 0x00, 0x32, 0xbc, 0x42, 0x79, 0x18, 0x3e, 0xa8, 0xe4,
+  0xb3, 0xbd, 0x04, 0x2f, 0x6e, 0xbd, 0x41, 0xb2, 0x51, 0x3e, 0x56, 0x54, 0xe7,
+  0x3d, 0x0c, 0x44, 0xbb, 0xbd, 0xa4, 0xce, 0x8b, 0x3c, 0xad, 0x8a, 0xec, 0x3d,
+  0xf7, 0xc9, 0x44, 0xbd, 0xc5, 0xdc, 0x2a, 0x3b, 0xde, 0x9e, 0xb6, 0x3d, 0x20,
+  0x2c, 0x1c, 0xbe, 0x04, 0x0c, 0x9f, 0xbd, 0x41, 0x5f, 0xd4, 0xbd, 0x76, 0x92,
+  0x06, 0xbe, 0x6a, 0x98, 0x30, 0xbe, 0xc4, 0xa0, 0xd3, 0x3c, 0x38, 0x33, 0xf5,
+  0xbd, 0x94, 0x28, 0x0d, 0xbd, 0x42, 0x60, 0x1e, 0x3d, 0xfd, 0x72, 0xca, 0x3d,
+  0xee, 0xf6, 0x0d, 0x3e, 0x35, 0xb3, 0x27, 0x3e, 0x15, 0xde, 0x08, 0xbe, 0x34,
+  0xc4, 0x8b, 0xbd, 0x4a, 0x4f, 0x9a, 0x3d, 0x87, 0x8f, 0x06, 0xbc, 0x68, 0x43,
+  0x10, 0xbd, 0x36, 0x40, 0xb6, 0xbc, 0xf2, 0xad, 0x82, 0xbd, 0xc5, 0xef, 0x13,
+  0xbe, 0x4c, 0x38, 0xcd, 0xbd, 0x4a, 0xdf, 0x9d, 0x3c, 0x9d, 0xb0, 0x9a, 0x3d,
+  0xe8, 0xf7, 0xd4, 0x3d, 0x9d, 0x50, 0x34, 0x3d, 0xc9, 0x92, 0xdf, 0x3d, 0x20,
+  0x66, 0xeb, 0x3d, 0x54, 0x5c, 0x85, 0xbd, 0x2d, 0x0e, 0xc6, 0x3d, 0x90, 0xea,
+  0x64, 0xbd, 0xcd, 0xa5, 0x5c, 0xbd, 0x77, 0x8d, 0x7b, 0x3d, 0xf7, 0xda, 0x98,
+  0xbd, 0xc2, 0x98, 0xcb, 0x3d, 0x79, 0xa4, 0x2d, 0x3d, 0x52, 0x42, 0x15, 0x3e,
+  0xc5, 0x68, 0x47, 0xbd, 0xbf, 0xa0, 0xe7, 0xbd, 0xbf, 0xa4, 0xbd, 0x3b, 0x6f,
+  0xe3, 0x05, 0xbd, 0xd3, 0xda, 0xdb, 0xbd, 0x40, 0x3a, 0xa8, 0xbd, 0x87, 0x88,
+  0x36, 0xbe, 0xaf, 0x1d, 0xe5, 0x3d, 0xf6, 0xe8, 0x2e, 0xbe, 0xbc, 0x78, 0x9b,
+  0x3d, 0x8b, 0x27, 0xf6, 0xbd, 0x18, 0x45, 0xef, 0xbd, 0x8c, 0x3f, 0x3e, 0x3e,
+  0x94, 0x69, 0x16, 0xbe, 0x4f, 0xce, 0x48, 0xbe, 0x0c, 0xfa, 0x0b, 0xbc, 0x01,
+  0x50, 0x37, 0x3e, 0x87, 0x13, 0x0b, 0xbe, 0xd0, 0xb1, 0x38, 0x3e, 0x71, 0x2c,
+  0xa1, 0x3d, 0x4a, 0x15, 0xb4, 0xbd, 0x80, 0x28, 0x2b, 0xbd, 0xc7, 0x3d, 0x7e,
+  0x3c, 0xe5, 0xe1, 0xf1, 0x3d, 0x43, 0x56, 0x2c, 0x3d, 0x18, 0xba, 0x20, 0xbe,
+  0x4e, 0x30, 0x8d, 0x3d, 0x0b, 0x52, 0x20, 0x3b, 0x2d, 0xbc, 0x48, 0xbd, 0xf8,
+  0xff, 0xcf, 0xbb, 0x34, 0xb2, 0xaf, 0x3c, 0xea, 0xad, 0xf0, 0x3d, 0xed, 0xbd,
+  0x8d, 0x3d, 0x41, 0x8c, 0xde, 0xbd, 0xb0, 0xb4, 0x32, 0x3e, 0xf8, 0x16, 0x2e,
+  0xbe, 0x0c, 0x4a, 0x8c, 0x3d, 0x89, 0x92, 0x13, 0x3e, 0x8b, 0xd2, 0xbb, 0xbd,
+  0xf5, 0xce, 0x0f, 0x3e, 0x31, 0x82, 0x7b, 0xbb, 0x7f, 0xac, 0x0e, 0x3e, 0x9f,
+  0xe7, 0x0a, 0xbe, 0x5b, 0xef, 0x2b, 0x3d, 0xa9, 0x7f, 0x0d, 0x3e, 0xa4, 0xc0,
+  0xde, 0x3d, 0xde, 0x0d, 0xbc, 0xbc, 0x59, 0x6f, 0x81, 0x3a, 0x46, 0x0c, 0x1b,
+  0xbe, 0xd0, 0xba, 0xf5, 0xbc, 0xe5, 0x6d, 0x1d, 0x3e, 0x31, 0x08, 0x5a, 0x3d,
+  0xab, 0x1c, 0xb5, 0xbc, 0xe7, 0xaa, 0x18, 0x3e, 0xaa, 0xcc, 0x14, 0x3e, 0x4e,
+  0x1e, 0x08, 0xbd, 0xfc, 0x9f, 0xbe, 0xbd, 0x44, 0x7b, 0x2b, 0xbe, 0xf1, 0xfa,
+  0x90, 0x3c, 0xa4, 0x75, 0x16, 0xbe, 0x27, 0x3b, 0x05, 0xbe, 0xf3, 0x41, 0xde,
+  0xbd, 0xb9, 0x96, 0x10, 0xbd, 0xd0, 0x44, 0x6a, 0x3b, 0x5b, 0x04, 0x02, 0xbe,
+  0x3c, 0xf7, 0x41, 0xbd, 0xe6, 0xaf, 0x06, 0xbe, 0x52, 0x74, 0x08, 0x3e, 0xda,
+  0x81, 0x54, 0x3d, 0xcd, 0xe8, 0xbc, 0x3d, 0xf8, 0x07, 0xdc, 0x3d, 0x84, 0x6f,
+  0xd8, 0xbd, 0xe0, 0x65, 0x2a, 0x3e, 0x04, 0xae, 0xe1, 0xbd, 0x34, 0xd5, 0x27,
+  0xbd, 0x5c, 0xb4, 0x70, 0xbd, 0x0d, 0x68, 0xfa, 0x3d, 0x04, 0xb0, 0xc5, 0xbd,
+  0xa0, 0xf7, 0x87, 0x3d, 0xdc, 0x08, 0x18, 0x3e, 0x86, 0xb9, 0x0f, 0xbe, 0x21,
+  0x03, 0x75, 0x3d, 0x2b, 0x4f, 0x15, 0xbd, 0x3c, 0x86, 0x8e, 0xbc, 0xc7, 0xd0,
+  0x73, 0x3d, 0xe0, 0x50, 0x37, 0x3c, 0xd6, 0x8d, 0xce, 0x3d, 0x3b, 0x42, 0x1b,
+  0x3e, 0xa9, 0xfc, 0x29, 0x3e, 0xe4, 0x58, 0x1d, 0x3d, 0x5d, 0xab, 0x3b, 0xbe,
+  0x28, 0x32, 0x07, 0xbd, 0x54, 0x37, 0x9c, 0x3d, 0xd4, 0xdd, 0x04, 0x3d, 0x28,
+  0xe1, 0xad, 0xbc, 0x98, 0x0e, 0x13, 0x3e, 0xae, 0x57, 0x2a, 0xbe, 0xc4, 0xf0,
+  0x70, 0xbd, 0xf9, 0x8d, 0x0d, 0xbe, 0x5e, 0x46, 0x17, 0xbe, 0x90, 0x6a, 0xbc,
+  0x3d, 0x12, 0xa1, 0xf3, 0xbd, 0x0f, 0xf9, 0x88, 0xbd, 0x60, 0xd9, 0x2f, 0xbd,
+  0x07, 0x99, 0xa2, 0xbd, 0x0b, 0xa5, 0x1b, 0xbc, 0x92, 0x9d, 0xaf, 0xbc, 0x37,
+  0xf5, 0x5a, 0x3c, 0x88, 0xf0, 0xcf, 0x3d, 0x96, 0xdd, 0x54, 0x3d, 0x2f, 0xd2,
+  0x0a, 0x3e, 0xe5, 0xbd, 0x46, 0x3c, 0xd2, 0x65, 0xcb, 0xbd, 0x19, 0x00, 0x0b,
+  0xbe, 0xd6, 0xf6, 0xb0, 0x3d, 0x39, 0xc2, 0x14, 0x3e, 0x44, 0x63, 0x3f, 0x3e,
+  0x4a, 0x6c, 0x1d, 0x3e, 0xf3, 0x6a, 0xe1, 0xbc, 0x31, 0xa5, 0x28, 0xbe, 0x54,
+  0x4d, 0x49, 0xbd, 0xd4, 0xbf, 0x64, 0xbd, 0xec, 0x58, 0xbc, 0xbd, 0xff, 0xc6,
+  0xd0, 0x3c, 0xb7, 0xf1, 0xa7, 0x3d, 0x55, 0x15, 0x26, 0xbd, 0xe6, 0x14, 0xe2,
+  0x3c, 0x6b, 0x28, 0x05, 0x3e, 0x83, 0xaf, 0xbc, 0xbd, 0xc6, 0xb7, 0x6a, 0x3d,
+  0x6f, 0xa9, 0x01, 0x3e, 0x93, 0x78, 0x62, 0xb9, 0x23, 0x46, 0x3f, 0xbd, 0x89,
+  0xbd, 0x88, 0x3d, 0x4d, 0xeb, 0xa0, 0x3d, 0x5e, 0x68, 0x74, 0xbd, 0x3d, 0xe2,
+  0x86, 0xbd, 0x11, 0x15, 0x62, 0xbd, 0x01, 0xde, 0xc8, 0xbd, 0xf0, 0x96, 0xc0,
+  0xbd, 0xf4, 0x9d, 0xff, 0xbd, 0x04, 0xcb, 0x80, 0x3c, 0x4f, 0x43, 0x35, 0x3d,
+  0x65, 0x45, 0x6c, 0x3d, 0x45, 0x55, 0xaa, 0xbc, 0xe1, 0x1a, 0x59, 0x3d, 0x4c,
+  0x54, 0x20, 0xbe, 0x35, 0xaf, 0xe3, 0x3d, 0xd2, 0x5e, 0xae, 0xbd, 0xa7, 0xaa,
+  0x15, 0x3e, 0xea, 0x3c, 0xe9, 0x3c, 0xa4, 0xc9, 0x08, 0xbe, 0xca, 0xec, 0x82,
+  0x3b, 0x8b, 0x49, 0xfa, 0xbd, 0x9d, 0x1e, 0x8b, 0xbc, 0x1b, 0xb4, 0xed, 0xbd,
+  0x1d, 0xbe, 0xc9, 0x3d, 0x8c, 0xdf, 0x2a, 0xbe, 0x8c, 0xba, 0xe3, 0x3d, 0x1f,
+  0xa2, 0x14, 0x3d, 0x61, 0xf2, 0xcf, 0xba, 0xd5, 0x67, 0x88, 0xbd, 0xa7, 0xd0,
+  0x5d, 0x3e, 0x71, 0x6e, 0xfd, 0x3d, 0xd5, 0xcf, 0x02, 0xbd, 0x0c, 0x25, 0xb5,
+  0x3c, 0xa6, 0x27, 0x90, 0x3c, 0x86, 0x80, 0x1c, 0x3e, 0x41, 0x4f, 0x02, 0xbe,
+  0xe1, 0x7a, 0x28, 0x3e, 0xef, 0xf7, 0x96, 0xbd, 0x0f, 0x11, 0xd3, 0x3d, 0xd9,
+  0x11, 0x00, 0x3e, 0x77, 0x16, 0x98, 0x3d, 0x6a, 0xbc, 0x03, 0xbe, 0xbc, 0x2b,
+  0xc9, 0xbd, 0xc0, 0xc5, 0x99, 0x3d, 0xf4, 0x17, 0xc9, 0x3d, 0x37, 0xc7, 0xea,
+  0x3d, 0xd0, 0x01, 0x29, 0xbe, 0xae, 0xfd, 0x37, 0xbd, 0x7a, 0xce, 0xba, 0xbc,
+  0x7d, 0x16, 0x19, 0x3e, 0x2b, 0x5f, 0x32, 0x3a, 0x54, 0x01, 0x96, 0xbd, 0xd6,
+  0xb6, 0x73, 0x3c, 0x8f, 0x5c, 0xa9, 0x3c, 0x67, 0x4e, 0xac, 0x3d, 0x52, 0x49,
+  0xab, 0x3d, 0x05, 0x07, 0x29, 0x3e, 0x43, 0x4c, 0x28, 0xbe, 0x0c, 0x1a, 0x12,
+  0xbe, 0x05, 0x18, 0x3c, 0x3c, 0x29, 0x0f, 0x22, 0x3e, 0xf3, 0x49, 0x54, 0x3e,
+  0xbf, 0xcd, 0x46, 0x3d, 0xea, 0x9f, 0x53, 0x3d, 0xf6, 0xcc, 0xb5, 0x3d, 0x80,
+  0x51, 0x9e, 0x3d, 0xff, 0xc1, 0x69, 0x3d, 0x94, 0x19, 0x41, 0xbd, 0x7b, 0x33,
+  0x75, 0x3c, 0x9e, 0x51, 0x2f, 0x3e, 0x58, 0x6e, 0x21, 0x3c, 0x46, 0x38, 0x22,
+  0x3e, 0x73, 0xf9, 0x15, 0xbe, 0xfa, 0x12, 0x04, 0xbe, 0xaf, 0x1d, 0x1e, 0xbe,
+  0xad, 0x03, 0x11, 0xbe, 0xb3, 0xa7, 0x07, 0x3d, 0x4b, 0x76, 0x58, 0xbd, 0x68,
+  0xaa, 0x21, 0xbe, 0x18, 0xb3, 0x24, 0xbe, 0x59, 0xa7, 0x9d, 0xbd, 0x8a, 0x64,
+  0x92, 0x3d, 0xf4, 0xe8, 0x00, 0xbe, 0xed, 0xd4, 0x85, 0x3c, 0x77, 0x84, 0xf0,
+  0xbd, 0x3f, 0x0d, 0x37, 0x3e, 0x2c, 0x42, 0x64, 0x3c, 0x5b, 0x23, 0x27, 0x3e,
+  0x3e, 0xc6, 0xb0, 0x3d, 0x1c, 0xba, 0xfe, 0xbc, 0xcf, 0xde, 0xb4, 0xbc, 0x97,
+  0x05, 0x1c, 0xbd, 0x0d, 0xa5, 0x92, 0xbb, 0x6a, 0x79, 0x50, 0x3e, 0x62, 0x30,
+  0x19, 0x3e, 0xd7, 0x23, 0x02, 0x3e, 0x9d, 0xc1, 0x7e, 0x3d, 0xb5, 0x03, 0x9c,
+  0xbd, 0x7b, 0xc5, 0x72, 0x3d, 0xc3, 0xd4, 0x22, 0xbe, 0x55, 0x27, 0x63, 0x3d,
+  0xb7, 0x8f, 0x2e, 0xbe, 0x18, 0xe1, 0xbd, 0xbd, 0xa9, 0x10, 0xf0, 0xbd, 0x51,
+  0xd4, 0x4d, 0x3d, 0x62, 0x08, 0xe2, 0x3d, 0x3b, 0xf4, 0x5e, 0x3d, 0xa1, 0xeb,
+  0xb4, 0x3d, 0xed, 0x6f, 0x72, 0x3d, 0x1c, 0x3b, 0xba, 0xbd, 0x56, 0xa6, 0xc8,
+  0xbd, 0x1e, 0x39, 0x3b, 0xbe, 0x83, 0xc7, 0xb4, 0x3d, 0x04, 0xe6, 0xd6, 0x3d,
+  0x2a, 0x2c, 0x91, 0x3d, 0x78, 0x72, 0x9f, 0x3d, 0x62, 0xf9, 0xdd, 0xbd, 0x21,
+  0x97, 0x28, 0xbe, 0x52, 0xaa, 0x06, 0x3e, 0x55, 0x9e, 0x26, 0xbe, 0xb0, 0x2a,
+  0x4f, 0xbd, 0x72, 0x66, 0xeb, 0x3c, 0xa8, 0x84, 0xed, 0x3d, 0x02, 0xca, 0xaf,
+  0xbd, 0xbd, 0x90, 0x64, 0xbd, 0x91, 0xd5, 0x81, 0xbd, 0xcd, 0x4a, 0x24, 0x3e,
+  0x57, 0x13, 0x44, 0xbd, 0x35, 0x93, 0x1b, 0xbb, 0x9e, 0x75, 0xe0, 0x3d, 0x86,
+  0xfb, 0x25, 0xbe, 0x7a, 0xe1, 0xe5, 0x3d, 0x15, 0x97, 0x28, 0x3d, 0xa5, 0x78,
+  0xe4, 0x3d, 0x22, 0xf8, 0x0d, 0x3d, 0x18, 0xbb, 0xcb, 0xbc, 0xfc, 0x53, 0x99,
+  0xbd, 0xd5, 0x40, 0xcc, 0xbd, 0x2e, 0x47, 0xf6, 0x3d, 0xd0, 0x5c, 0x1c, 0xbb,
+  0xac, 0x38, 0xb3, 0x3c, 0x25, 0xfd, 0x8e, 0x3c, 0xd0, 0xc9, 0x4c, 0xbd, 0x37,
+  0xc4, 0xfe, 0xbd, 0x1d, 0xca, 0x17, 0xbe, 0x54, 0x50, 0x8f, 0xbd, 0xc1, 0xfb,
+  0xed, 0xbd, 0xb9, 0x2f, 0x24, 0x3e, 0xc0, 0x6d, 0x1c, 0xbe, 0xe2, 0xd7, 0x95,
+  0x3d, 0x21, 0xa6, 0x7c, 0x3d, 0x1b, 0x02, 0x3c, 0x3d, 0xc6, 0x73, 0x4b, 0x3d,
+  0x28, 0x7a, 0xcf, 0x3d, 0x6c, 0x4f, 0xf5, 0x3c, 0x0a, 0x47, 0x88, 0xbd, 0xe1,
+  0xc9, 0x39, 0xbe, 0x0d, 0x2d, 0x04, 0x3c, 0x80, 0xf8, 0xd7, 0xbb, 0x8e, 0xa6,
+  0xf3, 0xbd, 0x10, 0x3c, 0xe1, 0x3d, 0xde, 0x10, 0xb2, 0xbd, 0x9c, 0x3f, 0x46,
+  0xbd, 0xd4, 0x42, 0x01, 0x3e, 0x63, 0x0f, 0x82, 0x3d, 0xab, 0x71, 0xe9, 0xbd,
+  0x06, 0xe4, 0x11, 0x3e, 0x12, 0x15, 0x0a, 0xbe, 0x46, 0x0a, 0x5a, 0xbd, 0x83,
+  0xff, 0x9a, 0xbc, 0xe4, 0x96, 0xdc, 0xbd, 0xc7, 0xaf, 0x7a, 0x3d, 0x64, 0x84,
+  0xbe, 0x3d, 0x90, 0x0c, 0x04, 0xbd, 0xb4, 0x26, 0xb1, 0xbc, 0x35, 0xf6, 0x23,
+  0x3e, 0x81, 0x0c, 0x89, 0xbd, 0x8a, 0xe7, 0xd7, 0xbc, 0x3b, 0xce, 0xa5, 0x3d,
+  0xc1, 0x40, 0x83, 0x3d, 0x44, 0x14, 0x9a, 0x3d, 0xeb, 0x57, 0xbe, 0x3c, 0xde,
+  0x7c, 0x01, 0x3d, 0xa0, 0x13, 0xe4, 0xbc, 0x54, 0xae, 0xca, 0x3d, 0x9d, 0xd5,
+  0xc7, 0x3b, 0x59, 0x7b, 0xfc, 0xbd, 0xae, 0x12, 0x00, 0x3e, 0x79, 0xac, 0x07,
+  0x3e, 0x40, 0x9b, 0x83, 0xbd, 0x7b, 0xb9, 0xeb, 0xbb, 0x12, 0x58, 0xf6, 0x3d,
+  0x10, 0x80, 0x8c, 0xbd, 0x73, 0x18, 0xc8, 0xbd, 0x5e, 0x85, 0xbc, 0xbd, 0xf4,
+  0x7c, 0xd0, 0xbd, 0x3b, 0x06, 0x66, 0xbd, 0x88, 0xaf, 0x82, 0xbc, 0x43, 0x81,
+  0x80, 0x3d, 0x03, 0x7a, 0x20, 0x3e, 0xc1, 0x44, 0xd1, 0x3c, 0x2f, 0xa0, 0x76,
+  0x3d, 0x63, 0x3e, 0x06, 0x3c, 0x80, 0xb6, 0xa4, 0x3d, 0x6d, 0x3d, 0x20, 0x3e,
+  0xee, 0xe4, 0xb3, 0x3d, 0x3f, 0xb3, 0xfc, 0x3c, 0x66, 0x46, 0x52, 0x3e, 0x93,
+  0x86, 0x14, 0xbd, 0x1f, 0x77, 0x8e, 0xbd, 0x99, 0x66, 0x88, 0x3c, 0xbb, 0xb7,
+  0xc1, 0x3d, 0x30, 0x43, 0xcd, 0xbd, 0xd6, 0x81, 0xbe, 0x39, 0x60, 0x9d, 0x21,
+  0xbe, 0x77, 0xb4, 0x16, 0x3e, 0x50, 0x6b, 0x88, 0xbb, 0xbe, 0x2a, 0xe1, 0xbc,
+  0x7e, 0xfb, 0x13, 0xbe, 0x04, 0xd2, 0x01, 0x3e, 0xd7, 0xf2, 0xfb, 0xbd, 0xa1,
+  0x97, 0xa5, 0x3d, 0x51, 0xb1, 0x1d, 0x3e, 0xa6, 0xe9, 0x11, 0x3e, 0x28, 0xe3,
+  0xb0, 0xbc, 0xd6, 0xd7, 0xcf, 0xbd, 0xf7, 0x89, 0x10, 0x3e, 0x2d, 0x9d, 0x0b,
+  0xbe, 0x08, 0x0a, 0x0e, 0xbd, 0xc7, 0x1e, 0x08, 0x3d, 0x18, 0x40, 0xad, 0xbd,
+  0xef, 0x48, 0x05, 0xbd, 0xf6, 0xc0, 0x23, 0xbe, 0xf6, 0x7d, 0xa6, 0x3d, 0x05,
+  0xb5, 0x6c, 0x3d, 0x7f, 0x05, 0xd4, 0xbd, 0xd5, 0x2a, 0x1f, 0x3e, 0x60, 0x90,
+  0xee, 0xbd, 0x82, 0x03, 0x26, 0xbd, 0x27, 0x9d, 0x05, 0xbd, 0x2d, 0x05, 0x9c,
+  0x3c, 0xa0, 0x72, 0xef, 0x3d, 0x4a, 0xd9, 0xad, 0x3d, 0x9f, 0x2a, 0x46, 0xbd,
+  0x47, 0x6e, 0xfb, 0xbc, 0x43, 0x4b, 0xde, 0xbd, 0xf0, 0x40, 0x97, 0x3d, 0xd9,
+  0xf7, 0xe1, 0xbd, 0xbd, 0xae, 0xce, 0x3c, 0x79, 0xae, 0x8c, 0xbd, 0x34, 0xc9,
+  0x34, 0xbe, 0x99, 0x0a, 0xae, 0xbd, 0xae, 0xe2, 0xe9, 0x3d, 0xe7, 0x97, 0xf7,
+  0x3d, 0xd1, 0x30, 0x05, 0x3e, 0x14, 0xd3, 0x0c, 0x3d, 0xcd, 0x90, 0x63, 0x3d,
+  0x50, 0xac, 0x27, 0xbd, 0x06, 0x6c, 0x30, 0xbe, 0x31, 0x20, 0xa1, 0xbd, 0xf3,
+  0x98, 0x87, 0x3d, 0x31, 0x34, 0xac, 0xbd, 0x2e, 0xc3, 0xb3, 0xbb, 0xec, 0xb6,
+  0x4d, 0xbd, 0x6f, 0x2c, 0x02, 0xbc, 0xcc, 0xcb, 0x80, 0xbd, 0x7b, 0x15, 0x29,
+  0xbe, 0x8f, 0xb6, 0x8b, 0x3c, 0xca, 0x8b, 0x51, 0xbd, 0x64, 0x5f, 0x45, 0xbd,
+  0x0f, 0xa3, 0xa4, 0x3d, 0xed, 0x79, 0x9c, 0xbd, 0x31, 0xa0, 0xbb, 0x3d, 0xe9,
+  0x06, 0x26, 0x3e, 0x85, 0x78, 0x21, 0x3e, 0x81, 0x35, 0xcd, 0xbd, 0x05, 0x31,
+  0x11, 0xbe, 0x9d, 0x19, 0xde, 0xbd, 0x9a, 0xd3, 0x11, 0xbe, 0x58, 0xa7, 0xff,
+  0xbc, 0x9f, 0x4a, 0x29, 0x3d, 0xda, 0x56, 0x8c, 0xbc, 0xf6, 0xf9, 0x79, 0x3d,
+  0x11, 0xbe, 0x82, 0x3d, 0xda, 0x43, 0x04, 0x3e, 0xed, 0xce, 0xe1, 0x3d, 0x3a,
+  0x95, 0x3a, 0x3d, 0x56, 0x31, 0x4e, 0x3d, 0x82, 0x65, 0xbd, 0x3b, 0x4c, 0x6f,
+  0xa8, 0xbc, 0xa4, 0xa1, 0x25, 0xbc, 0xad, 0x79, 0x2f, 0xbe, 0x73, 0xac, 0x2b,
+  0x3e, 0x2d, 0x80, 0x3f, 0xbd, 0x97, 0xee, 0x80, 0xbd, 0xd8, 0x02, 0x77, 0x3d,
+  0xb2, 0xcb, 0x9b, 0x3d, 0x7c, 0x94, 0xc9, 0xbd, 0xce, 0xd1, 0xdd, 0x3d, 0x12,
+  0xef, 0x8b, 0x3d, 0x3a, 0xbe, 0x08, 0x3e, 0x73, 0x80, 0x1d, 0xbe, 0x2f, 0xdb,
+  0x2d, 0xbe, 0x58, 0x7d, 0xd7, 0xbd, 0x44, 0x0f, 0xae, 0x3d, 0xd6, 0xe7, 0x3d,
+  0x3e, 0xe0, 0x3a, 0xad, 0x3c, 0x7b, 0x10, 0x19, 0x3e, 0x1b, 0x4e, 0x78, 0xbd,
+  0x3f, 0xf3, 0x07, 0xbe, 0x8c, 0xcc, 0xf7, 0xbd, 0x5a, 0x20, 0xb9, 0xbd, 0x53,
+  0x04, 0x34, 0x3d, 0x6b, 0xcf, 0x24, 0x3e, 0x32, 0x1b, 0xc2, 0xbd, 0x92, 0x01,
+  0xee, 0x3c, 0x79, 0x75, 0xd8, 0xbd, 0xdf, 0x4b, 0x0a, 0x3c, 0xf3, 0x93, 0xce,
+  0x3d, 0x76, 0xf7, 0x31, 0xbd, 0xd7, 0x71, 0x17, 0xbe, 0xac, 0xed, 0x1f, 0xbe,
+  0xb5, 0x4d, 0x46, 0x3d, 0xb0, 0xb9, 0x0b, 0xbe, 0x02, 0xb8, 0x9f, 0x3d, 0x7d,
+  0x42, 0x28, 0xbe, 0x65, 0x07, 0xc7, 0x3d, 0xb2, 0xd4, 0xb5, 0x3d, 0x28, 0x07,
+  0xd3, 0x3c, 0x55, 0x93, 0x2c, 0xbe, 0x79, 0x7c, 0x29, 0x3e, 0x59, 0x10, 0x0a,
+  0xbe, 0x9d, 0x0a, 0x08, 0xbd, 0xa3, 0x61, 0x5d, 0x3d, 0xf8, 0xb5, 0xde, 0xbb,
+  0x54, 0x24, 0xa7, 0x3d, 0xe3, 0xe4, 0x32, 0xbe, 0x20, 0x3b, 0x3d, 0xbe, 0x48,
+  0x67, 0xc2, 0xbd, 0x3c, 0x7b, 0x2b, 0xbd, 0x69, 0xee, 0x56, 0xbd, 0xa9, 0x90,
+  0xcb, 0x3d, 0xff, 0xf1, 0xa7, 0xbd, 0xa9, 0xd8, 0x43, 0xbd, 0xb8, 0xcd, 0xb7,
+  0x3c, 0xcd, 0xfb, 0xbb, 0x3d, 0xd6, 0x26, 0x8a, 0xbd, 0x45, 0xa4, 0x81, 0x3d,
+  0xd2, 0xc9, 0x29, 0x3e, 0xdb, 0xf4, 0xdd, 0xbd, 0x93, 0x95, 0xa9, 0x3d, 0x11,
+  0xbb, 0x12, 0x3e, 0xdf, 0xf4, 0xcd, 0xbd, 0xb9, 0xde, 0x82, 0x3c, 0xdf, 0x26,
+  0x76, 0x3d, 0xb6, 0x47, 0x32, 0xbe, 0x91, 0x0f, 0x6f, 0x3b, 0x56, 0x16, 0x4c,
+  0xbe, 0x77, 0x77, 0x00, 0xbe, 0x2c, 0x1f, 0xd1, 0xbd, 0xf6, 0x43, 0x12, 0x3e,
+  0xd8, 0x7c, 0x16, 0x3e, 0x26, 0xec, 0x0c, 0xbe, 0xaf, 0x69, 0xe0, 0x3d, 0x5a,
+  0x3b, 0xdf, 0x3d, 0xbb, 0x0f, 0x99, 0x3d, 0xe2, 0x32, 0x2b, 0xbd, 0xf3, 0x1e,
+  0x1d, 0x3e, 0x9e, 0xdc, 0xf3, 0x3c, 0x77, 0x8b, 0xf7, 0xbd, 0x46, 0xb5, 0x48,
+  0xbc, 0x28, 0xce, 0xbd, 0x3c, 0x22, 0x68, 0x1a, 0x3e, 0x92, 0x40, 0xf0, 0x3c,
+  0x35, 0xf1, 0xbe, 0xbd, 0x8d, 0xed, 0xd0, 0x3d, 0x93, 0x67, 0x5e, 0xbd, 0xc8,
+  0xa3, 0xb0, 0xbd, 0x83, 0x61, 0x2f, 0x3d, 0x39, 0xce, 0x81, 0x3b, 0xa5, 0x87,
+  0x1d, 0x3e, 0xe0, 0x8f, 0x38, 0x3c, 0xce, 0x6f, 0x26, 0x3d, 0x09, 0x7f, 0x9a,
+  0x3d, 0x6c, 0x04, 0x8f, 0xbd, 0x31, 0x13, 0x9c, 0xbb, 0xab, 0xbc, 0x3f, 0xbd,
+  0xe1, 0x11, 0xc2, 0xbd, 0x47, 0xa8, 0x3a, 0x3d, 0x76, 0xc5, 0x0b, 0xbe, 0x0d,
+  0x71, 0xff, 0x3d, 0x30, 0x8e, 0x41, 0x3d, 0xdc, 0xf6, 0x2d, 0xbe, 0x1a, 0x84,
+  0x1f, 0x3d, 0xe2, 0xd4, 0x09, 0x3e, 0xe7, 0x1f, 0x1d, 0xbd, 0x20, 0x25, 0x26,
+  0x3d, 0x68, 0x8f, 0x61, 0x3d, 0xe7, 0xdf, 0x1f, 0xbe, 0xad, 0x57, 0x1b, 0xbe,
+  0x3e, 0xec, 0x1b, 0xbe, 0x6f, 0xe4, 0x09, 0xbe, 0x87, 0x7d, 0xb5, 0xbc, 0xce,
+  0x89, 0x07, 0x3d, 0x8a, 0x34, 0xbe, 0x3b, 0x7a, 0x7d, 0x24, 0x3e, 0xde, 0xc8,
+  0xfa, 0x3d, 0xa4, 0xc7, 0x9e, 0xbd, 0x5b, 0x97, 0xf0, 0xbd, 0x16, 0xf7, 0x3b,
+  0xbe, 0x91, 0xad, 0x27, 0x3e, 0x06, 0x69, 0xf3, 0xbd, 0x6d, 0xb9, 0xe6, 0xbd,
+  0xfc, 0xa1, 0x33, 0x3e, 0x73, 0x47, 0xd4, 0xbd, 0xd1, 0x35, 0xc0, 0x3d, 0x74,
+  0x47, 0x12, 0x3d, 0x2d, 0x04, 0x23, 0x3d, 0xfc, 0xc6, 0x1b, 0x3d, 0x75, 0x18,
+  0x0e, 0xbe, 0xa5, 0x96, 0x55, 0x3c, 0xb8, 0x10, 0xad, 0xbc, 0x93, 0x9b, 0xde,
+  0xbd, 0x9f, 0xa2, 0xf4, 0x3d, 0xb8, 0x21, 0xf6, 0xba, 0xd7, 0x96, 0x09, 0xbd,
+  0x2a, 0x6c, 0xd9, 0xbd, 0xb1, 0x32, 0x45, 0x3d, 0xc0, 0x16, 0x94, 0xbd, 0x78,
+  0xac, 0x97, 0xbd, 0x97, 0xd4, 0xdf, 0xbd, 0x68, 0x97, 0x36, 0xbd, 0x28, 0xce,
+  0x2f, 0x3d, 0x12, 0x02, 0x3d, 0xbd, 0x5b, 0x8f, 0x23, 0x3d, 0xf5, 0xc3, 0xda,
+  0xba, 0xa6, 0x72, 0x41, 0x3e, 0x27, 0xa9, 0xcd, 0xbd, 0x9c, 0x9a, 0x3c, 0x3d,
+  0xf2, 0x7f, 0x45, 0x3e, 0x1c, 0x9f, 0x40, 0x3e, 0xa9, 0xdf, 0x74, 0x3c, 0x6a,
+  0x72, 0x6e, 0xbd, 0x46, 0x83, 0xa5, 0x3d, 0x3b, 0x67, 0x6c, 0x3c, 0xfc, 0x84,
+  0x2a, 0x3d, 0x3c, 0xf4, 0x35, 0x3e, 0xb4, 0x2c, 0x79, 0xbd, 0x43, 0xb9, 0xd6,
+  0x3d, 0xe6, 0xae, 0x13, 0xbd, 0xeb, 0x77, 0xd0, 0xbd, 0x31, 0x51, 0xbe, 0x3d,
+  0x5f, 0x2e, 0x23, 0x3c, 0x7a, 0xbe, 0x15, 0x3e, 0x4b, 0x59, 0xdc, 0xbd, 0xa0,
+  0x8f, 0xe7, 0xbd, 0x76, 0xa8, 0xf3, 0xbd, 0x88, 0x1c, 0x74, 0x3d, 0x85, 0x4d,
+  0xdd, 0xbd, 0x45, 0x96, 0x36, 0xbd, 0xe8, 0x39, 0x98, 0x3d, 0xbe, 0x82, 0xf9,
+  0x3d, 0x1d, 0xdb, 0x2d, 0x3b, 0x6f, 0xac, 0x63, 0xbd, 0x8c, 0xc8, 0xe1, 0xbd,
+  0xcf, 0x49, 0x73, 0xbd, 0x8a, 0xdd, 0xe3, 0xbd, 0xf8, 0x00, 0x19, 0xbd, 0x17,
+  0xe8, 0xdf, 0xbd, 0xba, 0x22, 0x5b, 0x3c, 0xf1, 0x54, 0x21, 0xbe, 0x7b, 0x38,
+  0x58, 0xbd, 0x48, 0x88, 0x67, 0xbd, 0x5e, 0xe2, 0x6c, 0x3d, 0xa5, 0x44, 0x20,
+  0xbe, 0x69, 0x7f, 0xbf, 0xbc, 0x7c, 0xfa, 0x25, 0x3e, 0xc1, 0xd9, 0xd5, 0xbd,
+  0x46, 0x87, 0x75, 0xbd, 0x13, 0x1c, 0x01, 0xbd, 0xe5, 0xc3, 0x19, 0xbb, 0x2d,
+  0xc8, 0x30, 0xbe, 0xad, 0xd8, 0xf2, 0x3d, 0xd9, 0x37, 0x14, 0xbd, 0xd2, 0xb5,
+  0x9a, 0x3d, 0xf4, 0x37, 0x8d, 0x3c, 0x2f, 0x8f, 0xc0, 0x3d, 0x8e, 0xe9, 0xc5,
+  0xbd, 0xf5, 0x4d, 0x21, 0xbe, 0xfd, 0x9a, 0xaa, 0xbd, 0x91, 0xb6, 0x00, 0xbe,
+  0xf0, 0x0d, 0xbf, 0x3c, 0xe4, 0x94, 0xed, 0x3d, 0x64, 0xbe, 0x8d, 0x3c, 0x27,
+  0xcf, 0x2f, 0x3e, 0x22, 0xa5, 0xf1, 0x3d, 0x96, 0xf2, 0xbf, 0xbd, 0x62, 0xde,
+  0xe5, 0xbd, 0x4b, 0x4a, 0x89, 0x3d, 0x7a, 0x3c, 0x1d, 0x3e, 0xfc, 0x83, 0xab,
+  0xbc, 0x0f, 0x00, 0x2e, 0xbe, 0xd5, 0xd1, 0x93, 0x3d, 0x32, 0x51, 0xca, 0xbd,
+  0x27, 0x77, 0x31, 0xbd, 0x6e, 0xe6, 0xe2, 0x3d, 0xdd, 0xb0, 0x03, 0xbe, 0xd7,
+  0xec, 0xe5, 0xbd, 0x97, 0x8e, 0x82, 0x3b, 0x7b, 0xaf, 0x03, 0xbe, 0xbe, 0x24,
+  0xc3, 0x3d, 0x1e, 0x4c, 0x51, 0x3e, 0x07, 0x32, 0x10, 0x3e, 0xac, 0xdb, 0x01,
+  0xbe, 0xef, 0x14, 0x38, 0x3e, 0x1b, 0xbb, 0x73, 0x3d, 0x6a, 0x42, 0x35, 0xbd,
+  0x79, 0x72, 0x13, 0xbe, 0x05, 0x8c, 0xe9, 0x3d, 0xc1, 0x57, 0xe5, 0x3b, 0x50,
+  0x38, 0x71, 0x3d, 0x47, 0xb5, 0xe4, 0xbd, 0x0f, 0x18, 0x01, 0xbe, 0xd6, 0x1c,
+  0x76, 0x3b, 0x99, 0x36, 0x1c, 0xbe, 0x6d, 0xee, 0x1a, 0x3d, 0x2d, 0xcb, 0x39,
+  0xbd, 0xc0, 0x54, 0x24, 0x3e, 0xcb, 0x5b, 0xfb, 0x3c, 0x8d, 0xc8, 0x85, 0x3a,
+  0x10, 0xcb, 0xd6, 0x3c, 0xfd, 0x81, 0xd8, 0x3c, 0xc7, 0xab, 0x1b, 0xba, 0xf5,
+  0xe1, 0xb5, 0xbd, 0x7a, 0x09, 0xfc, 0x3d, 0x98, 0x7b, 0x6b, 0xbd, 0x31, 0x74,
+  0x46, 0xbe, 0x13, 0x26, 0x02, 0x3e, 0x67, 0x37, 0x03, 0xbe, 0x68, 0x29, 0xc4,
+  0xbd, 0x8a, 0xc5, 0x8b, 0xbd, 0x50, 0x23, 0x22, 0xbc, 0x6d, 0x99, 0xf5, 0x3d,
+  0x01, 0x6c, 0xc5, 0xbd, 0xd6, 0xce, 0x14, 0xbe, 0x29, 0xd4, 0xef, 0xbd, 0x7c,
+  0xe1, 0x8b, 0x3c, 0x8f, 0x04, 0xd6, 0xbc, 0x29, 0xf1, 0x60, 0x3c, 0x02, 0x1a,
+  0x2c, 0x3b, 0x76, 0x21, 0x00, 0xbe, 0x16, 0x98, 0x66, 0xbd, 0x2a, 0x64, 0x3f,
+  0xbd, 0xbf, 0x81, 0x24, 0x3d, 0x30, 0x34, 0x27, 0x3e, 0x90, 0xee, 0x9b, 0x3d,
+  0xe1, 0x6c, 0xdd, 0x3c, 0x25, 0x40, 0x25, 0x3e, 0xc0, 0x85, 0x57, 0x3b, 0x16,
+  0xa8, 0x4f, 0x3e, 0xa9, 0xfb, 0x48, 0xbd, 0x38, 0x1c, 0xf8, 0x3b, 0x7a, 0x4a,
+  0xb0, 0xbd, 0x29, 0xe7, 0xf3, 0xbd, 0xa5, 0x5c, 0x42, 0x3d, 0xab, 0x54, 0x09,
+  0x3e, 0x94, 0x68, 0x75, 0x3d, 0x24, 0x37, 0x03, 0xbe, 0x4e, 0xba, 0x09, 0x3e,
+  0x16, 0xba, 0x09, 0x3e, 0xbd, 0x97, 0x00, 0xbe, 0x92, 0xe4, 0x95, 0xbd, 0x74,
+  0xf5, 0x9f, 0xbd, 0x40, 0x16, 0x81, 0x3d, 0x83, 0x4c, 0x26, 0x3e, 0x61, 0xd1,
+  0x25, 0x3e, 0xfb, 0x74, 0x1d, 0xbe, 0x9b, 0x9f, 0x0f, 0x3d, 0xe8, 0x7e, 0x10,
+  0x3d, 0x9e, 0xb0, 0x15, 0x3d, 0x34, 0xe6, 0xee, 0x3d, 0xaf, 0xef, 0xf0, 0xbb,
+  0xaa, 0x06, 0x24, 0xbe, 0x43, 0x5e, 0xdb, 0x3d, 0x10, 0xd8, 0xa4, 0x3d, 0x6e,
+  0xc9, 0x0c, 0xbd, 0x1c, 0xfe, 0xa9, 0x3d, 0xf0, 0xf3, 0x31, 0x3d, 0x38, 0xf5,
+  0x7e, 0xba, 0x24, 0x31, 0xe0, 0x3d, 0x6e, 0xf2, 0xa2, 0x3d, 0xbe, 0x8b, 0xd4,
+  0xbd, 0x65, 0xc3, 0x25, 0x3c, 0xa3, 0xde, 0x67, 0xba, 0x41, 0xe9, 0x13, 0xbe,
+  0x83, 0xd0, 0x02, 0xbd, 0x8b, 0x91, 0x3a, 0x3d, 0x29, 0x20, 0x4c, 0xbc, 0xfc,
+  0x3f, 0xcd, 0xbd, 0x5a, 0x01, 0xae, 0xbd, 0x6c, 0x48, 0x1e, 0xbe, 0xe0, 0x29,
+  0x80, 0x3d, 0x18, 0x74, 0xa0, 0xbd, 0x2a, 0xeb, 0xbd, 0x39, 0x28, 0xe6, 0x2e,
+  0xbe, 0x4b, 0x70, 0x59, 0x3d, 0xd7, 0xcf, 0xd7, 0xbc, 0x34, 0x77, 0xa5, 0x3c,
+  0xef, 0x6d, 0x58, 0xbb, 0x31, 0xcc, 0xde, 0xbb, 0xf6, 0xe6, 0xc2, 0xbd, 0x8b,
+  0xee, 0x14, 0x3e, 0xf3, 0x70, 0x12, 0xbe, 0x88, 0x93, 0xae, 0xbd, 0x57, 0xd4,
+  0xfc, 0x3d, 0x48, 0x74, 0x36, 0x3e, 0xb5, 0xcb, 0x08, 0xbe, 0x32, 0x08, 0xbe,
+  0xbd, 0x95, 0xe2, 0x2e, 0xbd, 0x6c, 0xa0, 0xc3, 0x3d, 0x83, 0xdb, 0xc4, 0x3a,
+  0xc8, 0x25, 0xf0, 0x3d, 0x8a, 0x78, 0x0f, 0x3e, 0xed, 0xd4, 0x02, 0xbc, 0xd4,
+  0x18, 0xad, 0xbd, 0x70, 0x10, 0xbf, 0xbd, 0x9f, 0x8e, 0x1c, 0xbe, 0x41, 0xdf,
+  0xf2, 0x3d, 0x20, 0x72, 0x45, 0x3d, 0x7f, 0x52, 0x16, 0xbe, 0xd7, 0xf4, 0x25,
+  0xbe, 0x6d, 0x3f, 0x3d, 0x3e, 0xd4, 0xb0, 0x26, 0xbe, 0x23, 0x8c, 0x87, 0x3d,
+  0x6c, 0x4e, 0xb9, 0xbc, 0x67, 0x6c, 0x44, 0x3c, 0x35, 0x7b, 0xde, 0x3d, 0x19,
+  0x66, 0xd7, 0x3d, 0x1c, 0xc9, 0xc2, 0x3d, 0xf1, 0xee, 0xba, 0xbd, 0xa3, 0xe1,
+  0xc8, 0x3d, 0xf5, 0xf9, 0x82, 0x3c, 0x3d, 0x0e, 0x81, 0x3d, 0xea, 0xc7, 0x5d,
+  0x3d, 0x19, 0x63, 0x25, 0x3e, 0x59, 0x2f, 0x13, 0xbd, 0xf2, 0x44, 0xeb, 0x3d,
+  0xf0, 0xb5, 0xf1, 0xbc, 0x85, 0x77, 0x03, 0x3d, 0xda, 0x66, 0x11, 0xbd, 0xef,
+  0xae, 0x1b, 0x3d, 0xe1, 0x4f, 0x94, 0xbd, 0x25, 0x17, 0x56, 0xbd, 0x74, 0x34,
+  0x0c, 0x3e, 0xf8, 0x12, 0x88, 0x3d, 0x96, 0x08, 0x97, 0xbd, 0x04, 0xb9, 0x75,
+  0xbc, 0x72, 0x9f, 0x8e, 0x3d, 0x0d, 0xf3, 0x7d, 0xbd, 0x51, 0xe7, 0x56, 0xbc,
+  0x93, 0x6d, 0x08, 0xbe, 0xa7, 0xd8, 0x09, 0x3e, 0x80, 0xd5, 0xa8, 0xbd, 0x40,
+  0x03, 0xd1, 0x3c, 0xe2, 0x44, 0x1f, 0xbd, 0x3e, 0x1f, 0xd6, 0xbd, 0x9f, 0x62,
+  0xe7, 0x3c, 0xf7, 0x6d, 0xae, 0xbd, 0xf4, 0x14, 0xf6, 0x3a, 0x54, 0x99, 0xea,
+  0x3b, 0x9c, 0xab, 0xf7, 0xbd, 0x74, 0x21, 0xdd, 0x3d, 0x87, 0x18, 0x95, 0xbd,
+  0x49, 0x55, 0x0c, 0xbe, 0xd6, 0xdc, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20,
+  0x01, 0x00, 0x00, 0x5a, 0xd4, 0xee, 0x3d, 0x38, 0x39, 0x64, 0x3e, 0x55, 0xb4,
+  0x79, 0x3d, 0x1d, 0xa3, 0xb9, 0x3d, 0xb9, 0x79, 0xe0, 0x3b, 0x30, 0xff, 0xd1,
+  0x3d, 0x7a, 0x3b, 0x2d, 0xbd, 0x18, 0x51, 0x07, 0xbe, 0x5c, 0x31, 0x3d, 0x3e,
+  0x46, 0x0f, 0x51, 0xbe, 0x29, 0x32, 0x13, 0x3e, 0x7c, 0x11, 0xf3, 0xbd, 0x3a,
+  0xbd, 0x4a, 0xbd, 0x56, 0xb3, 0xce, 0xbd, 0x37, 0xd0, 0xf6, 0x3d, 0xd5, 0x9b,
+  0xd8, 0x3d, 0xa8, 0xbc, 0x5a, 0xbe, 0x1b, 0x22, 0x0e, 0xbc, 0x03, 0x98, 0xf9,
+  0x3d, 0x64, 0xf4, 0x47, 0x3e, 0xa2, 0xb5, 0x2f, 0xbe, 0x70, 0x7a, 0x89, 0xbe,
+  0x9c, 0x58, 0x60, 0x3e, 0x71, 0xac, 0x25, 0xbe, 0x17, 0x1c, 0x01, 0x3e, 0x48,
+  0x73, 0x93, 0xbd, 0x0d, 0x92, 0xa3, 0x3d, 0xf1, 0xff, 0x62, 0xbe, 0x56, 0xe9,
+  0x71, 0xbe, 0x09, 0xf7, 0x96, 0xbe, 0x91, 0x7a, 0x0a, 0x3e, 0xc1, 0x6d, 0x88,
+  0x3c, 0x6c, 0xd0, 0x4f, 0xbe, 0x71, 0x75, 0x99, 0xbd, 0x7d, 0x92, 0x01, 0xbe,
+  0x35, 0x21, 0x96, 0xbe, 0xd9, 0x0e, 0x2d, 0x3e, 0x63, 0x17, 0x8b, 0x3d, 0x53,
+  0x6d, 0xb7, 0x3c, 0xb9, 0x06, 0x20, 0x3d, 0xdf, 0x56, 0x11, 0x3e, 0xc4, 0xcd,
+  0xa9, 0x3c, 0x7d, 0x0a, 0x3b, 0x3e, 0xd6, 0x23, 0x7f, 0xbc, 0xaf, 0x06, 0xc4,
+  0xbc, 0xe0, 0xe3, 0x63, 0xbd, 0x34, 0x50, 0x2a, 0x3e, 0x1f, 0xff, 0x4c, 0x3e,
+  0x34, 0x98, 0x79, 0xbe, 0x4c, 0xbd, 0x18, 0x3e, 0x5b, 0x8b, 0x0f, 0x3e, 0x33,
+  0x44, 0x34, 0xbd, 0xd6, 0xd7, 0x90, 0xbe, 0x51, 0x5e, 0x55, 0x3d, 0x46, 0x2b,
+  0x54, 0xbe, 0xd8, 0x49, 0x30, 0xbe, 0x45, 0xb3, 0x72, 0xbe, 0x93, 0x18, 0xcd,
+  0x3d, 0x86, 0xe1, 0x73, 0xbd, 0x94, 0x56, 0xf3, 0x3d, 0x0a, 0x54, 0xd7, 0xbd,
+  0x01, 0xd9, 0x98, 0x3e, 0xd5, 0x11, 0x01, 0xbb, 0x69, 0x07, 0x62, 0xbe, 0x81,
+  0x33, 0x03, 0xbb, 0x98, 0xf9, 0x9f, 0x3c, 0xe8, 0x77, 0x96, 0x3e, 0x3a, 0xc2,
+  0x73, 0x3e, 0xa1, 0x45, 0x35, 0xbe, 0xea, 0x1c, 0x86, 0xbc, 0xad, 0x90, 0x45,
+  0xbe, 0x0b, 0xd2, 0x03, 0x3d, 0x02, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0xa1, 0xc6, 0xcd, 0xbe, 0x46, 0xa7, 0xbd, 0x3e, 0x7c,
+  0xe3, 0x00, 0x3f, 0x13, 0x8d, 0xb6, 0xbe, 0x21, 0x72, 0x8b, 0x3e, 0x16, 0x68,
+  0x68, 0x3e, 0x05, 0xb7, 0xb6, 0xbe, 0xa0, 0xd3, 0xd4, 0x3e, 0x98, 0x82, 0x83,
+  0xbd, 0x8c, 0xb1, 0xe2, 0x3d, 0xd6, 0x94, 0x82, 0x3e, 0x07, 0x6a, 0x70, 0xbe,
+  0x6b, 0x74, 0x0b, 0x3f, 0xd8, 0xf5, 0x3d, 0x3e, 0xfb, 0xf3, 0x19, 0xbd, 0x2c,
+  0x72, 0xbf, 0x3e, 0xff, 0x95, 0x49, 0x3d, 0xee, 0x70, 0x78, 0x3e, 0xb0, 0x3f,
+  0x58, 0x3d, 0x78, 0xea, 0x9d, 0xbe, 0x53, 0x1d, 0x15, 0x3f, 0x0d, 0xfc, 0xbe,
+  0xbe, 0xad, 0x10, 0x07, 0xbf, 0xb4, 0x11, 0x87, 0xbe, 0x20, 0x92, 0x62, 0x3e,
+  0x58, 0x61, 0xbd, 0x3e, 0xea, 0x54, 0x4a, 0xbd, 0xbd, 0x55, 0xce, 0xbe, 0x12,
+  0x48, 0xa2, 0x3e, 0xe0, 0x74, 0x90, 0x3d, 0xce, 0x80, 0xf5, 0x3e, 0xa5, 0xb7,
+  0x15, 0x3f, 0x8e, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01,
+  0x00, 0x2c, 0xcf, 0x79, 0xbd, 0x8c, 0x37, 0x5a, 0xbc, 0x00, 0x4c, 0x6f, 0x3c,
+  0x14, 0x0b, 0x8e, 0x3d, 0xa8, 0xc3, 0x12, 0x3c, 0x10, 0x9f, 0xa5, 0xbb, 0xe8,
+  0x7e, 0x17, 0xbd, 0x43, 0x60, 0x74, 0xbd, 0xc6, 0x62, 0x6f, 0x3d, 0x88, 0x83,
+  0x6c, 0xbd, 0xf7, 0xf2, 0x36, 0xbd, 0xb7, 0x11, 0x81, 0xbd, 0x69, 0x1c, 0x30,
+  0xbd, 0xde, 0xd0, 0x4e, 0x3c, 0xa4, 0x9f, 0x6e, 0xbc, 0x06, 0xd8, 0xd6, 0xbc,
+  0x21, 0x75, 0x5f, 0xbd, 0x68, 0x6f, 0x0c, 0xbc, 0xbd, 0x21, 0xcf, 0xbb, 0x20,
+  0x31, 0xb0, 0x3b, 0x88, 0xa3, 0x32, 0x3c, 0xa0, 0xec, 0x56, 0x3d, 0x19, 0xfd,
+  0xf8, 0x3c, 0x99, 0xd1, 0x75, 0x3d, 0x99, 0x54, 0x3d, 0x3c, 0x4d, 0x0f, 0x12,
+  0x3b, 0x34, 0xf2, 0x37, 0xbd, 0xaa, 0x3b, 0x85, 0xbb, 0x23, 0xfe, 0xde, 0xbb,
+  0x8a, 0xe4, 0x21, 0x3c, 0xbd, 0x46, 0x8d, 0x3d, 0xd8, 0xf0, 0x03, 0x3d, 0xfa,
+  0xb6, 0xb6, 0x3c, 0xb8, 0x2e, 0xc9, 0xbc, 0xac, 0x52, 0x4a, 0xbd, 0xd2, 0x5d,
+  0x00, 0x3c, 0x7d, 0x64, 0x6f, 0xbd, 0xe6, 0x47, 0x77, 0x3d, 0xe0, 0x29, 0xbe,
+  0x3b, 0x5a, 0xb3, 0xee, 0xbc, 0x40, 0x76, 0xe3, 0xbb, 0x18, 0xf0, 0x8b, 0x3c,
+  0xbc, 0x5f, 0x3a, 0x3d, 0x47, 0xdd, 0x08, 0x3d, 0x0b, 0xae, 0x39, 0xbc, 0xa1,
+  0xca, 0xd9, 0xbc, 0xf8, 0x6b, 0x92, 0xbc, 0xf8, 0x2b, 0x42, 0x3d, 0xef, 0x4c,
+  0x14, 0xbd, 0x64, 0xd7, 0x4b, 0xbd, 0x22, 0x18, 0x18, 0x3c, 0x20, 0xf8, 0x29,
+  0xbd, 0x00, 0x5d, 0xdd, 0x3a, 0x56, 0x0c, 0x5f, 0xbd, 0x47, 0x5d, 0x84, 0xbd,
+  0x5e, 0xea, 0xa1, 0x3c, 0xc4, 0x53, 0x89, 0xbd, 0x53, 0xde, 0x4d, 0xbc, 0xe7,
+  0xc7, 0x88, 0xbc, 0x35, 0xef, 0x56, 0x3d, 0x45, 0x2c, 0xb4, 0x3c, 0xd8, 0x97,
+  0x7b, 0xbd, 0x17, 0xec, 0x89, 0x3d, 0xe1, 0x90, 0x45, 0x3d, 0x89, 0xf2, 0x3f,
+  0xbd, 0xf1, 0x11, 0xff, 0xbb, 0x1b, 0x6f, 0x03, 0xbd, 0xf7, 0xf7, 0x3d, 0x3b,
+  0xc4, 0x7d, 0x91, 0x3c, 0x44, 0x07, 0x0b, 0x3d, 0x4a, 0xc0, 0x6f, 0x3d, 0x79,
+  0x51, 0x8f, 0x3d, 0x66, 0x5e, 0x41, 0x3d, 0xf1, 0x9b, 0x8c, 0xbd, 0x38, 0xb9,
+  0xca, 0x3c, 0xe3, 0xf8, 0xe8, 0x3c, 0xcd, 0xce, 0x8f, 0xbb, 0xe4, 0xe9, 0x6b,
+  0x3c, 0x92, 0xd8, 0x39, 0x3d, 0xbe, 0x6d, 0x52, 0xbd, 0x38, 0xed, 0x4a, 0xbd,
+  0x68, 0xd4, 0x28, 0xbc, 0x6f, 0x16, 0x67, 0xbd, 0xd7, 0x55, 0x8a, 0x3d, 0xe0,
+  0x69, 0xb0, 0xbb, 0xfa, 0x9c, 0x93, 0xbd, 0x14, 0xe4, 0x21, 0x3d, 0x96, 0x1c,
+  0x7b, 0x3d, 0x4c, 0x31, 0x34, 0x3c, 0xa8, 0x41, 0x5c, 0x3c, 0x90, 0xe5, 0x8c,
+  0x3d, 0x11, 0x9f, 0x98, 0x3c, 0xf0, 0x3d, 0x16, 0x3d, 0x53, 0xd1, 0x91, 0xbd,
+  0x50, 0xc5, 0xef, 0x3c, 0x25, 0x52, 0x83, 0x3c, 0x9e, 0xce, 0x1f, 0x3c, 0x91,
+  0xa7, 0x0c, 0xbd, 0xb8, 0x95, 0x03, 0x3c, 0x7a, 0x4c, 0x35, 0x3d, 0x8e, 0xc4,
+  0x44, 0x3d, 0x1c, 0x66, 0x2c, 0x3d, 0x00, 0x89, 0x40, 0xba, 0xe1, 0xa3, 0x83,
+  0x3d, 0x68, 0xf2, 0x2b, 0xbd, 0x30, 0xd4, 0xde, 0x3b, 0xcf, 0xa1, 0xbc, 0x3c,
+  0x24, 0x79, 0x39, 0xbd, 0xe5, 0xf4, 0xb7, 0xbc, 0x79, 0x8d, 0x25, 0x3c, 0x95,
+  0xb6, 0x38, 0x3d, 0xd8, 0xc2, 0x74, 0x3c, 0xaa, 0x8e, 0x80, 0xbd, 0x0d, 0x74,
+  0xf3, 0x3c, 0x73, 0x5b, 0x98, 0xbc, 0x00, 0x64, 0x5e, 0xbc, 0x44, 0x82, 0xcb,
+  0x3c, 0x5a, 0x25, 0x53, 0xbd, 0xe2, 0xd0, 0x93, 0xbd, 0x3b, 0x7a, 0x77, 0xbd,
+  0x93, 0x3e, 0xd4, 0x3c, 0x39, 0x81, 0x28, 0xbd, 0x54, 0xd5, 0xef, 0x3c, 0x6c,
+  0x29, 0xe1, 0x3c, 0x69, 0xc8, 0x09, 0x3d, 0x83, 0xb3, 0x36, 0xbd, 0x90, 0xe1,
+  0xd4, 0xbb, 0x95, 0xa7, 0x1a, 0xbd, 0x39, 0xf5, 0x2b, 0xbc, 0x0c, 0xdf, 0x64,
+  0xbd, 0x74, 0xec, 0xdc, 0xbc, 0x20, 0xc6, 0x3b, 0x3d, 0x40, 0x20, 0x46, 0x3c,
+  0x18, 0x09, 0x3f, 0xbd, 0x96, 0x4c, 0xdc, 0xbc, 0x98, 0x98, 0x8d, 0xbd, 0xb4,
+  0xdd, 0x27, 0xbd, 0x74, 0x45, 0xbb, 0x3c, 0x49, 0xd9, 0x08, 0xbd, 0x8e, 0x06,
+  0xa8, 0x3b, 0x91, 0x10, 0xb4, 0x3c, 0xf8, 0x58, 0xf3, 0xbc, 0x06, 0xe9, 0x5e,
+  0x3d, 0x14, 0xc8, 0x26, 0x3d, 0xc5, 0xf7, 0x20, 0xbb, 0x6b, 0x78, 0xc0, 0x3c,
+  0xae, 0x64, 0x7f, 0x3c, 0xbb, 0xbf, 0x8b, 0x3c, 0x82, 0x4e, 0x0c, 0xbd, 0xb0,
+  0xd0, 0xdf, 0xbc, 0xfe, 0x53, 0x97, 0xbc, 0x8a, 0x9e, 0x24, 0xbd, 0xdf, 0x79,
+  0x84, 0x3d, 0x7e, 0xff, 0x8e, 0xbd, 0x66, 0x7b, 0xda, 0x3c, 0xb0, 0xdd, 0x8d,
+  0xbd, 0xab, 0x91, 0xbb, 0xbc, 0x23, 0x20, 0xb0, 0xbc, 0xbe, 0x43, 0x3f, 0xbd,
+  0x64, 0x80, 0xda, 0x3c, 0x32, 0x00, 0xde, 0x3c, 0xb2, 0x8a, 0x86, 0x3c, 0x68,
+  0x45, 0x05, 0x3d, 0x8b, 0x7c, 0xd8, 0x3b, 0x68, 0x97, 0xe7, 0x3c, 0x82, 0x8d,
+  0x6b, 0x3d, 0xa6, 0x53, 0x2d, 0x3d, 0xc0, 0x43, 0x23, 0x3c, 0xaa, 0xe6, 0x2d,
+  0xbd, 0x34, 0x06, 0x57, 0xbc, 0xfc, 0x9f, 0x0c, 0xbd, 0x42, 0x77, 0xc6, 0x3c,
+  0x51, 0x7a, 0x70, 0x3c, 0xe5, 0xe4, 0x7c, 0x3d, 0x86, 0x00, 0x67, 0xbd, 0x95,
+  0xb8, 0x37, 0xbd, 0xdd, 0x7a, 0x8d, 0x3d, 0x97, 0x08, 0xa9, 0x3c, 0xfd, 0xb6,
+  0x09, 0x3d, 0xdc, 0xb7, 0x81, 0x3d, 0xe0, 0x6c, 0x68, 0xbc, 0x79, 0x9b, 0x03,
+  0xbd, 0xb8, 0xc7, 0x78, 0xbb, 0x94, 0x60, 0x0f, 0x3d, 0x3b, 0x0e, 0x80, 0x3d,
+  0x11, 0xe6, 0x80, 0x3d, 0xb3, 0xab, 0x86, 0x3d, 0xed, 0xe6, 0x9d, 0xbc, 0xd8,
+  0xeb, 0xd9, 0xbc, 0xaa, 0x62, 0x80, 0x3d, 0x12, 0xc5, 0x00, 0x3d, 0x2b, 0x4b,
+  0x23, 0xbc, 0xc7, 0x31, 0xff, 0xbc, 0xe4, 0x95, 0xdb, 0x3b, 0xa7, 0x90, 0x66,
+  0x3c, 0xd3, 0x65, 0xdb, 0xbc, 0x50, 0xe3, 0x47, 0x3d, 0xd4, 0x25, 0x84, 0xbd,
+  0x5a, 0xd5, 0xae, 0xbc, 0x90, 0x5e, 0xba, 0x3c, 0x8c, 0x60, 0x90, 0xbd, 0xfc,
+  0x57, 0x4c, 0x3d, 0x99, 0x08, 0x7d, 0xbd, 0x9f, 0xac, 0x3b, 0x3c, 0x1c, 0xb1,
+  0x61, 0xbc, 0x6a, 0xb5, 0x33, 0xbc, 0x10, 0xb0, 0x28, 0x3c, 0x89, 0x5d, 0x9f,
+  0x3c, 0xd2, 0x80, 0x84, 0xbc, 0xb4, 0xb1, 0xd5, 0xba, 0x41, 0x1e, 0xa0, 0x3c,
+  0xd1, 0xd9, 0xd0, 0xbb, 0x04, 0xda, 0xd2, 0x3c, 0x58, 0x46, 0x90, 0xbc, 0xc1,
+  0x5c, 0x19, 0xbc, 0x01, 0x66, 0x2c, 0xbd, 0xad, 0xdc, 0x88, 0xbd, 0x32, 0xab,
+  0xb6, 0xbc, 0x14, 0x1f, 0x0b, 0x3d, 0x87, 0xf0, 0x69, 0x3d, 0x55, 0x30, 0x26,
+  0xbd, 0x2e, 0x3a, 0x05, 0xbd, 0xda, 0x08, 0x0e, 0xbd, 0xef, 0x31, 0x57, 0xbd,
+  0x0e, 0x44, 0x13, 0xbd, 0x53, 0x11, 0x29, 0xbd, 0x00, 0xd2, 0xea, 0x3a, 0x47,
+  0x72, 0xae, 0xbc, 0x54, 0x4a, 0x4d, 0xbd, 0x8a, 0x13, 0x2b, 0xbd, 0xa3, 0xaf,
+  0x92, 0x3d, 0x68, 0x15, 0x0d, 0x3c, 0x18, 0x17, 0x35, 0x3c, 0xb8, 0xf2, 0x6a,
+  0x3c, 0x15, 0xf8, 0xb2, 0x3c, 0x1d, 0x9d, 0xcd, 0x3c, 0xd3, 0x90, 0x81, 0xbd,
+  0x51, 0xe8, 0x21, 0x3d, 0x74, 0x43, 0xa9, 0x3c, 0x00, 0x0b, 0xa0, 0x3c, 0x8e,
+  0x69, 0xfb, 0xba, 0x81, 0x27, 0xfa, 0x3c, 0x6b, 0x7c, 0xf5, 0xbc, 0x61, 0x68,
+  0x84, 0x3d, 0xe4, 0x1a, 0x6b, 0xbd, 0xd0, 0xe9, 0xc8, 0x3c, 0x26, 0xff, 0x47,
+  0xbd, 0x64, 0xb7, 0xe9, 0x3b, 0xf3, 0xad, 0x36, 0x3d, 0x8a, 0x00, 0x3f, 0xbd,
+  0x94, 0x41, 0xcf, 0xbc, 0x01, 0xba, 0x55, 0x3d, 0x8c, 0x08, 0x36, 0xbd, 0xa4,
+  0x6b, 0x1a, 0x3d, 0x59, 0xfd, 0x83, 0x3d, 0xcc, 0xdd, 0x60, 0xbd, 0x59, 0xc2,
+  0xfe, 0xbc, 0xa6, 0x99, 0x2a, 0x3d, 0xbd, 0x45, 0x8b, 0x3d, 0xe2, 0x5e, 0x8c,
+  0x3d, 0x18, 0x83, 0x87, 0xbc, 0x10, 0x63, 0xda, 0x3b, 0x58, 0xa1, 0xc2, 0x3c,
+  0x78, 0xfa, 0x78, 0x3c, 0xfc, 0x33, 0xf0, 0x3c, 0xc4, 0xab, 0x5b, 0xbd, 0xde,
+  0x4b, 0x07, 0x3d, 0x53, 0x76, 0x1b, 0xbd, 0xee, 0xd8, 0x86, 0x3d, 0x7f, 0xd6,
+  0x7c, 0xbd, 0x68, 0xb5, 0x8e, 0x3c, 0x49, 0xdd, 0xd5, 0xbc, 0x83, 0x63, 0xed,
+  0xbb, 0x4e, 0x00, 0x91, 0xbd, 0x69, 0xce, 0xd5, 0xbb, 0x2f, 0x57, 0x71, 0xbc,
+  0x9a, 0xc3, 0x8f, 0xbd, 0x65, 0x27, 0x47, 0x3d, 0x2d, 0x6b, 0x77, 0xbd, 0xdd,
+  0x54, 0x43, 0xbc, 0xf7, 0x1f, 0xe8, 0xbc, 0x12, 0x8f, 0x87, 0xbd, 0x4f, 0xcf,
+  0x2f, 0x3d, 0x15, 0x51, 0x4b, 0xbd, 0x9d, 0x1f, 0x86, 0x3d, 0x68, 0x35, 0x58,
+  0xbd, 0x16, 0xe4, 0x4e, 0xbd, 0xd0, 0x03, 0x91, 0xbd, 0x39, 0xc6, 0x90, 0x3c,
+  0xdd, 0xbb, 0x0a, 0xbd, 0x58, 0x1b, 0x33, 0xbd, 0x55, 0x86, 0x91, 0xbd, 0x48,
+  0xe7, 0x90, 0xbc, 0xf4, 0x14, 0x3f, 0xbc, 0xc0, 0x75, 0x9e, 0xba, 0x7e, 0x8f,
+  0xa8, 0xbc, 0x8c, 0x2b, 0x55, 0x3d, 0x54, 0x4b, 0x70, 0xbd, 0x56, 0x74, 0x52,
+  0x3d, 0x6d, 0xf4, 0x02, 0x3b, 0x7d, 0x46, 0x5c, 0x3b, 0x76, 0xf4, 0x0c, 0xbd,
+  0xac, 0xa2, 0x1d, 0xbd, 0x5c, 0x63, 0xe2, 0xbc, 0x64, 0x4d, 0x31, 0x3c, 0xf9,
+  0x3e, 0x3f, 0x3d, 0xed, 0x12, 0x2c, 0xbd, 0xc8, 0x12, 0xb0, 0xbc, 0x4d, 0x90,
+  0x8f, 0x3d, 0x1d, 0xef, 0x89, 0x3d, 0xf0, 0x4f, 0x93, 0xbd, 0x88, 0x79, 0xd8,
+  0x3c, 0x74, 0x42, 0x1f, 0xbd, 0xba, 0x43, 0x90, 0x3c, 0xd5, 0x7e, 0xe3, 0xbc,
+  0x71, 0x49, 0x7b, 0xbd, 0x5d, 0x36, 0x16, 0x3d, 0x91, 0xb8, 0x22, 0xbd, 0xd4,
+  0x0e, 0x1e, 0x3d, 0xaa, 0x17, 0x2d, 0x3c, 0xca, 0x4d, 0xb9, 0x3b, 0x8a, 0x9d,
+  0x01, 0x3d, 0x60, 0xcf, 0xc3, 0xbb, 0xc4, 0xc0, 0x00, 0x3b, 0x6d, 0xeb, 0x09,
+  0xbd, 0x88, 0x55, 0x9e, 0xbc, 0x04, 0x54, 0xc3, 0xbc, 0x00, 0x93, 0xf2, 0x3a,
+  0xe2, 0x88, 0x6e, 0x3d, 0xa0, 0xdb, 0xd4, 0xbc, 0x12, 0x3b, 0xa4, 0x3b, 0x5d,
+  0x20, 0x88, 0x3d, 0xb4, 0xe5, 0xdc, 0xbc, 0x93, 0xf0, 0x70, 0xbc, 0xf6, 0x1a,
+  0x31, 0xbd, 0xe0, 0xc3, 0x75, 0x3c, 0xbc, 0x2b, 0x96, 0x3c, 0x5b, 0x81, 0x44,
+  0xbd, 0x6e, 0x2f, 0xab, 0xbc, 0x4c, 0x4e, 0x82, 0x3d, 0x6c, 0x17, 0x9b, 0xbc,
+  0x70, 0x5a, 0x16, 0xbc, 0x70, 0x5e, 0x10, 0x3c, 0x81, 0xf0, 0x7d, 0xbd, 0x55,
+  0xca, 0x3d, 0x3d, 0xca, 0x75, 0xa2, 0xbc, 0x7f, 0xc2, 0xe2, 0xbb, 0xc4, 0x59,
+  0x82, 0x3d, 0xbd, 0xde, 0xd0, 0xbc, 0xe6, 0x4c, 0x3a, 0x3d, 0x62, 0xc7, 0x62,
+  0x3d, 0x3e, 0xd2, 0xc1, 0xba, 0xeb, 0xae, 0xb3, 0xbb, 0x39, 0xf0, 0xa2, 0x3c,
+  0xd0, 0xa2, 0x18, 0xbd, 0x65, 0xea, 0x99, 0x3b, 0xd0, 0x01, 0x8d, 0xbc, 0x34,
+  0x0c, 0x84, 0xbd, 0xc3, 0x10, 0x3f, 0xbd, 0xb0, 0x26, 0xc4, 0x3b, 0xde, 0xc4,
+  0x2e, 0x3d, 0xb4, 0x3f, 0xe5, 0x3c, 0x80, 0x6d, 0xda, 0x3b, 0xd3, 0x01, 0x8f,
+  0x3d, 0x7b, 0x2e, 0x70, 0x3b, 0x95, 0x55, 0x51, 0xbd, 0xc2, 0x13, 0x4a, 0x3d,
+  0x70, 0xd8, 0x4a, 0x3d, 0x6d, 0xf3, 0xc7, 0xbb, 0x40, 0x46, 0xe8, 0x3c, 0x71,
+  0x53, 0x85, 0x3a, 0xea, 0x87, 0xf9, 0x3c, 0xb0, 0xb0, 0xf5, 0x3c, 0xf2, 0x2a,
+  0x58, 0x3d, 0xe8, 0xd7, 0xc4, 0x3c, 0x57, 0xd9, 0xc8, 0x3c, 0xf3, 0x05, 0x79,
+  0xbd, 0x9c, 0x0e, 0xf5, 0xbb, 0xcd, 0xaa, 0x1b, 0xbc, 0x42, 0xa2, 0x22, 0x3d,
+  0x3e, 0x81, 0xe3, 0x3c, 0x66, 0x13, 0x2a, 0xbd, 0x6d, 0xfd, 0x8f, 0x3d, 0xd3,
+  0x64, 0xab, 0x3c, 0x1e, 0x94, 0xba, 0x3c, 0x68, 0x42, 0x45, 0xbd, 0x4c, 0x0e,
+  0xaf, 0xbc, 0x90, 0xbf, 0x7e, 0x3d, 0x6f, 0x71, 0x91, 0x3d, 0xc3, 0xb6, 0x80,
+  0x3d, 0x3a, 0xbd, 0x32, 0xbd, 0x08, 0x63, 0x11, 0xbc, 0xec, 0xf4, 0x08, 0x3d,
+  0x60, 0x5c, 0xcc, 0x3b, 0x66, 0x5b, 0x59, 0xbd, 0xb9, 0xcb, 0x8d, 0xbd, 0xfd,
+  0x30, 0x54, 0x3d, 0x2e, 0xaa, 0x0f, 0xbc, 0x80, 0x26, 0x1a, 0xbb, 0x47, 0x43,
+  0x19, 0xbd, 0x2c, 0x5d, 0xb8, 0x3c, 0x6c, 0xa6, 0xe8, 0x3c, 0xec, 0x3c, 0xcb,
+  0xbc, 0x61, 0x53, 0xa4, 0x3c, 0x68, 0xf1, 0x0a, 0x3c, 0x9c, 0x5f, 0x30, 0x3d,
+  0x5b, 0x39, 0xb8, 0xbc, 0xd2, 0x8d, 0x99, 0xbc, 0xe7, 0x1e, 0x31, 0xbd, 0x61,
+  0x4e, 0x2c, 0xbd, 0x11, 0xeb, 0xb3, 0xbc, 0x80, 0x2e, 0x0b, 0xbc, 0x57, 0xbf,
+  0x75, 0x3c, 0xbb, 0xd3, 0x2b, 0x3d, 0xba, 0xc5, 0x1b, 0x3d, 0x43, 0x78, 0x80,
+  0x3d, 0xeb, 0x30, 0x0a, 0x3c, 0xf7, 0xf8, 0x04, 0x3d, 0x1f, 0x88, 0x17, 0xbd,
+  0x7c, 0x55, 0xf0, 0xbc, 0x4a, 0x93, 0x3c, 0x3d, 0x7a, 0x12, 0x5c, 0xbd, 0x54,
+  0x6b, 0x42, 0xbd, 0xa0, 0x16, 0xd8, 0x3b, 0x20, 0x3e, 0x3b, 0x3b, 0x3c, 0xde,
+  0x72, 0xbd, 0x68, 0x37, 0x68, 0xbd, 0x37, 0x55, 0x97, 0xbb, 0x19, 0x7b, 0x43,
+  0xbd, 0x82, 0xce, 0x8a, 0xbd, 0xcf, 0xc2, 0x88, 0xbd, 0x30, 0xde, 0xd8, 0x3b,
+  0xf1, 0xc1, 0xa9, 0x3c, 0x68, 0x51, 0x2d, 0x3d, 0x76, 0xd5, 0xac, 0x3c, 0xb8,
+  0x4b, 0x78, 0xbb, 0x0f, 0x1c, 0x5d, 0xbd, 0xf7, 0x31, 0x25, 0xbd, 0x72, 0x4c,
+  0x91, 0x3d, 0x6e, 0x4f, 0x51, 0x3d, 0xb4, 0x9b, 0x21, 0xbd, 0x03, 0x73, 0xdd,
+  0xbc, 0x38, 0x49, 0x4f, 0x3c, 0xb8, 0xc7, 0x4f, 0x3d, 0x6a, 0x17, 0x0a, 0xba,
+  0xf4, 0x4f, 0xcd, 0x3c, 0x93, 0x14, 0x86, 0xbd, 0xde, 0x1e, 0x31, 0x3c, 0x57,
+  0x45, 0xf1, 0x3c, 0x53, 0xc3, 0x7c, 0x3d, 0xc8, 0x1a, 0xd8, 0x3c, 0x85, 0xf4,
+  0x8d, 0x3d, 0xf2, 0xaa, 0x46, 0x3d, 0xa6, 0x5c, 0x73, 0x3d, 0xf8, 0x5a, 0x3c,
+  0x3d, 0xd0, 0x85, 0xaf, 0x3c, 0x60, 0x1f, 0xa0, 0x3c, 0xef, 0xcb, 0x45, 0xbd,
+  0x68, 0xc2, 0x24, 0x3d, 0x25, 0x65, 0x14, 0x3b, 0x0c, 0x01, 0x67, 0x3d, 0x43,
+  0x57, 0x65, 0xbd, 0x50, 0x8f, 0xec, 0x3b, 0x88, 0xf5, 0x16, 0x3d, 0xde, 0xa3,
+  0xe2, 0xbc, 0x92, 0x11, 0xfb, 0x3c, 0x35, 0x93, 0x26, 0x3d, 0x96, 0xe4, 0x70,
+  0x3d, 0x30, 0xea, 0x40, 0x3c, 0x50, 0x65, 0x37, 0x3c, 0x56, 0xf8, 0x84, 0xbd,
+  0x36, 0xc0, 0x8e, 0x3d, 0x58, 0x45, 0x6b, 0xbd, 0x46, 0xcc, 0x5e, 0xbc, 0x41,
+  0x2a, 0x4f, 0xbd, 0x5f, 0xce, 0x80, 0xbb, 0xfb, 0x75, 0xae, 0xbc, 0x19, 0xe3,
+  0x0b, 0xbd, 0x54, 0x3e, 0x8a, 0x3c, 0x41, 0x54, 0xb7, 0x39, 0x8f, 0xb4, 0x80,
+  0x3d, 0xfb, 0x42, 0x00, 0x3d, 0x5e, 0x0b, 0x19, 0xbd, 0x5d, 0x03, 0xb5, 0x3c,
+  0xd8, 0x30, 0x78, 0x3c, 0x3e, 0xef, 0x90, 0xbc, 0xe0, 0x2c, 0xdb, 0x3b, 0x0a,
+  0x5a, 0xfc, 0xbc, 0x24, 0x7e, 0x90, 0xbd, 0x1a, 0xd4, 0x1b, 0x3d, 0x10, 0x0a,
+  0x87, 0x3d, 0xa3, 0x8c, 0x3b, 0xbd, 0x3f, 0x54, 0xda, 0xbc, 0x0f, 0x59, 0xd8,
+  0x3b, 0xbe, 0xea, 0xea, 0x3c, 0x39, 0x2d, 0x7e, 0xbd, 0x19, 0xa0, 0x73, 0xba,
+  0x3c, 0xc5, 0x60, 0xbd, 0x57, 0x9e, 0x70, 0xbd, 0xdc, 0x65, 0xfb, 0x3b, 0xbc,
+  0x13, 0x32, 0xbd, 0xa4, 0xd0, 0x81, 0xbd, 0x5f, 0x74, 0x85, 0x3d, 0x1a, 0xf5,
+  0x58, 0x3d, 0xa3, 0x35, 0x7c, 0x3d, 0xb3, 0x3d, 0x87, 0x3c, 0x83, 0xc6, 0x6b,
+  0x3d, 0xff, 0xe3, 0x8e, 0x3d, 0x97, 0xab, 0x01, 0xbd, 0x7c, 0xd4, 0x85, 0x3d,
+  0xa0, 0xbd, 0x83, 0xbc, 0x04, 0x12, 0x41, 0x3d, 0x9e, 0x3d, 0x57, 0xbd, 0xa2,
+  0x37, 0xc1, 0x3c, 0xf2, 0xa6, 0x81, 0xbd, 0xe0, 0xde, 0xe6, 0xbc, 0xa0, 0x4b,
+  0xd4, 0xbb, 0xe8, 0x33, 0xd8, 0xbc, 0x9a, 0x4c, 0x55, 0x3d, 0x16, 0xc0, 0x91,
+  0xbd, 0x28, 0xa0, 0x1e, 0x3c, 0xfc, 0xc7, 0x5f, 0xbc, 0xc1, 0x5e, 0x95, 0x3c,
+  0xc4, 0x85, 0xa0, 0x3c, 0xf5, 0x01, 0xd7, 0xbc, 0xf3, 0x15, 0xcc, 0xbb, 0x52,
+  0x0c, 0x2c, 0xbd, 0xea, 0xdf, 0x7b, 0x3d, 0x06, 0xe0, 0x26, 0xbc, 0x7a, 0x9a,
+  0x8d, 0xbd, 0x9c, 0xdb, 0xac, 0x3c, 0x4b, 0xfa, 0x2f, 0x3d, 0xe4, 0x93, 0xf1,
+  0x3c, 0x89, 0xe5, 0x91, 0xbd, 0xda, 0x41, 0x28, 0xbd, 0x52, 0x6f, 0x58, 0x3d,
+  0x89, 0x2f, 0x43, 0xbd, 0x74, 0xe4, 0x00, 0xbd, 0x59, 0xd4, 0x26, 0xbd, 0x97,
+  0x79, 0xa9, 0x3c, 0xb0, 0x62, 0x9f, 0xb9, 0xbc, 0xac, 0x04, 0x3d, 0x5c, 0xce,
+  0x3d, 0xbd, 0x15, 0x58, 0x67, 0xbd, 0x0a, 0xce, 0xf4, 0xbc, 0x3a, 0x8f, 0x01,
+  0xbd, 0x50, 0xd2, 0x73, 0xbc, 0x8e, 0x54, 0x16, 0xbc, 0xea, 0xd7, 0x3c, 0x3d,
+  0xf0, 0xbe, 0xd7, 0x3c, 0x1a, 0x3d, 0x82, 0xbd, 0xba, 0x91, 0x2f, 0x3d, 0x10,
+  0xb0, 0x92, 0xbd, 0xf8, 0x36, 0x1c, 0x3d, 0x50, 0x2a, 0x8f, 0xbd, 0xb0, 0x09,
+  0x5e, 0x3d, 0x3b, 0xc8, 0x8f, 0xba, 0xf4, 0xce, 0x92, 0xbd, 0x38, 0xc4, 0x78,
+  0xbd, 0xe0, 0x8c, 0x5c, 0xbc, 0x98, 0x6b, 0x8b, 0x3d, 0x16, 0x7f, 0x4a, 0x3d,
+  0x18, 0xc0, 0xfe, 0xbc, 0x66, 0xbb, 0x4b, 0xbd, 0x90, 0xb6, 0xe1, 0x3b, 0x98,
+  0xca, 0x8c, 0x3c, 0x05, 0xfe, 0xec, 0xbc, 0x58, 0x1c, 0x17, 0x3d, 0x37, 0x17,
+  0x80, 0x3d, 0x41, 0x6e, 0x14, 0x3d, 0xee, 0x95, 0xcb, 0xbb, 0x1a, 0x56, 0x1f,
+  0xbd, 0xae, 0xc7, 0x2c, 0x3c, 0x28, 0x3a, 0x80, 0x3b, 0x00, 0x13, 0x76, 0xbc,
+  0x69, 0xaf, 0x5e, 0xbc, 0x80, 0xcc, 0x02, 0xbd, 0xa8, 0xea, 0x04, 0xba, 0xb8,
+  0xae, 0x09, 0x3d, 0xb3, 0x0d, 0x8d, 0x3d, 0xc0, 0x22, 0x84, 0xba, 0x04, 0x62,
+  0x5c, 0xbd, 0xd8, 0x28, 0x09, 0x3c, 0x68, 0xd3, 0x41, 0x3c, 0x62, 0x52, 0x1e,
+  0x3d, 0x99, 0x42, 0x03, 0xbd, 0x3b, 0x4b, 0xd9, 0xba, 0x68, 0x5e, 0x32, 0xbd,
+  0x8b, 0x9e, 0x26, 0xbb, 0x9c, 0xd7, 0xcd, 0x3c, 0x4e, 0xdc, 0x16, 0x3d, 0x42,
+  0x1a, 0x07, 0x3d, 0xbb, 0xa6, 0x96, 0xbb, 0xf4, 0x47, 0x59, 0xbc, 0x13, 0xa3,
+  0xa1, 0xbc, 0x8f, 0x58, 0x0f, 0xbc, 0x88, 0xd1, 0x1d, 0xbd, 0xe0, 0x0f, 0xfb,
+  0x3c, 0x81, 0xd3, 0x90, 0x3d, 0xe0, 0x4b, 0x4f, 0xbd, 0x3f, 0x4a, 0x80, 0x3d,
+  0x3a, 0x63, 0x67, 0x3d, 0xe2, 0xee, 0x1e, 0x3c, 0xf8, 0x65, 0xdd, 0x3b, 0x1c,
+  0x30, 0x09, 0xbd, 0xe9, 0x2f, 0xdb, 0xbc, 0x94, 0x36, 0x55, 0xbd, 0x2c, 0xa4,
+  0x95, 0x3a, 0x78, 0x24, 0x2f, 0x3d, 0xc7, 0x9c, 0x44, 0xbd, 0xb5, 0x09, 0x10,
+  0xbd, 0x7d, 0x10, 0x49, 0xbd, 0x60, 0xd3, 0x43, 0x3c, 0xef, 0x67, 0x05, 0xbd,
+  0x0a, 0x1d, 0x6c, 0x3d, 0xaa, 0x4d, 0x0c, 0x3d, 0x84, 0xfc, 0x8a, 0xbc, 0x0d,
+  0xf7, 0x65, 0xbd, 0x5c, 0x71, 0x93, 0xbc, 0xd8, 0xe9, 0x2a, 0x3d, 0x1d, 0xd9,
+  0xc6, 0xbc, 0xd6, 0xeb, 0x70, 0xbd, 0xef, 0x92, 0x41, 0xbd, 0x4a, 0xd3, 0x83,
+  0xbd, 0x1e, 0xf1, 0x74, 0x3b, 0xa3, 0xb4, 0x1e, 0xbc, 0x4f, 0x0c, 0x12, 0x3d,
+  0x69, 0xf6, 0x25, 0x3d, 0x5a, 0x52, 0x35, 0x3d, 0xb5, 0x14, 0x37, 0x3d, 0x2b,
+  0xf9, 0x2d, 0xbd, 0xb8, 0xc6, 0x12, 0x3d, 0x2e, 0xeb, 0xf8, 0xbb, 0x31, 0xe0,
+  0x43, 0xbd, 0x37, 0x68, 0xf4, 0x3b, 0x4e, 0xd7, 0x55, 0xbd, 0xf2, 0x8f, 0x06,
+  0x3d, 0xa3, 0xe0, 0x8a, 0x3d, 0x47, 0xcb, 0x91, 0x3d, 0xc3, 0xaa, 0x1c, 0xbd,
+  0x43, 0x44, 0x24, 0x3d, 0x5a, 0xcc, 0x30, 0xbd, 0x72, 0xbe, 0x27, 0x3c, 0xfc,
+  0xd5, 0xbe, 0x3c, 0x34, 0x0e, 0x3f, 0x3d, 0xdc, 0x3d, 0x7b, 0xbc, 0x64, 0xe1,
+  0xa9, 0x3c, 0x00, 0x61, 0x80, 0x3b, 0x19, 0xd4, 0x82, 0xbd, 0x41, 0xef, 0x8c,
+  0x3d, 0x90, 0x50, 0x11, 0xbd, 0x0d, 0x32, 0x8d, 0x3d, 0x56, 0x78, 0x5f, 0x3c,
+  0x71, 0x44, 0x6c, 0x3d, 0x21, 0xe4, 0x22, 0x3d, 0x31, 0xfd, 0xb4, 0xbb, 0xcc,
+  0x10, 0x7e, 0x3c, 0x7a, 0xb4, 0x06, 0x3d, 0xc5, 0xde, 0x22, 0xbc, 0xd2, 0x57,
+  0xfe, 0x3c, 0x30, 0x95, 0x81, 0xbd, 0x00, 0x6d, 0xde, 0x39, 0xfd, 0x2b, 0x3f,
+  0x3d, 0x8f, 0xe7, 0xf4, 0x3b, 0x2b, 0xf8, 0xa3, 0xbc, 0xcf, 0x7c, 0x4e, 0x3d,
+  0x86, 0xee, 0xf7, 0x3c, 0x20, 0x5a, 0x22, 0xbb, 0x1a, 0xa9, 0x62, 0xbd, 0x0f,
+  0x24, 0x7f, 0x3d, 0x74, 0x7e, 0x00, 0x3d, 0x24, 0xd2, 0xcb, 0xbc, 0x06, 0xc6,
+  0x44, 0xbd, 0xe1, 0x53, 0xa3, 0x3c, 0x7d, 0x24, 0x08, 0x3d, 0xf6, 0x9f, 0x23,
+  0xbd, 0x3f, 0xb0, 0x84, 0xbd, 0xb0, 0xbb, 0xbc, 0x3c, 0x74, 0x6c, 0x22, 0xbc,
+  0x0b, 0x32, 0x50, 0xbd, 0x81, 0x6f, 0x8b, 0x3d, 0x98, 0x37, 0xc3, 0x3c, 0xfd,
+  0x30, 0x08, 0xbd, 0x11, 0x42, 0x01, 0xbd, 0xd6, 0x91, 0x16, 0x3c, 0x6e, 0xf1,
+  0xc2, 0x3a, 0xed, 0x4b, 0x8c, 0xbd, 0x51, 0x70, 0x34, 0xbd, 0x2a, 0x7e, 0x1c,
+  0x3b, 0x5a, 0x96, 0xcd, 0x37, 0x9a, 0x8e, 0xf8, 0x3c, 0xce, 0x8a, 0x6d, 0x3d,
+  0x62, 0xb2, 0x38, 0x3d, 0x70, 0x0a, 0xbe, 0xbc, 0xd0, 0x3f, 0x66, 0xbc, 0xf4,
+  0xfe, 0x24, 0x3d, 0xbe, 0xf9, 0x89, 0x3c, 0xa0, 0x2b, 0xc1, 0xbc, 0x02, 0x6d,
+  0x41, 0x3c, 0xa4, 0x00, 0x14, 0xbd, 0xbc, 0xa1, 0xd1, 0x3b, 0xbc, 0x27, 0xa6,
+  0x3c, 0xc8, 0x08, 0xfd, 0xbc, 0xa1, 0x0e, 0x9c, 0xbc, 0xa1, 0x28, 0x07, 0xbc,
+  0x33, 0xf3, 0x71, 0x3c, 0x96, 0xed, 0x1f, 0x3d, 0xf6, 0x6d, 0x5e, 0xbd, 0x30,
+  0x7c, 0x12, 0xbc, 0xf2, 0xaf, 0x7b, 0x3d, 0x56, 0xfa, 0x36, 0xbd, 0x7a, 0x6f,
+  0x3a, 0x3d, 0x40, 0x65, 0x8f, 0x3c, 0x2c, 0xa1, 0x4f, 0xbc, 0x80, 0x0f, 0x7b,
+  0x3b, 0xaf, 0xc3, 0xf2, 0x3c, 0xae, 0x39, 0x8a, 0xbd, 0xd5, 0xf6, 0x42, 0xbd,
+  0x12, 0x9c, 0x33, 0x3d, 0x88, 0x27, 0x4d, 0x3d, 0x61, 0x05, 0x1e, 0xbd, 0x02,
+  0xcd, 0x04, 0xbd, 0xe8, 0x6f, 0xe1, 0x3c, 0xf8, 0xd2, 0x73, 0x3d, 0xb9, 0xa3,
+  0x61, 0xbd, 0x64, 0x01, 0x92, 0x3c, 0x4f, 0x8e, 0x21, 0xbc, 0x8b, 0xf5, 0x18,
+  0x3d, 0xce, 0x3b, 0x77, 0x3d, 0x8d, 0x0e, 0x97, 0x3a, 0x30, 0xfc, 0x85, 0x3c,
+  0x1f, 0x24, 0x8e, 0x3a, 0xca, 0xdd, 0x4e, 0x3d, 0x5f, 0x7c, 0xfe, 0x3b, 0x84,
+  0xdf, 0x2d, 0x3d, 0x7a, 0x5c, 0x8c, 0x3d, 0x90, 0xf3, 0x79, 0xbc, 0x4f, 0x99,
+  0x17, 0xbd, 0x30, 0xb1, 0xd2, 0xbb, 0x1c, 0x5a, 0x32, 0xbd, 0xd4, 0x8c, 0xd9,
+  0x3c, 0x08, 0x56, 0xec, 0x3c, 0xf0, 0xcf, 0x64, 0xbd, 0xf0, 0x2a, 0xf1, 0xbb,
+  0x28, 0x09, 0x0c, 0xbc, 0x0f, 0xf7, 0x8d, 0xbd, 0x86, 0x8f, 0x59, 0xbd, 0xfa,
+  0xbf, 0x52, 0xbd, 0x76, 0x65, 0x4c, 0xbd, 0x79, 0xaa, 0x16, 0xbd, 0x9e, 0x6f,
+  0xa7, 0xbc, 0xac, 0x9e, 0x8f, 0xbd, 0x5a, 0xfc, 0x7b, 0xbd, 0x90, 0xe3, 0x20,
+  0x3d, 0xd0, 0x2b, 0x81, 0x3d, 0xc1, 0xbf, 0x85, 0x3d, 0x48, 0x79, 0x44, 0x3d,
+  0x3e, 0x7b, 0x6d, 0x3d, 0x2b, 0x83, 0x11, 0x3d, 0x45, 0x84, 0x38, 0x3d, 0xbd,
+  0x6d, 0x47, 0xb8, 0xe9, 0x7c, 0x29, 0xbd, 0x51, 0xd2, 0xc9, 0x3c, 0x77, 0x53,
+  0xf0, 0x3b, 0xca, 0xc2, 0x17, 0xbd, 0xb2, 0xbc, 0x13, 0x3d, 0xbc, 0x58, 0xf9,
+  0x3c, 0xed, 0x65, 0xed, 0x3c, 0x05, 0xdd, 0x8e, 0xbc, 0x0f, 0xa5, 0x96, 0xbc,
+  0xd2, 0x96, 0x00, 0x3d, 0x90, 0xfe, 0x5c, 0x3d, 0x1f, 0x18, 0x90, 0xbd, 0x68,
+  0xbb, 0xc8, 0x3c, 0x86, 0xae, 0xbb, 0xbc, 0x8a, 0x69, 0xea, 0xbc, 0x28, 0x6a,
+  0x7c, 0x3c, 0x32, 0x5f, 0x70, 0x3d, 0xdd, 0x12, 0xd4, 0xba, 0xca, 0x54, 0x56,
+  0xbd, 0x46, 0x94, 0x3f, 0xbd, 0x28, 0x3e, 0xa6, 0x3c, 0x93, 0x06, 0x43, 0xbd,
+  0x58, 0xc7, 0xf0, 0x3c, 0x5d, 0x14, 0xa9, 0xbb, 0x58, 0x98, 0xc8, 0xbc, 0x89,
+  0x34, 0x8d, 0x3d, 0x39, 0x90, 0x7b, 0x3d, 0x66, 0x18, 0x63, 0x3d, 0x60, 0x47,
+  0x4d, 0x3b, 0x1d, 0x50, 0x6c, 0xbd, 0x55, 0x74, 0x27, 0x3d, 0x11, 0xf1, 0x66,
+  0xbd, 0x14, 0xe6, 0x90, 0x3d, 0xdf, 0x99, 0x88, 0x3d, 0x9b, 0xc6, 0x67, 0x3d,
+  0x16, 0xca, 0xd3, 0xbc, 0x79, 0xad, 0x87, 0x3d, 0x52, 0x56, 0x7b, 0x3d, 0x6e,
+  0x19, 0x14, 0xbc, 0x12, 0x02, 0x26, 0x3d, 0xaf, 0x26, 0x1b, 0xbd, 0x5e, 0x09,
+  0x8c, 0xbd, 0xa2, 0x3c, 0x5f, 0x3d, 0x60, 0x7e, 0x7d, 0xbd, 0x10, 0xc0, 0x85,
+  0xbd, 0x70, 0x15, 0xc4, 0x3b, 0xe0, 0xfa, 0xf8, 0x3b, 0xe6, 0x2e, 0x00, 0x3d,
+  0xf7, 0xd5, 0x1f, 0x3d, 0x48, 0x70, 0x60, 0x3d, 0x2a, 0x3a, 0xed, 0xbc, 0xfd,
+  0x05, 0x26, 0xbc, 0x67, 0xf0, 0xee, 0x3a, 0x7e, 0x6e, 0x46, 0x3d, 0x57, 0x87,
+  0x90, 0x3d, 0x22, 0xdb, 0x65, 0xbd, 0x70, 0xad, 0x7a, 0x3c, 0xa6, 0xb5, 0xc3,
+  0x3c, 0xd4, 0xfa, 0x12, 0x3c, 0x4e, 0x84, 0x2f, 0xbd, 0x00, 0x37, 0x63, 0xbb,
+  0xfb, 0x25, 0x41, 0xbc, 0x38, 0xa5, 0x84, 0x3d, 0x8a, 0xd7, 0x5a, 0xbd, 0x11,
+  0xf7, 0xd6, 0xbb, 0xd1, 0x99, 0x22, 0xbd, 0xc8, 0xfc, 0x83, 0x3c, 0xd8, 0x91,
+  0xd8, 0xbc, 0xa6, 0xf0, 0x3f, 0xbd, 0x08, 0x4d, 0x3b, 0x3d, 0xdd, 0x56, 0x4c,
+  0xbd, 0xeb, 0x23, 0x8d, 0xbd, 0x23, 0x09, 0xcc, 0x3c, 0xbb, 0x3d, 0x8a, 0x3d,
+  0x47, 0xb9, 0x75, 0xbd, 0x69, 0x75, 0x82, 0x3d, 0x30, 0x78, 0x86, 0x3c, 0x0c,
+  0xc2, 0xd6, 0xbc, 0x2a, 0x22, 0x51, 0x3d, 0x9c, 0xfa, 0x3b, 0xbc, 0x00, 0x4b,
+  0xbf, 0x39, 0x10, 0x58, 0xe6, 0xbb, 0x22, 0xa4, 0x47, 0x3d, 0x8b, 0xd1, 0x6f,
+  0x3c, 0xf3, 0x8b, 0x23, 0xbd, 0xad, 0x67, 0x71, 0xbd, 0xa4, 0xbb, 0x71, 0xbc,
+  0x68, 0x9d, 0x36, 0x3d, 0x79, 0xda, 0x00, 0x3d, 0x30, 0x88, 0x15, 0x3d, 0xc4,
+  0x55, 0xab, 0x3c, 0xd0, 0xbe, 0x4f, 0x3d, 0x43, 0xa2, 0x8b, 0x3d, 0xc0, 0x0b,
+  0x27, 0xbc, 0xfe, 0x35, 0x91, 0xbd, 0x27, 0x33, 0x5b, 0xbc, 0xc5, 0x00, 0x91,
+  0xb9, 0x3e, 0x30, 0x74, 0xbd, 0x1c, 0x92, 0x70, 0xbd, 0xfe, 0x13, 0x56, 0xbb,
+  0x63, 0x1b, 0x84, 0x3d, 0x24, 0x9a, 0xa1, 0x3c, 0x93, 0x78, 0x83, 0xbc, 0x29,
+  0xb2, 0xce, 0x3c, 0x05, 0x6f, 0x8f, 0x3d, 0xe8, 0xb4, 0x3b, 0xbd, 0x12, 0x90,
+  0x8e, 0x3d, 0x58, 0x6a, 0x76, 0xbd, 0xee, 0x8f, 0x90, 0xbd, 0x1e, 0x98, 0xde,
+  0xbc, 0x88, 0x22, 0x40, 0x3d, 0x1b, 0x7f, 0x87, 0xbd, 0x3e, 0x25, 0x5e, 0x3d,
+  0x38, 0xf3, 0x0c, 0xbc, 0x77, 0x6a, 0x8b, 0xbd, 0x0c, 0x98, 0x08, 0xbc, 0xbd,
+  0x52, 0xf6, 0x3c, 0x2d, 0x2f, 0x03, 0xbd, 0x15, 0xbf, 0x91, 0x3d, 0xba, 0x41,
+  0xef, 0xbc, 0xdf, 0x02, 0xab, 0xbc, 0xe4, 0xac, 0x7e, 0x3d, 0x9e, 0x8c, 0x51,
+  0x3d, 0xcc, 0x12, 0x01, 0x3d, 0xfc, 0xfb, 0x1b, 0xbd, 0x75, 0x2b, 0x81, 0xbd,
+  0x6a, 0xbf, 0x20, 0x3d, 0xbb, 0x3c, 0x77, 0xbd, 0xae, 0x2f, 0x74, 0xbd, 0x58,
+  0x94, 0x53, 0xbd, 0xa0, 0xcf, 0xd4, 0x3c, 0x68, 0x51, 0xd1, 0x3c, 0x1c, 0x40,
+  0x22, 0xbd, 0x86, 0x62, 0x04, 0x3d, 0x9c, 0x10, 0x02, 0xbd, 0x5d, 0x31, 0x49,
+  0xbb, 0x5d, 0x8e, 0xf5, 0xbc, 0xb8, 0xef, 0x44, 0xbc, 0x06, 0xe5, 0x50, 0xbd,
+  0xe6, 0x33, 0x40, 0xbd, 0x20, 0x2e, 0x39, 0x3b, 0x00, 0x2f, 0x96, 0xbb, 0x75,
+  0x2e, 0x80, 0xbd, 0x2c, 0x9f, 0x4e, 0x3d, 0xd0, 0x40, 0xf6, 0x3b, 0x2e, 0x56,
+  0x8e, 0x3d, 0xcf, 0x00, 0x15, 0x3d, 0xae, 0x5d, 0xc7, 0x3b, 0x44, 0x47, 0x05,
+  0x3d, 0x80, 0x19, 0x71, 0xbb, 0x8c, 0xce, 0x87, 0xbd, 0xd2, 0x30, 0x78, 0xbd,
+  0xcc, 0x7b, 0x14, 0xbd, 0xf4, 0xb8, 0x91, 0xbd, 0xbe, 0x76, 0x64, 0x3d, 0xf9,
+  0x7e, 0x80, 0x3d, 0xda, 0xf8, 0x13, 0xbd, 0x92, 0xd0, 0x11, 0xbd, 0x03, 0x64,
+  0x55, 0xbc, 0x50, 0x1a, 0xe8, 0xbc, 0x97, 0xeb, 0x5e, 0xbd, 0x7c, 0xf8, 0x90,
+  0x3d, 0xc4, 0x26, 0x4b, 0x3d, 0xc2, 0x04, 0x7d, 0xbd, 0x25, 0x41, 0x14, 0x3b,
+  0xac, 0xc2, 0xdf, 0x3c, 0xda, 0x60, 0xd3, 0xbc, 0x1b, 0x00, 0x45, 0xbd, 0x7e,
+  0x09, 0xac, 0xbc, 0x28, 0x65, 0xcb, 0xbc, 0xe6, 0xd0, 0xb2, 0xbc, 0xb8, 0xdf,
+  0xae, 0x3c, 0xc8, 0xb7, 0xca, 0x3c, 0x98, 0x50, 0xa1, 0x3c, 0x5c, 0xa2, 0xa0,
+  0xbc, 0x8c, 0x18, 0x56, 0x3d, 0xea, 0x98, 0x8e, 0xbd, 0xb5, 0xba, 0x49, 0x3b,
+  0xff, 0x2b, 0xaf, 0x3c, 0x91, 0xf6, 0x49, 0xbd, 0x0a, 0x19, 0x4d, 0x3d, 0xa1,
+  0x7e, 0x69, 0xbd, 0x6c, 0x77, 0x3e, 0xbc, 0xa0, 0x00, 0x6e, 0x3d, 0x81, 0xc6,
+  0xb1, 0x3b, 0x8b, 0xbf, 0x40, 0xbd, 0x5e, 0x71, 0xf5, 0xbc, 0x74, 0x2c, 0x96,
+  0xbc, 0x3d, 0x0c, 0x8b, 0xbd, 0x45, 0x9a, 0x8a, 0xbd, 0xdb, 0x49, 0xcb, 0x3c,
+  0x9b, 0x5b, 0x10, 0x3d, 0xf5, 0x79, 0x45, 0x3d, 0x5a, 0x50, 0x86, 0xbd, 0xf9,
+  0x2f, 0x7c, 0xbd, 0xf6, 0x3d, 0x19, 0xbd, 0x54, 0x10, 0x0c, 0x3b, 0xaf, 0x59,
+  0x27, 0xbd, 0x1f, 0x75, 0x78, 0x3d, 0x10, 0xb2, 0x9a, 0xbc, 0xc3, 0xb1, 0x99,
+  0xbc, 0xb4, 0x08, 0xac, 0x3c, 0x15, 0x41, 0x86, 0x3d, 0xc0, 0x2d, 0x46, 0xbb,
+  0xc4, 0x49, 0x56, 0xbc, 0xef, 0x2e, 0x7b, 0xbd, 0x6c, 0xee, 0x14, 0x3d, 0x70,
+  0xe7, 0x9c, 0x3c, 0x78, 0x7e, 0xfb, 0xbc, 0xf7, 0x06, 0x51, 0xbd, 0x52, 0xd4,
+  0x1a, 0xbd, 0xb0, 0x2b, 0xeb, 0xbc, 0xad, 0xad, 0x4e, 0xbd, 0xa4, 0x7c, 0xe3,
+  0x3c, 0x18, 0xa1, 0xd8, 0xbc, 0x6e, 0xa6, 0x8f, 0xbd, 0x79, 0x0d, 0xb7, 0xba,
+  0xb2, 0x10, 0x10, 0x3d, 0xe6, 0xcf, 0x52, 0x3d, 0x8e, 0x88, 0x35, 0x3d, 0xdd,
+  0x92, 0x8d, 0x3d, 0x54, 0x69, 0x83, 0xbc, 0xab, 0xa9, 0x88, 0xbd, 0xe0, 0xa7,
+  0x1c, 0xbb, 0x86, 0x10, 0x2c, 0xbd, 0x24, 0xde, 0x18, 0x3d, 0x4a, 0x04, 0x87,
+  0xbd, 0x42, 0x3c, 0x16, 0xbd, 0x62, 0x25, 0x90, 0xbd, 0xce, 0x01, 0x64, 0xbd,
+  0x2c, 0x76, 0x6f, 0xbd, 0xd2, 0x15, 0x0b, 0xbd, 0x45, 0x72, 0x73, 0x3b, 0xeb,
+  0x46, 0x02, 0xbd, 0x05, 0x12, 0x1c, 0xbd, 0xb8, 0x16, 0x22, 0xbd, 0xe5, 0x22,
+  0x89, 0x3d, 0x8c, 0x8a, 0xf4, 0x3c, 0x40, 0x6b, 0xe4, 0x3a, 0x5c, 0xe2, 0x70,
+  0xbd, 0x56, 0x08, 0x67, 0xbd, 0x5b, 0xec, 0x4d, 0x3d, 0xba, 0x4d, 0x2a, 0xbd,
+  0xb9, 0x55, 0xa4, 0xbc, 0xb7, 0xd7, 0x39, 0x3d, 0xa0, 0x88, 0xfe, 0x3c, 0xbf,
+  0x7d, 0x6b, 0xbd, 0xcd, 0xdf, 0xe3, 0xbc, 0x26, 0xa0, 0x3e, 0x3d, 0x19, 0x4b,
+  0x17, 0x3d, 0x54, 0x84, 0xa7, 0xbc, 0x78, 0x9a, 0x6a, 0xbd, 0x80, 0xcc, 0xa7,
+  0x3c, 0x58, 0x48, 0x3a, 0x3d, 0xd9, 0x9a, 0xe3, 0xbc, 0xe0, 0xa2, 0xb8, 0x3c,
+  0x3f, 0x32, 0x4d, 0x3d, 0x8e, 0xa6, 0x80, 0xbc, 0x0f, 0xfc, 0xd6, 0xbb, 0x40,
+  0x70, 0x8b, 0xbd, 0xe3, 0xa3, 0xf6, 0xbb, 0x40, 0x26, 0x33, 0xbb, 0x43, 0xb2,
+  0x01, 0xbd, 0x2e, 0xf9, 0x27, 0xbd, 0x6c, 0xcf, 0x54, 0x3c, 0xae, 0xca, 0x4d,
+  0x3c, 0x6e, 0x2d, 0x1d, 0x3a, 0x04, 0xda, 0x94, 0xbc, 0x2c, 0x2b, 0xc6, 0x3c,
+  0x59, 0xc8, 0x1a, 0xbd, 0x80, 0x56, 0xcb, 0x3b, 0xf4, 0xce, 0xa1, 0x3c, 0x84,
+  0xdd, 0xeb, 0x3c, 0x95, 0x36, 0x83, 0xbd, 0x60, 0xeb, 0x47, 0x3d, 0x90, 0xf8,
+  0x63, 0x3d, 0x8a, 0xc4, 0x6a, 0xbc, 0x40, 0x25, 0xa9, 0x3b, 0x7a, 0xfc, 0x65,
+  0x3d, 0xe2, 0xcd, 0x33, 0x3d, 0x69, 0x80, 0xe5, 0xbc, 0xf7, 0xc5, 0x42, 0xbc,
+  0x17, 0xf4, 0x31, 0xbd, 0xbe, 0xb3, 0x79, 0x3d, 0xff, 0xfc, 0x6c, 0x3d, 0xc5,
+  0x04, 0x7d, 0xbc, 0xd9, 0x4f, 0x8e, 0x3d, 0xfe, 0xd3, 0x86, 0xbd, 0xcd, 0xeb,
+  0x3f, 0x3d, 0xd8, 0x90, 0x2e, 0xbd, 0x56, 0x17, 0xbf, 0x3c, 0xbb, 0x23, 0x83,
+  0xbd, 0x69, 0x4a, 0x43, 0x3d, 0x0a, 0x76, 0x5e, 0xbd, 0xee, 0x69, 0x8d, 0x3d,
+  0x75, 0xda, 0x1c, 0x3c, 0xe8, 0xf7, 0xe0, 0xbc, 0x53, 0xbe, 0xda, 0xb8, 0xc2,
+  0x03, 0x2e, 0xbd, 0xe4, 0xa0, 0x38, 0xbc, 0xbc, 0x5e, 0x3b, 0xbd, 0xfc, 0xfc,
+  0xb7, 0x3c, 0xd4, 0xfb, 0x13, 0xbd, 0xf6, 0x8c, 0x44, 0x3d, 0x70, 0x13, 0x9d,
+  0x3c, 0xf8, 0xb8, 0x11, 0xbc, 0xcc, 0x9b, 0x3b, 0xbd, 0xf7, 0x18, 0xe4, 0xbc,
+  0x89, 0xc3, 0x31, 0x3d, 0xde, 0x7c, 0x32, 0xbd, 0x3c, 0xc7, 0x97, 0x3c, 0x2e,
+  0xc0, 0xb8, 0xbc, 0xa2, 0xfe, 0x29, 0xbd, 0x17, 0xb2, 0x35, 0xbd, 0xaa, 0x83,
+  0xdd, 0x3c, 0x1e, 0xfa, 0x83, 0x3d, 0xc6, 0x4c, 0x16, 0x3d, 0xfd, 0x0f, 0x29,
+  0x3d, 0x2d, 0x90, 0xac, 0x3b, 0xfe, 0xe5, 0xc8, 0x3b, 0xac, 0x11, 0xc7, 0xbc,
+  0x2d, 0xf3, 0xfa, 0x3c, 0x2a, 0x75, 0x81, 0xbd, 0x2d, 0x84, 0xb4, 0x3c, 0xfd,
+  0xad, 0x66, 0xbc, 0xaa, 0x80, 0x2a, 0xbd, 0x58, 0x82, 0x8c, 0x3d, 0x75, 0x06,
+  0x78, 0x3d, 0x1b, 0xdd, 0x21, 0xbc, 0x1c, 0x40, 0x38, 0x3d, 0xe0, 0xdc, 0x6e,
+  0x3d, 0x50, 0xb8, 0x32, 0xbc, 0x80, 0x13, 0x4f, 0xbb, 0x32, 0x50, 0x6c, 0x3d,
+  0xce, 0x1b, 0xf1, 0xbc, 0xd8, 0x20, 0x02, 0x3d, 0x43, 0x68, 0xa2, 0x3c, 0x9a,
+  0x6c, 0x29, 0xbd, 0x8d, 0x90, 0x22, 0xbd, 0x14, 0xff, 0xe6, 0xbb, 0xb8, 0xcf,
+  0xc1, 0x3c, 0xa6, 0x3b, 0x4a, 0x3d, 0xac, 0xad, 0x11, 0x3d, 0x60, 0x19, 0xc9,
+  0x3c, 0x55, 0xae, 0xf1, 0xbc, 0x3d, 0xc0, 0x23, 0xbd, 0xa3, 0x00, 0xcd, 0xbb,
+  0x44, 0x9e, 0x17, 0x3d, 0xc0, 0x31, 0xe2, 0x3a, 0x30, 0xdf, 0xf4, 0x3c, 0x31,
+  0x09, 0x92, 0xbc, 0xa8, 0xbd, 0x66, 0x3c, 0xa5, 0x06, 0x4f, 0x3c, 0xdc, 0x2e,
+  0x92, 0xbd, 0xfb, 0x54, 0x87, 0xb9, 0x9b, 0x34, 0x1f, 0x3d, 0xd8, 0xf7, 0xa7,
+  0xbb, 0xff, 0x1d, 0x62, 0xbd, 0xe0, 0xf8, 0x3c, 0x3d, 0x85, 0x58, 0x8f, 0xbd,
+  0x75, 0xf9, 0x62, 0xbd, 0xef, 0xf5, 0x7a, 0xbd, 0x58, 0x32, 0x86, 0x3d, 0x90,
+  0x17, 0x29, 0x3c, 0x64, 0xcc, 0x4a, 0xbd, 0xf0, 0x07, 0xc1, 0xbc, 0x72, 0xdc,
+  0x64, 0xbd, 0x68, 0x3e, 0x2e, 0x3c, 0x38, 0x6d, 0x60, 0xbd, 0x46, 0x1f, 0x59,
+  0x3d, 0xd0, 0xa7, 0x3e, 0x3d, 0x77, 0x1d, 0x49, 0x3d, 0xcb, 0xed, 0x7f, 0xbd,
+  0xd8, 0x47, 0x40, 0x3c, 0x00, 0xf0, 0xee, 0x39, 0xcc, 0xea, 0x57, 0x3d, 0x10,
+  0x1d, 0x8a, 0xbd, 0xb9, 0x55, 0x5f, 0xbd, 0x17, 0x3c, 0x66, 0xbc, 0x02, 0xb8,
+  0x06, 0xbd, 0x5f, 0xfb, 0x16, 0xbd, 0x58, 0x15, 0x8c, 0x3d, 0x18, 0x99, 0x5f,
+  0x3d, 0x5f, 0x73, 0xb3, 0xbc, 0x61, 0x73, 0x63, 0x3d, 0x61, 0xf2, 0x7b, 0xbc,
+  0xbd, 0x2b, 0xad, 0x3a, 0xda, 0x99, 0x5c, 0xbd, 0x81, 0xd1, 0xd0, 0x3c, 0xf0,
+  0xf9, 0xb0, 0x3c, 0x84, 0x54, 0x68, 0x3c, 0x24, 0x10, 0x84, 0x3d, 0x4d, 0xec,
+  0xa2, 0x3b, 0xd3, 0xab, 0x1e, 0xbd, 0xbd, 0x4d, 0x84, 0x3d, 0xd0, 0xd9, 0xb6,
+  0x3c, 0x84, 0xdc, 0x71, 0xbd, 0x84, 0x4a, 0x03, 0x3d, 0x54, 0xb8, 0xc6, 0x3c,
+  0x0a, 0x84, 0x0e, 0x3d, 0xdc, 0xfe, 0x64, 0xbd, 0xa6, 0xc2, 0x19, 0x3d, 0xd1,
+  0x79, 0x4c, 0x3c, 0x7c, 0x16, 0xbd, 0x3c, 0xc1, 0x7d, 0x3c, 0xbc, 0xb2, 0xe7,
+  0x94, 0xbc, 0xf0, 0x46, 0x69, 0xbc, 0x2d, 0x5f, 0x68, 0x3c, 0xbc, 0x78, 0x44,
+  0xbd, 0xcf, 0x27, 0x97, 0xbd, 0x03, 0xfb, 0x4b, 0xbd, 0x0c, 0xc4, 0xcd, 0xbc,
+  0xd7, 0xc5, 0x11, 0xbd, 0x6b, 0xe3, 0xf5, 0xbb, 0xda, 0x4d, 0x75, 0x3d, 0xb0,
+  0xf1, 0x39, 0xbd, 0x02, 0x4e, 0x00, 0xbd, 0xcf, 0x22, 0x81, 0x3d, 0x48, 0x54,
+  0x10, 0xbd, 0x93, 0x8c, 0x42, 0x3a, 0x62, 0x1e, 0x18, 0x3d, 0xb5, 0x1d, 0x8d,
+  0x3d, 0xbe, 0x37, 0x54, 0xbc, 0x9e, 0xa3, 0x92, 0xbc, 0x6a, 0x91, 0x7b, 0x3d,
+  0xc5, 0x13, 0x8c, 0xbb, 0x30, 0x93, 0x55, 0xbd, 0x01, 0x29, 0x2b, 0xbd, 0xd4,
+  0x57, 0x3a, 0xbd, 0xaf, 0xbc, 0xed, 0x3c, 0x65, 0xfe, 0x66, 0xbd, 0x2c, 0x98,
+  0x11, 0x3d, 0x6e, 0xcf, 0x7c, 0xbd, 0xbe, 0xb4, 0x49, 0x3d, 0x17, 0x7c, 0x4f,
+  0xbc, 0x13, 0xfc, 0x28, 0x3d, 0x28, 0xca, 0x2b, 0xbd, 0xdf, 0x3e, 0xa3, 0x3b,
+  0x7e, 0xf4, 0x99, 0xbd, 0x9d, 0x89, 0x35, 0xbc, 0x70, 0x4c, 0x8a, 0xbd, 0xf9,
+  0x58, 0x3a, 0xbd, 0x6f, 0xa9, 0x4f, 0x3d, 0x30, 0xce, 0x59, 0xbc, 0x52, 0xd4,
+  0x41, 0xbd, 0x0d, 0x88, 0x2d, 0xbd, 0x94, 0xe1, 0x30, 0x3d, 0x7a, 0x53, 0xcd,
+  0xbb, 0x2d, 0xcc, 0x75, 0x3c, 0x18, 0x30, 0x24, 0x3d, 0xfb, 0xa8, 0x07, 0x3d,
+  0xa8, 0x1f, 0x19, 0xbc, 0xdf, 0x0a, 0x1c, 0x3d, 0x76, 0x06, 0x31, 0x3d, 0x6c,
+  0x40, 0x82, 0x3c, 0x72, 0xb0, 0x82, 0xbd, 0x10, 0xae, 0x67, 0x3d, 0x00, 0x02,
+  0xb5, 0x3a, 0x0a, 0xcd, 0x29, 0x3d, 0x7a, 0xf4, 0x27, 0x3c, 0x9d, 0xe2, 0x75,
+  0xbd, 0x1e, 0xcd, 0x09, 0x3c, 0xa7, 0x3e, 0x25, 0xbd, 0x90, 0xb7, 0x8b, 0xbd,
+  0xac, 0x2e, 0x6c, 0x3c, 0x22, 0x59, 0x79, 0x3d, 0xaf, 0x3b, 0x02, 0xba, 0x40,
+  0xb8, 0x2c, 0x3d, 0xe8, 0x48, 0x6e, 0x3d, 0x13, 0xdb, 0x2f, 0x3b, 0x89, 0x0e,
+  0x82, 0x3c, 0xdf, 0xe9, 0xc4, 0xbc, 0xc9, 0x26, 0x19, 0xbc, 0x67, 0x6b, 0x50,
+  0x3d, 0xc0, 0x4c, 0x10, 0xbd, 0x30, 0xa9, 0x40, 0x3c, 0x12, 0x2f, 0xb1, 0x3c,
+  0x3e, 0x0e, 0x00, 0xbd, 0xe9, 0x1b, 0x6f, 0xbd, 0xe4, 0x4b, 0x81, 0xbd, 0x93,
+  0xc1, 0x7f, 0x3d, 0xb7, 0x8d, 0x04, 0xbd, 0x68, 0x33, 0x29, 0xbc, 0xa4, 0x5e,
+  0x60, 0x3d, 0x23, 0xc0, 0x0a, 0xbd, 0xf0, 0x22, 0x80, 0xbd, 0x79, 0xea, 0x47,
+  0x3d, 0x10, 0x77, 0x87, 0x3d, 0xc1, 0xfb, 0x19, 0xbd, 0x9c, 0xf7, 0x7c, 0x3d,
+  0x27, 0x74, 0xb9, 0xbc, 0xc6, 0xea, 0x25, 0x3d, 0x54, 0xbc, 0xa4, 0x3c, 0x88,
+  0x18, 0x36, 0x3d, 0x74, 0xd5, 0xd3, 0x3c, 0x68, 0x6e, 0x24, 0x3d, 0x36, 0xb4,
+  0x49, 0x3d, 0x3e, 0x98, 0x2c, 0xbd, 0x99, 0x3e, 0x47, 0xbd, 0x21, 0xac, 0x15,
+  0x3d, 0xef, 0x4f, 0x26, 0xbd, 0xb4, 0x49, 0x3f, 0xbd, 0xf5, 0xbc, 0x0a, 0xbd,
+  0x04, 0x05, 0x6f, 0x3d, 0xf1, 0x5f, 0x15, 0x3d, 0xca, 0x51, 0x3f, 0x3d, 0xc2,
+  0x88, 0x3a, 0xbd, 0x40, 0xeb, 0xbf, 0x3c, 0x4c, 0x13, 0xb6, 0x3c, 0xe6, 0x26,
+  0xfe, 0x3c, 0xda, 0xab, 0x95, 0xbd, 0xd8, 0xcf, 0x81, 0x3d, 0xa2, 0x19, 0x53,
+  0xbd, 0x5d, 0x5e, 0x0d, 0xbd, 0xfe, 0x6b, 0x36, 0x3d, 0xfb, 0x27, 0x4c, 0xbd,
+  0x36, 0x92, 0x43, 0xbd, 0x94, 0xee, 0x45, 0xbc, 0x8a, 0x6d, 0xe4, 0x3c, 0xa8,
+  0xb1, 0x52, 0xbc, 0x1f, 0x82, 0x88, 0xbb, 0x73, 0x6b, 0x53, 0xbd, 0x56, 0xc3,
+  0x6f, 0x3d, 0x78, 0x17, 0x4a, 0x3d, 0xf2, 0x2e, 0x77, 0xbd, 0x2e, 0xae, 0x2a,
+  0x3d, 0xa0, 0xd4, 0xa8, 0x3c, 0xe0, 0xb4, 0xd8, 0x3c, 0x24, 0x6d, 0x6a, 0xbd,
+  0x16, 0xd2, 0x58, 0xbd, 0x56, 0xf5, 0x5d, 0x3b, 0xae, 0xdb, 0x76, 0xbd, 0x16,
+  0x9a, 0x9a, 0xbd, 0x7c, 0x79, 0x51, 0x3d, 0x72, 0x5b, 0xa7, 0xbc, 0xce, 0xbf,
+  0x62, 0x3d, 0xab, 0xd8, 0x23, 0x3d, 0x7e, 0xfd, 0x23, 0x3d, 0x0c, 0x3d, 0x6b,
+  0x3d, 0x6c, 0x2f, 0x87, 0x3c, 0x1e, 0x26, 0x00, 0xbc, 0xc3, 0x94, 0x6f, 0xbd,
+  0xb3, 0x7d, 0x24, 0xbd, 0x2a, 0xfb, 0x71, 0x3d, 0xee, 0x5a, 0xeb, 0xbc, 0x6c,
+  0x3e, 0x60, 0xbd, 0x6c, 0x46, 0xf5, 0x3c, 0x83, 0xe3, 0x17, 0x3b, 0xe6, 0x15,
+  0x32, 0xbd, 0x45, 0xba, 0x05, 0xbd, 0x18, 0x9a, 0x72, 0x3d, 0x45, 0x9c, 0x83,
+  0xbd, 0x08, 0x2b, 0x5e, 0x3d, 0x75, 0xea, 0xe8, 0xbc, 0x81, 0xb6, 0x84, 0x3b,
+  0x4b, 0xf4, 0x16, 0xbd, 0x90, 0xf4, 0x16, 0x3d, 0x2b, 0x95, 0x53, 0xbc, 0x53,
+  0x27, 0x4b, 0xbd, 0x00, 0x6c, 0xe7, 0x3b, 0x62, 0xbd, 0x83, 0xbd, 0xd8, 0x6f,
+  0x87, 0x3c, 0x3c, 0x17, 0x65, 0x3c, 0x3b, 0x64, 0x7e, 0x3d, 0xbd, 0x05, 0x09,
+  0xbd, 0x7f, 0x37, 0x88, 0xbd, 0x63, 0x0e, 0x98, 0xbd, 0x03, 0x67, 0x71, 0x3c,
+  0x02, 0x06, 0xe5, 0x39, 0xe4, 0x9f, 0xe7, 0x3b, 0x93, 0x66, 0x93, 0xbd, 0xc6,
+  0xcd, 0x7c, 0xbd, 0xde, 0xaf, 0x20, 0x3d, 0xd2, 0x18, 0x54, 0x3c, 0xac, 0xeb,
+  0x62, 0xbd, 0x93, 0xf7, 0xa2, 0x3c, 0x4c, 0x4b, 0x00, 0x3d, 0x38, 0x67, 0x3d,
+  0xbd, 0x81, 0xcb, 0xa2, 0x3c, 0x9b, 0xd5, 0x90, 0x3c, 0x35, 0x26, 0x0f, 0x3c,
+  0xcb, 0x77, 0x45, 0xbd, 0x38, 0xe0, 0x48, 0xbd, 0x96, 0x9e, 0x1d, 0x3b, 0x7c,
+  0x3f, 0xaf, 0xbc, 0xef, 0x49, 0xac, 0xbc, 0x07, 0x74, 0xcc, 0x3c, 0xc0, 0x22,
+  0x42, 0xbb, 0x5b, 0x72, 0x62, 0x3d, 0xd0, 0x55, 0x95, 0xbd, 0xf7, 0x7d, 0x82,
+  0x3d, 0x90, 0x79, 0xd9, 0x3b, 0xd0, 0xa1, 0x96, 0x3c, 0xbf, 0x32, 0x8a, 0x3d,
+  0xbd, 0xf0, 0x57, 0x3d, 0x5f, 0xf9, 0x3b, 0x3c, 0x4f, 0xea, 0x86, 0x3d, 0xbb,
+  0x72, 0xaa, 0x3c, 0x42, 0x3b, 0x4c, 0x3d, 0x86, 0x1d, 0x86, 0x3c, 0x90, 0xc6,
+  0x2a, 0xbd, 0x4f, 0x86, 0x76, 0x3d, 0x92, 0x79, 0x3d, 0x3d, 0x0d, 0x95, 0x92,
+  0x3d, 0xbf, 0x77, 0x4e, 0x3d, 0x8b, 0x45, 0x03, 0xbd, 0x95, 0x0c, 0xff, 0xbc,
+  0x62, 0x35, 0x11, 0xbb, 0xbd, 0x74, 0x28, 0x3d, 0xaf, 0x87, 0x7f, 0xbd, 0x8e,
+  0xb8, 0x06, 0xbd, 0x0f, 0xbd, 0x3e, 0x3d, 0xe6, 0xd4, 0x41, 0xbd, 0x80, 0x81,
+  0xac, 0x3c, 0x7a, 0xec, 0x82, 0xbc, 0x01, 0xac, 0x93, 0xbd, 0xe8, 0xba, 0xb3,
+  0xbb, 0xcf, 0x47, 0x8f, 0xbb, 0x11, 0x6f, 0x57, 0x3d, 0x74, 0xf5, 0x9d, 0x3c,
+  0x67, 0x6e, 0x01, 0xbd, 0xa6, 0x8c, 0x8f, 0xbd, 0xe4, 0x48, 0x30, 0xbd, 0x80,
+  0xa7, 0x88, 0xbb, 0x48, 0x69, 0xea, 0x3c, 0x20, 0x78, 0x14, 0x3b, 0x18, 0xc4,
+  0xca, 0xbc, 0xd6, 0x83, 0xcb, 0x3c, 0x88, 0x63, 0xd1, 0x3c, 0x02, 0x3a, 0x1b,
+  0xbc, 0x02, 0x15, 0x13, 0x3c, 0xbe, 0x71, 0xf0, 0xbb, 0xe1, 0x3c, 0x12, 0xbd,
+  0xa6, 0x23, 0x33, 0x3c, 0xc8, 0x04, 0xee, 0x3c, 0x78, 0x7e, 0x4d, 0x3c, 0x7f,
+  0xd1, 0x95, 0xbc, 0xa3, 0x48, 0x22, 0x3c, 0x6d, 0x33, 0x77, 0xbd, 0xfc, 0x4f,
+  0xc7, 0xbc, 0x8c, 0x5c, 0x8c, 0xbd, 0x98, 0x32, 0x02, 0xbd, 0x5f, 0x37, 0x00,
+  0x3d, 0x41, 0xea, 0x7f, 0x3d, 0x4b, 0x38, 0x77, 0xbc, 0x47, 0x90, 0x92, 0xbd,
+  0x56, 0x10, 0x1f, 0xbd, 0x10, 0x70, 0x8e, 0xbb, 0x0a, 0x99, 0x7a, 0x3c, 0x46,
+  0x4c, 0x7d, 0x3d, 0xc0, 0x71, 0x6d, 0x3d, 0xd8, 0x3f, 0x28, 0x3d, 0x84, 0xe3,
+  0x2b, 0x3d, 0x31, 0xdc, 0x55, 0xbd, 0x6e, 0x0a, 0x34, 0x3d, 0x10, 0xff, 0x85,
+  0x3c, 0x72, 0x7b, 0x1d, 0xbd, 0x7f, 0xf5, 0xb4, 0xbb, 0xfb, 0xef, 0x87, 0x3d,
+  0xb5, 0x8a, 0x4f, 0x3c, 0x20, 0xd7, 0x40, 0xbd, 0x17, 0x2c, 0x38, 0xbd, 0xcb,
+  0xd4, 0x6d, 0x3d, 0x3c, 0x24, 0x7a, 0xbd, 0xb3, 0x3d, 0x92, 0xbd, 0x18, 0xbe,
+  0x99, 0xba, 0x29, 0xe3, 0x42, 0xbc, 0xf7, 0x2c, 0x8f, 0xbd, 0x34, 0xd9, 0xc7,
+  0x3c, 0xac, 0x8c, 0x99, 0xbd, 0x40, 0xe4, 0xa5, 0x3c, 0x8d, 0xcf, 0x3d, 0x3d,
+  0x81, 0xe9, 0x3e, 0x3d, 0x7a, 0xbb, 0x3f, 0x3d, 0xc7, 0x9b, 0x25, 0xbc, 0x84,
+  0x26, 0xc3, 0xbb, 0x52, 0x3f, 0x7a, 0x3d, 0x7b, 0xdb, 0x69, 0xbd, 0x99, 0x0e,
+  0x71, 0xbd, 0x4c, 0xb5, 0xa5, 0x3b, 0xcf, 0x2f, 0xfd, 0xbb, 0x6b, 0x5b, 0x0c,
+  0x3b, 0x9e, 0xeb, 0x04, 0xbc, 0x00, 0x9d, 0xdc, 0xbb, 0x10, 0xc2, 0xc0, 0x3c,
+  0x08, 0xa2, 0x31, 0xbd, 0xc0, 0x3c, 0xf9, 0x3a, 0xad, 0xd5, 0x55, 0xbd, 0x11,
+  0xea, 0xf3, 0x3c, 0x80, 0x63, 0xfa, 0x3a, 0x30, 0x82, 0x48, 0x3b, 0x58, 0x5f,
+  0x2c, 0xbd, 0xd4, 0x00, 0x83, 0xbd, 0x12, 0x38, 0x8a, 0xbd, 0xd2, 0xdf, 0x1e,
+  0x3c, 0xd0, 0x71, 0x1b, 0x3d, 0x92, 0x5f, 0x56, 0xbd, 0x51, 0x29, 0x94, 0xbd,
+  0x40, 0x81, 0x92, 0xbd, 0x04, 0x93, 0x82, 0xbd, 0x8c, 0xf7, 0x84, 0x3d, 0x8a,
+  0x96, 0x85, 0xbd, 0x2a, 0x93, 0x3b, 0xba, 0xc7, 0x7c, 0x3b, 0xbd, 0xb0, 0x3d,
+  0x50, 0x3d, 0xa0, 0xcb, 0x42, 0x3d, 0xad, 0x3c, 0x16, 0xbc, 0x59, 0xaa, 0x30,
+  0xbd, 0xcd, 0x10, 0x91, 0xbc, 0xe8, 0xea, 0x35, 0xbd, 0x53, 0x63, 0x36, 0xbd,
+  0xa9, 0x85, 0x82, 0x3c, 0x23, 0xbd, 0x36, 0xbd, 0x25, 0x81, 0xe9, 0x3c, 0x76,
+  0x54, 0x6d, 0x3d, 0xc1, 0x4f, 0x69, 0xbd, 0x55, 0x6c, 0x8f, 0x3d, 0xd5, 0x0a,
+  0x7d, 0xbd, 0x48, 0xbe, 0xd2, 0x3c, 0x5b, 0xce, 0x84, 0x3d, 0xaa, 0x8e, 0x46,
+  0xbc, 0x9c, 0x93, 0xc9, 0x3c, 0x66, 0xb1, 0x45, 0x3d, 0xf1, 0xc0, 0x90, 0xbc,
+  0x2d, 0x09, 0x22, 0x3d, 0xcc, 0x52, 0x20, 0x3d, 0xaa, 0xec, 0x70, 0x3d, 0x3a,
+  0xbd, 0xac, 0xbb, 0x70, 0x69, 0x81, 0x3d, 0x43, 0x3f, 0x8b, 0xbc, 0x46, 0x6a,
+  0x04, 0xbd, 0xac, 0x25, 0x5a, 0xbd, 0xc2, 0xb9, 0x74, 0xbd, 0x35, 0x78, 0xeb,
+  0x3c, 0xe2, 0x31, 0x54, 0xbd, 0xa0, 0xb1, 0xfe, 0x3c, 0xaf, 0xd2, 0xf8, 0x3c,
+  0x00, 0x44, 0x82, 0x3a, 0x70, 0xcc, 0x91, 0xbd, 0x82, 0x1f, 0x57, 0xbd, 0xc2,
+  0xe4, 0x03, 0x3d, 0xd0, 0xbd, 0x80, 0xbd, 0x7a, 0xde, 0x41, 0xbd, 0xe9, 0xf4,
+  0x3b, 0x3c, 0xf9, 0x96, 0x1a, 0xbd, 0xe2, 0x2e, 0x46, 0xbd, 0xae, 0xbd, 0x34,
+  0xbd, 0xb4, 0xa2, 0x8c, 0xbc, 0xa8, 0x0e, 0x30, 0xbd, 0x56, 0xf8, 0x33, 0xbd,
+  0xce, 0x69, 0x35, 0x3d, 0x52, 0x2f, 0xeb, 0xbc, 0x9f, 0xe0, 0x0f, 0xbd, 0xc9,
+  0x34, 0x29, 0xbd, 0x43, 0x26, 0x1e, 0x3d, 0xc8, 0x03, 0x05, 0x3c, 0x0f, 0x46,
+  0x97, 0x3c, 0x18, 0x4c, 0x0c, 0xbd, 0xb8, 0xf9, 0x1c, 0xbd, 0xbd, 0x84, 0x86,
+  0xbd, 0xbe, 0x50, 0xb1, 0xbc, 0x26, 0x15, 0x57, 0x3c, 0xca, 0x9f, 0x77, 0xbc,
+  0xc0, 0xea, 0xca, 0xba, 0x23, 0xde, 0x41, 0xbd, 0x9d, 0xb4, 0x5c, 0xbd, 0x46,
+  0x03, 0x30, 0xbd, 0xd0, 0xb3, 0x37, 0x3d, 0xfd, 0xe6, 0x3e, 0x3d, 0x8a, 0x0e,
+  0x6a, 0xbd, 0xf8, 0x91, 0x64, 0x3d, 0xb4, 0x0b, 0x76, 0x3d, 0xf2, 0x94, 0x5f,
+  0x3d, 0x98, 0xe6, 0x78, 0x3c, 0xc4, 0xab, 0x1e, 0xbd, 0xdd, 0xb6, 0x77, 0xbd,
+  0x56, 0x1e, 0x8c, 0x3d, 0x0f, 0xee, 0x15, 0xbd, 0x42, 0xb6, 0x92, 0xbd, 0x2c,
+  0xea, 0x96, 0xbc, 0x90, 0xc4, 0x30, 0xbd, 0x2e, 0xdc, 0xc8, 0xbb, 0xe4, 0x79,
+  0xb0, 0xbc, 0x2e, 0xe6, 0x08, 0x3d, 0x74, 0x81, 0x34, 0x3d, 0xc0, 0xd5, 0x48,
+  0xbc, 0xd3, 0xf2, 0x3c, 0xbd, 0x34, 0x47, 0xef, 0x3c, 0x9a, 0xcb, 0xe5, 0x3c,
+  0xe0, 0x94, 0xef, 0xba, 0x80, 0x36, 0x23, 0xbc, 0x08, 0xf9, 0x35, 0xbd, 0x0f,
+  0x9d, 0x99, 0xbd, 0x71, 0xdf, 0x2e, 0xbd, 0xb5, 0xa6, 0x78, 0xbd, 0xfa, 0xa8,
+  0x69, 0x3d, 0x97, 0xc3, 0xda, 0xbb, 0x37, 0x74, 0xdf, 0x3c, 0x7f, 0xc2, 0x88,
+  0xbd, 0x53, 0x20, 0xbe, 0x3b, 0x9c, 0x7a, 0xd9, 0x3c, 0xa9, 0x4b, 0x01, 0xbd,
+  0xfb, 0xf7, 0x00, 0xbd, 0xd5, 0xda, 0x41, 0x3d, 0x9d, 0x2a, 0x82, 0x3d, 0x9a,
+  0x03, 0x01, 0x3d, 0x38, 0xa7, 0x1b, 0x3d, 0x40, 0x75, 0xef, 0x3c, 0x4a, 0xdc,
+  0x1b, 0xbc, 0xd1, 0x1a, 0x41, 0x3d, 0x04, 0xee, 0x74, 0x3d, 0xdb, 0x3f, 0x71,
+  0xbd, 0x86, 0xc4, 0x22, 0x3d, 0x99, 0x74, 0x78, 0xbc, 0x48, 0x90, 0x54, 0xbd,
+  0x88, 0xae, 0xf9, 0x3c, 0x4f, 0xbe, 0x10, 0x3d, 0x7d, 0x35, 0x68, 0xbd, 0xb3,
+  0xf9, 0x3d, 0x3d, 0x1b, 0x89, 0x85, 0xbb, 0x85, 0x05, 0xae, 0x3c, 0xfd, 0x18,
+  0x5b, 0xbd, 0x2d, 0xfa, 0x7f, 0xbd, 0x6e, 0xad, 0x8c, 0xbd, 0x67, 0x72, 0x28,
+  0x3d, 0x2c, 0x8b, 0x9a, 0x3c, 0xb3, 0x94, 0x57, 0xbd, 0xa4, 0x3e, 0xa8, 0xbc,
+  0xa6, 0x6a, 0x06, 0x3d, 0xf8, 0x03, 0x33, 0x3d, 0x56, 0xb0, 0x7a, 0xbd, 0x47,
+  0x97, 0x68, 0xbc, 0xd0, 0x17, 0x7a, 0xbd, 0xe8, 0xab, 0x7d, 0xbd, 0xec, 0x67,
+  0xf9, 0xbb, 0x3d, 0x92, 0x83, 0xbd, 0x36, 0xa4, 0x00, 0xbd, 0x00, 0x1b, 0x45,
+  0x3a, 0x39, 0x13, 0x88, 0xbd, 0x05, 0x63, 0x26, 0x3c, 0x53, 0x7b, 0xc9, 0x3c,
+  0x67, 0x97, 0x7a, 0xbb, 0xfe, 0x71, 0xd6, 0xbc, 0x24, 0x84, 0x1e, 0xbd, 0x02,
+  0xa3, 0x76, 0x3d, 0xff, 0x16, 0x69, 0x3d, 0x80, 0xf0, 0x21, 0x3d, 0x90, 0x11,
+  0x48, 0xbd, 0xc8, 0xa9, 0x3f, 0xbd, 0xc8, 0x06, 0x25, 0xbd, 0xaa, 0xfe, 0x96,
+  0xbd, 0xa4, 0xbe, 0x57, 0xbc, 0x6e, 0x82, 0x1d, 0x3d, 0xd6, 0xfa, 0x66, 0xbb,
+  0x9a, 0x25, 0x20, 0x3d, 0xa3, 0x94, 0x27, 0xbb, 0x23, 0x2f, 0xcd, 0x3c, 0x5e,
+  0xa4, 0x4e, 0x3d, 0x2a, 0x3b, 0x09, 0xbd, 0x4a, 0x40, 0x6f, 0x3d, 0xfe, 0xd8,
+  0xe4, 0x3c, 0xab, 0xce, 0x56, 0xbd, 0x1d, 0x9a, 0x65, 0x3d, 0xb6, 0xf5, 0x76,
+  0xbd, 0x88, 0x3d, 0x52, 0x3d, 0x0f, 0x1c, 0x50, 0xbd, 0x1d, 0x0d, 0x6a, 0x3d,
+  0x99, 0x66, 0x98, 0xbd, 0x6e, 0xe2, 0xb9, 0x3c, 0x4c, 0x26, 0x82, 0xbd, 0xe2,
+  0x3f, 0x65, 0xbd, 0x09, 0xa4, 0x8a, 0x3c, 0x19, 0x7d, 0x7d, 0xbd, 0xe6, 0xf8,
+  0x1d, 0xbd, 0xfc, 0xe2, 0xee, 0xbc, 0x1d, 0xab, 0x89, 0x3d, 0x8e, 0xb4, 0xfe,
+  0xbc, 0x68, 0x9c, 0x83, 0x3c, 0xf7, 0xa9, 0x0b, 0xbd, 0x3c, 0xed, 0x92, 0x3c,
+  0x90, 0x72, 0xa5, 0x3c, 0x02, 0xd9, 0x69, 0xbd, 0xa9, 0x64, 0x2a, 0xbb, 0x6d,
+  0x20, 0xf5, 0xbc, 0x0e, 0x44, 0x37, 0xbd, 0xc7, 0xf0, 0xde, 0x3c, 0xb6, 0xdb,
+  0x71, 0x3d, 0xea, 0x6b, 0xda, 0xbc, 0xc8, 0x8f, 0x1d, 0xbd, 0xb9, 0x43, 0x05,
+  0xbd, 0x6c, 0x4a, 0x78, 0xbc, 0xc0, 0xc3, 0x82, 0x3b, 0x4b, 0x41, 0x49, 0xbd,
+  0xc1, 0xfc, 0xcb, 0x3b, 0x93, 0x21, 0x8d, 0xbd, 0xcf, 0x67, 0x7a, 0xbd, 0x58,
+  0x9d, 0xdb, 0x3c, 0xd3, 0x71, 0x03, 0x3d, 0xaf, 0x55, 0x84, 0x3d, 0x71, 0x0c,
+  0x5d, 0xbd, 0x4c, 0x19, 0x89, 0x3c, 0x7f, 0x29, 0x8b, 0x3d, 0xf6, 0xcd, 0xa9,
+  0x3c, 0xaa, 0x00, 0x4c, 0x3d, 0x2b, 0xaa, 0x19, 0xbc, 0x93, 0xde, 0x16, 0xb9,
+  0xda, 0xaf, 0x90, 0xbb, 0xf6, 0xde, 0x48, 0x3d, 0x00, 0x08, 0x29, 0x3b, 0xb2,
+  0xe0, 0x82, 0xbc, 0x84, 0xf3, 0x40, 0xbc, 0xd4, 0x75, 0x08, 0x3d, 0x88, 0xe7,
+  0x64, 0xbd, 0x68, 0xd6, 0x95, 0x3c, 0x1b, 0x70, 0x3f, 0x3d, 0x64, 0xfa, 0xfd,
+  0xbc, 0xfc, 0x82, 0x61, 0x3d, 0x8e, 0x6e, 0x11, 0xbd, 0x0a, 0x0a, 0x9f, 0xbc,
+  0xb5, 0x1d, 0x68, 0x3c, 0x7d, 0x9f, 0x86, 0x3d, 0xe6, 0x3f, 0x83, 0x3d, 0xf9,
+  0xd6, 0xfe, 0x3c, 0x68, 0x0c, 0x61, 0xbd, 0x65, 0x33, 0x27, 0x3d, 0x2c, 0xcf,
+  0x68, 0x3d, 0xb0, 0xc0, 0x14, 0xbd, 0xb0, 0xb2, 0x81, 0x3d, 0xc0, 0x9c, 0x89,
+  0xbc, 0xae, 0x60, 0x8e, 0xbd, 0x92, 0xdd, 0x91, 0xbd, 0xc9, 0x0b, 0x85, 0x3d,
+  0xa4, 0x00, 0xb1, 0xbc, 0x80, 0x9d, 0xf8, 0x3c, 0x1d, 0xc1, 0x98, 0xbd, 0x3e,
+  0x88, 0xcd, 0x3c, 0x67, 0xc9, 0x66, 0x3c, 0x00, 0x46, 0x64, 0xba, 0x80, 0x3e,
+  0x19, 0xbd, 0x18, 0xe0, 0x20, 0x3c, 0x50, 0xcb, 0xc0, 0x3b, 0xe3, 0xf3, 0x8c,
+  0xbc, 0xac, 0x02, 0xd6, 0x3c, 0xca, 0x7a, 0x45, 0x3d, 0x95, 0xab, 0x47, 0xbd,
+  0xe6, 0x14, 0x55, 0x3d, 0x88, 0x82, 0x09, 0x3d, 0x1c, 0x74, 0x91, 0x3c, 0xbf,
+  0x00, 0x2f, 0x3c, 0x8c, 0xfc, 0x96, 0xbd, 0xcb, 0xa8, 0x9e, 0xbb, 0xb5, 0x6b,
+  0x42, 0x3d, 0x0f, 0xed, 0x99, 0xbd, 0x6a, 0x9e, 0x45, 0xba, 0x50, 0xa3, 0x2d,
+  0xbc, 0x6a, 0x95, 0x52, 0x3d, 0x18, 0x66, 0xd7, 0xbb, 0x65, 0x63, 0x7c, 0xbd,
+  0xfe, 0xa8, 0xe1, 0xbc, 0x48, 0x89, 0x50, 0xbd, 0x64, 0x1d, 0xbe, 0x3c, 0x54,
+  0xe9, 0x07, 0x3d, 0x2f, 0x27, 0x2b, 0x3d, 0x55, 0x02, 0x00, 0x3d, 0xb2, 0xbe,
+  0x53, 0xbd, 0xd8, 0x03, 0x72, 0xbd, 0xd4, 0x63, 0x69, 0x3d, 0x1c, 0x9b, 0x7c,
+  0xbd, 0x87, 0x6b, 0x83, 0xbd, 0xc8, 0x0e, 0x0f, 0xbd, 0xed, 0x88, 0x30, 0xbd,
+  0xce, 0x02, 0x31, 0xbd, 0xae, 0xdd, 0x17, 0xbd, 0x03, 0x61, 0x43, 0xbd, 0xcf,
+  0xd3, 0x03, 0xbd, 0x56, 0x0b, 0x57, 0xbd, 0x85, 0x33, 0x0d, 0xbd, 0x36, 0x8f,
+  0x0b, 0xbd, 0x8e, 0x7d, 0x2c, 0xbc, 0x99, 0x21, 0x40, 0xbd, 0x9b, 0xf2, 0x62,
+  0xbb, 0xcc, 0xaf, 0x3f, 0x3d, 0x3f, 0xc0, 0xab, 0x3c, 0xc1, 0x4d, 0x27, 0x3c,
+  0x4b, 0x78, 0x30, 0x3d, 0x04, 0x65, 0xfe, 0x3b, 0xbe, 0x78, 0xb0, 0xbc, 0x9a,
+  0xb9, 0xe8, 0xbc, 0x58, 0x9c, 0x5d, 0x3d, 0x95, 0x93, 0x65, 0x3d, 0xd9, 0xa8,
+  0x41, 0xbd, 0x91, 0xb5, 0x36, 0x3d, 0x48, 0xc5, 0x84, 0xbd, 0xf8, 0x98, 0x3c,
+  0x3c, 0x07, 0x2e, 0x96, 0xbd, 0xf2, 0xa1, 0x2b, 0xba, 0xdc, 0xa1, 0x10, 0xbd,
+  0x3a, 0xa4, 0xdb, 0xbc, 0x03, 0x75, 0x63, 0xbd, 0x5f, 0x46, 0x3d, 0x3a, 0x75,
+  0x7d, 0x56, 0x3d, 0x68, 0x12, 0xa8, 0xbc, 0x03, 0xf5, 0x98, 0xbd, 0xe0, 0x3c,
+  0xe7, 0xbc, 0x90, 0xb6, 0xbb, 0xbb, 0x48, 0x0e, 0x08, 0x3d, 0x68, 0x30, 0x35,
+  0x3c, 0xb4, 0x17, 0xcf, 0x3c, 0xf9, 0xd9, 0xf8, 0x3c, 0xc8, 0x7e, 0x09, 0xbc,
+  0x84, 0xde, 0x45, 0xbd, 0xfe, 0xad, 0xf7, 0xbc, 0xdb, 0x10, 0x8b, 0xbd, 0x65,
+  0xac, 0x40, 0x3d, 0x2f, 0xc7, 0x12, 0x3c, 0x60, 0x81, 0x62, 0x3d, 0x96, 0xbd,
+  0xf6, 0x3c, 0xee, 0x7e, 0x80, 0x3d, 0x76, 0x78, 0x25, 0x3d, 0xec, 0x17, 0x1b,
+  0xbc, 0x17, 0xa7, 0x2f, 0xbd, 0x5c, 0x17, 0x4e, 0x3d, 0x92, 0x4e, 0x99, 0xbb,
+  0xe6, 0xec, 0x1d, 0xbd, 0xcf, 0xd4, 0x15, 0x3d, 0x36, 0x68, 0xcb, 0x3c, 0x05,
+  0xd3, 0x68, 0x3c, 0x4d, 0x37, 0x96, 0x3c, 0x85, 0x4b, 0x98, 0x3b, 0x3e, 0xf9,
+  0x6a, 0x3d, 0x42, 0xd5, 0x85, 0xbc, 0x35, 0xf1, 0x48, 0xbd, 0xae, 0x5a, 0x69,
+  0x3b, 0xfc, 0xc3, 0x81, 0xbd, 0x3d, 0xe3, 0x71, 0xbd, 0xdb, 0x3b, 0x18, 0xbd,
+  0x40, 0x90, 0x26, 0xbd, 0x5d, 0xef, 0x80, 0xbc, 0x94, 0x89, 0x9a, 0xbc, 0x96,
+  0x7a, 0x33, 0xbd, 0x94, 0x61, 0x71, 0x3d, 0xe6, 0xaf, 0x5a, 0x3d, 0x5f, 0x3d,
+  0x6a, 0x3b, 0x22, 0xcf, 0x23, 0xbc, 0xb1, 0x6f, 0x4b, 0xbb, 0x9a, 0x4b, 0xbe,
+  0x3c, 0xd7, 0x02, 0x95, 0xbc, 0xb5, 0xfa, 0x4b, 0xbd, 0x8d, 0x7e, 0x85, 0xbc,
+  0x12, 0x0b, 0x3c, 0x3d, 0xa5, 0x2c, 0xfc, 0xbb, 0xb0, 0xcc, 0xb2, 0xbb, 0xf2,
+  0x03, 0x4a, 0xbd, 0x87, 0xe3, 0x1d, 0xbd, 0xcc, 0xd7, 0xed, 0x3c, 0x16, 0x63,
+  0x73, 0xbc, 0x18, 0x4e, 0x47, 0x3d, 0x70, 0x95, 0x37, 0xbd, 0xfb, 0xdd, 0xc4,
+  0x3c, 0x3d, 0x65, 0xfb, 0x3c, 0x96, 0xa0, 0x84, 0x3d, 0x60, 0x19, 0xff, 0xbb,
+  0xa4, 0xbf, 0x4b, 0x3c, 0x5b, 0x63, 0x03, 0xbd, 0x8d, 0x86, 0xcb, 0xbb, 0x62,
+  0xee, 0x76, 0xbd, 0x9c, 0x16, 0x73, 0x3d, 0x4f, 0xd8, 0x81, 0x3d, 0xe2, 0x7d,
+  0xba, 0xbc, 0xd6, 0x7a, 0xb4, 0x3b, 0x61, 0x45, 0x87, 0x3d, 0xe1, 0x5e, 0x8a,
+  0xbd, 0xfc, 0x1f, 0xc0, 0xbc, 0xc0, 0x87, 0x14, 0xbd, 0x3d, 0x53, 0x16, 0x3d,
+  0x86, 0x91, 0x17, 0x3c, 0xa6, 0x1a, 0x71, 0xbc, 0xe7, 0x57, 0xf9, 0xbc, 0x27,
+  0x13, 0x87, 0x3d, 0x98, 0x4e, 0x02, 0x3d, 0xe5, 0x9d, 0x13, 0x3d, 0x89, 0xbf,
+  0x2e, 0x3c, 0xa0, 0x5f, 0x21, 0x3b, 0x80, 0xc1, 0xf4, 0x3b, 0x14, 0x22, 0x2a,
+  0xbc, 0x33, 0xd3, 0x93, 0x3c, 0xd7, 0x3d, 0x6e, 0x3d, 0x2e, 0xcd, 0x81, 0xbd,
+  0x71, 0xa3, 0x45, 0xbd, 0xde, 0xd6, 0x4f, 0x3d, 0xb7, 0xe7, 0x41, 0xbd, 0x27,
+  0x86, 0xd6, 0x3c, 0x6b, 0x72, 0x85, 0x3d, 0x6d, 0x89, 0x11, 0xbd, 0x21, 0x7b,
+  0x1a, 0xbd, 0x18, 0xf1, 0x38, 0xbd, 0xc3, 0xf7, 0xb1, 0x3c, 0xd7, 0xa0, 0x8e,
+  0xbd, 0x6e, 0x16, 0x24, 0x3d, 0xc2, 0x2b, 0x2f, 0x3d, 0xc8, 0x1c, 0x82, 0x3c,
+  0x53, 0x30, 0x24, 0xbc, 0xd9, 0x49, 0x1f, 0xbd, 0xea, 0x81, 0x3f, 0x3d, 0xc4,
+  0xb7, 0x1a, 0x3d, 0xc3, 0x0a, 0x0b, 0xbd, 0x29, 0x5d, 0x88, 0x3d, 0x3f, 0xb6,
+  0x9f, 0xbc, 0x97, 0x16, 0x72, 0xbd, 0x67, 0x40, 0xa4, 0xbc, 0x67, 0x64, 0x59,
+  0xbc, 0xd0, 0x90, 0xfd, 0xbc, 0x48, 0xa3, 0x1b, 0xbd, 0x5f, 0x6c, 0xf2, 0x3c,
+  0xe4, 0x81, 0x97, 0xbd, 0x2b, 0xe9, 0x86, 0x3d, 0x6c, 0xa1, 0x06, 0xbd, 0xa8,
+  0x7c, 0x2a, 0x3c, 0x07, 0xca, 0x8d, 0x3b, 0x1f, 0x0c, 0x21, 0xbd, 0xb0, 0x7f,
+  0x90, 0xbd, 0xe5, 0x3f, 0x17, 0x3d, 0x03, 0x58, 0x43, 0xbd, 0xe7, 0x24, 0x42,
+  0xbd, 0xdd, 0xf2, 0x95, 0xbd, 0x58, 0xd0, 0xd9, 0x3c, 0xa9, 0xbe, 0x00, 0x3d,
+  0x40, 0x4c, 0x97, 0xbd, 0x06, 0x0f, 0x63, 0xbd, 0x44, 0x04, 0x42, 0xbd, 0x69,
+  0xfa, 0xd6, 0xbb, 0x40, 0x95, 0xca, 0xba, 0xba, 0x29, 0x80, 0xbd, 0x40, 0x04,
+  0x8f, 0xbd, 0x9b, 0xd2, 0x71, 0xbd, 0x16, 0x0f, 0x36, 0xbd, 0xcf, 0xe9, 0x77,
+  0x3d, 0x00, 0x20, 0xe2, 0xb8, 0x77, 0xed, 0x89, 0xba, 0x27, 0x9d, 0x7d, 0xbd,
+  0x8b, 0x7d, 0xa1, 0x3c, 0xaf, 0x02, 0x41, 0xbd, 0x76, 0x0a, 0x80, 0xbd, 0xc5,
+  0xbe, 0x0c, 0x3c, 0x65, 0xbc, 0x53, 0x3c, 0x23, 0x57, 0x71, 0x3d, 0x4c, 0x69,
+  0xad, 0x3c, 0xe6, 0x35, 0x70, 0xbd, 0x4a, 0x71, 0x0f, 0x3d, 0x60, 0x74, 0x60,
+  0xbd, 0x00, 0x21, 0xff, 0xbc, 0x2e, 0x9e, 0x15, 0xbd, 0x5b, 0xfa, 0xfb, 0xbc,
+  0x70, 0x17, 0xe6, 0x3c, 0xb8, 0x5a, 0x03, 0x3d, 0x26, 0x71, 0x82, 0x3d, 0x40,
+  0xf1, 0xe2, 0xbb, 0xad, 0xa1, 0x7d, 0xbd, 0xbb, 0x38, 0xb0, 0xbc, 0xa8, 0x2e,
+  0x18, 0x3d, 0x29, 0xe4, 0x01, 0xbd, 0x3d, 0xed, 0x75, 0xbc, 0xc1, 0x90, 0x09,
+  0x3d, 0x7a, 0x35, 0xf9, 0xbc, 0x0a, 0x1f, 0x8e, 0xbc, 0x7b, 0x9e, 0x05, 0xbc,
+  0x00, 0xe1, 0x18, 0x3c, 0x90, 0xf1, 0xc1, 0xbc, 0xbc, 0xfc, 0x87, 0x3d, 0x28,
+  0x2a, 0x48, 0x3c, 0xcf, 0x41, 0xf4, 0xbc, 0xa3, 0x20, 0x7a, 0xbd, 0x58, 0x65,
+  0x0c, 0x3b, 0x5b, 0x8e, 0xd7, 0xbc, 0x09, 0x03, 0x87, 0x3d, 0xfa, 0xcf, 0xaa,
+  0xbc, 0x12, 0x45, 0x83, 0xbd, 0x29, 0x24, 0x89, 0xbd, 0x77, 0x6e, 0x98, 0xbd,
+  0x50, 0xf7, 0x91, 0xbb, 0x3e, 0x17, 0x86, 0x3c, 0xcf, 0x82, 0x54, 0x3d, 0x12,
+  0x48, 0xff, 0xbb, 0xa8, 0x39, 0xa6, 0x3c, 0x57, 0xfc, 0xb4, 0xbc, 0xc5, 0x25,
+  0x30, 0xbd, 0xcd, 0xbc, 0x04, 0xbd, 0x10, 0x87, 0xb4, 0xbc, 0x16, 0x7b, 0x6e,
+  0xbd, 0xba, 0x00, 0x5f, 0xbd, 0xf8, 0x14, 0xac, 0x3c, 0xdf, 0x4d, 0x88, 0xbd,
+  0x2e, 0xd2, 0xb6, 0xbc, 0x8e, 0x7a, 0x8e, 0xbd, 0xac, 0xdb, 0xe2, 0x3c, 0x7b,
+  0x12, 0x8b, 0x3d, 0x03, 0xe2, 0x91, 0xbd, 0x43, 0xac, 0x3c, 0xbc, 0x5a, 0xc7,
+  0x52, 0x3d, 0x5e, 0xec, 0x40, 0x3d, 0x1a, 0xb0, 0x1f, 0xbc, 0x1d, 0x9c, 0x92,
+  0xbd, 0xd3, 0x03, 0xfd, 0x3c, 0xdd, 0x22, 0x0a, 0xbb, 0xe2, 0x2a, 0x89, 0x3d,
+  0x94, 0xb6, 0xd4, 0xbb, 0x74, 0x26, 0xb8, 0xbc, 0xc6, 0x7a, 0x35, 0xbd, 0xa8,
+  0xb7, 0x8e, 0xbd, 0xbe, 0x94, 0x36, 0xbd, 0x22, 0xc0, 0x03, 0xbd, 0x40, 0xb4,
+  0xe5, 0x3a, 0x53, 0xb5, 0x14, 0xbc, 0xac, 0x00, 0x3a, 0xbc, 0xb3, 0xd9, 0xee,
+  0x3c, 0xb5, 0x7c, 0xae, 0xbb, 0xd6, 0xb2, 0x75, 0x3c, 0x2f, 0x0e, 0x1a, 0xbd,
+  0xf0, 0xb2, 0x47, 0xbd, 0xad, 0x36, 0x50, 0xbb, 0x19, 0x86, 0x36, 0xbd, 0xb4,
+  0x02, 0xe4, 0xbc, 0xe2, 0x37, 0x10, 0x3d, 0x17, 0xcb, 0x86, 0xbd, 0x33, 0x35,
+  0x5e, 0x3c, 0x63, 0xfe, 0x8f, 0x3d, 0x8e, 0x91, 0x6c, 0xbd, 0xf8, 0x55, 0x6f,
+  0x3c, 0x60, 0xc0, 0xb6, 0x3c, 0x09, 0x23, 0x8d, 0xbd, 0x75, 0xae, 0x89, 0x3d,
+  0x4e, 0xb2, 0x76, 0x3d, 0xbc, 0x52, 0x57, 0xbd, 0x5c, 0xf2, 0xde, 0xbc, 0x5a,
+  0xc5, 0xc5, 0xbc, 0x01, 0xbf, 0x1a, 0xbd, 0xc4, 0x10, 0x37, 0xbd, 0xe9, 0xe5,
+  0x7a, 0x3b, 0xa0, 0x03, 0x58, 0xbd, 0x4f, 0xe4, 0x66, 0x3d, 0xbd, 0xc0, 0xa8,
+  0xbc, 0xd0, 0x05, 0xb9, 0x3c, 0xd3, 0xb7, 0xd9, 0x3c, 0xf2, 0x28, 0x2d, 0x3d,
+  0x69, 0x78, 0x38, 0xbd, 0x55, 0x58, 0x49, 0xbc, 0xc5, 0x5b, 0xc2, 0x3c, 0x67,
+  0x0d, 0x40, 0x3d, 0x02, 0xec, 0x2b, 0x3d, 0x60, 0x6a, 0xac, 0x3c, 0x6a, 0x9c,
+  0x65, 0x3d, 0x19, 0x18, 0x4d, 0xbd, 0x05, 0xaf, 0xbd, 0xbc, 0x22, 0x2b, 0x54,
+  0xbd, 0x1d, 0x0c, 0xd9, 0xbc, 0x0a, 0xf7, 0xfd, 0x3a, 0x5a, 0x18, 0x23, 0x3d,
+  0xeb, 0xfc, 0x84, 0xbd, 0xaf, 0x71, 0x0c, 0xbc, 0x98, 0x72, 0x5e, 0x3c, 0x18,
+  0x8b, 0x88, 0x3c, 0xa4, 0x1d, 0x8f, 0xbb, 0x3c, 0x3d, 0xbf, 0xbc, 0x18, 0x7a,
+  0xc7, 0x3c, 0x2e, 0x1c, 0x77, 0xbd, 0x50, 0x47, 0x55, 0x3c, 0x5c, 0xa7, 0x23,
+  0xbc, 0x0c, 0x4e, 0xda, 0x3c, 0x00, 0x25, 0x7f, 0x3d, 0xdc, 0xbd, 0x85, 0xbd,
+  0xee, 0x84, 0x91, 0xbc, 0x0b, 0xcb, 0x81, 0x3d, 0x7a, 0x5f, 0x04, 0xbc, 0xde,
+  0x3d, 0x7b, 0xbb, 0x05, 0xa9, 0x79, 0x3d, 0x6c, 0x47, 0x2e, 0xbd, 0x9a, 0x8c,
+  0x7c, 0x3d, 0xee, 0xc6, 0x93, 0xbd, 0xaf, 0xd0, 0xd9, 0xbc, 0x33, 0x14, 0x3c,
+  0xbd, 0xe3, 0x36, 0x6e, 0x3d, 0x0b, 0x9a, 0x55, 0xbc, 0xe9, 0x83, 0x84, 0x3d,
+  0xd6, 0xb4, 0x6c, 0x3d, 0xc4, 0xea, 0xd4, 0x3c, 0x48, 0xb4, 0x20, 0x3d, 0x6e,
+  0xc9, 0x53, 0x3d, 0x4e, 0x95, 0xbb, 0xbc, 0x15, 0x0c, 0x86, 0x3d, 0xdc, 0x7a,
+  0x40, 0xbd, 0x98, 0x24, 0x6d, 0xbc, 0x2f, 0xea, 0x8a, 0xbd, 0x78, 0x00, 0xb4,
+  0x3c, 0x8f, 0x53, 0x52, 0x3d, 0xc2, 0xfb, 0x11, 0x3d, 0x10, 0x7e, 0x81, 0x3c,
+  0xae, 0xf3, 0x3e, 0x3d, 0x34, 0x8d, 0xeb, 0x3c, 0x72, 0x86, 0xd6, 0xbc, 0xd5,
+  0x02, 0xad, 0x3b, 0x9d, 0x1c, 0x41, 0xbd, 0xda, 0x6b, 0x23, 0x3d, 0xaf, 0xa0,
+  0x2b, 0x3d, 0x91, 0xd9, 0x5c, 0x3d, 0xce, 0x13, 0x4c, 0xbd, 0xa8, 0x7a, 0x4a,
+  0x3d, 0xfd, 0xc5, 0x29, 0xbd, 0xff, 0xa6, 0x50, 0xbd, 0x9d, 0x04, 0x43, 0x3d,
+  0x49, 0x9f, 0x82, 0xbd, 0xe0, 0x8c, 0x87, 0xbd, 0xb7, 0xb5, 0x64, 0xbd, 0x5e,
+  0x55, 0x27, 0x3d, 0x8d, 0xde, 0x41, 0x3d, 0x19, 0x6b, 0x23, 0xbc, 0x6f, 0x71,
+  0xf6, 0x3c, 0x04, 0x56, 0x24, 0x3d, 0xb8, 0x20, 0x3a, 0x3c, 0x97, 0xb4, 0x91,
+  0xbd, 0x87, 0xf5, 0x6d, 0x3d, 0x80, 0x5b, 0x9d, 0x3c, 0x70, 0x4c, 0xad, 0x3b,
+  0xff, 0x49, 0x81, 0x3d, 0x88, 0x14, 0x89, 0xbc, 0x72, 0xde, 0x25, 0xbd, 0x62,
+  0xa9, 0x21, 0x3d, 0x94, 0x43, 0x59, 0xbc, 0xb1, 0x5a, 0x92, 0x3d, 0x9d, 0x57,
+  0x6b, 0x3c, 0x5d, 0xa8, 0x8d, 0x3d, 0xd7, 0xf7, 0x08, 0x3d, 0x1c, 0x07, 0xe3,
+  0xbc, 0xdd, 0xfc, 0xb5, 0xbc, 0xbc, 0xca, 0x84, 0x3d, 0x5c, 0x9e, 0x18, 0xbd,
+  0xd5, 0x6d, 0x86, 0x3d, 0x42, 0x2b, 0x58, 0x3c, 0x0a, 0xc6, 0x33, 0x3d, 0x2c,
+  0x1e, 0xf6, 0xbc, 0xb8, 0x48, 0x46, 0xbd, 0x26, 0xd6, 0x88, 0xbd, 0xd8, 0x45,
+  0x2e, 0x3d, 0x7f, 0x28, 0x4f, 0x3d, 0x52, 0x42, 0x40, 0xbc, 0xad, 0xc8, 0x45,
+  0xbd, 0xaa, 0x1c, 0x27, 0xbd, 0x32, 0x83, 0x72, 0xbb, 0xd2, 0xc5, 0x33, 0x3b,
+  0x1e, 0x2f, 0x6f, 0x3d, 0x9e, 0x5c, 0x1c, 0x3d, 0x2d, 0xfb, 0xc5, 0xbc, 0x3d,
+  0x12, 0x68, 0x3b, 0xb4, 0x98, 0xe9, 0x3c, 0xb9, 0xbd, 0xdf, 0x3a, 0xe0, 0xac,
+  0x2c, 0x3d, 0x10, 0x5c, 0x87, 0x3c, 0x80, 0xd6, 0x2d, 0xba, 0x18, 0x73, 0x94,
+  0x3c, 0xb8, 0x3c, 0x39, 0xbc, 0x48, 0x64, 0xda, 0x3c, 0x54, 0xdf, 0x05, 0x3d,
+  0x04, 0x35, 0xdf, 0x3c, 0xdb, 0xf8, 0xfb, 0xba, 0xc3, 0x2d, 0xc1, 0xb8, 0x0e,
+  0x8c, 0xd1, 0x3c, 0x4f, 0x12, 0x14, 0x3d, 0x50, 0xbc, 0x7d, 0xbc, 0xc7, 0x20,
+  0x88, 0xbd, 0x79, 0x45, 0x2f, 0xbd, 0x77, 0x83, 0x55, 0xbc, 0x42, 0x7e, 0x95,
+  0xbd, 0x9d, 0xfb, 0x4d, 0xbd, 0x92, 0xcc, 0x89, 0xbd, 0x84, 0x1d, 0x03, 0xbd,
+  0x1f, 0xe1, 0x86, 0xbb, 0xca, 0xee, 0x4e, 0x3c, 0x15, 0x39, 0x55, 0xbd, 0x94,
+  0x4b, 0x87, 0xbd, 0xf3, 0xf0, 0x0d, 0xbd, 0x4d, 0x17, 0x7b, 0x3d, 0xe5, 0x0b,
+  0x95, 0xbc, 0x10, 0x50, 0x20, 0xbd, 0x60, 0x74, 0x7c, 0xbd, 0x50, 0x76, 0xad,
+  0xbc, 0xdd, 0x59, 0x89, 0x3c, 0xa1, 0xcc, 0x10, 0x3d, 0x23, 0x4c, 0x37, 0x3c,
+  0x50, 0x0e, 0xa6, 0x3c, 0x02, 0x0e, 0x24, 0xbd, 0x9d, 0x9f, 0x40, 0xbd, 0xba,
+  0xe1, 0x51, 0xbd, 0x9e, 0xe5, 0x2a, 0xbd, 0x44, 0x07, 0xc8, 0x3c, 0xc0, 0x11,
+  0x85, 0x3c, 0x1c, 0xde, 0x40, 0xbd, 0x34, 0xd3, 0xe3, 0x3c, 0xf1, 0xae, 0xdb,
+  0xbc, 0xea, 0xbb, 0xf0, 0xbc, 0x32, 0x81, 0xb7, 0x3c, 0x1b, 0xe9, 0x4f, 0xbd,
+  0x47, 0xd3, 0xb7, 0xbc, 0xc4, 0x4b, 0xe7, 0xbc, 0xf3, 0x52, 0x3b, 0x3d, 0x10,
+  0xb8, 0xb6, 0x3b, 0x0b, 0xb8, 0x33, 0xbc, 0xb1, 0xba, 0x29, 0x3d, 0x93, 0xfc,
+  0x00, 0xbd, 0xdf, 0x63, 0x30, 0xbd, 0xac, 0x1d, 0x1e, 0x3d, 0x52, 0xf7, 0x15,
+  0xbd, 0x7f, 0xea, 0x53, 0xbd, 0x29, 0xe4, 0x2f, 0xbc, 0x5e, 0xf0, 0xb7, 0x3c,
+  0xb1, 0xff, 0x09, 0xbd, 0xc9, 0x0f, 0xae, 0x3c, 0x5a, 0xc0, 0x06, 0xbd, 0x34,
+  0x15, 0x10, 0xbd, 0x76, 0xea, 0x95, 0xbc, 0x60, 0xd8, 0x2d, 0x3c, 0x4c, 0x12,
+  0x77, 0xbc, 0x2d, 0xb6, 0x88, 0x3d, 0x7f, 0x15, 0xe4, 0x3c, 0xb0, 0xef, 0xf0,
+  0xbc, 0x79, 0x32, 0x1c, 0xbd, 0x4d, 0xbc, 0x4b, 0xbd, 0xae, 0x6d, 0x64, 0x3d,
+  0x0c, 0x44, 0x82, 0xbc, 0x15, 0x4f, 0x3e, 0xbd, 0x86, 0x54, 0xab, 0xbc, 0x78,
+  0xea, 0x0d, 0xbd, 0x73, 0xc6, 0x87, 0xbd, 0x06, 0xed, 0x32, 0xbd, 0xfd, 0x03,
+  0x8a, 0xbd, 0x89, 0x8b, 0x30, 0xbd, 0x40, 0x73, 0x0d, 0xbd, 0xcf, 0x80, 0x84,
+  0xbd, 0x3c, 0x00, 0x69, 0xbd, 0xeb, 0x8a, 0xf8, 0x3b, 0xc1, 0xa4, 0x93, 0xbd,
+  0x25, 0x74, 0x69, 0xbd, 0x11, 0xe5, 0x00, 0x3d, 0x2d, 0xa0, 0x01, 0x3d, 0xf9,
+  0x7d, 0x02, 0xbc, 0x55, 0x26, 0x30, 0x3d, 0xad, 0xf7, 0x50, 0x3c, 0xd6, 0xb1,
+  0x68, 0x3d, 0xce, 0x49, 0x71, 0xbd, 0xcf, 0xde, 0xaa, 0x3b, 0x5d, 0x6e, 0x91,
+  0xbd, 0xb4, 0xf1, 0x1a, 0xbd, 0xc7, 0xeb, 0xc2, 0x3c, 0x50, 0x74, 0xd4, 0xbb,
+  0xe8, 0x25, 0x1f, 0x3d, 0xdb, 0x0a, 0x8e, 0xbc, 0x9d, 0x5d, 0x73, 0xbd, 0x70,
+  0xce, 0x01, 0xbc, 0xc4, 0x22, 0x84, 0x3d, 0x80, 0x3b, 0x1d, 0x3c, 0x3d, 0xfa,
+  0x15, 0xbd, 0x45, 0xd7, 0x9a, 0xbd, 0x4d, 0xa2, 0x4e, 0xbd, 0x41, 0x6e, 0x96,
+  0xbc, 0xbf, 0xe4, 0x6c, 0x3d, 0x90, 0x3c, 0x21, 0x3d, 0x99, 0x76, 0x83, 0x3c,
+  0xe1, 0xb9, 0x6f, 0x3d, 0x24, 0xb9, 0xcf, 0xbc, 0xc0, 0x33, 0xee, 0xbb, 0x8d,
+  0xa6, 0xf0, 0xbc, 0x40, 0x81, 0x3f, 0x3d, 0x43, 0x82, 0x7e, 0x3c, 0xfa, 0x13,
+  0x7a, 0x3d, 0x91, 0xcd, 0x0a, 0xbc, 0x80, 0x3e, 0x61, 0x3d, 0x65, 0xef, 0x56,
+  0xbd, 0x44, 0x57, 0x90, 0xbd, 0xb4, 0x86, 0x7a, 0x3c, 0x70, 0xf5, 0xbd, 0x3c,
+  0x90, 0x5c, 0xdc, 0x3c, 0x13, 0xe5, 0xeb, 0xbc, 0x30, 0x7a, 0x48, 0x3d, 0xfa,
+  0x4c, 0xbe, 0x3c, 0x4d, 0x35, 0x2e, 0xbd, 0x32, 0x33, 0xdb, 0xbc, 0xab, 0x4c,
+  0x0a, 0xbd, 0x12, 0x58, 0xad, 0xbc, 0x20, 0x07, 0x0c, 0x3c, 0xbc, 0xb5, 0xa6,
+  0x3c, 0xb6, 0x70, 0x8f, 0xbd, 0xbc, 0x9a, 0x57, 0x3d, 0xb3, 0x6f, 0x82, 0xbd,
+  0x52, 0xb9, 0x5c, 0x3c, 0x0d, 0x71, 0xd9, 0x3c, 0x18, 0x70, 0x0a, 0x3d, 0x80,
+  0x7b, 0x0a, 0x3b, 0xee, 0x75, 0x27, 0xbc, 0x63, 0x74, 0x56, 0xbd, 0xf0, 0x20,
+  0x5f, 0x3b, 0xfb, 0x77, 0x1e, 0xba, 0xb8, 0x6c, 0xee, 0x3c, 0x01, 0xd0, 0xef,
+  0x3c, 0xb2, 0x68, 0x12, 0xbd, 0x51, 0xf6, 0x3c, 0xbd, 0x12, 0xb0, 0x2e, 0xbd,
+  0x11, 0xfd, 0x5e, 0xbd, 0x48, 0xea, 0xb4, 0xbc, 0xce, 0xca, 0x88, 0x3d, 0x38,
+  0x57, 0x40, 0x3d, 0x11, 0xfa, 0x8b, 0x3d, 0xc0, 0x34, 0x36, 0x3d, 0xe4, 0x82,
+  0x8e, 0xbd, 0xbd, 0x95, 0x59, 0xbd, 0xf0, 0x8b, 0x43, 0xbd, 0x93, 0x9b, 0x0a,
+  0xbc, 0xb7, 0x99, 0x4d, 0x3c, 0x46, 0x42, 0x1d, 0x3d, 0x00, 0x19, 0x3a, 0xbd,
+  0x1c, 0xd3, 0x5a, 0xbd, 0xff, 0x09, 0x02, 0xbd, 0xa1, 0x01, 0x8e, 0x3d, 0xc3,
+  0x9e, 0xd8, 0xbb, 0x28, 0xb5, 0x2d, 0x3d, 0x56, 0x9c, 0x16, 0x3d, 0x78, 0xe6,
+  0x1e, 0xbc, 0x06, 0x56, 0x14, 0x3d, 0xbc, 0x3f, 0x88, 0xbd, 0x34, 0x45, 0x94,
+  0xbc, 0xfb, 0xb1, 0x0a, 0xbd, 0x67, 0x87, 0x90, 0xbd, 0x4d, 0x75, 0x27, 0xbd,
+  0x9f, 0xc8, 0x60, 0x3b, 0x02, 0xc4, 0xb0, 0xbc, 0x54, 0x5b, 0x5f, 0xbd, 0xe3,
+  0x43, 0xff, 0xbc, 0xf6, 0xf7, 0x39, 0xbc, 0x99, 0x4c, 0x82, 0xbd, 0xda, 0x99,
+  0xa9, 0x3b, 0x6a, 0xd5, 0xee, 0xbc, 0x1e, 0xc1, 0x93, 0xbd, 0xc2, 0x21, 0x52,
+  0xbc, 0x52, 0xfc, 0x06, 0xbc, 0x70, 0x59, 0x85, 0xbd, 0x5d, 0xbd, 0x8a, 0xbd,
+  0xe2, 0x10, 0x77, 0x3d, 0x36, 0x83, 0x90, 0xbd, 0x66, 0x9f, 0x90, 0xbc, 0x30,
+  0x78, 0x4c, 0x3d, 0xd4, 0x2c, 0x8b, 0x3c, 0xe0, 0x8b, 0x4e, 0xbc, 0x31, 0x0f,
+  0x80, 0xbd, 0x4a, 0xb7, 0x5b, 0xbd, 0x52, 0xd0, 0x1a, 0xbd, 0x5c, 0x20, 0xe3,
+  0x3c, 0x5a, 0x77, 0x29, 0xbd, 0x90, 0x0b, 0x00, 0xbd, 0x62, 0x10, 0x4c, 0x3d,
+  0x40, 0x52, 0x58, 0x3c, 0x18, 0x5e, 0x46, 0x3c, 0xc6, 0x6b, 0x37, 0x3d, 0x17,
+  0x5c, 0x90, 0x3d, 0x28, 0x6c, 0xfd, 0xbc, 0x7e, 0x4b, 0x28, 0xbd, 0x86, 0x7b,
+  0x1d, 0xbd, 0x2b, 0x78, 0x83, 0x3d, 0x48, 0x65, 0x53, 0x3d, 0x91, 0x41, 0x7b,
+  0xbd, 0x0a, 0x32, 0x65, 0xbd, 0x80, 0xb5, 0x83, 0xbd, 0x93, 0x10, 0x8b, 0x3d,
+  0x40, 0xc2, 0x9b, 0x3a, 0xe8, 0xe9, 0xcc, 0x3c, 0xb8, 0xf5, 0x00, 0x3d, 0x2a,
+  0x60, 0x70, 0x3d, 0xbb, 0xa9, 0x18, 0xbd, 0xbf, 0xca, 0x76, 0xbd, 0xf4, 0x83,
+  0xda, 0xbc, 0xcc, 0x89, 0xeb, 0x3c, 0xa0, 0x01, 0x27, 0xbb, 0x90, 0x98, 0x1e,
+  0x3d, 0x2d, 0x7a, 0x91, 0xbd, 0x00, 0x8e, 0x71, 0xbd, 0xc7, 0x30, 0x1a, 0xbd,
+  0x22, 0xe9, 0x3d, 0x3d, 0x1a, 0xb3, 0x46, 0x3d, 0xbe, 0x20, 0x5a, 0x3d, 0x02,
+  0x34, 0x0b, 0xbd, 0x8d, 0x91, 0x5c, 0xbd, 0x84, 0xeb, 0xdc, 0xbc, 0xaa, 0x4b,
+  0xd6, 0xbc, 0xab, 0xd1, 0x91, 0x3d, 0xb8, 0x2c, 0x95, 0x3c, 0x0c, 0xf7, 0x59,
+  0x3d, 0xc9, 0xea, 0x8e, 0xbd, 0x23, 0xb1, 0x83, 0xbd, 0x27, 0x20, 0x85, 0xbd,
+  0x40, 0xdb, 0xaa, 0x3a, 0x4c, 0x7b, 0x48, 0xbc, 0x00, 0x62, 0x9d, 0x3b, 0xaf,
+  0xeb, 0x83, 0x3d, 0xe0, 0x4e, 0x1d, 0x3b, 0x90, 0xf9, 0xdc, 0xbc, 0xd6, 0x49,
+  0x60, 0x3d, 0x4e, 0x96, 0x66, 0x3d, 0xbe, 0x9e, 0x9b, 0xbc, 0xec, 0x9e, 0xff,
+  0x3c, 0xd0, 0xa1, 0x0b, 0x3d, 0xb4, 0x2d, 0x39, 0x3d, 0x28, 0x62, 0x9a, 0x3c,
+  0xce, 0xdc, 0x67, 0x3d, 0xe8, 0xb6, 0x68, 0x3c, 0xb6, 0x37, 0x87, 0xbd, 0xee,
+  0xd3, 0x67, 0x3d, 0x18, 0xfb, 0x31, 0x3c, 0x27, 0x89, 0x26, 0xbd, 0x30, 0x9e,
+  0xc0, 0x3c, 0xd0, 0x5b, 0x30, 0xbd, 0x90, 0x96, 0x33, 0x3c, 0x1e, 0xf8, 0x20,
+  0xbd, 0x48, 0xa2, 0xa2, 0x3c, 0x2e, 0x6b, 0x3f, 0xbd, 0x32, 0x37, 0x1e, 0x3d,
+  0x10, 0x9e, 0x26, 0xbd, 0x1c, 0xd5, 0x60, 0xbd, 0xf5, 0x5f, 0x06, 0xbd, 0x87,
+  0xff, 0x71, 0xbd, 0x1d, 0xba, 0x8c, 0xbd, 0x00, 0xe0, 0x8c, 0xba, 0x20, 0x94,
+  0x0d, 0xbc, 0x5a, 0x15, 0x84, 0xbc, 0x36, 0x58, 0x50, 0x3d, 0x7a, 0x21, 0x5c,
+  0x3d, 0x78, 0x57, 0x39, 0xbd, 0x8d, 0x3b, 0x59, 0xbd, 0x90, 0x90, 0x80, 0xbb,
+  0xf0, 0x93, 0xbe, 0x3b, 0x50, 0x34, 0xe1, 0xbb, 0xc0, 0xac, 0xd3, 0xba, 0x42,
+  0x75, 0xb4, 0xbc, 0x38, 0xaa, 0x30, 0xbd, 0xa6, 0x79, 0x49, 0x3d, 0xfc, 0xd2,
+  0x37, 0xbc, 0xe0, 0x0d, 0xd6, 0xbb, 0xc1, 0x2d, 0x73, 0xbd, 0x4a, 0xf1, 0x5b,
+  0xbd, 0xd4, 0x0c, 0x82, 0x3c, 0xce, 0x51, 0x0c, 0xbd, 0xe0, 0x9c, 0x4e, 0xbd,
+  0x3e, 0x98, 0x6a, 0x3d, 0x7e, 0xbf, 0x27, 0x3d, 0x00, 0xb2, 0x6f, 0xbd, 0x0c,
+  0xcd, 0x4d, 0x3d, 0xfa, 0x7b, 0x22, 0x3d, 0x18, 0x3f, 0x02, 0xbc, 0xa4, 0x1a,
+  0xb7, 0xbc, 0xe2, 0xf5, 0x45, 0x3d, 0xf0, 0x66, 0xe6, 0xbb, 0xd2, 0x56, 0x54,
+  0x3d, 0x72, 0xff, 0x64, 0x3d, 0x68, 0xbf, 0x41, 0x3d, 0x8c, 0xa8, 0x39, 0xbd,
+  0x4b, 0x80, 0x88, 0x3d, 0x40, 0x05, 0x8f, 0x3c, 0x9a, 0x58, 0x6b, 0xbd, 0xb6,
+  0xc7, 0x58, 0xbd, 0x66, 0x73, 0x12, 0x3d, 0x9c, 0x2b, 0x50, 0xbd, 0xc8, 0x47,
+  0x7d, 0xbc, 0xb7, 0x6a, 0x04, 0xbd, 0xe6, 0x6a, 0x23, 0x3d, 0xdb, 0x11, 0x1f,
+  0xbd, 0x60, 0x1d, 0x5e, 0xbc, 0x80, 0x70, 0x72, 0xbd, 0x08, 0xed, 0x51, 0x3c,
+  0xb8, 0x35, 0x0c, 0xbc, 0x2e, 0xef, 0x47, 0x3d, 0xd0, 0xfb, 0xdf, 0x3b, 0xee,
+  0xea, 0x5c, 0x3d, 0x52, 0xa6, 0x7f, 0x3d, 0x1c, 0xd4, 0x92, 0x3c, 0x0c, 0xe1,
+  0xe3, 0x3c, 0x0b, 0x0e, 0x8b, 0x3d, 0x1e, 0x6f, 0x20, 0x3d, 0xee, 0xf3, 0x45,
+  0xbd, 0x28, 0xef, 0xfc, 0x3c, 0x48, 0x19, 0x8c, 0xbd, 0x02, 0x87, 0x7f, 0xbd,
+  0x6c, 0xc1, 0x4b, 0x3d, 0x30, 0x88, 0x72, 0xbc, 0x00, 0xb2, 0xce, 0x39, 0x68,
+  0x2f, 0xf1, 0xbc, 0x00, 0xa0, 0x3b, 0xb8, 0x0c, 0x90, 0x7b, 0xbd, 0xd0, 0x97,
+  0x45, 0xbd, 0xf6, 0xf5, 0x5d, 0x3d, 0x50, 0x0b, 0x0e, 0x3c, 0x48, 0x51, 0xf9,
+  0x3c, 0xb7, 0xe4, 0x4d, 0xbd, 0xca, 0x8d, 0xcf, 0xbc, 0x49, 0x0d, 0x88, 0xbd,
+  0xb1, 0x3c, 0x8f, 0x3d, 0xef, 0x72, 0x8a, 0x3d, 0x90, 0x23, 0x02, 0x3d, 0xe8,
+  0x60, 0x05, 0x3c, 0xc0, 0x9f, 0xb6, 0xba, 0xd5, 0x57, 0x03, 0xbd, 0x22, 0xae,
+  0x66, 0x3d, 0x61, 0x03, 0x8b, 0xbd, 0xcc, 0x23, 0xea, 0xbc, 0x80, 0x58, 0x4f,
+  0x3c, 0x60, 0xea, 0xd0, 0x3b, 0xae, 0x19, 0x2e, 0xbd, 0x5e, 0xee, 0xb5, 0xbc,
+  0x50, 0x19, 0x18, 0x3c, 0x6d, 0xd7, 0x78, 0xbd, 0x40, 0xcb, 0xe9, 0xbc, 0xea,
+  0x76, 0x53, 0xbd, 0x2c, 0x0e, 0x6b, 0xbc, 0xd8, 0xd6, 0x6a, 0x3c, 0xe0, 0x3d,
+  0x80, 0xbd, 0x80, 0x36, 0xf1, 0xba, 0x30, 0x30, 0x51, 0x3c, 0x40, 0x41, 0xa3,
+  0xba, 0xc8, 0xe8, 0x80, 0xbd, 0x72, 0x33, 0x67, 0x3d, 0xdd, 0x7d, 0x0c, 0xbd,
+  0x1c, 0xcf, 0xbe, 0x3c, 0x8c, 0x1d, 0x8f, 0xbd, 0x4c, 0x5a, 0x3a, 0x3d, 0xa0,
+  0x35, 0xff, 0x3b, 0x50, 0xb8, 0xea, 0xbb, 0x58, 0x63, 0x26, 0xbc, 0x70, 0x33,
+  0x0c, 0xbc, 0x58, 0xbb, 0x09, 0xbc, 0x1a, 0xd0, 0xf6, 0xbc, 0x02, 0xb0, 0x08,
+  0x3d, 0x4c, 0x72, 0xa7, 0x3c, 0x10, 0xa0, 0xa7, 0x3b, 0x7c, 0xab, 0x3f, 0x3d,
+  0x12, 0x95, 0xc6, 0xbc, 0x58, 0xe5, 0xac, 0xbc, 0x80, 0xbc, 0x56, 0x3b, 0x00,
+  0xd2, 0xda, 0xbb, 0x26, 0xff, 0xaa, 0xbc, 0xf2, 0xdc, 0x71, 0x3d, 0x30, 0xaf,
+  0x85, 0xbb, 0x88, 0xf9, 0x14, 0x3d, 0x50, 0x89, 0xc5, 0xbb, 0xc0, 0xd0, 0xf1,
+  0x3b, 0x95, 0xf2, 0x7b, 0xbd, 0x66, 0x43, 0xfa, 0xbc, 0xa0, 0x68, 0xf3, 0xbb,
+  0x60, 0xa0, 0xdc, 0x3c, 0x0e, 0x67, 0x6e, 0x3d, 0xdd, 0xec, 0x8a, 0xbd, 0xca,
+  0x1e, 0x8f, 0xbd, 0x64, 0x84, 0x6c, 0xbd, 0xee, 0x7b, 0x7a, 0xbd, 0xd2, 0xdc,
+  0x97, 0xbc, 0x84, 0x44, 0x77, 0xbd, 0xf8, 0xec, 0x0e, 0xbd, 0xea, 0x25, 0x03,
+  0x3d, 0x8e, 0x42, 0x27, 0xbd, 0x31, 0x0b, 0x87, 0x3d, 0xba, 0x5e, 0x31, 0xbd,
+  0x74, 0xee, 0xa5, 0x3c, 0xb5, 0xa1, 0x83, 0x3d, 0x48, 0x87, 0xad, 0x3c, 0x5c,
+  0xc4, 0x04, 0xbd, 0xe6, 0xe7, 0x4e, 0x3d, 0x24, 0xa4, 0xb2, 0xbc, 0x02, 0x4a,
+  0x8d, 0xbd, 0xfa, 0x96, 0x92, 0xbd, 0xf8, 0x1e, 0xaf, 0x3c, 0x80, 0xdb, 0xfe,
+  0x3a, 0x20, 0x48, 0xff, 0xbb, 0xf2, 0xdd, 0x63, 0x3d, 0x2c, 0x12, 0xaf, 0x3c,
+  0x8a, 0x05, 0xcf, 0xbc, 0xd8, 0x3a, 0x23, 0x3d, 0x2b, 0x32, 0x89, 0xbd, 0xd0,
+  0xff, 0x8b, 0x3b, 0x58, 0xd1, 0x13, 0xbd, 0x00, 0xac, 0x96, 0x3a, 0x8a, 0x92,
+  0x33, 0x3d, 0x1c, 0xdb, 0x2f, 0xbc, 0x8a, 0x30, 0x69, 0xbd, 0x80, 0xcc, 0x7a,
+  0x3b, 0x88, 0xaa, 0x7b, 0xbd, 0x03, 0xda, 0x8e, 0xbd, 0x10, 0x40, 0xfe, 0x3b,
+  0x74, 0x92, 0x0b, 0x3d, 0x54, 0x61, 0x7e, 0xbd, 0xdd, 0x2f, 0x75, 0xbd, 0xa8,
+  0xcd, 0x52, 0x3c, 0x20, 0xf1, 0x57, 0x3d, 0x98, 0x18, 0x05, 0xbc, 0x86, 0x14,
+  0x3a, 0x3d, 0xf0, 0xa5, 0x94, 0x3b, 0x13, 0xd7, 0x8b, 0x3d, 0xbe, 0x38, 0x1e,
+  0x3d, 0xe6, 0xa2, 0x8d, 0xbc, 0xc0, 0x39, 0xdf, 0x3c, 0xf8, 0x3f, 0x8b, 0xbd,
+  0xc9, 0x86, 0x8a, 0x3d, 0x51, 0xa4, 0x6d, 0xbd, 0x7b, 0xe0, 0x82, 0x3d, 0x50,
+  0x6e, 0x6d, 0x3c, 0xd0, 0x15, 0x60, 0xbd, 0x46, 0xec, 0x06, 0xbd, 0x50, 0x8b,
+  0x0f, 0x3d, 0x8e, 0x36, 0xab, 0xbc, 0x7f, 0x46, 0x74, 0xbd, 0x4e, 0x2b, 0x63,
+  0xbd, 0x6e, 0xdf, 0x2c, 0x3d, 0xee, 0x87, 0x60, 0x3d, 0x4e, 0x24, 0x6e, 0xbd,
+  0x06, 0xbf, 0x7d, 0x3d, 0x40, 0xf6, 0x25, 0x3c, 0xba, 0xea, 0x01, 0x3d, 0x29,
+  0x4f, 0x8c, 0xbd, 0xf3, 0x02, 0x8b, 0xbd, 0x7c, 0x06, 0x30, 0xbd, 0xda, 0x97,
+  0x1e, 0x3d, 0xad, 0x89, 0x8b, 0xbd, 0x90, 0x78, 0xd1, 0x3b, 0x2c, 0x75, 0xb5,
+  0x3c, 0x41, 0x04, 0x40, 0xbd, 0x52, 0x9d, 0x08, 0x3d, 0xf4, 0x53, 0xbf, 0x3c,
+  0x48, 0x82, 0x16, 0x3c, 0x3a, 0xa1, 0x72, 0x3d, 0xc8, 0x73, 0x32, 0x3d, 0x5a,
+  0x20, 0x20, 0x3d, 0x08, 0xb1, 0x48, 0x3d, 0x46, 0x6e, 0x73, 0x3d, 0x59, 0x17,
+  0x0f, 0xbd, 0xb8, 0xa7, 0x01, 0x3c, 0x10, 0x53, 0x46, 0x3c, 0x27, 0xc2, 0x3f,
+  0xbd, 0x77, 0x6b, 0x91, 0x3d, 0xa8, 0x1c, 0xec, 0x3c, 0xfd, 0x09, 0x92, 0xbd,
+  0x1c, 0x87, 0x89, 0xbd, 0x60, 0x10, 0xdc, 0xbb, 0x00, 0x40, 0xd1, 0x36, 0x48,
+  0xb3, 0x28, 0x3c, 0xc8, 0xb3, 0x94, 0x3c, 0xfa, 0x6c, 0x8e, 0xbc, 0x98, 0x5b,
+  0x68, 0xbc, 0x32, 0xc1, 0x3b, 0x3d, 0xb7, 0xd5, 0x81, 0x3d, 0x48, 0xb6, 0x10,
+  0x3d, 0x5c, 0x95, 0x58, 0xbd, 0xf6, 0xb9, 0x00, 0xbd, 0xaa, 0xbe, 0x51, 0xbd,
+  0x2e, 0xbc, 0x70, 0x3d, 0xc8, 0x89, 0x06, 0x3c, 0x00, 0x00, 0x41, 0xb9, 0x31,
+  0x3e, 0x10, 0xbd, 0xf0, 0x26, 0x14, 0xbc, 0x98, 0xfc, 0xf2, 0x3c, 0xf3, 0x6d,
+  0x27, 0xbd, 0xd0, 0xdd, 0x2e, 0xbc, 0xee, 0x5b, 0x92, 0xbd, 0xc6, 0x4c, 0x24,
+  0x3d, 0x3c, 0x5e, 0x01, 0x3d, 0x6a, 0xe6, 0x26, 0xbd, 0x90, 0xd6, 0x1f, 0x3c,
+  0xbc, 0x88, 0xcd, 0x3c, 0xb0, 0xad, 0xee, 0x3c, 0xd4, 0xc5, 0xdf, 0x3c, 0xa6,
+  0x0f, 0xe7, 0xbc, 0x51, 0x99, 0x84, 0x3d, 0xc4, 0x84, 0x6a, 0xbc, 0xa8, 0xb6,
+  0x5c, 0xbc, 0x00, 0xba, 0x3a, 0x39, 0x28, 0x4f, 0x59, 0x3d, 0x80, 0x55, 0x45,
+  0xba, 0x48, 0x20, 0x84, 0xbc, 0x3f, 0xfd, 0x90, 0x3d, 0x74, 0x17, 0x82, 0xbd,
+  0x93, 0xd5, 0x26, 0xbd, 0xc0, 0x02, 0xbf, 0xbc, 0x42, 0xdf, 0x24, 0x3d, 0x0e,
+  0xac, 0xd5, 0xbc, 0x42, 0xcc, 0x7a, 0xbd, 0xd0, 0x21, 0xf6, 0x3b, 0x88, 0x2e,
+  0x63, 0xbd, 0x08, 0xdd, 0xc4, 0xbc, 0x08, 0xa7, 0x6b, 0x3c, 0x17, 0x07, 0x83,
+  0xbd, 0x31, 0xfd, 0x81, 0x3d, 0x68, 0xb0, 0x3f, 0x3c, 0xec, 0x78, 0xc0, 0xbc,
+  0x40, 0x91, 0x3b, 0x3c, 0x80, 0x96, 0xbf, 0x3a, 0x94, 0xed, 0xa7, 0x3c, 0xb0,
+  0xf7, 0x2a, 0x3c, 0x00, 0x90, 0xc6, 0x37, 0xb4, 0x0d, 0x89, 0xbd, 0xd0, 0x28,
+  0xb0, 0xbb, 0xf0, 0x65, 0x06, 0x3c, 0xcd, 0xc8, 0x8d, 0x3d, 0x66, 0xa5, 0x6f,
+  0x3d, 0x36, 0x46, 0x4c, 0x3d, 0x00, 0x80, 0x67, 0x36, 0xaf, 0x78, 0x20, 0xbd,
+  0xce, 0x83, 0x08, 0x3d, 0x7f, 0x32, 0x84, 0xbd, 0x23, 0x80, 0x8e, 0x3d, 0xb4,
+  0xa5, 0x56, 0x3d, 0xe4, 0xc2, 0x10, 0xbd, 0xc0, 0xf4, 0xe9, 0xba, 0xa6, 0x4e,
+  0x6d, 0x3d, 0x04, 0x19, 0xad, 0xbc, 0x0c, 0xf2, 0x38, 0x3d, 0xc6, 0x2c, 0x29,
+  0xbd, 0xba, 0x51, 0x5c, 0x3d, 0x20, 0x92, 0xae, 0x3c, 0x68, 0x55, 0xf7, 0x3c,
+  0x40, 0x10, 0x08, 0x3d, 0x86, 0x95, 0x62, 0x3d, 0x36, 0xef, 0x80, 0xbd, 0xd8,
+  0x21, 0x37, 0xbd, 0x28, 0x37, 0x93, 0xbc, 0x20, 0xb5, 0x35, 0x3b, 0x2f, 0x41,
+  0x86, 0xbd, 0xf0, 0xf4, 0xfd, 0xbc, 0x3e, 0xa1, 0x8a, 0xbd, 0x38, 0xf3, 0x8f,
+  0xbd, 0x15, 0xd9, 0x6e, 0xbd, 0xb8, 0xd9, 0x4b, 0x3d, 0x6e, 0x7c, 0x61, 0xbd,
+  0x00, 0x0e, 0x4d, 0xbb, 0xf8, 0xa5, 0x58, 0xbc, 0x20, 0x15, 0xb6, 0x3b, 0xa0,
+  0x58, 0x09, 0x3b, 0xed, 0x15, 0x72, 0xbd, 0x00, 0xc6, 0x1a, 0x3a, 0x90, 0xdf,
+  0x44, 0x3d, 0x70, 0xb4, 0x28, 0xbd, 0x66, 0x55, 0x7d, 0xbd, 0x94, 0x94, 0x84,
+  0x3c, 0x49, 0xde, 0x32, 0xbd, 0x32, 0x47, 0x13, 0x3d, 0x2e, 0x3b, 0x4a, 0xbd,
+  0x8a, 0x6d, 0x53, 0xbd, 0x88, 0x9e, 0x8b, 0xbc, 0xfe, 0x9b, 0xd0, 0xbc, 0xf0,
+  0xb2, 0x16, 0x3c, 0x8c, 0x8a, 0x85, 0x3c, 0xd5, 0x73, 0x8b, 0xbd, 0xd6, 0xd6,
+  0x02, 0xbd, 0x70, 0x96, 0x22, 0x3d, 0x8a, 0x4b, 0x1c, 0x3d, 0x80, 0x91, 0xeb,
+  0x3a, 0x80, 0x29, 0x95, 0x3c, 0x71, 0xf1, 0x8d, 0x3d, 0x3e, 0x5e, 0x5e, 0xbd,
+  0xd2, 0x53, 0x63, 0x3d, 0x0b, 0xcb, 0x8d, 0xbd, 0x58, 0x76, 0x5f, 0xbc, 0xc2,
+  0xe8, 0x02, 0x3d, 0x9c, 0x96, 0x99, 0x3c, 0xbc, 0xe8, 0x96, 0x3c, 0xff, 0x05,
+  0x45, 0xbd, 0x48, 0xa6, 0x02, 0x3d, 0x83, 0x34, 0x87, 0xbd, 0xe4, 0x9a, 0x47,
+  0x3d, 0xd8, 0x5f, 0xc5, 0x3c, 0x0c, 0x1c, 0xee, 0xbc, 0x3e, 0x65, 0x46, 0x3d,
+  0xe5, 0xd2, 0x10, 0xbd, 0x00, 0x98, 0x9a, 0xbb, 0x06, 0x89, 0x8d, 0xbc, 0xb8,
+  0x08, 0xc5, 0xbc, 0x9e, 0xeb, 0xbd, 0xbc, 0x98, 0x4b, 0x78, 0xbd, 0x7d, 0x8a,
+  0x7d, 0xbd, 0x00, 0x70, 0xf6, 0x39, 0xe0, 0x0c, 0xba, 0x3b, 0xa2, 0xf4, 0xdf,
+  0xbc, 0xca, 0x61, 0x79, 0xbd, 0x44, 0x6f, 0xa3, 0xbc, 0x3c, 0x56, 0xe1, 0x3c,
+  0x90, 0xfd, 0x3c, 0xbd, 0x71, 0x08, 0x35, 0xbd, 0xde, 0x28, 0x6b, 0xbd, 0xae,
+  0xe2, 0x36, 0x3d, 0xe7, 0x04, 0x1e, 0xbd, 0x94, 0x0b, 0x1a, 0x3d, 0x3a, 0x8f,
+  0x26, 0x3d, 0x40, 0xbe, 0x07, 0xbc, 0x10, 0x36, 0x8d, 0xbd, 0x40, 0x7b, 0x06,
+  0x3b, 0xd8, 0x7b, 0x2c, 0x3d, 0x4f, 0x09, 0x59, 0xbd, 0x28, 0xc9, 0xeb, 0x3c,
+  0x1c, 0xee, 0x7c, 0xbc, 0xf0, 0x79, 0x19, 0x3c, 0xf8, 0x06, 0x72, 0x3c, 0xe0,
+  0x83, 0xb5, 0x3b, 0xc8, 0xca, 0x47, 0x3c, 0x88, 0x99, 0x0c, 0x3d, 0xe6, 0x5f,
+  0xaf, 0xbc, 0x14, 0x1b, 0x4f, 0xbc, 0x13, 0x70, 0x80, 0xbd, 0xdd, 0x13, 0x18,
+  0xbd, 0x4e, 0xae, 0xe3, 0xbc, 0xaa, 0x98, 0x7d, 0x3d, 0x00, 0xf9, 0x2f, 0x3c,
+  0xdd, 0xd1, 0x8c, 0x3d, 0x28, 0x5c, 0x3c, 0x3d, 0x90, 0x81, 0x38, 0x3d, 0x3a,
+  0xf4, 0x5d, 0x3d, 0xc2, 0x24, 0x53, 0x3d, 0x00, 0x34, 0x42, 0xbb, 0x32, 0xc8,
+  0x78, 0x3d, 0x7a, 0x94, 0xe6, 0xbc, 0x76, 0x8f, 0x80, 0xbc, 0x83, 0xca, 0x8b,
+  0x3d, 0x62, 0xfb, 0x78, 0x3d, 0xe9, 0x00, 0x90, 0x3d, 0xe8, 0x9b, 0x1c, 0xbd,
+  0x66, 0xd9, 0x8d, 0xbd, 0xa2, 0xe7, 0x73, 0x3d, 0xd8, 0xb6, 0xb9, 0xbc, 0xa0,
+  0x55, 0x70, 0x3b, 0x08, 0x5b, 0x00, 0x3c, 0xb4, 0xd0, 0x58, 0xbd, 0xe4, 0x3b,
+  0x52, 0xbd, 0xb0, 0x22, 0x3d, 0x3d, 0x4a, 0x4f, 0x81, 0xbd, 0x48, 0xf0, 0x6a,
+  0x3c, 0x61, 0xf4, 0x65, 0xbd, 0x34, 0x4e, 0x00, 0x3d, 0xd1, 0x71, 0x3c, 0xbd,
+  0x8e, 0x3e, 0x70, 0x3d, 0x55, 0x7a, 0x27, 0xbd, 0x68, 0x22, 0xd5, 0xbc, 0x59,
+  0x71, 0x90, 0xbd, 0xc8, 0xb0, 0x60, 0x3c, 0x74, 0x5b, 0x36, 0xbd, 0xdc, 0x16,
+  0xbf, 0x3c, 0x62, 0x7a, 0xe3, 0xbc, 0x00, 0x21, 0x8e, 0xba, 0x1e, 0x0d, 0x08,
+  0xbd, 0xa3, 0x7a, 0x07, 0xbd, 0xb4, 0x92, 0xee, 0x3c, 0x8d, 0xd2, 0x81, 0x3d,
+  0x40, 0xc6, 0x98, 0x3c, 0x78, 0xc1, 0x69, 0x3c, 0x36, 0x9a, 0x72, 0x3d, 0xd2,
+  0xfa, 0xe3, 0xbc, 0x42, 0x4c, 0x0e, 0x3d, 0x97, 0x2c, 0x88, 0x3d, 0x78, 0x6f,
+  0x13, 0xbc, 0x40, 0x90, 0x7a, 0x3b, 0x66, 0x40, 0x95, 0xbc, 0xb8, 0xe6, 0x33,
+  0x3d, 0x64, 0x0c, 0xf1, 0x3c, 0xb3, 0xc0, 0x1f, 0xbd, 0x67, 0x03, 0x03, 0xbd,
+  0xe4, 0x7c, 0xfb, 0x3c, 0x7e, 0x22, 0x0e, 0xbd, 0xd6, 0x60, 0x8d, 0xbd, 0xcc,
+  0xa2, 0x2c, 0xbd, 0x00, 0xa4, 0xd6, 0x39, 0xf8, 0x7d, 0x8d, 0xbd, 0xe4, 0x27,
+  0x9a, 0xbc, 0xd8, 0x19, 0x61, 0xbd, 0xb8, 0x49, 0x54, 0xbd, 0x70, 0xcb, 0xd3,
+  0x3b, 0x49, 0xe1, 0x89, 0x3d, 0x06, 0x6c, 0x78, 0x3d, 0xc0, 0xbe, 0x82, 0x3c,
+  0x4d, 0x99, 0x8f, 0x3d, 0xd8, 0x0d, 0xe6, 0x3c, 0x4e, 0x2d, 0x60, 0x3d, 0x1c,
+  0xab, 0x99, 0x3c, 0x66, 0xc6, 0xcc, 0xbc, 0x28, 0x76, 0x0b, 0xbc, 0x7b, 0x6e,
+  0x90, 0x3d, 0x3b, 0x2f, 0x1c, 0xbd, 0x60, 0x1e, 0x83, 0x3b, 0xc8, 0x88, 0xfd,
+  0x3c, 0x00, 0x48, 0xa8, 0x3c, 0x40, 0x3d, 0xd4, 0x3b, 0xa4, 0x83, 0xfc, 0x3c,
+  0x3c, 0xe7, 0xd8, 0x3c, 0xfe, 0xaa, 0x6f, 0x3d, 0xbb, 0x22, 0x90, 0xbd, 0xd6,
+  0xf5, 0x29, 0x3d, 0x8e, 0x7e, 0x65, 0x3d, 0xae, 0x3b, 0xe4, 0xbc, 0xea, 0x04,
+  0x54, 0x3d, 0x64, 0x22, 0x1f, 0x3d, 0x24, 0x95, 0x90, 0x3c, 0xcd, 0x7b, 0x21,
+  0xbd, 0xd0, 0xf8, 0xb9, 0x3b, 0x26, 0xf8, 0x28, 0xbd, 0x6a, 0x37, 0x5b, 0x3d,
+  0x6e, 0x7e, 0x70, 0x3d, 0xa0, 0x90, 0xec, 0x3c, 0x00, 0x8e, 0x0d, 0xbb, 0xe0,
+  0xbe, 0x5b, 0xbb, 0x58, 0xf6, 0x9c, 0x3c, 0xbe, 0x59, 0xc0, 0xbc, 0x64, 0x78,
+  0xa4, 0x3c, 0x79, 0xfb, 0x86, 0x3d, 0x60, 0x6c, 0x85, 0xbc, 0xba, 0x44, 0x18,
+  0xbd, 0x5e, 0xea, 0x6a, 0xbd, 0x6c, 0xf4, 0x36, 0xbd, 0xee, 0xd4, 0x4c, 0xbd,
+  0xa2, 0x17, 0x16, 0x3d, 0x98, 0x59, 0xb9, 0x3c, 0x90, 0x41, 0x3d, 0x3c, 0x66,
+  0x14, 0x06, 0x3d, 0x40, 0xa2, 0x17, 0xbb, 0xdd, 0x83, 0x75, 0xbd, 0x2c, 0x19,
+  0x8f, 0x3c, 0xfe, 0xde, 0x49, 0xbd, 0x57, 0x3d, 0x85, 0x3d, 0x1c, 0xb3, 0xef,
+  0xbc, 0x58, 0xdb, 0x3f, 0xbd, 0x0e, 0x38, 0x20, 0x3d, 0x80, 0xbf, 0xa7, 0x3a,
+  0xf0, 0xe2, 0x91, 0xbd, 0xcc, 0x0f, 0x0a, 0x3d, 0xc7, 0xad, 0x4d, 0xbd, 0x64,
+  0x33, 0x69, 0xbd, 0xc0, 0xc0, 0xd7, 0xbb, 0xb0, 0x16, 0x83, 0xbd, 0xd0, 0xbf,
+  0x3c, 0x3d, 0x11, 0x62, 0x87, 0x3d, 0x68, 0x04, 0x0f, 0x3d, 0x6e, 0xee, 0x2a,
+  0x3d, 0xb8, 0x70, 0x37, 0xbc, 0x62, 0x76, 0x7e, 0x3d, 0x84, 0xbc, 0xa0, 0x3c,
+  0xc0, 0xc9, 0x26, 0xbd, 0x82, 0x1a, 0x85, 0xbd, 0x80, 0x55, 0x8e, 0xbd, 0xe4,
+  0xdb, 0x48, 0x3d, 0x60, 0xa5, 0xd6, 0x3b, 0x39, 0x18, 0x92, 0x3d, 0x36, 0x5a,
+  0x6c, 0xbd, 0xe8, 0x77, 0xcb, 0x3c, 0x48, 0x9e, 0x12, 0x3d, 0x3b, 0x40, 0x91,
+  0xbd, 0x00, 0xe0, 0xf6, 0x38, 0xd6, 0xa0, 0x2f, 0xbd, 0xe0, 0xe2, 0x0f, 0xbc,
+  0xf4, 0x85, 0x50, 0x3d, 0x64, 0xf7, 0x9b, 0x3c, 0xdc, 0x72, 0x53, 0x3d, 0x28,
+  0x0b, 0x45, 0xbc, 0x4e, 0xb5, 0x3f, 0xbd, 0x34, 0x7a, 0xea, 0x3c, 0x58, 0xe1,
+  0x71, 0x3c, 0x60, 0x5b, 0xf8, 0xbc, 0xf8, 0x3d, 0x52, 0x3c, 0xd0, 0xdc, 0x67,
+  0xbd, 0xee, 0x2d, 0x0c, 0x3d, 0x70, 0x47, 0xb0, 0x3c, 0x70, 0x7c, 0x29, 0x3d,
+  0xf4, 0x97, 0xc9, 0x3c, 0x74, 0x63, 0x32, 0x3d, 0x6c, 0x17, 0x94, 0x3c, 0x87,
+  0xdc, 0x7a, 0xbd, 0xb6, 0xf5, 0x7c, 0x3d, 0x62, 0xd2, 0xe7, 0xbc, 0x99, 0xa5,
+  0x50, 0xbd, 0x4c, 0xa2, 0xb1, 0xbc, 0xf0, 0x38, 0xdd, 0xbb, 0xac, 0x44, 0x3f,
+  0xbd, 0x34, 0xb7, 0x06, 0x3d, 0xf6, 0x65, 0x25, 0x3d, 0xdb, 0x01, 0x1e, 0xbd,
+  0x68, 0xee, 0x19, 0xbc, 0x4c, 0xdd, 0x8a, 0x3c, 0xe0, 0xe4, 0x14, 0xbc, 0x9e,
+  0x6f, 0x21, 0x3d, 0x18, 0xd1, 0x59, 0x3d, 0x0c, 0xdd, 0xe1, 0xbc, 0x84, 0xa1,
+  0xe6, 0x3c, 0x5c, 0x56, 0xfa, 0x3c, 0xc4, 0x30, 0x8d, 0x3c, 0x9c, 0xba, 0x12,
+  0xbd, 0xe0, 0x85, 0xbf, 0xbc, 0x00, 0x1d, 0x62, 0xbb, 0xe4, 0x7a, 0x13, 0x3d,
+  0x36, 0x6c, 0x07, 0x3d, 0x88, 0xb1, 0x2a, 0x3c, 0x06, 0xba, 0x16, 0xbd, 0x24,
+  0x12, 0xaf, 0x3c, 0x7c, 0x97, 0x3b, 0xbc, 0xe4, 0x3d, 0x2e, 0xbd, 0x8c, 0x86,
+  0xa9, 0xbc, 0x6c, 0x70, 0x06, 0x3d, 0x0b, 0x2c, 0x76, 0xbd, 0x72, 0x24, 0xe8,
+  0xbc, 0x22, 0xeb, 0x70, 0x3d, 0xf0, 0xfb, 0x7b, 0x3c, 0x62, 0x51, 0x08, 0xbd,
+  0x52, 0x97, 0x88, 0xbd, 0x58, 0x8d, 0x76, 0x3c, 0x3c, 0x79, 0xf1, 0x3c, 0x6c,
+  0x9b, 0xbd, 0xbc, 0xa4, 0xf4, 0xe9, 0x3c, 0x80, 0x4d, 0x22, 0x3a, 0x78, 0x12,
+  0x81, 0x3c, 0x9a, 0xc5, 0x4a, 0x3d, 0xfa, 0x9b, 0x4a, 0x3d, 0x0c, 0x20, 0x7f,
+  0xbd, 0x36, 0x46, 0x06, 0xbd, 0x60, 0x13, 0xbd, 0xbb, 0x8e, 0x08, 0x92, 0xbc,
+  0xca, 0x25, 0x1c, 0x3d, 0xb2, 0x84, 0x3f, 0x3d, 0x98, 0x3f, 0x47, 0x3d, 0x58,
+  0x18, 0x4b, 0x3d, 0x60, 0x91, 0x63, 0xbb, 0xa2, 0x5c, 0xea, 0xbc, 0xc4, 0x8e,
+  0x86, 0x3c, 0x5c, 0x76, 0x91, 0xbd, 0x10, 0xa2, 0x1d, 0xbc, 0xe0, 0xcb, 0xb5,
+  0xbb, 0x50, 0xd2, 0xe2, 0x3c, 0x98, 0xbd, 0x88, 0xbd, 0x00, 0xd8, 0x0f, 0x39,
+  0x72, 0x33, 0x20, 0x3d, 0x00, 0x13, 0xbd, 0x39, 0xae, 0xc3, 0xd1, 0xbc, 0xec,
+  0x7e, 0xb8, 0xbc, 0x78, 0xb4, 0x90, 0xbc, 0xc2, 0x01, 0x68, 0x3d, 0x40, 0x0a,
+  0x4f, 0xbb, 0xb7, 0xe6, 0x87, 0x3d, 0x35, 0xe8, 0x85, 0x3d, 0x94, 0x2a, 0xe6,
+  0x3c, 0xd8, 0x5c, 0x69, 0x3c, 0x20, 0x8e, 0xc2, 0xbb, 0x4c, 0xa2, 0x92, 0x3c,
+  0xd6, 0xc7, 0x73, 0x3d, 0xf8, 0x0c, 0xb8, 0x3c, 0x40, 0x90, 0xb9, 0x3a, 0x2e,
+  0x2b, 0x31, 0x3d, 0x18, 0xf5, 0x8a, 0x3c, 0x91, 0x95, 0x5b, 0xbd, 0xc0, 0xfa,
+  0xc8, 0x3a, 0x72, 0xf1, 0xa9, 0xbc, 0x36, 0x77, 0x48, 0xbd, 0x73, 0x0d, 0x6c,
+  0xbd, 0x70, 0x22, 0xe4, 0xbb, 0x88, 0x5c, 0x28, 0x3d, 0xc6, 0x18, 0x3e, 0x3d,
+  0x94, 0x3c, 0xd1, 0xbc, 0x7f, 0x43, 0x15, 0xbd, 0xee, 0x0d, 0x9e, 0xbc, 0x62,
+  0xff, 0x29, 0x3d, 0xf0, 0x56, 0xf2, 0x3b, 0x22, 0x3f, 0x4e, 0x3d, 0xb6, 0x94,
+  0x39, 0xbd, 0x9e, 0xf1, 0x45, 0xbd, 0x87, 0xdb, 0x85, 0x3d, 0xd8, 0x35, 0x65,
+  0x3c, 0xcc, 0x13, 0x8a, 0x3c, 0x44, 0x89, 0x64, 0xbc, 0xe6, 0xb5, 0x2a, 0xbd,
+  0x28, 0x4f, 0x69, 0x3c, 0x36, 0x45, 0x53, 0x3d, 0x3a, 0xd2, 0xfe, 0xbc, 0xce,
+  0xa8, 0xa2, 0xbc, 0x8a, 0x16, 0x7d, 0xbd, 0xc2, 0xd5, 0xd9, 0xbc, 0xa0, 0x4a,
+  0x87, 0xbd, 0x9e, 0xc2, 0x2c, 0x3d, 0xfc, 0x3a, 0xaf, 0x3c, 0x9e, 0x10, 0x40,
+  0xbd, 0xe0, 0x3a, 0x82, 0x3b, 0x0c, 0xe4, 0xfc, 0x3c, 0xd8, 0x07, 0x57, 0xbd,
+  0xba, 0x34, 0x91, 0xbd, 0xc6, 0x42, 0x51, 0x3d, 0xc0, 0xe9, 0xe1, 0x3b, 0x9c,
+  0x4a, 0x2a, 0xbc, 0xc6, 0x92, 0x7b, 0x3d, 0x12, 0x9f, 0x59, 0xbd, 0x0c, 0x62,
+  0xfd, 0xbc, 0x6c, 0x1a, 0xe6, 0x3c, 0x72, 0x2c, 0x4b, 0x3d, 0x7a, 0xa5, 0x3b,
+  0xbd, 0xfa, 0x37, 0x7b, 0x3d, 0xc0, 0xf0, 0x87, 0xbc, 0x28, 0xd1, 0x5a, 0x3c,
+  0xd7, 0x35, 0x6b, 0xbd, 0x7e, 0x9c, 0x6f, 0x3d, 0x1a, 0xf6, 0x23, 0xbd, 0x66,
+  0x3b, 0xa2, 0xbc, 0x00, 0xb5, 0x5d, 0xba, 0xbb, 0xc3, 0x52, 0xbd, 0x24, 0x0d,
+  0x14, 0x3d, 0x6f, 0x6f, 0x7d, 0xbd, 0x74, 0x88, 0x90, 0xbd, 0xda, 0x8a, 0x68,
+  0xbd, 0xb4, 0xe0, 0x5f, 0xbc, 0xb8, 0x32, 0x88, 0xbd, 0x13, 0xc0, 0x81, 0x3d,
+  0x2c, 0x07, 0x2e, 0xbd, 0xd0, 0x8a, 0x8a, 0x3b, 0xe2, 0x9e, 0x8a, 0xbd, 0x60,
+  0x09, 0x8a, 0x3b, 0xd5, 0x6b, 0x92, 0xbd, 0x90, 0x61, 0x50, 0x3d, 0x62, 0x32,
+  0x0f, 0xbd, 0x9b, 0x7c, 0x6f, 0xbd, 0x10, 0x7c, 0xa3, 0x3c, 0x80, 0x22, 0xcc,
+  0xbb, 0x20, 0xc6, 0x3a, 0x3d, 0x40, 0xcb, 0x3f, 0x3b, 0xca, 0xa4, 0xdd, 0xbc,
+  0xc0, 0x36, 0xbf, 0x3c, 0x40, 0x4f, 0x85, 0x3b, 0x13, 0x52, 0x6c, 0xbd, 0x6b,
+  0xa9, 0x6f, 0xbd, 0x58, 0x41, 0x5d, 0xbc, 0xa8, 0x0e, 0x82, 0x3c, 0x7c, 0x92,
+  0xf5, 0x3c, 0xfa, 0xd8, 0x5a, 0xbd, 0xcc, 0x79, 0x54, 0x3d, 0xc4, 0x8f, 0x2a,
+  0xbc, 0x78, 0xec, 0xdb, 0x3c, 0xf0, 0x95, 0xa9, 0x3b, 0x78, 0x9d, 0xf6, 0xbc,
+  0x53, 0x59, 0x55, 0xbd, 0x08, 0x4e, 0xca, 0x3c, 0xcc, 0x95, 0xbb, 0x3c, 0xe4,
+  0x91, 0xb4, 0xbc, 0xfb, 0x9d, 0x86, 0xbd, 0x08, 0x68, 0x3f, 0xbc, 0x5d, 0x1b,
+  0x84, 0xbd, 0xd0, 0xc8, 0x83, 0x3b, 0x4a, 0x39, 0x54, 0x3d, 0x3c, 0x6e, 0xb6,
+  0xbc, 0x70, 0xdd, 0x1b, 0x3c, 0xf4, 0xfc, 0x21, 0xbd, 0x68, 0x25, 0x5e, 0x3c,
+  0x01, 0xfc, 0x8e, 0xbd, 0x60, 0xe5, 0x2a, 0x3b, 0x98, 0x51, 0x23, 0xbc, 0x00,
+  0xef, 0x0a, 0xba, 0xfc, 0x95, 0x1f, 0xbc, 0xf4, 0x89, 0x55, 0x3d, 0x76, 0x2e,
+  0x29, 0x3d, 0xdb, 0x02, 0x86, 0x3d, 0x64, 0xaa, 0x31, 0xbc, 0x7c, 0x3a, 0x9c,
+  0xbc, 0x00, 0xf2, 0x64, 0xbd, 0x86, 0xf3, 0x51, 0xbd, 0xc0, 0x2f, 0x9a, 0x3a,
+  0xf2, 0xf2, 0xd3, 0xbc, 0x1e, 0x43, 0xcb, 0xbc, 0x6d, 0x44, 0x92, 0x3d, 0x40,
+  0xc6, 0x90, 0xba, 0xaa, 0xc9, 0x3e, 0xbd, 0x02, 0xc1, 0x5b, 0x3d, 0x66, 0xeb,
+  0x1e, 0x3d, 0xf2, 0x34, 0x63, 0xbd, 0xea, 0xba, 0x66, 0x3d, 0xee, 0x8c, 0x1a,
+  0x3d, 0x3b, 0xb9, 0x1e, 0xbd, 0x0a, 0xd2, 0x13, 0x3d, 0xa0, 0xaf, 0x3e, 0x3c,
+  0xc0, 0x24, 0x83, 0x3c, 0x90, 0x69, 0xf0, 0xbb, 0x1f, 0x73, 0x86, 0x3d, 0x9d,
+  0x21, 0x77, 0xbd, 0x45, 0x4f, 0x8c, 0x3d, 0x40, 0x6d, 0xfe, 0x3c, 0xcb, 0xa5,
+  0x8d, 0xbd, 0x00, 0x8d, 0xe5, 0x39, 0x56, 0x9b, 0x55, 0x3d, 0x26, 0x49, 0x5a,
+  0xbd, 0x66, 0x93, 0x7a, 0x3d, 0x80, 0x29, 0x4f, 0xba, 0xff, 0xff, 0x82, 0xbd,
+  0x50, 0xf9, 0x65, 0x3c, 0x28, 0xa6, 0xb5, 0xbc, 0xdf, 0x70, 0x54, 0xbd, 0x17,
+  0xd1, 0x8e, 0xbd, 0x00, 0x3a, 0xb9, 0x3b, 0x26, 0x45, 0x86, 0xbc, 0xad, 0x85,
+  0x33, 0xbd, 0x94, 0x78, 0x32, 0x3d, 0x70, 0xcb, 0xa1, 0x3b, 0x40, 0xe5, 0x21,
+  0x3d, 0x32, 0xd5, 0xc2, 0xbc, 0xf8, 0x3d, 0x27, 0x3d, 0x28, 0xc0, 0x39, 0xbc,
+  0xac, 0xc8, 0x7a, 0xbc, 0xe6, 0xc2, 0xd4, 0xbc, 0x91, 0x81, 0x5c, 0xbd, 0xe1,
+  0x6a, 0x90, 0xbd, 0xa9, 0xc8, 0x1d, 0xbd, 0x00, 0x94, 0xcb, 0xb9, 0xe0, 0x0d,
+  0x31, 0x3c, 0x00, 0x2a, 0xbe, 0xbb, 0x9a, 0x1e, 0x2a, 0xbd, 0x06, 0xef, 0x7f,
+  0x3d, 0xc0, 0xcc, 0x0d, 0x3c, 0xd6, 0x50, 0x74, 0xbd, 0x10, 0x24, 0xcd, 0x3b,
+  0x22, 0x4f, 0x0c, 0xbd, 0xc8, 0xf2, 0xaa, 0x3c, 0x9e, 0x84, 0xc8, 0xbc, 0x80,
+  0xf2, 0x4e, 0x3c, 0x0c, 0x38, 0x77, 0xbd, 0x6c, 0xab, 0x63, 0xbd, 0xb7, 0x31,
+  0x11, 0xbd, 0x25, 0x39, 0x84, 0x3d, 0x31, 0x0b, 0x91, 0x3d, 0xe3, 0x1d, 0x08,
+  0xbd, 0x92, 0xb6, 0x1b, 0xbd, 0x65, 0xca, 0x88, 0x3d, 0x1c, 0x62, 0x2c, 0xbd,
+  0xda, 0x7b, 0x73, 0x3d, 0xff, 0xbb, 0x85, 0xbd, 0xc4, 0xc7, 0x51, 0x3d, 0x98,
+  0xd2, 0x6f, 0xbd, 0x70, 0xa4, 0xe9, 0x3c, 0x74, 0x65, 0xd7, 0x3c, 0x18, 0xdd,
+  0x5e, 0x3c, 0x78, 0x1d, 0x04, 0x3d, 0x2c, 0xef, 0x43, 0xbd, 0x48, 0x7d, 0x5e,
+  0xbd, 0xd6, 0x02, 0x9f, 0xbc, 0x80, 0x29, 0xa1, 0x3c, 0x70, 0x64, 0x54, 0x3d,
+  0x3e, 0xe0, 0x50, 0x3d, 0xd3, 0x7d, 0x2e, 0xbd, 0x64, 0xdf, 0x55, 0xbd, 0x72,
+  0x47, 0x8c, 0xbd, 0xfb, 0x45, 0x12, 0xbd, 0xd6, 0x49, 0x9d, 0xbc, 0xca, 0xd5,
+  0x67, 0x3d, 0x50, 0xb9, 0xf4, 0x3c, 0x93, 0xca, 0x1f, 0xbd, 0xa7, 0xe1, 0x8f,
+  0xbd, 0xcc, 0x00, 0x52, 0x3d, 0x07, 0xd3, 0x20, 0xbd, 0xd0, 0x26, 0x82, 0xbc,
+  0x2a, 0x6e, 0x69, 0x3d, 0x0c, 0x67, 0x70, 0xbd, 0xaa, 0x35, 0xe9, 0xbc, 0xae,
+  0x97, 0xba, 0xbc, 0xea, 0x69, 0x3d, 0xbd, 0x28, 0xa0, 0x6f, 0xbc, 0x2a, 0x6a,
+  0x67, 0x3d, 0x50, 0xd0, 0x6e, 0x3c, 0x16, 0x90, 0x06, 0x3d, 0x4a, 0xdf, 0x3f,
+  0x3d, 0xa0, 0x4e, 0x07, 0x3d, 0x48, 0x0d, 0x55, 0xbd, 0x50, 0x0b, 0xc6, 0xbc,
+  0xc4, 0xf3, 0x47, 0xbd, 0x90, 0x09, 0xb3, 0xbb, 0x20, 0xe9, 0x7f, 0xbd, 0xbf,
+  0x2e, 0x86, 0xbd, 0xba, 0xcf, 0x74, 0x3d, 0x86, 0xd8, 0xf6, 0xbc, 0x20, 0x65,
+  0x57, 0x3d, 0x82, 0xc5, 0x50, 0xbd, 0xac, 0x70, 0x41, 0x3d, 0x0e, 0xb0, 0x40,
+  0xbd, 0x4c, 0x30, 0x39, 0xbd, 0x80, 0xa0, 0xe5, 0x3c, 0x20, 0xc2, 0x86, 0xbb,
+  0xb8, 0x3d, 0x8c, 0x3c, 0xdf, 0x7e, 0x5f, 0xbd, 0xe0, 0xfd, 0x37, 0x3b, 0x0b,
+  0x70, 0x15, 0xbd, 0x00, 0xc1, 0x97, 0xba, 0x9a, 0x38, 0x56, 0xbd, 0x32, 0x67,
+  0xdb, 0xbc, 0x4a, 0x22, 0x38, 0x3d, 0x12, 0x1c, 0x7f, 0x3d, 0x88, 0x38, 0xee,
+  0x3c, 0x0a, 0x76, 0x61, 0x3d, 0x6d, 0xd7, 0x0a, 0xbd, 0xba, 0xb0, 0x3c, 0x3d,
+  0x28, 0xbe, 0x91, 0xbc, 0xa8, 0x3e, 0x0b, 0x3c, 0x54, 0x53, 0xb7, 0x3c, 0x50,
+  0x41, 0x57, 0x3c, 0xb4, 0x5d, 0x9b, 0x3c, 0x04, 0xb9, 0x18, 0xbd, 0xa8, 0xd5,
+  0x9c, 0xbc, 0x7c, 0x5f, 0x15, 0xbd, 0x64, 0xf3, 0x0d, 0x3d, 0x17, 0x85, 0x90,
+  0x3d, 0x5d, 0xf4, 0x51, 0xbd, 0x97, 0x93, 0x30, 0xbd, 0x40, 0x65, 0xe6, 0xbb,
+  0x20, 0xa7, 0xc3, 0x3c, 0x10, 0xb1, 0x90, 0x3c, 0xc8, 0x2f, 0x36, 0x3c, 0x6b,
+  0x38, 0x8e, 0xbd, 0xd6, 0x6c, 0x62, 0x3d, 0x94, 0x52, 0x4b, 0xbd, 0x48, 0xe5,
+  0x15, 0x3d, 0x48, 0x7a, 0x3f, 0x3d, 0x60, 0xb0, 0xdf, 0xbb, 0xc2, 0x53, 0x05,
+  0xbd, 0xc0, 0xaa, 0x94, 0x3a, 0xf2, 0xef, 0x68, 0xbd, 0xb0, 0x4d, 0x46, 0xbc,
+  0xa0, 0xdc, 0x0e, 0x3b, 0x9c, 0x99, 0x5d, 0xbd, 0xd0, 0x37, 0x63, 0xbd, 0x61,
+  0x02, 0x03, 0xbd, 0x80, 0x26, 0x51, 0x3a, 0xa0, 0xab, 0xb5, 0xbb, 0x65, 0x1e,
+  0x8d, 0x3d, 0xa0, 0x46, 0xc6, 0x3c, 0x00, 0x48, 0xa3, 0x3c, 0x4d, 0xdf, 0x84,
+  0x3d, 0x1c, 0xf1, 0x34, 0xbd, 0x1a, 0xb0, 0x00, 0x3d, 0x86, 0x6e, 0x5a, 0x3d,
+  0x02, 0xfe, 0x8b, 0xbd, 0x0e, 0x96, 0x32, 0x3d, 0xe6, 0x1e, 0x91, 0xbc, 0x8a,
+  0xe9, 0x6b, 0xbd, 0x4c, 0x53, 0x38, 0x3d, 0x39, 0xf5, 0x90, 0xbd, 0x66, 0x81,
+  0x7e, 0x3d, 0xec, 0x33, 0xaa, 0xbc, 0x3e, 0xc4, 0x5c, 0x3d, 0xd8, 0x19, 0x87,
+  0xbc, 0x70, 0xd6, 0x52, 0x3d, 0x00, 0x6a, 0xab, 0x3a, 0xda, 0x41, 0x81, 0xbc,
+  0xf0, 0xbd, 0xe3, 0x3c, 0x38, 0x66, 0x1e, 0x3c, 0x62, 0x7d, 0x8e, 0xbd, 0xa5,
+  0x2a, 0x15, 0xbd, 0xf6, 0x6a, 0x72, 0x3d, 0x72, 0x22, 0x33, 0x3d, 0x8c, 0xb7,
+  0x8e, 0xbd, 0xe2, 0xf8, 0x6a, 0xbd, 0x01, 0x40, 0x35, 0xbd, 0xb3, 0xe4, 0x79,
+  0xbd, 0xdc, 0xb4, 0x65, 0xbc, 0x3d, 0x74, 0x91, 0x3d, 0x94, 0x0a, 0xe8, 0x3c,
+  0x16, 0x25, 0x57, 0xbd, 0xd6, 0x05, 0x0b, 0x3d, 0x16, 0x2b, 0x5f, 0x3d, 0x38,
+  0x59, 0xcd, 0xbc, 0x8c, 0x9f, 0x0e, 0x3d, 0xac, 0x67, 0x9c, 0x3c, 0x00, 0xe1,
+  0xb3, 0x39, 0x1c, 0x2e, 0xf8, 0x3c, 0xed, 0xfd, 0x80, 0x3d, 0xc6, 0x8b, 0x2b,
+  0xbd, 0x08, 0x4d, 0xe0, 0x3c, 0xff, 0x55, 0x85, 0x3d, 0x3c, 0xd0, 0xe9, 0x3c,
+  0x30, 0x7c, 0x79, 0x3c, 0xd0, 0xf7, 0x8c, 0x3b, 0x82, 0xe9, 0x7d, 0xbd, 0x54,
+  0x3f, 0x46, 0x3d, 0xb8, 0x88, 0xc0, 0x3c, 0xc8, 0xf4, 0x35, 0xbc, 0xe9, 0x19,
+  0x85, 0x3d, 0x01, 0x5f, 0x62, 0xbd, 0xea, 0x7f, 0x0f, 0x3d, 0xf8, 0x73, 0x42,
+  0xbd, 0x41, 0x97, 0x8f, 0x3d, 0x13, 0xec, 0x80, 0x3d, 0xe7, 0xa8, 0x40, 0xbd,
+  0x08, 0x47, 0x4b, 0x3c, 0x80, 0xce, 0x77, 0xbc, 0xb6, 0x2d, 0x4f, 0xbd, 0xe0,
+  0xa7, 0x0b, 0x3b, 0xda, 0xb6, 0x76, 0x3d, 0xc8, 0xce, 0x14, 0x3c, 0xe0, 0xbf,
+  0x20, 0xbb, 0x10, 0xa1, 0x94, 0x3b, 0x02, 0x4e, 0x3f, 0x3d, 0xa0, 0xe9, 0x0c,
+  0xbc, 0x6a, 0x57, 0x2b, 0xbd, 0x22, 0x09, 0x1d, 0xbd, 0xa8, 0xa6, 0x4c, 0x3c,
+  0x21, 0x7d, 0x40, 0xbd, 0x91, 0xdf, 0x87, 0x3d, 0x65, 0xe4, 0x05, 0xbd, 0xdc,
+  0xd6, 0x84, 0xbd, 0x22, 0x49, 0x79, 0x3d, 0xf4, 0xf7, 0x40, 0xbc, 0x2c, 0x16,
+  0x86, 0xbc, 0xa8, 0x26, 0x40, 0x3d, 0xaa, 0x89, 0xa9, 0xbc, 0xc4, 0x74, 0xc5,
+  0xbc, 0x3c, 0x76, 0x83, 0xbc, 0x2b, 0xf7, 0x90, 0x3d, 0xa8, 0x0c, 0x6f, 0xbc,
+  0xdc, 0x96, 0x2c, 0x3d, 0xe0, 0x71, 0x88, 0x3c, 0x66, 0x9f, 0x2a, 0xbd, 0xf1,
+  0x10, 0x82, 0x3d, 0x41, 0x73, 0x41, 0xbd, 0x7e, 0x2c, 0x21, 0xbd, 0xf0, 0xea,
+  0x08, 0x3c, 0x54, 0xb4, 0x2a, 0xbc, 0xf6, 0xf5, 0x64, 0xbd, 0x46, 0xf9, 0x2a,
+  0xbd, 0x54, 0xa4, 0x29, 0x3d, 0x1e, 0x79, 0xee, 0xbc, 0xf5, 0x8b, 0x83, 0x3d,
+  0x30, 0x04, 0x10, 0x3d, 0x14, 0x83, 0x4e, 0x3d, 0x67, 0x9f, 0x62, 0xbd, 0x00,
+  0x01, 0x10, 0xbd, 0x96, 0xc8, 0x2c, 0x3d, 0x3f, 0x58, 0x8e, 0x3d, 0x34, 0xeb,
+  0xe1, 0x3c, 0x12, 0x5d, 0x87, 0xbc, 0x0b, 0x23, 0x80, 0x3d, 0x0a, 0x55, 0x81,
+  0xbd, 0xc2, 0x80, 0x16, 0xbd, 0x58, 0xa6, 0x7a, 0x3c, 0xec, 0x9a, 0xf1, 0x3c,
+  0xf0, 0x0e, 0xaa, 0x3c, 0xe2, 0x06, 0x9a, 0xbc, 0x20, 0x57, 0xec, 0xbb, 0xe8,
+  0x5b, 0xc6, 0x3c, 0x40, 0x51, 0x3b, 0x3c, 0x47, 0xf6, 0x8e, 0x3d, 0x6e, 0xc5,
+  0x06, 0xbd, 0xac, 0xf6, 0x2b, 0x3d, 0xec, 0x29, 0x05, 0x3d, 0x76, 0xd9, 0x2e,
+  0x3d, 0x7c, 0x02, 0x40, 0xbc, 0x5e, 0x98, 0x8b, 0xbc, 0x20, 0xf8, 0x8b, 0x3c,
+  0xcc, 0x04, 0x59, 0xbc, 0xd7, 0xfe, 0x8a, 0x3d, 0xda, 0xed, 0x1a, 0xbd, 0x82,
+  0x45, 0x9b, 0xbc, 0xfc, 0xa0, 0x7b, 0xbc, 0x14, 0x19, 0x0a, 0x3d, 0x7c, 0x3a,
+  0x7d, 0xbd, 0x46, 0x32, 0x91, 0xbd, 0xc0, 0xea, 0x8b, 0x3c, 0x0e, 0x44, 0x78,
+  0x3d, 0x96, 0x53, 0x2a, 0x3d, 0x3a, 0xbb, 0x79, 0x3d, 0x1f, 0xe3, 0x19, 0xbd,
+  0x56, 0xbb, 0x67, 0x3d, 0x44, 0x48, 0x86, 0x3c, 0x33, 0x5f, 0x8e, 0xbd, 0xc0,
+  0x86, 0x8c, 0xbc, 0xb0, 0x2a, 0x8e, 0x3b, 0x20, 0xd2, 0x8f, 0xbd, 0x16, 0x08,
+  0x67, 0x3d, 0x4a, 0xc7, 0x67, 0x3d, 0x50, 0x7c, 0xfd, 0xbc, 0xb0, 0xc1, 0x3f,
+  0xbd, 0xc0, 0x77, 0xde, 0x3b, 0x98, 0x6b, 0x98, 0xbc, 0x10, 0x91, 0xa0, 0x3b,
+  0x80, 0x9a, 0xed, 0x3c, 0xdd, 0xc9, 0x82, 0x3d, 0x2c, 0x20, 0x4d, 0x3d, 0x05,
+  0xe9, 0x78, 0xbd, 0x44, 0xae, 0xcd, 0x3c, 0xd8, 0x92, 0x81, 0x3c, 0x57, 0xa3,
+  0x77, 0xbd, 0xbe, 0x2e, 0x65, 0xbd, 0x74, 0xfc, 0x41, 0x3d, 0xa2, 0x99, 0x7b,
+  0x3d, 0xe0, 0x55, 0x98, 0x3b, 0xe4, 0xdf, 0xa5, 0x3c, 0xcf, 0x0c, 0x16, 0xbd,
+  0x68, 0x3f, 0x78, 0xbd, 0xbe, 0xe3, 0x4e, 0x3d, 0xf4, 0x7f, 0x4a, 0x3d, 0xaa,
+  0x64, 0x3b, 0xbd, 0xa7, 0xe7, 0x83, 0xbd, 0xe0, 0x45, 0x60, 0x3b, 0x41, 0x1e,
+  0x0c, 0xbd, 0x14, 0xa6, 0x90, 0xbd, 0x71, 0x37, 0x5f, 0xbd, 0x72, 0x90, 0xb8,
+  0xbc, 0xc6, 0x6e, 0x3b, 0xbd, 0x4d, 0x5e, 0xe0, 0xbc, 0x40, 0x74, 0x5b, 0xbb,
+  0xb2, 0x61, 0x06, 0x3d, 0xc8, 0xd6, 0xc1, 0x3c, 0xa9, 0x80, 0x85, 0xbd, 0x76,
+  0xe9, 0x20, 0x3d, 0x1a, 0xcc, 0x80, 0x3d, 0x39, 0x17, 0xdf, 0xbc, 0xe1, 0x45,
+  0x8c, 0x3c, 0x67, 0x35, 0x48, 0x3d, 0x9d, 0x17, 0x76, 0xbd, 0x38, 0xa6, 0xb2,
+  0xba, 0xad, 0x55, 0xaf, 0x3c, 0xf4, 0x50, 0x5e, 0x3d, 0x02, 0x7b, 0xd9, 0xba,
+  0x0a, 0x74, 0x0f, 0xbd, 0xa9, 0x69, 0x54, 0x3d, 0x3e, 0xa8, 0x6c, 0x3d, 0xcc,
+  0xde, 0x27, 0xbd, 0x4f, 0x51, 0xa7, 0xbb, 0xbf, 0x78, 0x26, 0xbd, 0x66, 0xcc,
+  0x84, 0xbd, 0xce, 0x30, 0xcd, 0xbc, 0xab, 0x28, 0x60, 0x3d, 0x97, 0xdb, 0x31,
+  0xbd, 0x6f, 0x6f, 0xc3, 0x3b, 0xe0, 0x7e, 0x8c, 0xbd, 0x06, 0xe2, 0xc0, 0xbc,
+  0xce, 0x5b, 0x7a, 0xbd, 0xa5, 0xfb, 0xe1, 0xbc, 0xbd, 0x3b, 0x44, 0xbd, 0x90,
+  0xa1, 0xbd, 0x3b, 0xc9, 0xba, 0x34, 0xbc, 0x5f, 0xab, 0x08, 0xbd, 0xf8, 0x5a,
+  0x5f, 0x3c, 0x23, 0xbe, 0x8c, 0x3d, 0xbc, 0x19, 0xad, 0xbc, 0xb1, 0xd8, 0x19,
+  0xbd, 0x33, 0x7a, 0x85, 0x3d, 0xa5, 0x19, 0xc7, 0x3b, 0x83, 0x55, 0x83, 0xbc,
+  0x9d, 0x63, 0x08, 0x3d, 0x36, 0x98, 0x1c, 0x3d, 0x20, 0x2d, 0x2d, 0xbc, 0x6b,
+  0xc3, 0x68, 0xbd, 0xbc, 0x22, 0xb6, 0x3c, 0x93, 0xdb, 0xc0, 0x3a, 0x88, 0x17,
+  0xdf, 0x3c, 0x0d, 0x0d, 0x2c, 0xbd, 0xc0, 0x40, 0x60, 0x3b, 0xea, 0xf9, 0x3f,
+  0xbd, 0x0d, 0xd7, 0x03, 0xbd, 0x45, 0x08, 0x68, 0xbd, 0xb3, 0xa4, 0xe9, 0xbc,
+  0xfd, 0xe9, 0x5f, 0x3d, 0x4c, 0x45, 0x0c, 0x3d, 0xff, 0xdb, 0xa3, 0xbc, 0x12,
+  0x16, 0x88, 0xbd, 0x70, 0x42, 0xe5, 0xbc, 0x60, 0xda, 0x1c, 0x3c, 0x2b, 0x55,
+  0xf8, 0x3b, 0x07, 0x82, 0x87, 0x3c, 0x08, 0x94, 0x83, 0xbd, 0x66, 0xf3, 0x44,
+  0x3d, 0x0b, 0xed, 0x10, 0x3c, 0x1b, 0x7e, 0x8f, 0xbd, 0xbe, 0x4c, 0xb5, 0xbc,
+  0xc4, 0x84, 0x26, 0x3d, 0x80, 0x5f, 0x6a, 0xbc, 0xb8, 0x41, 0x29, 0x3d, 0xfa,
+  0xbc, 0x4a, 0x3d, 0xbe, 0x44, 0x47, 0xbc, 0xc1, 0x9b, 0x21, 0x3d, 0x33, 0xb8,
+  0xd7, 0xbc, 0x54, 0xe6, 0x53, 0x3d, 0xd8, 0x95, 0x3d, 0xbd, 0x2b, 0x4d, 0x90,
+  0x3d, 0x0c, 0x3c, 0x3a, 0xbc, 0x6c, 0x41, 0x24, 0xbd, 0x31, 0xfd, 0x66, 0xbd,
+  0x43, 0x29, 0x4a, 0x3d, 0x00, 0x8d, 0xc3, 0xb9, 0x20, 0xd6, 0xe2, 0xbb, 0xb7,
+  0xf6, 0x22, 0xbd, 0xe9, 0xd7, 0x3f, 0x3d, 0x8d, 0xb7, 0xf7, 0x3c, 0x2b, 0x56,
+  0x8b, 0x3d, 0xa6, 0xa7, 0x70, 0xbd, 0xdf, 0x62, 0x56, 0x3d, 0xe9, 0x4b, 0xb0,
+  0x3c, 0x40, 0xb6, 0x04, 0x3c, 0x34, 0x8c, 0x04, 0xbd, 0xb9, 0x1a, 0x1b, 0x3d,
+  0x25, 0xbc, 0x05, 0xbd, 0x3d, 0x10, 0x1c, 0xbd, 0x77, 0x24, 0x8c, 0xbd, 0x53,
+  0x9b, 0xdf, 0x3b, 0x80, 0xc9, 0x53, 0x3d, 0x40, 0xc7, 0x6c, 0xbc, 0x00, 0xb3,
+  0xbe, 0xba, 0xe5, 0xe9, 0x89, 0x3d, 0xb0, 0x72, 0x88, 0xbd, 0xcd, 0x2d, 0x0c,
+  0xbd, 0x27, 0x35, 0x07, 0xbd, 0x6b, 0x6a, 0x49, 0xbd, 0x99, 0x9b, 0x51, 0xbd,
+  0x1c, 0x94, 0x51, 0x3c, 0x78, 0x26, 0x6a, 0xbd, 0xc2, 0x3e, 0x04, 0x3d, 0xf3,
+  0x19, 0x16, 0xbd, 0x9c, 0xb7, 0x0b, 0xbd, 0xb8, 0x3d, 0xf9, 0x3c, 0x69, 0xdb,
+  0x14, 0x3d, 0x0a, 0xe3, 0x0f, 0xbd, 0x1a, 0xd5, 0x80, 0xbd, 0xed, 0x79, 0x8d,
+  0x3c, 0x1b, 0x21, 0x00, 0xbb, 0x9a, 0x88, 0x0e, 0x3d, 0xc0, 0x1c, 0x66, 0x3d,
+  0x60, 0x74, 0x82, 0xbd, 0x7b, 0x96, 0x1c, 0x3d, 0x53, 0x16, 0x49, 0x3d, 0xeb,
+  0xfc, 0x8d, 0x3d, 0xb0, 0x52, 0x32, 0x3c, 0xa0, 0xa5, 0x5a, 0xbd, 0xfe, 0xf7,
+  0x9c, 0xbc, 0x19, 0x78, 0x4a, 0x3c, 0x78, 0xd1, 0xc2, 0x3c, 0xb4, 0x51, 0x91,
+  0xbd, 0x47, 0x08, 0x76, 0xbd, 0x7e, 0x70, 0x02, 0x3d, 0x8b, 0x90, 0x80, 0xbd,
+  0xc0, 0xad, 0x10, 0xbd, 0xc6, 0x2e, 0x4d, 0xbd, 0x0e, 0xe4, 0x0b, 0x3d, 0x9e,
+  0x8e, 0x8f, 0x3b, 0xd6, 0x81, 0x8a, 0xbd, 0xb9, 0x43, 0x05, 0xbd, 0xfd, 0xb4,
+  0x3d, 0xbd, 0x69, 0x1b, 0xa9, 0xbb, 0x0b, 0xb6, 0x88, 0xbd, 0xe3, 0x8f, 0x64,
+  0x3d, 0xd9, 0xda, 0x4d, 0x3c, 0xa8, 0xa9, 0x66, 0xbd, 0x87, 0x10, 0x23, 0x3d,
+  0xf6, 0x03, 0x3b, 0x3d, 0xa4, 0xcb, 0x83, 0x3c, 0x36, 0xd0, 0x2a, 0xbd, 0x22,
+  0x31, 0x27, 0x3d, 0xf0, 0xfb, 0x18, 0x3d, 0x8e, 0xa1, 0x04, 0x3d, 0x67, 0x0e,
+  0x67, 0xbc, 0x77, 0x07, 0x90, 0x3d, 0xaf, 0x11, 0x72, 0x3d, 0x7b, 0xdd, 0x80,
+  0x3d, 0x18, 0xd2, 0x6e, 0xbc, 0x0c, 0xfa, 0x5e, 0xbd, 0xe8, 0x92, 0xaf, 0xbc,
+  0x8f, 0x89, 0xe9, 0x3c, 0x15, 0x06, 0x1d, 0x3c, 0x02, 0x7f, 0x81, 0x3d, 0x88,
+  0xe0, 0x0f, 0xbd, 0x16, 0x6a, 0xab, 0xbc, 0xc4, 0x1f, 0xdf, 0x3c, 0x38, 0xab,
+  0x4b, 0x3c, 0x40, 0xfd, 0x83, 0x3b, 0x71, 0x9a, 0x52, 0xbd, 0x90, 0x3f, 0x04,
+  0xbd, 0xe4, 0x23, 0x81, 0x3d, 0x4a, 0xaa, 0x39, 0xbd, 0xc1, 0xb6, 0x7c, 0x3d,
+  0xa4, 0xb4, 0x2d, 0x3d, 0x3c, 0x8b, 0xea, 0x3b, 0xf3, 0x93, 0x8e, 0x3d, 0x9b,
+  0xea, 0x87, 0xbc, 0x25, 0x22, 0x91, 0xbd, 0xeb, 0x03, 0x1a, 0x3d, 0xde, 0xb3,
+  0x41, 0x3d, 0xb3, 0x03, 0x59, 0xbd, 0x98, 0xea, 0x1d, 0xbd, 0xaf, 0x46, 0xd9,
+  0xbc, 0xc0, 0x55, 0x3e, 0xbd, 0x4d, 0xe2, 0x45, 0x3d, 0x85, 0xa0, 0x44, 0x3c,
+  0x00, 0xe5, 0x3e, 0xbd, 0x6f, 0x4e, 0x4b, 0xbb, 0xe1, 0xcd, 0x86, 0x3c, 0x90,
+  0xaa, 0x08, 0xbd, 0xb6, 0xb9, 0x7a, 0x3d, 0x45, 0x80, 0x5c, 0x3d, 0xda, 0x7b,
+  0x28, 0xbd, 0x4e, 0x73, 0xc1, 0xbc, 0x8b, 0xff, 0x1b, 0x3d, 0xe0, 0xad, 0x71,
+  0xbc, 0x5c, 0xa3, 0xd3, 0xbc, 0x93, 0x08, 0x85, 0x3d, 0xce, 0x42, 0x3a, 0x3d,
+  0x31, 0x10, 0x86, 0x3d, 0x28, 0x95, 0x86, 0x3a, 0x81, 0x0e, 0x39, 0xbd, 0xa6,
+  0xb2, 0x57, 0x3d, 0x97, 0xab, 0xf8, 0xbc, 0x53, 0x5b, 0x9f, 0xbc, 0x79, 0x78,
+  0x54, 0x3d, 0xdc, 0x5b, 0x8b, 0x3d, 0xf5, 0xe7, 0x2d, 0x3d, 0xe7, 0x23, 0xa4,
+  0xbc, 0x6a, 0xff, 0x83, 0x3d, 0x53, 0xe7, 0x48, 0x3d, 0x27, 0x3c, 0x8c, 0x3d,
+  0x44, 0xdf, 0x74, 0xbd, 0x58, 0xe8, 0xf3, 0xbc, 0x4c, 0x9f, 0x57, 0x3c, 0x6c,
+  0xb6, 0x95, 0x3c, 0xbd, 0x8e, 0x65, 0x3d, 0x11, 0x3e, 0xcb, 0x3c, 0x88, 0x0e,
+  0x02, 0xbd, 0x68, 0x1c, 0x8d, 0xbb, 0xe9, 0xaa, 0x81, 0x3d, 0x00, 0xcc, 0x35,
+  0xbd, 0x4f, 0x0b, 0x8f, 0xbd, 0xa4, 0xaa, 0x40, 0xbc, 0x0a, 0x00, 0xac, 0xbc,
+  0xe2, 0x2a, 0x40, 0xbd, 0xc3, 0xff, 0x05, 0xbd, 0x09, 0xbe, 0x65, 0xbd, 0xe6,
+  0xde, 0x7e, 0xbd, 0x30, 0x36, 0x17, 0x3c, 0x50, 0x30, 0x0e, 0xbc, 0x64, 0x36,
+  0xfa, 0x3c, 0x9d, 0x5a, 0x85, 0xbb, 0x50, 0x2c, 0x65, 0xbc, 0x90, 0x5a, 0xae,
+  0xbb, 0x37, 0xe6, 0x41, 0xbd, 0xfd, 0x21, 0xf7, 0xbc, 0xb5, 0x91, 0x8b, 0xbb,
+  0x15, 0xaa, 0xbe, 0x3c, 0x86, 0x46, 0x78, 0xbd, 0xd4, 0x41, 0xf8, 0xbc, 0xf2,
+  0xb7, 0xe4, 0x3c, 0x1b, 0x84, 0x5a, 0x3c, 0x5a, 0xc8, 0x5e, 0x3d, 0x74, 0xad,
+  0xa8, 0x3c, 0x71, 0xbe, 0xa0, 0xbc, 0x9b, 0xaf, 0x2b, 0x3d, 0x43, 0x1b, 0x69,
+  0xbd, 0xb3, 0xe7, 0x88, 0x3d, 0xbd, 0xe2, 0x5c, 0x3d, 0x6b, 0xa4, 0x35, 0xbd,
+  0xe9, 0xbc, 0x8f, 0xbd, 0x16, 0xc0, 0x74, 0x3d, 0x92, 0xb9, 0x4c, 0x3d, 0x5d,
+  0xee, 0x91, 0x3c, 0x74, 0xda, 0x1d, 0xbd, 0xda, 0x42, 0x5a, 0xbb, 0x70, 0x1b,
+  0xbc, 0x3c, 0xc3, 0x23, 0xd9, 0xba, 0x6c, 0xf4, 0xa4, 0x3c, 0x9c, 0x95, 0x0a,
+  0x3d, 0xb8, 0x03, 0x9e, 0x3c, 0x05, 0x7b, 0x84, 0x3d, 0x88, 0x24, 0x29, 0x3d,
+  0x6e, 0xb3, 0x72, 0x3d, 0x36, 0x31, 0x62, 0x3c, 0xea, 0x27, 0x24, 0xbd, 0x6d,
+  0xf3, 0xe5, 0x3c, 0x2e, 0x24, 0x1f, 0x3d, 0x69, 0x95, 0x6b, 0xbd, 0xa6, 0xdf,
+  0x42, 0xba, 0xdd, 0x6e, 0x90, 0xbd, 0xb3, 0x52, 0x00, 0xbd, 0xbe, 0x22, 0x02,
+  0x3d, 0xbf, 0x61, 0x80, 0xbd, 0x8d, 0xde, 0x82, 0x3d, 0xf4, 0x40, 0x28, 0x3d,
+  0x7b, 0xeb, 0xb7, 0xba, 0xe1, 0x73, 0x94, 0x3c, 0xae, 0x7f, 0x12, 0xba, 0x02,
+  0xf0, 0x40, 0xbb, 0xf1, 0xb7, 0x05, 0x3d, 0x0d, 0xbb, 0x6b, 0xbd, 0xe2, 0x4f,
+  0x12, 0xbd, 0x0a, 0x66, 0x09, 0xbd, 0xb7, 0xe9, 0x8f, 0x3d, 0x0d, 0x7c, 0x14,
+  0x3d, 0x11, 0xf4, 0xbe, 0xba, 0x09, 0x4d, 0x38, 0xbd, 0x80, 0x94, 0x41, 0x3a,
+  0xd3, 0x89, 0xc2, 0x3c, 0xd8, 0x3a, 0x3d, 0x3c, 0x28, 0x00, 0x5f, 0xbc, 0xc4,
+  0x2a, 0x91, 0xbc, 0x50, 0x98, 0xe6, 0xbc, 0xfa, 0x52, 0x16, 0x3d, 0x3c, 0xb5,
+  0x87, 0x3d, 0xed, 0xcf, 0x70, 0x3c, 0x78, 0x9e, 0x72, 0xbb, 0x93, 0x6b, 0x23,
+  0x3d, 0xf0, 0xaf, 0x64, 0xbd, 0xce, 0xd7, 0x5e, 0xbd, 0x6c, 0x20, 0x7b, 0xbc,
+  0xd0, 0x7a, 0xe0, 0xbb, 0x60, 0xfd, 0xef, 0x3b, 0x95, 0xe5, 0x5f, 0xbd, 0xdf,
+  0x49, 0x33, 0x3c, 0x11, 0x3d, 0x80, 0x3d, 0xd4, 0x04, 0xc8, 0x3c, 0x58, 0xc0,
+  0x41, 0xbd, 0x50, 0x35, 0x63, 0x3d, 0xd2, 0x8a, 0xc8, 0xbc, 0x67, 0xf0, 0x8b,
+  0xbd, 0x69, 0x02, 0x55, 0x3d, 0x0c, 0xa1, 0x76, 0xbd, 0xa8, 0x5e, 0x05, 0xbb,
+  0xd0, 0xc3, 0x16, 0x3d, 0x78, 0x7f, 0x23, 0xbc, 0x59, 0x25, 0x5c, 0xbd, 0xb4,
+  0xaf, 0x36, 0xbd, 0x26, 0xc1, 0xd0, 0xb9, 0xa3, 0xb9, 0x54, 0x3d, 0xd3, 0x99,
+  0xea, 0xbc, 0x56, 0x87, 0xfc, 0xbc, 0x86, 0x17, 0x16, 0xbd, 0x80, 0x75, 0x17,
+  0xbd, 0xe9, 0xe9, 0x26, 0xbd, 0x73, 0xd9, 0x7f, 0xbd, 0x78, 0xf7, 0x08, 0x3d,
+  0xb4, 0x6e, 0x24, 0x3d, 0xdb, 0x78, 0x04, 0x3d, 0x91, 0x4e, 0x5e, 0x3d, 0x93,
+  0x73, 0x86, 0x3d, 0xd5, 0xc8, 0x41, 0xbd, 0x18, 0x68, 0x79, 0x3d, 0x1e, 0x5e,
+  0x74, 0xbd, 0x05, 0x92, 0x43, 0x3d, 0xed, 0xd7, 0xcb, 0x3c, 0x90, 0x04, 0x48,
+  0xbd, 0x2a, 0x81, 0x59, 0xbd, 0xa6, 0xf8, 0x8f, 0xbd, 0x21, 0x1b, 0x82, 0x3d,
+  0x47, 0x2f, 0x03, 0xbd, 0x49, 0x8a, 0xea, 0x3b, 0x82, 0x20, 0x29, 0x3d, 0x3e,
+  0x06, 0x0a, 0x3b, 0x0d, 0xe3, 0x93, 0x3c, 0x3f, 0xb2, 0x83, 0x3d, 0x57, 0x42,
+  0xe4, 0x3b, 0x02, 0x82, 0xde, 0xbc, 0x75, 0x96, 0x0a, 0xbd, 0x66, 0xb5, 0x0a,
+  0x3d, 0x11, 0xed, 0x8d, 0xbd, 0xc5, 0x7c, 0x61, 0xbd, 0x85, 0xde, 0x56, 0xbc,
+  0x2f, 0x3e, 0x41, 0xbd, 0x65, 0x92, 0x70, 0x3d, 0x10, 0x6d, 0xd8, 0xbb, 0x6e,
+  0x7b, 0x45, 0x3d, 0xe0, 0xcd, 0x58, 0x3d, 0x5a, 0xa0, 0x6c, 0xbd, 0x25, 0x13,
+  0x2f, 0xbd, 0x95, 0xcf, 0x6b, 0xbd, 0x42, 0x36, 0x20, 0xbc, 0x3c, 0x82, 0x47,
+  0x3c, 0x71, 0xef, 0x16, 0x3c, 0x50, 0xa2, 0xb8, 0xba, 0x7e, 0xc4, 0x61, 0x3c,
+  0xa6, 0xc5, 0x78, 0xbd, 0xb9, 0x33, 0x32, 0xbd, 0x47, 0x60, 0x81, 0x3d, 0x58,
+  0xd9, 0x16, 0x3d, 0x3a, 0x50, 0x7a, 0xbd, 0x47, 0xc7, 0x15, 0x3d, 0x00, 0xca,
+  0x8a, 0xbd, 0x6f, 0x8f, 0x83, 0xbd, 0x7b, 0x4f, 0x58, 0xba, 0x30, 0x8f, 0x43,
+  0xbd, 0xd1, 0x28, 0xd6, 0xbb, 0x20, 0x94, 0xf7, 0xbc, 0x84, 0xef, 0x25, 0xbd,
+  0x06, 0x79, 0x6f, 0x3d, 0xdb, 0x3e, 0xcd, 0x3c, 0xc7, 0xce, 0x79, 0x3d, 0x23,
+  0x71, 0x97, 0xbc, 0x5c, 0x5c, 0x38, 0x3d, 0xc8, 0xb6, 0x03, 0xbd, 0xd6, 0x31,
+  0xc6, 0xbc, 0x33, 0xe1, 0xd0, 0xbb, 0x66, 0xf2, 0xd5, 0xbc, 0xe2, 0x07, 0x49,
+  0x3d, 0x2c, 0x67, 0xc9, 0xbc, 0x71, 0xd2, 0x41, 0xbd, 0x1a, 0xb4, 0x81, 0x3c,
+  0xf0, 0x27, 0x7d, 0x3d, 0xca, 0xcc, 0xd5, 0xbc, 0x3f, 0x3e, 0x30, 0xbd, 0x50,
+  0xe1, 0x26, 0xba, 0x53, 0x7d, 0x00, 0x3d, 0x8e, 0x75, 0x4d, 0x3b, 0x0a, 0x56,
+  0x20, 0x3d, 0x61, 0xaf, 0xf4, 0xbc, 0x55, 0x41, 0x98, 0xbc, 0x16, 0x66, 0x13,
+  0x3d, 0x40, 0x96, 0x67, 0xbd, 0x40, 0x3a, 0x0b, 0xbd, 0xbe, 0x16, 0x88, 0xbc,
+  0x54, 0xd1, 0x56, 0xbd, 0xd5, 0xa2, 0xba, 0xbb, 0x97, 0x30, 0x1f, 0xbb, 0x37,
+  0x2d, 0x18, 0xbd, 0xe7, 0xe3, 0x8e, 0xbd, 0x82, 0x9b, 0x29, 0x3c, 0x8f, 0x41,
+  0x24, 0xbd, 0xa2, 0x55, 0x8f, 0x3b, 0x25, 0xa4, 0x18, 0x3c, 0xb6, 0xee, 0xe7,
+  0x3c, 0x3a, 0x0b, 0x12, 0xbd, 0x27, 0xfb, 0xb4, 0xb9, 0x70, 0x41, 0x0a, 0xbc,
+  0xe8, 0x8b, 0x62, 0xbd, 0x04, 0x95, 0xc5, 0x3c, 0xa4, 0x51, 0x46, 0xbd, 0x42,
+  0x1e, 0x65, 0xbd, 0x4f, 0x3d, 0x4a, 0x3d, 0x6f, 0x9d, 0x19, 0x3d, 0xb8, 0xdb,
+  0x8c, 0xbd, 0x9a, 0xfe, 0x23, 0x3c, 0x0c, 0x8a, 0x58, 0x3d, 0xe2, 0x61, 0x62,
+  0xbd, 0x1f, 0xee, 0x64, 0x3c, 0x0c, 0xb0, 0x9a, 0x3b, 0xe8, 0x9f, 0xf7, 0xbc,
+  0x54, 0xf9, 0xef, 0xbc, 0xbb, 0x3b, 0x57, 0x3a, 0xcc, 0x92, 0xa6, 0x3c, 0xfa,
+  0x7f, 0xf0, 0x3c, 0x92, 0x0c, 0x03, 0x3d, 0xc4, 0xa7, 0x0b, 0xbd, 0x3d, 0xf1,
+  0x8b, 0xbd, 0x6a, 0x7a, 0x4c, 0xbd, 0xfe, 0x96, 0xdc, 0x3c, 0xf8, 0x93, 0x99,
+  0x3b, 0xe4, 0xd7, 0x70, 0x3d, 0x72, 0x25, 0x4f, 0x3d, 0xc0, 0xa1, 0x80, 0xbd,
+  0xb8, 0xac, 0x50, 0x3d, 0x87, 0x18, 0x87, 0xbc, 0xcc, 0xe2, 0x01, 0xbd, 0x70,
+  0x67, 0xfb, 0xbb, 0xda, 0x29, 0x7c, 0x3d, 0xe6, 0xf0, 0x67, 0x3d, 0x98, 0xd8,
+  0x0e, 0x3d, 0xe8, 0xf6, 0x45, 0xbd, 0xcc, 0x76, 0x57, 0xbd, 0x12, 0xec, 0x02,
+  0x3d, 0x02, 0x73, 0xbf, 0x3c, 0xea, 0x67, 0x9e, 0x3a, 0x29, 0x29, 0x1f, 0x3d,
+  0x19, 0x65, 0x2a, 0x3d, 0x9c, 0x3a, 0x86, 0x3d, 0xd8, 0xcd, 0x15, 0xbd, 0xf3,
+  0xed, 0x75, 0xbd, 0xa6, 0x30, 0xff, 0xbc, 0x87, 0x2e, 0xc7, 0x3c, 0xe6, 0x41,
+  0xb9, 0x3c, 0x38, 0xf9, 0xb0, 0x3c, 0x49, 0x88, 0x8c, 0xbd, 0xf2, 0x2b, 0x70,
+  0x3d, 0x3d, 0x58, 0xec, 0x3b, 0xa2, 0x59, 0x3a, 0x3c, 0x3f, 0x5f, 0x3a, 0x3d,
+  0x5f, 0xb9, 0x48, 0xbd, 0x09, 0x9a, 0xc5, 0x3b, 0x12, 0x63, 0x84, 0xbd, 0x11,
+  0x76, 0x5e, 0x3d, 0x4f, 0xa0, 0x84, 0x3d, 0x90, 0x8b, 0x29, 0xbd, 0x03, 0xcc,
+  0x2c, 0xbd, 0xbe, 0x89, 0x8f, 0xbd, 0xa5, 0x7a, 0x81, 0x3d, 0x54, 0xa8, 0xd0,
+  0x3c, 0x54, 0x70, 0x9d, 0xbb, 0x4a, 0xe4, 0xb9, 0xbc, 0x94, 0x65, 0xfe, 0xbc,
+  0x3c, 0xef, 0xac, 0x3c, 0x4c, 0x87, 0x16, 0xbd, 0x0a, 0xda, 0x85, 0xbc, 0x89,
+  0x04, 0x88, 0x3d, 0xb6, 0xe7, 0x19, 0x3d, 0x38, 0x06, 0x08, 0xbd, 0x37, 0x6c,
+  0x3d, 0xbd, 0x75, 0x70, 0x09, 0x3d, 0x13, 0x5c, 0x7f, 0xbd, 0xe2, 0x25, 0xfb,
+  0x3c, 0x74, 0xe4, 0x06, 0x3d, 0xd8, 0xcb, 0x82, 0x3d, 0xbc, 0xa0, 0xeb, 0xbc,
+  0xaf, 0xb1, 0x8e, 0xbd, 0x30, 0x53, 0xdc, 0x3b, 0x4b, 0x94, 0x84, 0x3d, 0xc9,
+  0x6d, 0xcd, 0x3c, 0xd1, 0x47, 0x8e, 0x3d, 0x5e, 0x1a, 0x15, 0xbc, 0x0b, 0xe3,
+  0xb2, 0x3c, 0x4c, 0x7f, 0xfb, 0x3c, 0x6e, 0x6d, 0x53, 0x3d, 0xdc, 0xa5, 0x8d,
+  0x3d, 0x71, 0x25, 0x85, 0xbd, 0xc8, 0xa9, 0x17, 0xbc, 0xe1, 0xcd, 0xf3, 0xbc,
+  0xbd, 0xc5, 0x5f, 0xbd, 0xde, 0xbc, 0x07, 0x3d, 0x2a, 0x50, 0x91, 0x3c, 0x12,
+  0x64, 0x9a, 0x3b, 0x54, 0x8b, 0x02, 0x3d, 0x2d, 0x77, 0x8b, 0xbd, 0x83, 0x37,
+  0x82, 0x3d, 0x5f, 0xdb, 0x50, 0xbd, 0xba, 0xe6, 0x63, 0x3d, 0x2d, 0x97, 0x21,
+  0x3d, 0xfe, 0xba, 0x80, 0x3d, 0xe4, 0xc2, 0x39, 0xbd, 0x8d, 0x37, 0x94, 0x3c,
+  0x8d, 0xe8, 0xb0, 0xbc, 0x0e, 0xbc, 0xa9, 0xbc, 0xbb, 0xfb, 0xb1, 0xbb, 0xff,
+  0xdb, 0x13, 0xbd, 0x15, 0x1e, 0x1f, 0xbd, 0xe6, 0x81, 0x51, 0xbd, 0xf1, 0x39,
+  0xaf, 0xbc, 0x86, 0x69, 0x68, 0xbd, 0x33, 0x5c, 0xe8, 0x3c, 0x25, 0xd3, 0x5d,
+  0xbd, 0x77, 0xf4, 0x0e, 0xbd, 0x5f, 0x4b, 0xec, 0x3c, 0xc4, 0x6c, 0xfc, 0x3c,
+  0x39, 0x1e, 0xc9, 0x3c, 0x2c, 0xdc, 0x6f, 0xbd, 0xf0, 0xdd, 0x5b, 0x3c, 0xba,
+  0x58, 0x63, 0x3d, 0x20, 0xb8, 0x9c, 0x3b, 0x58, 0x4e, 0xb6, 0xbc, 0x47, 0x2d,
+  0xc4, 0xbc, 0x0c, 0x5b, 0x6b, 0x3d, 0x00, 0x18, 0xed, 0xb9, 0x96, 0xa9, 0x9e,
+  0x3c, 0x42, 0x5c, 0x4a, 0xbb, 0x94, 0x9f, 0x85, 0xbd, 0x10, 0xdd, 0xcd, 0x3c,
+  0x47, 0x98, 0x8c, 0xbd, 0x28, 0x33, 0x6f, 0xbd, 0x6c, 0x52, 0x21, 0x3d, 0x41,
+  0x5c, 0x45, 0x3c, 0xf7, 0x7c, 0x36, 0xbd, 0x6d, 0xf5, 0xdb, 0xbc, 0x30, 0x95,
+  0x87, 0x3d, 0xed, 0x8a, 0x8f, 0xbd, 0x79, 0x78, 0x88, 0xbd, 0x0c, 0x54, 0x1c,
+  0xbc, 0x82, 0xa3, 0xa7, 0x3b, 0x1f, 0xcf, 0x76, 0xbd, 0x71, 0x23, 0x8b, 0x3c,
+  0x01, 0xc3, 0x87, 0x3d, 0x54, 0xb5, 0xe5, 0x3c, 0x3e, 0x2f, 0x17, 0xbd, 0x99,
+  0xb5, 0x13, 0x3d, 0x69, 0xf7, 0xad, 0x3c, 0xb1, 0x19, 0x13, 0xbc, 0x0e, 0xf8,
+  0x5b, 0xbd, 0x74, 0x52, 0x82, 0x3d, 0x7a, 0x5f, 0xfd, 0xbb, 0x2b, 0x17, 0x15,
+  0xbd, 0x05, 0x3c, 0x72, 0xbd, 0x18, 0xbd, 0xb9, 0xba, 0xaf, 0x8e, 0xc5, 0xbc,
+  0x7a, 0x8f, 0xc3, 0xbb, 0xd9, 0x64, 0x14, 0xbd, 0x97, 0xdf, 0x55, 0x3d, 0x99,
+  0x96, 0xac, 0xba, 0x4f, 0x5c, 0x84, 0x3d, 0xa4, 0x57, 0x27, 0x3d, 0xf8, 0x8e,
+  0x81, 0xbd, 0xf8, 0xef, 0x55, 0x3c, 0x0e, 0x2d, 0x59, 0xbd, 0xf1, 0xeb, 0x52,
+  0x3a, 0x06, 0xde, 0x94, 0x3c, 0x53, 0x8e, 0x17, 0xbd, 0x5d, 0x25, 0x86, 0x3c,
+  0x1c, 0x8c, 0x8b, 0xbc, 0x32, 0xa0, 0x1c, 0x3d, 0x2e, 0xb3, 0x53, 0x3d, 0x2e,
+  0x1c, 0x3f, 0x3d, 0x38, 0xb0, 0xf1, 0x3c, 0x95, 0xc2, 0x55, 0xbb, 0x74, 0x05,
+  0x39, 0xbd, 0x4a, 0xa6, 0x27, 0x3b, 0xb3, 0x63, 0xd8, 0x3c, 0xd6, 0x03, 0x83,
+  0x3d, 0x24, 0x65, 0x49, 0xbd, 0x18, 0x9e, 0xee, 0x3c, 0x26, 0xf0, 0x85, 0xbd,
+  0xfc, 0xd0, 0x67, 0xbd, 0x43, 0xca, 0x12, 0xbd, 0xb1, 0xec, 0x03, 0x3d, 0x00,
+  0x1e, 0x74, 0x3c, 0xb5, 0x32, 0xa6, 0xbc, 0x3d, 0x56, 0x65, 0x3d, 0x8b, 0x0e,
+  0xa9, 0xbc, 0x03, 0x1e, 0x91, 0x3d, 0x64, 0x8f, 0x88, 0x3d, 0x1c, 0x50, 0xb5,
+  0xbc, 0xe4, 0xb3, 0x05, 0xbd, 0x2c, 0x4f, 0x59, 0xbd, 0x29, 0x30, 0x23, 0xbd,
+  0x0c, 0x23, 0x56, 0xbd, 0x7d, 0x77, 0x82, 0xbc, 0x45, 0x1a, 0xa4, 0x3c, 0xb7,
+  0x9c, 0x0f, 0xbc, 0xc5, 0x76, 0xd8, 0xbc, 0x7f, 0x4f, 0x78, 0xbd, 0xb4, 0x07,
+  0x82, 0x3c, 0x56, 0xcc, 0x6a, 0xbd, 0xc3, 0x11, 0x29, 0x3c, 0xa5, 0xf6, 0x7a,
+  0x3d, 0x8a, 0x88, 0xc4, 0x3c, 0x00, 0xf8, 0xa2, 0xbc, 0x30, 0x08, 0x50, 0xbd,
+  0x59, 0xcf, 0xb1, 0xbc, 0xd1, 0xba, 0x52, 0xbd, 0xc0, 0xe8, 0xbe, 0x3b, 0xc3,
+  0xb8, 0xfe, 0xbc, 0x22, 0xc5, 0x84, 0xbd, 0xef, 0x51, 0xbd, 0x3a, 0x75, 0x42,
+  0xc8, 0xbc, 0x1a, 0x32, 0x88, 0x3d, 0x2a, 0x26, 0xc2, 0xbc, 0x66, 0x17, 0x2a,
+  0xbd, 0x1d, 0x0f, 0x7f, 0x3d, 0x55, 0x2f, 0x8f, 0x3b, 0x01, 0x47, 0x8c, 0x3d,
+  0x3a, 0x01, 0x18, 0x3d, 0xca, 0xa0, 0xea, 0xbc, 0x3e, 0x16, 0x34, 0xbd, 0xe8,
+  0xf7, 0x75, 0x3c, 0x20, 0xee, 0x49, 0x3c, 0x6a, 0xc1, 0x3b, 0xbd, 0xa0, 0x98,
+  0x5c, 0xbd, 0x60, 0x8e, 0x94, 0x3b, 0xa2, 0x9b, 0x8a, 0x3d, 0x10, 0x4d, 0x4f,
+  0x3d, 0x87, 0xe4, 0x45, 0xbd, 0xb6, 0x17, 0xdd, 0x3b, 0xee, 0x06, 0x71, 0xbd,
+  0xca, 0xb4, 0xe0, 0x3c, 0xd4, 0x9d, 0x0b, 0xbd, 0xba, 0x3a, 0x21, 0x3d, 0x6c,
+  0xfd, 0xaa, 0x3c, 0x35, 0x20, 0x61, 0xbd, 0x20, 0x51, 0x52, 0x3d, 0x96, 0xcc,
+  0x29, 0xbd, 0x9f, 0x99, 0x22, 0x3d, 0x06, 0x2d, 0xdb, 0xba, 0xdb, 0xf1, 0x90,
+  0x3c, 0xf9, 0x05, 0x06, 0x3d, 0xdf, 0x02, 0xcb, 0x3c, 0x02, 0xb8, 0xf8, 0xbc,
+  0x70, 0x14, 0x50, 0xbd, 0x51, 0xdc, 0x88, 0x3d, 0xa8, 0xa5, 0xd6, 0xbc, 0x69,
+  0xd7, 0x8e, 0x3d, 0xbe, 0x91, 0x86, 0xbd, 0x5d, 0x93, 0x12, 0xbd, 0x7c, 0x23,
+  0x60, 0xbd, 0xb2, 0x55, 0xb7, 0x3c, 0x38, 0xb8, 0x0e, 0x3d, 0x88, 0x86, 0x0e,
+  0x3c, 0x9a, 0x4b, 0x0d, 0x3d, 0x00, 0xfa, 0x1a, 0x3b, 0xb8, 0x59, 0xbf, 0x3c,
+  0xbe, 0xa8, 0xea, 0x3c, 0xfc, 0xf4, 0xf3, 0x3c, 0xbf, 0x69, 0x17, 0x3d, 0x82,
+  0xe6, 0x84, 0xbd, 0x9d, 0xde, 0x3e, 0xbd, 0x3a, 0x02, 0x5b, 0xbd, 0x04, 0x34,
+  0x8b, 0xbd, 0x83, 0x26, 0xc5, 0x3c, 0x71, 0x0c, 0x17, 0x3d, 0x44, 0x33, 0x5a,
+  0xbd, 0xe0, 0x15, 0xe4, 0x3b, 0xd9, 0x25, 0x80, 0xbd, 0xbb, 0xac, 0x56, 0xbd,
+  0x54, 0x26, 0x6f, 0xbd, 0x30, 0x23, 0xa2, 0x3b, 0x08, 0x7c, 0x27, 0xbd, 0xba,
+  0x00, 0xde, 0xbc, 0x80, 0x47, 0x8f, 0xbd, 0xca, 0x52, 0x17, 0xbd, 0xf0, 0x9a,
+  0x0a, 0x3d, 0xe9, 0x6a, 0xea, 0x3b, 0x12, 0xaa, 0x65, 0x3d, 0x3e, 0x1a, 0x49,
+  0x3d, 0x3b, 0x68, 0x30, 0xbd, 0xfb, 0x34, 0x3d, 0x3d, 0x0c, 0x21, 0xe3, 0x3c,
+  0x13, 0x68, 0x67, 0xbb, 0xe5, 0xaf, 0x8b, 0xbd, 0xfe, 0x2b, 0x00, 0xbd, 0x5e,
+  0x1e, 0x4a, 0xbd, 0xb2, 0x94, 0x70, 0x3d, 0xa0, 0x7e, 0x47, 0x3b, 0xde, 0xa9,
+  0xef, 0xbc, 0x84, 0x2f, 0x1a, 0x3a, 0x26, 0xb6, 0xf8, 0x3c, 0xe4, 0xab, 0xd9,
+  0xbc, 0xa8, 0x0b, 0x87, 0xbd, 0x70, 0x2c, 0xbd, 0x3c, 0x32, 0xb2, 0x8c, 0x3c,
+  0xce, 0x0f, 0x34, 0xba, 0xc7, 0xc9, 0x3b, 0xbd, 0x22, 0xdb, 0xf3, 0xbc, 0x8d,
+  0x4e, 0x48, 0xbd, 0xf0, 0x63, 0x53, 0x3d, 0x04, 0xd6, 0xc7, 0x3b, 0xfa, 0x40,
+  0x6c, 0xbd, 0x22, 0xfb, 0x80, 0x38, 0xe9, 0x8c, 0x0e, 0x3c, 0xc4, 0x60, 0x27,
+  0x3d, 0xaa, 0xcf, 0x60, 0x3d, 0xfe, 0x59, 0x08, 0x3d, 0x6e, 0x69, 0x43, 0xbd,
+  0xcb, 0xa1, 0x03, 0xbd, 0x16, 0x47, 0x72, 0x3d, 0xc1, 0x37, 0x5d, 0x3d, 0x53,
+  0x6f, 0x8b, 0xbd, 0x50, 0x99, 0x18, 0x3d, 0x65, 0x92, 0x89, 0x3d, 0x12, 0x80,
+  0x94, 0xbd, 0x8d, 0x1d, 0x21, 0xbd, 0x6e, 0xc6, 0x69, 0x3d, 0x18, 0x1d, 0x23,
+  0x3d, 0x3e, 0x2b, 0x00, 0x3d, 0xe4, 0x71, 0x4f, 0xbd, 0xfb, 0xc5, 0x0e, 0xbd,
+  0x6e, 0x24, 0x47, 0x3d, 0x34, 0xf0, 0x50, 0x3c, 0x3f, 0x38, 0x89, 0x3d, 0xb5,
+  0x84, 0x41, 0xbc, 0xb8, 0xdc, 0x56, 0x3d, 0x3b, 0x56, 0x60, 0xbc, 0x5a, 0x3b,
+  0x58, 0x3d, 0x86, 0x56, 0x6d, 0xbd, 0x4f, 0x33, 0x43, 0x3d, 0x7e, 0x6c, 0x7d,
+  0x3c, 0xb9, 0x4c, 0x8b, 0x3d, 0x00, 0x88, 0x3f, 0x3a, 0x3a, 0xb8, 0xc1, 0x3c,
+  0x02, 0x18, 0x30, 0x3d, 0x6b, 0xb4, 0x4c, 0xbd, 0x0d, 0xd8, 0x3c, 0x3d, 0x9a,
+  0x25, 0x61, 0xbd, 0x87, 0x7b, 0xa7, 0xbc, 0x76, 0x8e, 0x06, 0xbb, 0x47, 0xf9,
+  0x73, 0xbd, 0x80, 0xfa, 0x28, 0xbb, 0xd4, 0xd1, 0x76, 0xbd, 0x9a, 0xcb, 0x29,
+  0xbd, 0xf6, 0x0f, 0xe5, 0xbc, 0x6d, 0xeb, 0x4f, 0xbd, 0x46, 0xe8, 0x69, 0xbc,
+  0x9a, 0x72, 0x69, 0x3d, 0x55, 0x19, 0x86, 0xbd, 0xba, 0x77, 0x0f, 0x3d, 0x4d,
+  0xf6, 0x64, 0x3d, 0xf4, 0xf6, 0x19, 0x3d, 0xc3, 0x53, 0x4a, 0x3d, 0x83, 0xc4,
+  0x7f, 0x3c, 0xb6, 0xcb, 0x53, 0xbd, 0xc5, 0x99, 0x83, 0xbd, 0xa9, 0xcb, 0x4e,
+  0xbd, 0xbc, 0xc0, 0xf3, 0x3c, 0xc3, 0x45, 0x2c, 0x3d, 0x6a, 0x2f, 0x93, 0xbd,
+  0x8d, 0x05, 0x67, 0x3d, 0xec, 0x6f, 0x3a, 0x3d, 0xf5, 0x47, 0x5a, 0x3d, 0xca,
+  0xa6, 0x79, 0x3d, 0x16, 0x97, 0x7d, 0xbd, 0x53, 0x30, 0x52, 0x3d, 0x07, 0x81,
+  0x52, 0x3d, 0xf7, 0xae, 0xa6, 0xbc, 0xa3, 0xc2, 0xa4, 0xbc, 0x5c, 0xd8, 0x23,
+  0xbd, 0xc5, 0x77, 0x50, 0x3d, 0x28, 0x78, 0x47, 0x3c, 0xe7, 0xe2, 0x04, 0xbd,
+  0xcc, 0x6f, 0x83, 0xbd, 0x4c, 0x2b, 0xfc, 0xbc, 0x42, 0xf8, 0xf6, 0x3c, 0x03,
+  0x7c, 0x87, 0x3d, 0x2d, 0x4d, 0x80, 0xbd, 0x08, 0x59, 0x65, 0x3d, 0x2b, 0x4a,
+  0x3a, 0xbd, 0xae, 0xec, 0x68, 0x3d, 0x1e, 0x42, 0x85, 0xbd, 0xd6, 0x06, 0x6a,
+  0x3d, 0x6e, 0xfe, 0x65, 0xbd, 0x77, 0xef, 0xb0, 0x3c, 0x81, 0xb1, 0x48, 0x3c,
+  0x86, 0x4b, 0x57, 0xbd, 0x1e, 0x45, 0x82, 0x3c, 0x9b, 0x6c, 0x0f, 0xbd, 0xeb,
+  0x5f, 0x1c, 0xbd, 0xc3, 0x49, 0x3b, 0x3d, 0x5b, 0x31, 0x7b, 0xbd, 0xee, 0xcb,
+  0x0c, 0xbd, 0x49, 0xa6, 0xa7, 0x3c, 0x89, 0x96, 0x73, 0xbd, 0x4d, 0xcf, 0x89,
+  0x3d, 0xec, 0x73, 0xe1, 0x3b, 0x0e, 0x74, 0x0b, 0x3c, 0xc4, 0x52, 0xe1, 0xbc,
+  0xf9, 0x15, 0x5f, 0x3d, 0x4a, 0x6c, 0x6c, 0xbd, 0x1d, 0x1d, 0xc7, 0xbb, 0xa2,
+  0x11, 0x26, 0x3d, 0x92, 0xa6, 0x00, 0xbd, 0xe8, 0x29, 0x52, 0x3d, 0x6c, 0x9f,
+  0xc3, 0x3c, 0xa9, 0xf6, 0xea, 0xbc, 0x0b, 0xce, 0x84, 0x3d, 0x3a, 0x7a, 0x83,
+  0x3d, 0x95, 0x99, 0xff, 0x3c, 0x26, 0xc1, 0xae, 0xbc, 0x4c, 0x73, 0xab, 0x3c,
+  0x10, 0x47, 0x5f, 0xbd, 0x6c, 0x99, 0xab, 0x3c, 0x40, 0x91, 0xee, 0x3a, 0x30,
+  0xe9, 0x43, 0xbd, 0xd8, 0xdf, 0xed, 0x3c, 0x93, 0xd4, 0x98, 0xbc, 0x05, 0xf8,
+  0x8c, 0x3d, 0x8d, 0x54, 0x89, 0xbd, 0x29, 0x6a, 0x5a, 0xbd, 0x54, 0x2f, 0x2d,
+  0xbd, 0x11, 0x76, 0x90, 0xbd, 0x62, 0x24, 0xdf, 0x3c, 0x1f, 0x0c, 0x92, 0xbd,
+  0x87, 0xb7, 0x06, 0xbd, 0x28, 0x1b, 0x92, 0xbd, 0x41, 0xb6, 0x19, 0xbd, 0x90,
+  0xa9, 0xc8, 0xbc, 0x10, 0x06, 0xa2, 0x3c, 0x9b, 0x59, 0x72, 0x3d, 0x9f, 0x9b,
+  0xc4, 0x3c, 0xc2, 0x44, 0xb9, 0xbb, 0xe4, 0x46, 0x90, 0x3d, 0xe9, 0x54, 0x40,
+  0xbd, 0x18, 0xdd, 0xc8, 0xbc, 0xff, 0x78, 0x44, 0xbd, 0x6e, 0xaa, 0x92, 0xbc,
+  0x76, 0xaa, 0x31, 0x3c, 0x37, 0x94, 0xe8, 0xbc, 0x2b, 0x84, 0xf6, 0x3c, 0xce,
+  0x29, 0x8f, 0xbc, 0x37, 0xdc, 0xaf, 0x3c, 0x40, 0x76, 0xbd, 0x3c, 0xd6, 0x49,
+  0x50, 0x3d, 0x48, 0x72, 0x36, 0xbd, 0xc7, 0x51, 0x63, 0xbd, 0x04, 0x47, 0x70,
+  0xbc, 0x02, 0x99, 0x7c, 0xbc, 0x83, 0xb4, 0x44, 0xbd, 0x1d, 0x3b, 0x83, 0xbd,
+  0x55, 0xe3, 0x41, 0x3d, 0x2c, 0x05, 0xcf, 0x3a, 0x52, 0x65, 0x2f, 0x3d, 0x8e,
+  0x0d, 0x2d, 0x3d, 0x59, 0x13, 0x43, 0xbd, 0xe6, 0x6e, 0xf3, 0x3c, 0xc3, 0xfc,
+  0xac, 0x3c, 0x82, 0x9e, 0x5f, 0xbc, 0x07, 0xd9, 0x6f, 0xbd, 0xf0, 0xf1, 0x9d,
+  0x3b, 0x09, 0xcd, 0x07, 0xbd, 0x99, 0xc1, 0x87, 0x3d, 0xfa, 0xef, 0x73, 0x3d,
+  0xe5, 0x18, 0xfc, 0x3c, 0xbc, 0x08, 0x06, 0x3d, 0x5e, 0x91, 0x90, 0xbd, 0x9c,
+  0x69, 0xf7, 0x3b, 0x71, 0x14, 0xef, 0xbc, 0x90, 0x77, 0xf9, 0x3c, 0x4c, 0x17,
+  0x6e, 0xbd, 0x59, 0x66, 0xe5, 0xbb, 0x6d, 0x0b, 0x5f, 0xbc, 0x8a, 0xde, 0x57,
+  0x3d, 0xdf, 0x37, 0x84, 0xbd, 0x6a, 0x62, 0x7b, 0x3d, 0x19, 0x4c, 0xc5, 0xbc,
+  0xf0, 0x81, 0x2b, 0x3d, 0x0c, 0xe8, 0x3f, 0xbd, 0x2c, 0xac, 0x36, 0xbd, 0x2a,
+  0x6a, 0x2e, 0x3d, 0x90, 0xcc, 0x94, 0xbb, 0x07, 0xfd, 0x28, 0xbd, 0x5e, 0x9f,
+  0xb7, 0x3b, 0xcc, 0xf7, 0x83, 0xbd, 0x2e, 0x4f, 0xa0, 0xbc, 0x06, 0x60, 0xcc,
+  0x3c, 0xc6, 0xbf, 0x5d, 0x3c, 0x48, 0x40, 0x6b, 0xbd, 0x69, 0x48, 0x03, 0x3d,
+  0x75, 0x47, 0x48, 0x3d, 0xc4, 0x2f, 0x0f, 0x3d, 0x2d, 0xa5, 0x6e, 0xbd, 0x5a,
+  0x05, 0x41, 0xbd, 0x7c, 0x10, 0xff, 0x3c, 0x2c, 0x2e, 0x78, 0xbd, 0x16, 0x4f,
+  0x7d, 0x3d, 0xcf, 0x20, 0x5f, 0x3d, 0xd7, 0x5c, 0x87, 0xbd, 0x96, 0x63, 0x1e,
+  0xbc, 0x2b, 0xf3, 0x8c, 0xbc, 0x6e, 0x52, 0x00, 0xbd, 0xb0, 0xb0, 0x47, 0x3d,
+  0x6e, 0x8c, 0xa2, 0xbc, 0x26, 0xa4, 0xbd, 0x3c, 0x50, 0xfb, 0xc4, 0xbc, 0x16,
+  0xc5, 0xe2, 0x3c, 0x34, 0xbe, 0xba, 0xbc, 0x58, 0x77, 0x06, 0xbc, 0xb6, 0x0f,
+  0x02, 0x3d, 0x00, 0xc0, 0x67, 0xbd, 0x19, 0x7b, 0x0f, 0xbd, 0xdf, 0xca, 0x42,
+  0xbd, 0x28, 0x6b, 0x5d, 0xbd, 0xe8, 0x7b, 0x0b, 0x3d, 0x0f, 0xd3, 0x9b, 0xbc,
+  0x0e, 0x94, 0x3c, 0x3d, 0x56, 0xcd, 0x32, 0xbd, 0x39, 0x73, 0x82, 0xbd, 0x32,
+  0x4b, 0x06, 0xbd, 0x77, 0xbe, 0x35, 0xbd, 0x4f, 0x03, 0x0b, 0x3d, 0x40, 0x14,
+  0x8b, 0x3d, 0xe0, 0x32, 0x60, 0xbd, 0x4f, 0xd0, 0x85, 0x3d, 0x0f, 0xfc, 0x74,
+  0xbc, 0xa1, 0xfc, 0xfa, 0xbb, 0x83, 0x11, 0x49, 0x3b, 0x48, 0x21, 0x1b, 0xbc,
+  0x4d, 0x36, 0xe6, 0xbc, 0x27, 0x47, 0x6c, 0xbc, 0x6f, 0x04, 0x37, 0xbd, 0xc6,
+  0x57, 0x6a, 0x3d, 0xa0, 0x16, 0x4d, 0x3b, 0x1a, 0xeb, 0x55, 0x3d, 0x6e, 0x5f,
+  0x2d, 0xbd, 0xde, 0xff, 0x65, 0xbd, 0x68, 0x46, 0x49, 0x3c, 0x3c, 0x27, 0x3c,
+  0xbd, 0xfd, 0xdc, 0x0e, 0xbd, 0xb9, 0xff, 0x24, 0xbd, 0xf0, 0x8f, 0x5c, 0xbd,
+  0xa8, 0x9d, 0x32, 0x3d, 0x5c, 0x6d, 0x4d, 0xbd, 0x0d, 0xc2, 0x47, 0x3d, 0xf5,
+  0xe0, 0x8b, 0x3c, 0x4e, 0xd4, 0xfb, 0xbc, 0x2f, 0xef, 0x7d, 0x3d, 0x0d, 0xbf,
+  0x03, 0x3d, 0x54, 0x6e, 0x16, 0x3d, 0x51, 0x8b, 0x85, 0xbd, 0xac, 0x6b, 0x19,
+  0xbb, 0x2e, 0x99, 0x9e, 0x3c, 0xd9, 0xa5, 0x35, 0x3d, 0x90, 0x56, 0x59, 0x3d,
+  0xda, 0xee, 0x7c, 0x3d, 0x63, 0x87, 0x1b, 0xbb, 0x12, 0x90, 0x39, 0xbd, 0x4b,
+  0xb8, 0x39, 0x3d, 0x3f, 0x49, 0x94, 0xbc, 0xeb, 0x8f, 0x80, 0x3d, 0x8a, 0x9f,
+  0x81, 0xbd, 0xdb, 0x11, 0x0c, 0x3d, 0x13, 0x28, 0x29, 0x3d, 0x70, 0x84, 0xfc,
+  0xbc, 0x48, 0x74, 0x10, 0x3c, 0xcc, 0xb3, 0x30, 0xbd, 0x48, 0x07, 0x16, 0x3c,
+  0x5d, 0x4f, 0x19, 0xbd, 0x2b, 0x80, 0xf7, 0xbb, 0x16, 0x87, 0x08, 0xbd, 0x07,
+  0x00, 0x88, 0x3d, 0x12, 0x69, 0x44, 0x3d, 0x18, 0x31, 0x0d, 0x3c, 0x57, 0xd3,
+  0x06, 0x3d, 0x24, 0x3d, 0x07, 0x3d, 0xcc, 0x07, 0x7f, 0x3d, 0xab, 0x2a, 0x79,
+  0xbd, 0x7e, 0x3c, 0x79, 0xbd, 0xa9, 0x22, 0xfb, 0xbc, 0x3d, 0xa3, 0x3f, 0x3d,
+  0x9b, 0x63, 0x40, 0x3c, 0x8f, 0xd5, 0x9b, 0x3c, 0x38, 0x24, 0x2b, 0x3d, 0x73,
+  0x53, 0x02, 0x3d, 0xf4, 0xe3, 0xfb, 0x3c, 0xab, 0x4b, 0x81, 0x3d, 0x6c, 0x44,
+  0x17, 0x3d, 0xe9, 0xbe, 0x8e, 0x3d, 0x79, 0xc1, 0x23, 0x3c, 0x19, 0xfd, 0x91,
+  0x3c, 0xf9, 0xea, 0x83, 0x3c, 0x5a, 0xee, 0x86, 0x3c, 0xa7, 0x51, 0x2f, 0xbd,
+  0x4a, 0xa1, 0x43, 0x3d, 0xf7, 0xc3, 0xdd, 0x3b, 0x41, 0x5d, 0x48, 0xbd, 0x91,
+  0x94, 0x92, 0xbd, 0x76, 0xb0, 0x87, 0x3d, 0xad, 0x39, 0x8e, 0x3d, 0xa0, 0x5a,
+  0xc3, 0xbb, 0x13, 0xd2, 0x42, 0xbd, 0x93, 0x32, 0x41, 0xbc, 0x02, 0x56, 0x91,
+  0xbd, 0x6e, 0x37, 0x12, 0xbd, 0x70, 0x73, 0xe7, 0x3b, 0x85, 0xd7, 0x78, 0x3b,
+  0xb0, 0xfb, 0x3f, 0xbd, 0x44, 0xb8, 0x2e, 0xbd, 0xcd, 0x1c, 0x92, 0xbd, 0x78,
+  0xee, 0xe1, 0xbc, 0xb4, 0x56, 0x52, 0xbd, 0xa6, 0xbd, 0x62, 0x3d, 0xdc, 0x38,
+  0xe8, 0xbc, 0x30, 0xaf, 0x68, 0x3c, 0xe0, 0x72, 0x05, 0xbc, 0x06, 0xad, 0xd5,
+  0x3b, 0xd9, 0x62, 0x23, 0x3d, 0xf8, 0xa2, 0xee, 0xbc, 0x44, 0x13, 0x07, 0x3d,
+  0x04, 0xcc, 0xf2, 0x3a, 0xce, 0x3f, 0x2c, 0x3d, 0x25, 0x8b, 0x28, 0x3c, 0x55,
+  0xd2, 0x7a, 0xbc, 0x19, 0x6f, 0x83, 0x3d, 0x62, 0xaa, 0x32, 0xbd, 0xf2, 0x19,
+  0x1c, 0xbc, 0x54, 0xc3, 0x8b, 0xbd, 0xdd, 0xeb, 0x52, 0x3c, 0x2a, 0xc7, 0x7c,
+  0x3d, 0x04, 0xf0, 0xb9, 0x3b, 0xe8, 0x91, 0x84, 0x3d, 0x8d, 0xa2, 0xa3, 0x3c,
+  0x01, 0xde, 0x7d, 0xbd, 0x14, 0xf3, 0x25, 0xbd, 0xde, 0x87, 0x8e, 0xbd, 0x6b,
+  0x3b, 0x85, 0x3d, 0x02, 0x85, 0x84, 0xbd, 0x6b, 0x77, 0x6d, 0xbc, 0xb6, 0x9a,
+  0x53, 0x3d, 0x0f, 0xb3, 0xaa, 0xbb, 0x13, 0x69, 0x55, 0xbd, 0x65, 0x98, 0x57,
+  0xbd, 0xef, 0x9c, 0xb2, 0xbc, 0xd2, 0x02, 0xd4, 0x3c, 0x8e, 0xca, 0x27, 0x3d,
+  0x64, 0xc8, 0x42, 0xbd, 0xca, 0x34, 0x39, 0xbd, 0xec, 0x45, 0x78, 0xbc, 0xe3,
+  0xe3, 0x15, 0xbd, 0xad, 0x80, 0x30, 0x3d, 0xa3, 0xc8, 0x12, 0xbd, 0x11, 0x8e,
+  0x40, 0x3d, 0x9a, 0x5f, 0x29, 0xbc, 0xbe, 0xc0, 0x8e, 0xbd, 0x2e, 0x01, 0x05,
+  0xba, 0xde, 0x16, 0x2d, 0x3d, 0xce, 0xc7, 0x68, 0x3d, 0x08, 0x78, 0x4b, 0x3d,
+  0xb9, 0xc7, 0x8f, 0xbd, 0x99, 0x7d, 0x71, 0x3d, 0x20, 0x52, 0x85, 0x3b, 0x8e,
+  0x86, 0xcc, 0xbc, 0x18, 0x1e, 0x1e, 0x3d, 0x06, 0x84, 0x35, 0x3d, 0xd8, 0x65,
+  0x71, 0xbd, 0xb1, 0x95, 0x1e, 0x3d, 0xa8, 0x12, 0x4f, 0x3d, 0xf0, 0x82, 0x6b,
+  0x3c, 0x82, 0x05, 0x05, 0xbd, 0x78, 0x40, 0xef, 0x3c, 0xea, 0xf1, 0x91, 0xbd,
+  0x06, 0x99, 0x82, 0x3d, 0x65, 0x80, 0x81, 0xbc, 0xc7, 0xd2, 0x98, 0xbc, 0x1b,
+  0xab, 0x8c, 0x3b, 0x8d, 0xe6, 0xa2, 0x3c, 0x5a, 0xb0, 0xe8, 0xbc, 0x74, 0x5c,
+  0x65, 0x3c, 0x53, 0x81, 0x88, 0x3d, 0x77, 0xe4, 0x83, 0xbd, 0x05, 0x68, 0x3f,
+  0xbd, 0x7f, 0xa0, 0x34, 0xbd, 0x23, 0xc6, 0x57, 0xbd, 0xe8, 0x03, 0x4c, 0xbd,
+  0xef, 0x5a, 0x91, 0x3c, 0x85, 0x78, 0x46, 0xbd, 0xc3, 0x5f, 0x2e, 0xbd, 0x38,
+  0x74, 0x09, 0x3d, 0x71, 0x8d, 0x2a, 0xbd, 0x7c, 0xb3, 0x40, 0x3d, 0x26, 0xf6,
+  0x72, 0xbd, 0x84, 0xfa, 0x4f, 0xbd, 0x34, 0x53, 0xa7, 0x3c, 0x2c, 0x63, 0x6f,
+  0x3d, 0xe4, 0xa4, 0x29, 0xbd, 0x00, 0x17, 0x21, 0xbb, 0x82, 0x9e, 0x6f, 0x3d,
+  0x8a, 0x61, 0x8d, 0xbd, 0xc4, 0xd7, 0x45, 0x3d, 0x20, 0x1a, 0xce, 0x3c, 0x86,
+  0x39, 0x27, 0xbd, 0xf1, 0x45, 0x1f, 0xbd, 0xe0, 0x3e, 0xd4, 0x3c, 0x8a, 0x80,
+  0x70, 0xbc, 0x80, 0xae, 0xd4, 0x3c, 0x04, 0x93, 0x0a, 0x3d, 0xff, 0x3c, 0x78,
+  0x3d, 0x31, 0x0e, 0x48, 0x3c, 0x20, 0xa8, 0x89, 0xbd, 0x98, 0x75, 0x07, 0xbc,
+  0x68, 0xa1, 0x71, 0x3d, 0xe0, 0xe8, 0x8e, 0xbc, 0xe9, 0x29, 0x19, 0x3d, 0x79,
+  0x7c, 0x4f, 0xbc, 0x90, 0x98, 0xd5, 0x3c, 0x3b, 0xec, 0x1c, 0xbd, 0x36, 0x46,
+  0x84, 0xb9, 0x18, 0x09, 0x8a, 0xbc, 0x84, 0xce, 0x0d, 0xbc, 0xb8, 0x2c, 0xa8,
+  0x3c, 0x20, 0x84, 0x18, 0xbc, 0xa0, 0x54, 0x72, 0xbd, 0x5f, 0xd9, 0x82, 0xbd,
+  0xe7, 0x32, 0x69, 0xbc, 0x58, 0xf3, 0x30, 0xbc, 0x12, 0xff, 0x89, 0x3b, 0x38,
+  0xb3, 0x50, 0x3c, 0x5c, 0xf7, 0x48, 0x3c, 0x40, 0xb3, 0xb9, 0x3c, 0x08, 0x01,
+  0x2b, 0x3d, 0xcb, 0x34, 0xc0, 0xbc, 0x9c, 0x64, 0x51, 0xbd, 0x58, 0x1a, 0x2f,
+  0xbd, 0x4a, 0x45, 0x8a, 0xbc, 0x6a, 0x88, 0xe3, 0x3b, 0xf2, 0xe0, 0x74, 0x3d,
+  0x08, 0xa7, 0x2d, 0xbd, 0x73, 0x61, 0x17, 0xbd, 0xf0, 0xee, 0xce, 0xbc, 0xda,
+  0xbc, 0x20, 0xbd, 0x57, 0x27, 0xc6, 0x3c, 0x3c, 0xfc, 0xb2, 0x3d, 0xf9, 0x52,
+  0x72, 0x3d, 0x98, 0x21, 0x23, 0x3a, 0x64, 0x0e, 0x39, 0xbd, 0x3c, 0x50, 0xff,
+  0xbd, 0xf0, 0xb9, 0x36, 0xbd, 0xff, 0xe2, 0xa3, 0x3d, 0x1c, 0xad, 0x24, 0xbd,
+  0x17, 0x26, 0x4b, 0x3d, 0x32, 0xdb, 0xca, 0x3b, 0xc6, 0x04, 0x3c, 0x3d, 0x3c,
+  0x98, 0x9c, 0x3d, 0xd7, 0xd3, 0x80, 0xbc, 0x30, 0x4e, 0xd9, 0x3c, 0xff, 0xc1,
+  0x21, 0x3d, 0x66, 0xcc, 0xa5, 0xbc, 0x61, 0x87, 0x98, 0x3d, 0x98, 0x20, 0x32,
+  0x3d, 0xec, 0xf1, 0x87, 0xbd, 0x40, 0x73, 0xb9, 0xbd, 0xed, 0x67, 0x98, 0x3d,
+  0x82, 0xde, 0x83, 0x3c, 0xef, 0xb3, 0xe9, 0x3c, 0xf6, 0xd1, 0x2f, 0x3d, 0xb6,
+  0xa2, 0x6c, 0xbd, 0xfa, 0x55, 0x87, 0xbd, 0x5e, 0x0d, 0x4b, 0xbd, 0x52, 0x83,
+  0x1b, 0x3d, 0x38, 0xa3, 0x32, 0xbd, 0x68, 0xa3, 0xd0, 0x3c, 0x6b, 0x9b, 0x0e,
+  0xbd, 0xe8, 0x58, 0x83, 0x3b, 0xac, 0xf2, 0x1d, 0x3d, 0xdc, 0x01, 0xfe, 0xbb,
+  0x45, 0xd1, 0x37, 0x3d, 0x7d, 0x74, 0x10, 0x3d, 0x39, 0x6f, 0x42, 0xbd, 0x1f,
+  0x11, 0xd3, 0xbc, 0x58, 0x36, 0x98, 0x3d, 0xe6, 0x99, 0x19, 0xbd, 0x2e, 0x3f,
+  0x44, 0x3c, 0x04, 0xd0, 0x08, 0xbd, 0x9e, 0x8c, 0x74, 0xbc, 0x73, 0x43, 0xeb,
+  0xbc, 0xa2, 0x01, 0x9b, 0xbd, 0x30, 0x8a, 0x29, 0xbd, 0x4d, 0xe1, 0x50, 0xbd,
+  0xc8, 0x2a, 0x1d, 0x3d, 0x2d, 0x12, 0x7d, 0x3d, 0xdd, 0x75, 0x24, 0xbc, 0xd7,
+  0x2b, 0x48, 0x3c, 0x84, 0x77, 0xf0, 0x3c, 0xf8, 0x69, 0x8a, 0x3d, 0x0d, 0x62,
+  0x23, 0x3d, 0x8d, 0x2a, 0x65, 0x3d, 0x33, 0xc6, 0xce, 0x3b, 0x34, 0xb9, 0x97,
+  0x3b, 0xf3, 0x86, 0xe2, 0xbb, 0x5d, 0x2a, 0x53, 0xbd, 0xea, 0x2b, 0x9a, 0xba,
+  0xbf, 0xd8, 0x91, 0xbc, 0x3d, 0x5f, 0xfa, 0xbc, 0x04, 0x71, 0x82, 0x3d, 0x02,
+  0x09, 0xbe, 0x3d, 0xa2, 0xb3, 0xad, 0x3c, 0x6c, 0x47, 0x28, 0xbd, 0xce, 0xd6,
+  0x16, 0xbd, 0x95, 0x44, 0xff, 0x3c, 0x6c, 0x62, 0x82, 0x3d, 0x2a, 0x15, 0xba,
+  0xbc, 0xc1, 0xa7, 0x83, 0xbb, 0x69, 0x42, 0x7c, 0xbd, 0x03, 0x6e, 0x01, 0x3d,
+  0xd9, 0x8c, 0x1b, 0xbd, 0xc7, 0x85, 0xdc, 0x3c, 0x76, 0x04, 0x4d, 0x3d, 0x99,
+  0x3b, 0x69, 0x3c, 0xee, 0x8a, 0x6f, 0x3d, 0x2c, 0xb5, 0x34, 0xbd, 0x95, 0xc2,
+  0x32, 0xbd, 0x34, 0x5b, 0x8a, 0x3c, 0x0d, 0x52, 0x44, 0xbb, 0xe8, 0xfd, 0xe3,
+  0xbc, 0x6c, 0x8f, 0x6c, 0x3d, 0x22, 0xe9, 0xce, 0xbc, 0x38, 0x1d, 0xa4, 0x3d,
+  0x37, 0xb9, 0xcc, 0xbb, 0x58, 0x8e, 0xbb, 0xbc, 0x13, 0x85, 0x8d, 0x3d, 0x7b,
+  0x10, 0x9d, 0xbd, 0xb0, 0x74, 0x20, 0xbd, 0xbf, 0x6b, 0x24, 0xbc, 0x0b, 0xb2,
+  0x6f, 0xbd, 0xbe, 0x9c, 0xae, 0x3d, 0x64, 0xfc, 0x34, 0x3d, 0x84, 0x44, 0x59,
+  0x3b, 0xc5, 0x97, 0xb6, 0xbc, 0x25, 0x1b, 0x42, 0xbd, 0x1c, 0x64, 0x59, 0x3d,
+  0x00, 0x12, 0x82, 0x3d, 0x64, 0xac, 0x91, 0x3b, 0x3b, 0xae, 0x6b, 0xbd, 0x18,
+  0x6c, 0xd0, 0x3d, 0x9e, 0xea, 0x60, 0x3d, 0xf3, 0xf6, 0x49, 0xbd, 0xd3, 0xfc,
+  0x5b, 0xbc, 0xe5, 0x37, 0x64, 0x3c, 0xbe, 0x33, 0x9c, 0xbc, 0x0e, 0x7a, 0x70,
+  0xbd, 0xf7, 0x19, 0x32, 0xbd, 0x7a, 0x54, 0xac, 0xbd, 0x94, 0x9a, 0x45, 0xbc,
+  0xb6, 0xa0, 0x55, 0x3d, 0x72, 0x8b, 0x81, 0x3d, 0xec, 0xf7, 0x1d, 0x3c, 0x7c,
+  0xc0, 0x65, 0xbd, 0x21, 0x3d, 0xa8, 0x3d, 0xfe, 0x98, 0x91, 0xbc, 0xfc, 0x4e,
+  0x99, 0xbd, 0xd5, 0x77, 0xa0, 0xbd, 0x9a, 0xec, 0x0b, 0x3d, 0xc2, 0xc5, 0x2e,
+  0xbd, 0x58, 0x39, 0x9b, 0x3d, 0x1a, 0x19, 0x4e, 0xbd, 0x32, 0x1e, 0x11, 0xbd,
+  0xe2, 0x81, 0x2f, 0xbd, 0x72, 0x93, 0x82, 0x3d, 0xb5, 0x33, 0x96, 0x3d, 0xfd,
+  0x32, 0x31, 0xbd, 0xf0, 0x5e, 0x7b, 0xbd, 0x37, 0x76, 0x4d, 0xbd, 0x5e, 0xa1,
+  0x9a, 0x3d, 0x58, 0xb2, 0x89, 0xbd, 0xc0, 0x61, 0x93, 0x3a, 0x12, 0xf4, 0x7a,
+  0x3d, 0xad, 0xe5, 0x32, 0xba, 0xf3, 0xfe, 0x75, 0x3d, 0xbd, 0xec, 0x57, 0xbd,
+  0x4d, 0x5b, 0x09, 0x3d, 0x27, 0x1d, 0x1b, 0xbd, 0x26, 0x5e, 0x77, 0xbc, 0x33,
+  0xd7, 0x30, 0xbd, 0x93, 0xde, 0x6d, 0xbd, 0xfe, 0xdd, 0x6f, 0x3d, 0x07, 0x21,
+  0xad, 0x3d, 0xb6, 0xfb, 0x77, 0x3d, 0xc7, 0xd4, 0x12, 0x3d, 0xee, 0xd1, 0x1a,
+  0x3b, 0x57, 0x6a, 0xdf, 0xbc, 0x9a, 0x69, 0x98, 0xbd, 0x18, 0xb5, 0x8b, 0xbd,
+  0x3f, 0x2a, 0x1b, 0xbc, 0xba, 0x61, 0x4e, 0x3d, 0xf7, 0xfc, 0x15, 0x3d, 0x15,
+  0x6a, 0x89, 0x3d, 0x0c, 0x26, 0x12, 0xbd, 0x3c, 0x56, 0x75, 0x3d, 0x31, 0x95,
+  0x49, 0x3c, 0x80, 0x89, 0x27, 0xbd, 0xc5, 0xc8, 0x2d, 0xba, 0xd4, 0xb2, 0x99,
+  0x3d, 0xbd, 0xfe, 0x19, 0xbd, 0x88, 0x62, 0x88, 0x3d, 0x1a, 0xea, 0xb6, 0x3d,
+  0x06, 0xc5, 0x95, 0xbd, 0xbe, 0x0c, 0x2d, 0xbd, 0x09, 0x1b, 0x59, 0x3d, 0xf7,
+  0xd4, 0xbe, 0xba, 0x23, 0x7e, 0x0d, 0xbd, 0x3f, 0x6a, 0x9f, 0x3c, 0x29, 0x6c,
+  0x86, 0x3c, 0x50, 0x53, 0xad, 0xbc, 0x4d, 0x7e, 0xd5, 0xbd, 0xd2, 0xac, 0x6b,
+  0x3d, 0xfd, 0xc0, 0x8d, 0xbd, 0x96, 0xc2, 0x3f, 0x3d, 0xc7, 0x50, 0x9d, 0xbc,
+  0xf8, 0x74, 0xa7, 0xbc, 0x20, 0xcb, 0xbe, 0xbd, 0x39, 0xaa, 0x5d, 0x3d, 0x53,
+  0x49, 0x99, 0xbc, 0xfe, 0x92, 0xca, 0xbd, 0xf2, 0x46, 0x75, 0xbd, 0x71, 0xfe,
+  0x6e, 0xbd, 0x9f, 0x2f, 0x59, 0xbd, 0x0b, 0xe7, 0x3f, 0xbc, 0xad, 0x3f, 0x80,
+  0x3d, 0xec, 0x4d, 0x81, 0xbd, 0x53, 0x8f, 0x8a, 0x3d, 0xfb, 0x2c, 0x54, 0x3d,
+  0x20, 0x2c, 0x57, 0xbd, 0xc1, 0xeb, 0xe2, 0xba, 0x98, 0xed, 0x46, 0x3d, 0x6a,
+  0x20, 0xc1, 0x3c, 0x54, 0x95, 0x2c, 0xbd, 0xac, 0xc1, 0x2b, 0x3c, 0x29, 0x2a,
+  0xf8, 0xbd, 0x4e, 0x69, 0x7f, 0x3d, 0x17, 0x04, 0x29, 0xbd, 0xf2, 0xbb, 0xeb,
+  0xbb, 0xf1, 0x49, 0x40, 0x3d, 0x00, 0x69, 0x01, 0x3d, 0x8d, 0x53, 0x64, 0x3d,
+  0xb7, 0x21, 0x0b, 0xbd, 0x43, 0xc5, 0xc7, 0xbd, 0x1b, 0xa3, 0x48, 0x3d, 0xcb,
+  0x7c, 0x09, 0xbd, 0x20, 0xcb, 0x6e, 0xbb, 0x94, 0x3f, 0x2e, 0x3d, 0xf7, 0x32,
+  0x72, 0xbd, 0x9a, 0x1e, 0x40, 0xbd, 0x5b, 0xf3, 0x47, 0x3d, 0x02, 0xea, 0x77,
+  0xba, 0x63, 0xf3, 0xe8, 0x3c, 0xac, 0x35, 0x06, 0xbd, 0xbd, 0x03, 0x4c, 0xbd,
+  0x11, 0xf6, 0x92, 0x3d, 0x1b, 0x1a, 0x64, 0x3d, 0x51, 0x88, 0x58, 0xbc, 0x61,
+  0xbf, 0x83, 0xbd, 0xdd, 0x44, 0x73, 0xbd, 0xe7, 0xe5, 0xd0, 0x3c, 0xc9, 0x5f,
+  0x87, 0x3d, 0xec, 0x20, 0xbe, 0x3d, 0xd9, 0x21, 0x0f, 0x3d, 0xf9, 0xdd, 0xe7,
+  0xbc, 0xf3, 0x32, 0x91, 0xbd, 0x71, 0xb6, 0x4a, 0x3d, 0x29, 0x35, 0x86, 0x3d,
+  0xba, 0xf4, 0x40, 0xbd, 0x1c, 0x2b, 0x17, 0xbd, 0x70, 0xfb, 0x3c, 0xbd, 0xed,
+  0x3e, 0xdf, 0xbc, 0x60, 0xf1, 0x3d, 0x3d, 0x53, 0x6e, 0x87, 0xbd, 0x0f, 0x52,
+  0x3d, 0x3d, 0x58, 0xd1, 0x47, 0xbd, 0xab, 0x7f, 0xc3, 0x3c, 0x3d, 0x5d, 0xa8,
+  0xbd, 0xe9, 0x7f, 0x11, 0xbd, 0x88, 0x93, 0x50, 0xbd, 0xf2, 0xd2, 0x0f, 0x3d,
+  0x24, 0x59, 0x90, 0x3a, 0x99, 0x86, 0x8b, 0xbd, 0x27, 0x21, 0x5f, 0xbd, 0xf4,
+  0xa1, 0x80, 0x3d, 0x0b, 0xbb, 0x89, 0x3c, 0xbc, 0xda, 0x79, 0x3d, 0xe8, 0x9b,
+  0x56, 0xbc, 0x42, 0xca, 0xf1, 0x3c, 0x74, 0xe2, 0x86, 0x3c, 0xe4, 0x85, 0x0f,
+  0x3d, 0x07, 0x57, 0x2e, 0x3d, 0x41, 0x24, 0x85, 0x3d, 0x48, 0x7e, 0x08, 0xbd,
+  0x91, 0xa8, 0xdd, 0x3c, 0x8c, 0xe1, 0xb7, 0xbc, 0x04, 0xae, 0x2f, 0x3d, 0xe4,
+  0x63, 0xa2, 0x3c, 0x6e, 0x28, 0x06, 0xbc, 0x8d, 0xd9, 0x67, 0xbd, 0x88, 0x14,
+  0x43, 0x3d, 0xe5, 0x9a, 0xde, 0x3c, 0x45, 0x3e, 0x9d, 0x3d, 0x03, 0x22, 0xcb,
+  0xbc, 0x71, 0x92, 0x7c, 0x3d, 0xf7, 0xc6, 0x0d, 0x3d, 0xfb, 0x47, 0xa4, 0x3d,
+  0x45, 0x18, 0x91, 0xbd, 0xda, 0x0b, 0x79, 0xbc, 0x18, 0x17, 0x71, 0xbd, 0xa2,
+  0x74, 0x4e, 0xbd, 0xd7, 0xdb, 0x46, 0x3d, 0x35, 0x53, 0xbb, 0x3c, 0x0c, 0x62,
+  0x0f, 0xbc, 0xe9, 0x2d, 0xdf, 0xbd, 0x33, 0xc7, 0x60, 0x3c, 0x18, 0x74, 0xa8,
+  0x3c, 0xa3, 0x75, 0x87, 0xbd, 0x7b, 0x58, 0xf3, 0xbd, 0x30, 0xcd, 0xfa, 0x3c,
+  0x35, 0xbd, 0x9c, 0xbd, 0x93, 0xcf, 0xdb, 0xbc, 0xc2, 0x35, 0xd9, 0xbc, 0x5e,
+  0x5a, 0x06, 0x3d, 0x3d, 0x8b, 0x39, 0xbd, 0xb7, 0x5d, 0x33, 0xbc, 0x50, 0xca,
+  0xb8, 0x3c, 0x8b, 0x71, 0xfb, 0x3c, 0x80, 0x8e, 0x2a, 0x3d, 0xa0, 0x72, 0x80,
+  0xbc, 0x08, 0x4a, 0x00, 0xbd, 0x9b, 0x6f, 0xd2, 0x3b, 0xda, 0x83, 0xf9, 0xbc,
+  0xed, 0x0c, 0x0b, 0x3c, 0x5d, 0x80, 0x40, 0xbc, 0x84, 0x40, 0x25, 0xbd, 0x52,
+  0x1e, 0x03, 0x3d, 0x53, 0xd4, 0x54, 0x3c, 0x0b, 0x6b, 0xda, 0x3c, 0xcc, 0x67,
+  0x17, 0x3b, 0x58, 0x05, 0xe5, 0xba, 0x63, 0x8d, 0x95, 0x3c, 0xc6, 0xa5, 0x5a,
+  0x3d, 0xdf, 0x29, 0x23, 0xbd, 0x4b, 0x72, 0x9b, 0x3d, 0xef, 0x78, 0x4b, 0xbd,
+  0xa5, 0x08, 0xb7, 0xbd, 0x9c, 0xb5, 0x78, 0xbc, 0xdf, 0x0c, 0x88, 0x3d, 0x07,
+  0xab, 0x19, 0x3d, 0xdc, 0xad, 0xc9, 0xbd, 0x5e, 0x37, 0x4f, 0x3d, 0xe6, 0x99,
+  0x77, 0xbd, 0x12, 0x5f, 0x48, 0xbc, 0x89, 0x82, 0xf2, 0x3b, 0x86, 0x89, 0x44,
+  0x3c, 0x66, 0x1b, 0xb7, 0xbc, 0x2f, 0x07, 0xd0, 0x3b, 0xb5, 0x85, 0x76, 0xb9,
+  0xb2, 0xc4, 0x11, 0xbd, 0x5b, 0x02, 0x30, 0xbd, 0xed, 0xed, 0xee, 0x3c, 0x77,
+  0xbd, 0x24, 0xbb, 0x36, 0xe9, 0x97, 0xbd, 0x2a, 0xe1, 0x6d, 0x3d, 0x75, 0x29,
+  0xaf, 0x3d, 0xff, 0x38, 0xac, 0xbb, 0x76, 0x6d, 0xe4, 0xbc, 0xf8, 0x03, 0x15,
+  0xbd, 0x6f, 0x3d, 0x9a, 0xbc, 0x6b, 0x64, 0x1f, 0x3d, 0xa6, 0x7c, 0x6f, 0xbd,
+  0xa7, 0x60, 0x83, 0x3c, 0xe1, 0xa5, 0x53, 0xbd, 0x04, 0x4f, 0xb6, 0xbc, 0xe7,
+  0x0b, 0x28, 0x3d, 0x4c, 0x15, 0xa9, 0xbc, 0x68, 0x90, 0x73, 0xbb, 0x77, 0x3e,
+  0x8e, 0x3c, 0xdd, 0x42, 0x0c, 0xbd, 0x07, 0x7d, 0x22, 0xbd, 0x35, 0x15, 0x82,
+  0xbd, 0xed, 0x56, 0xe0, 0x3c, 0xfa, 0x8d, 0x7e, 0x3d, 0xab, 0xb5, 0x85, 0xbd,
+  0x8c, 0x4b, 0xa4, 0xbc, 0xe5, 0xee, 0x53, 0xbc, 0x9e, 0x26, 0x4f, 0xbd, 0xaa,
+  0xdf, 0x63, 0xbd, 0xd2, 0x48, 0x11, 0x3c, 0xd6, 0x9c, 0x58, 0x3d, 0xa9, 0x90,
+  0x00, 0x3d, 0x9b, 0xfa, 0x8c, 0x3b, 0x2a, 0x97, 0x1d, 0x3d, 0x37, 0xe9, 0x3e,
+  0xbd, 0x51, 0xd8, 0xf0, 0xbd, 0x92, 0x65, 0x2b, 0xbd, 0x06, 0x73, 0x21, 0x3c,
+  0x85, 0x89, 0xad, 0x3d, 0x50, 0x07, 0x60, 0x3d, 0x01, 0x61, 0x9a, 0x3d, 0xcf,
+  0xba, 0x9c, 0x3d, 0x7c, 0x6f, 0x69, 0x3d, 0x20, 0x79, 0x71, 0xbd, 0xc8, 0x59,
+  0xd1, 0xbc, 0x2f, 0x68, 0x1e, 0xbd, 0xb2, 0xed, 0x87, 0xbd, 0x3e, 0xe7, 0xa0,
+  0xba, 0xb1, 0xf0, 0xd0, 0x3c, 0x1c, 0xf1, 0xdd, 0xbc, 0xb0, 0x4a, 0x83, 0xbb,
+  0xb5, 0x00, 0x55, 0xbc, 0xc6, 0x63, 0x0b, 0x3d, 0xa8, 0x88, 0x2f, 0x3d, 0x3c,
+  0x6e, 0xd7, 0x3c, 0x68, 0x1d, 0x14, 0xbc, 0xac, 0xd1, 0x37, 0x3d, 0x7f, 0xb7,
+  0x66, 0x3d, 0xca, 0xd0, 0xc7, 0xbb, 0x72, 0x5a, 0x91, 0x3d, 0x64, 0x09, 0xaf,
+  0x3c, 0xea, 0x7a, 0x0d, 0xbb, 0x87, 0xd8, 0x4f, 0xbb, 0x88, 0xdf, 0xa5, 0x3c,
+  0x1a, 0xd5, 0x73, 0xbc, 0x55, 0x5b, 0xce, 0x3a, 0xff, 0x62, 0x16, 0x3d, 0xb9,
+  0x06, 0xa8, 0xbd, 0xbc, 0x96, 0xc0, 0xbc, 0x77, 0x06, 0x17, 0xbc, 0xe9, 0xdf,
+  0x7e, 0xba, 0x94, 0x5f, 0xcd, 0x3b, 0x7b, 0x66, 0xf2, 0xbc, 0xc3, 0xdf, 0x7d,
+  0xbd, 0x9c, 0x07, 0x0e, 0xbd, 0xaa, 0x4e, 0x0a, 0xbd, 0x42, 0x2d, 0x7f, 0x3c,
+  0x6f, 0x45, 0xb9, 0x3c, 0x6a, 0xf4, 0x2c, 0xbd, 0x66, 0x01, 0x23, 0xbd, 0x5a,
+  0x2e, 0x12, 0xbc, 0x00, 0x0c, 0xc4, 0xbd, 0x56, 0xf3, 0xd9, 0xbc, 0x57, 0x20,
+  0x14, 0xbd, 0x8f, 0xae, 0xbd, 0x3c, 0x0a, 0x85, 0xbb, 0xbd, 0x51, 0x63, 0x28,
+  0xbd, 0xc3, 0x45, 0x19, 0xbd, 0x1a, 0xc0, 0x66, 0x3d, 0x58, 0xac, 0x77, 0xbd,
+  0x2e, 0xb6, 0xdc, 0xbc, 0xaa, 0x45, 0xe6, 0xbc, 0x06, 0xba, 0x43, 0xbd, 0x71,
+  0x36, 0xac, 0x3d, 0xf5, 0xcb, 0x96, 0x3d, 0x5b, 0x32, 0x58, 0xba, 0x6a, 0xe8,
+  0xe0, 0xb9, 0x39, 0xb6, 0xbe, 0x3c, 0x56, 0xcc, 0xc5, 0x3b, 0x6b, 0xde, 0xad,
+  0xbc, 0x6c, 0xd9, 0xf4, 0xbc, 0xb2, 0xe9, 0x43, 0x3d, 0xf9, 0xd2, 0x1b, 0xbc,
+  0xb1, 0x0f, 0x19, 0x3d, 0xb3, 0xe0, 0x05, 0x3b, 0xdd, 0x85, 0xa8, 0x3d, 0x92,
+  0x70, 0xc0, 0xbc, 0xaf, 0xa0, 0x22, 0xbd, 0x9f, 0x05, 0x33, 0xbd, 0x4a, 0xe4,
+  0xa8, 0x3c, 0x80, 0xf3, 0xc9, 0xba, 0x9f, 0x4c, 0x31, 0xbd, 0x5e, 0x75, 0xa4,
+  0xbc, 0x4e, 0xa3, 0x73, 0xbd, 0x32, 0x14, 0x96, 0xbd, 0xf1, 0xc8, 0xb1, 0x3c,
+  0xa6, 0x72, 0x15, 0xbd, 0x06, 0xbc, 0x4c, 0x3d, 0xd6, 0x84, 0x96, 0x3b, 0xbd,
+  0x95, 0x27, 0x3d, 0x89, 0x66, 0xd8, 0x3c, 0x14, 0xc8, 0xf8, 0xbc, 0x48, 0xc6,
+  0x2a, 0x3d, 0x68, 0x7c, 0xa4, 0x3d, 0x0b, 0xfe, 0x48, 0x3d, 0x03, 0x4e, 0xa0,
+  0x3c, 0x14, 0xeb, 0x9e, 0x3d, 0x54, 0x79, 0x17, 0xbd, 0x8d, 0xe5, 0x44, 0x3c,
+  0x89, 0xb2, 0x14, 0xbc, 0x37, 0x64, 0x98, 0x3d, 0xd5, 0x7d, 0x54, 0xbd, 0x82,
+  0x97, 0x92, 0xbd, 0x97, 0x4c, 0x7c, 0x3b, 0xf8, 0x3f, 0x2b, 0x3d, 0xa2, 0x52,
+  0xc8, 0x3c, 0x67, 0x7b, 0x49, 0xbd, 0x8b, 0xdc, 0x84, 0xbc, 0xfc, 0xd2, 0x1c,
+  0xbd, 0x50, 0x53, 0x8d, 0xbb, 0xa7, 0x93, 0xfe, 0xbc, 0xab, 0xb3, 0xff, 0xbc,
+  0xb0, 0x0d, 0x12, 0x3c, 0x90, 0xde, 0x69, 0x3d, 0x19, 0x4a, 0x31, 0x3d, 0xba,
+  0x86, 0xbe, 0xbd, 0xf0, 0xd1, 0x6f, 0xbd, 0x2a, 0x37, 0xa2, 0x3c, 0xba, 0x72,
+  0x91, 0xbc, 0x69, 0xfe, 0x8f, 0xbb, 0xb4, 0xe0, 0x26, 0x3d, 0x9e, 0x8e, 0x6f,
+  0x3d, 0x28, 0x1c, 0xa4, 0xbc, 0xeb, 0x11, 0x0b, 0x3d, 0xd3, 0x1a, 0x27, 0x3c,
+  0x89, 0x93, 0xa3, 0x3d, 0x22, 0xbf, 0x46, 0x3d, 0xe2, 0x27, 0xe5, 0xbc, 0xa1,
+  0x10, 0x8a, 0xbc, 0xe9, 0x93, 0x65, 0xbd, 0xef, 0x81, 0xce, 0x3c, 0x0c, 0x10,
+  0x44, 0x3c, 0xdc, 0x0d, 0x15, 0xbd, 0x8d, 0x3b, 0x09, 0x3d, 0xc2, 0xe2, 0x35,
+  0xbd, 0xc3, 0xde, 0x09, 0x3c, 0x68, 0xc5, 0x8f, 0x3d, 0xa2, 0xb3, 0x38, 0x3d,
+  0x94, 0xa6, 0x66, 0x3c, 0x5f, 0x15, 0x79, 0x3d, 0x74, 0x80, 0x7e, 0x3d, 0x00,
+  0xb6, 0xb0, 0xbb, 0xdb, 0xb6, 0x98, 0xbb, 0x8c, 0x1a, 0xb7, 0xbc, 0xa0, 0xf9,
+  0x7e, 0x3c, 0x66, 0x95, 0x47, 0x3d, 0xca, 0x33, 0xf0, 0xbc, 0xde, 0x00, 0xfa,
+  0x3b, 0x57, 0x05, 0xfb, 0xbb, 0xfc, 0x7f, 0xcb, 0xbc, 0x31, 0x1c, 0x11, 0x3d,
+  0x16, 0xe4, 0xfd, 0x3b, 0x3d, 0xd5, 0xb5, 0x3c, 0x8c, 0xd4, 0x69, 0xbd, 0x40,
+  0x7f, 0x87, 0xbb, 0x26, 0x9d, 0x77, 0xbc, 0x6b, 0xa7, 0xde, 0x3c, 0xf4, 0xd2,
+  0x00, 0x3c, 0xff, 0x0d, 0xbc, 0x3c, 0xab, 0xfb, 0x6f, 0x3d, 0x5a, 0x15, 0x8b,
+  0x3b, 0x05, 0x27, 0x77, 0x3d, 0xd8, 0xa8, 0x54, 0x3d, 0xa7, 0xf2, 0x01, 0x3d,
+  0x20, 0x41, 0x70, 0x3c, 0x19, 0x99, 0xfd, 0xbc, 0xc0, 0xea, 0x48, 0x3d, 0xd7,
+  0x09, 0x26, 0x3b, 0x79, 0x58, 0x6b, 0x3d, 0x2b, 0x43, 0x2e, 0xbd, 0x58, 0x06,
+  0x76, 0x3c, 0xc3, 0x4a, 0x8c, 0x3d, 0x4b, 0x5b, 0x62, 0x3d, 0xb2, 0xff, 0x1f,
+  0xbd, 0xeb, 0x73, 0x08, 0x3d, 0x39, 0xd4, 0x77, 0xbd, 0xfc, 0x94, 0x83, 0xbc,
+  0x0e, 0x0d, 0x6c, 0x3d, 0x5c, 0x29, 0x73, 0x3d, 0x96, 0xc4, 0x92, 0xba, 0x00,
+  0x64, 0x97, 0xbd, 0x3b, 0x52, 0x3a, 0xbd, 0x3a, 0x2d, 0x91, 0xbd, 0x62, 0x65,
+  0x97, 0xbd, 0x72, 0xde, 0xd2, 0xbd, 0x1d, 0x30, 0x00, 0xbd, 0x74, 0x93, 0x95,
+  0xbd, 0xae, 0x2c, 0xd7, 0xbc, 0xe3, 0xae, 0x27, 0x3d, 0x67, 0x7f, 0x0b, 0x3c,
+  0xfc, 0xcf, 0x74, 0xbc, 0x7f, 0x2b, 0x74, 0x3d, 0x00, 0x49, 0xa2, 0xba, 0x13,
+  0xfa, 0x0e, 0xbd, 0x7e, 0xfe, 0x9f, 0xbc, 0xa6, 0x05, 0xc7, 0xbb, 0xc2, 0xa7,
+  0x2a, 0xbc, 0xb3, 0x63, 0x9b, 0x3a, 0x9c, 0x14, 0x0e, 0x3d, 0x82, 0xc6, 0xb0,
+  0xbc, 0xc1, 0x25, 0xc0, 0x3c, 0x03, 0x95, 0x45, 0xbd, 0x61, 0xb6, 0x50, 0xbd,
+  0xf8, 0x77, 0xea, 0x3a, 0x9d, 0xa7, 0xaa, 0x3a, 0xf2, 0x18, 0x1d, 0xbd, 0x42,
+  0x15, 0x94, 0x3d, 0x7e, 0x0e, 0x47, 0xbd, 0xa5, 0x82, 0x84, 0x3d, 0xed, 0xbe,
+  0x3b, 0x3d, 0x3b, 0xdc, 0x2e, 0xbd, 0x5c, 0x8c, 0x4b, 0xbd, 0x37, 0xbc, 0x99,
+  0xbb, 0xb7, 0x55, 0x54, 0x3d, 0x8e, 0x6d, 0xa8, 0xbd, 0x09, 0x3c, 0x3f, 0x3d,
+  0x83, 0x0e, 0x3a, 0xbd, 0x8f, 0x1f, 0x91, 0x3d, 0x8b, 0x2b, 0x33, 0xbd, 0x92,
+  0x57, 0x58, 0x3d, 0x71, 0xcd, 0x27, 0xbd, 0xcf, 0x53, 0x30, 0x3d, 0x20, 0x81,
+  0x64, 0x3d, 0x50, 0x82, 0x60, 0xbd, 0x98, 0x46, 0x2f, 0x3d, 0x32, 0x95, 0x28,
+  0xbd, 0x70, 0xf5, 0x71, 0x3c, 0x9d, 0x96, 0xb0, 0xbc, 0x5b, 0x59, 0x56, 0xbd,
+  0x10, 0x59, 0x90, 0x3d, 0xc0, 0x1e, 0xbb, 0x3c, 0x5c, 0x37, 0x9d, 0x3d, 0xbd,
+  0x75, 0x61, 0x3d, 0xcf, 0x8b, 0x84, 0xbc, 0xb2, 0x23, 0x46, 0x3d, 0x0a, 0x82,
+  0x02, 0x3d, 0xaf, 0xd4, 0x8e, 0xbb, 0x60, 0x87, 0xca, 0x3c, 0xdb, 0x73, 0x1a,
+  0xbd, 0x52, 0xa2, 0x09, 0x3d, 0xa2, 0x5b, 0x4a, 0xbd, 0x1d, 0x5d, 0xa0, 0xbb,
+  0x30, 0x20, 0x7e, 0xbd, 0x84, 0x2a, 0x78, 0xbd, 0x74, 0x5f, 0x6a, 0xbd, 0xa5,
+  0x1a, 0xa5, 0xbd, 0xa8, 0x46, 0x92, 0x3c, 0xe5, 0x7e, 0x50, 0xbd, 0xc1, 0x19,
+  0x4b, 0x3c, 0x1a, 0x20, 0x71, 0x3d, 0xa1, 0xa7, 0x48, 0xbc, 0xc3, 0xa7, 0xeb,
+  0x3c, 0xd4, 0x58, 0x6c, 0xbd, 0x06, 0x40, 0x08, 0x3d, 0x07, 0x97, 0x93, 0x3d,
+  0x36, 0xb8, 0x5c, 0xbd, 0x69, 0x31, 0xc4, 0x3d, 0x5d, 0x20, 0x62, 0xbc, 0x73,
+  0x3a, 0xbf, 0xbc, 0xea, 0xff, 0x3f, 0x3d, 0x39, 0x07, 0xec, 0x3c, 0xeb, 0x30,
+  0xb4, 0xbb, 0x0b, 0x38, 0x72, 0xbd, 0x12, 0x71, 0xfd, 0xbc, 0xc5, 0x09, 0x82,
+  0x3b, 0x5d, 0x51, 0x84, 0xbd, 0xff, 0x16, 0x49, 0xbd, 0x5e, 0xd1, 0x13, 0xbd,
+  0xd8, 0xaf, 0x96, 0x3c, 0xea, 0x7c, 0x7e, 0xbd, 0x9b, 0x71, 0x1c, 0x3d, 0xe0,
+  0xff, 0xaf, 0xbc, 0xac, 0x24, 0x57, 0x3d, 0x8a, 0xf8, 0x49, 0x3d, 0x24, 0xfd,
+  0xbc, 0xbc, 0x46, 0x2c, 0xac, 0xbd, 0xc8, 0xdf, 0x63, 0xbc, 0x61, 0xc6, 0x2e,
+  0xbd, 0x9d, 0xec, 0xd9, 0xbc, 0xb1, 0x44, 0x86, 0xbd, 0x85, 0x38, 0x47, 0x3d,
+  0x7b, 0x49, 0x5a, 0xbd, 0xb0, 0x9c, 0xee, 0xbc, 0x03, 0x6f, 0x33, 0xbd, 0x55,
+  0x8c, 0x23, 0xbc, 0xd5, 0xcc, 0x82, 0xbc, 0x82, 0xc2, 0xcc, 0xbc, 0xac, 0x00,
+  0x85, 0x3c, 0xf6, 0xf5, 0x70, 0x3d, 0xb0, 0x0f, 0x03, 0x37, 0xa3, 0xfd, 0x5a,
+  0xbd, 0x13, 0x57, 0x38, 0x3c, 0x25, 0xe4, 0xea, 0xbc, 0x1a, 0xb8, 0x0e, 0x3c,
+  0x80, 0x95, 0x20, 0xbb, 0x84, 0x35, 0x36, 0x3d, 0x27, 0x0c, 0x1f, 0xbd, 0x4e,
+  0x46, 0x8d, 0x3d, 0xa4, 0xb0, 0xef, 0x3c, 0xe1, 0xf5, 0xce, 0xbc, 0x34, 0x54,
+  0x9d, 0xbc, 0x9f, 0x03, 0xd9, 0x3b, 0x22, 0xe9, 0xed, 0xbc, 0xd3, 0x7d, 0x30,
+  0xbd, 0xb8, 0x86, 0x1f, 0xbc, 0xed, 0xc3, 0x44, 0x3d, 0xbf, 0x32, 0xa1, 0x39,
+  0x74, 0xe5, 0x38, 0xbd, 0xa3, 0xe4, 0x6c, 0xbd, 0x56, 0x19, 0x33, 0xbd, 0x17,
+  0x60, 0xbd, 0xbc, 0xd5, 0xec, 0x4a, 0x3c, 0xa2, 0x27, 0xa4, 0x3d, 0x50, 0xea,
+  0x77, 0xbd, 0x5a, 0xb3, 0x91, 0x39, 0xf3, 0xc2, 0x19, 0x3d, 0xd2, 0xb9, 0x4f,
+  0xbd, 0x60, 0x90, 0x81, 0x3d, 0xbf, 0x14, 0x60, 0xbd, 0x7a, 0xdd, 0x62, 0x3c,
+  0x43, 0x4c, 0xa5, 0xbb, 0xad, 0x1c, 0xe1, 0xbc, 0xc8, 0x0b, 0x15, 0x3d, 0xe1,
+  0xbd, 0x0f, 0x3d, 0xc6, 0x1f, 0x92, 0x3d, 0xdf, 0x9a, 0x86, 0xbd, 0x08, 0x1a,
+  0xed, 0x3c, 0xfa, 0x1f, 0x00, 0x3c, 0x90, 0x94, 0x1b, 0x3d, 0x4a, 0x1c, 0x25,
+  0xbd, 0x79, 0xe4, 0xff, 0xbc, 0xdf, 0xeb, 0x91, 0x3d, 0x43, 0x22, 0x81, 0x3d,
+  0x1f, 0x1c, 0xa2, 0xbd, 0x54, 0xaf, 0x48, 0xbd, 0xbb, 0x7d, 0x4a, 0x3c, 0x32,
+  0xcd, 0x6a, 0x3d, 0xc0, 0x75, 0x8b, 0x3d, 0x9a, 0xad, 0x67, 0x3c, 0xd1, 0xe6,
+  0x30, 0xbd, 0x85, 0x2b, 0x33, 0x3c, 0xee, 0x90, 0x69, 0x3b, 0x7b, 0xdc, 0x96,
+  0xbd, 0x38, 0x29, 0xad, 0x3b, 0xd8, 0x2b, 0xff, 0xbb, 0x72, 0x62, 0x57, 0x3c,
+  0x55, 0x29, 0x86, 0x3d, 0xc7, 0x7c, 0x90, 0xbd, 0xfa, 0xa6, 0x71, 0xbd, 0x7f,
+  0x51, 0x15, 0x3c, 0x7a, 0x11, 0x61, 0xbd, 0xd8, 0xd1, 0x64, 0x3b, 0xbc, 0x7e,
+  0x8e, 0x3c, 0x06, 0x60, 0xe6, 0x3b, 0x1a, 0xd8, 0x43, 0x3d, 0x9b, 0xa8, 0x99,
+  0xbd, 0x30, 0x98, 0x17, 0x3d, 0x82, 0xd8, 0x7a, 0xbd, 0xca, 0x23, 0x14, 0x3d,
+  0x45, 0x6d, 0x18, 0xbd, 0x0d, 0x33, 0x8d, 0x3c, 0xd9, 0x88, 0xb5, 0xbc, 0x9c,
+  0x01, 0xc6, 0x3b, 0xc2, 0x52, 0xe5, 0x3c, 0xc6, 0xbf, 0x5a, 0x3d, 0xa8, 0x06,
+  0x1f, 0xbd, 0x1f, 0xaf, 0x4e, 0x3d, 0x84, 0x35, 0xca, 0xbd, 0x50, 0xc8, 0xee,
+  0x3c, 0x64, 0xe8, 0x35, 0xbd, 0xbc, 0x23, 0x31, 0x3d, 0x36, 0x1d, 0xbf, 0xbd,
+  0x7c, 0x88, 0x94, 0xbc, 0x0f, 0x8f, 0x1b, 0x3d, 0x08, 0x54, 0x81, 0x3c, 0x12,
+  0x2f, 0x8a, 0xbd, 0xd7, 0x70, 0x3c, 0xbc, 0xb8, 0x2a, 0x50, 0x3d, 0xc8, 0xed,
+  0x0e, 0xbd, 0xb7, 0xa3, 0x54, 0x3d, 0xc9, 0x64, 0x6c, 0xbc, 0x89, 0x83, 0x25,
+  0xbd, 0xef, 0x72, 0x3b, 0x3b, 0xeb, 0xf8, 0xec, 0x3b, 0xe6, 0x5e, 0x0b, 0xbc,
+  0xd4, 0xc0, 0xf5, 0xbc, 0x8a, 0x04, 0x92, 0x3d, 0xe8, 0x04, 0x39, 0xbd, 0x0f,
+  0x74, 0xea, 0x3c, 0xfc, 0x8b, 0x01, 0xbc, 0xb2, 0xe0, 0x73, 0x3d, 0xc8, 0xa1,
+  0xea, 0x3c, 0x99, 0xfe, 0x4f, 0x3d, 0xde, 0x4f, 0x36, 0xbd, 0x73, 0xe5, 0x76,
+  0xbd, 0x8b, 0xd2, 0xdb, 0x3b, 0x96, 0x72, 0x79, 0x3c, 0xd0, 0x9b, 0x14, 0x3d,
+  0x3d, 0x6f, 0x6a, 0x3d, 0x21, 0x55, 0x16, 0x3d, 0xeb, 0x2a, 0x91, 0x3d, 0x8c,
+  0xd0, 0x33, 0xbd, 0x45, 0xdd, 0x54, 0xbd, 0x7e, 0x94, 0x90, 0xbc, 0xd4, 0x4c,
+  0x8b, 0x3c, 0x4a, 0x6b, 0x19, 0x3d, 0x9e, 0x42, 0xeb, 0x3c, 0x7d, 0xf2, 0x4f,
+  0x3d, 0x17, 0x4f, 0xab, 0x3c, 0x28, 0x37, 0xa1, 0x3c, 0x6d, 0xb8, 0x88, 0xbd,
+  0xc1, 0xe3, 0x1e, 0xbd, 0x8f, 0x8c, 0x60, 0x3d, 0xe9, 0x88, 0x93, 0x3c, 0x54,
+  0x12, 0x8e, 0x3d, 0x04, 0x68, 0xcb, 0xbc, 0x6e, 0xbf, 0xb0, 0xb9, 0xba, 0x8b,
+  0x16, 0x3d, 0x3a, 0x30, 0xd5, 0x39, 0x89, 0x43, 0x89, 0x3c, 0x89, 0x8c, 0xc0,
+  0x3b, 0x93, 0x98, 0xd9, 0xbd, 0xc5, 0x26, 0x3e, 0xbd, 0x2a, 0x4f, 0xa9, 0xbb,
+  0x35, 0xa6, 0xe6, 0xbc, 0xeb, 0x89, 0x1f, 0x3d, 0xea, 0x85, 0xb7, 0xbc, 0xa7,
+  0x52, 0xbb, 0xbc, 0x02, 0xda, 0x86, 0x3d, 0x82, 0xad, 0xfd, 0xba, 0x01, 0x20,
+  0x2f, 0xbd, 0xb8, 0x8c, 0x9d, 0xbd, 0x9c, 0xbd, 0x1b, 0x3d, 0x1d, 0xad, 0xe6,
+  0x3c, 0xac, 0x48, 0x6b, 0x3c, 0xdd, 0x13, 0xcb, 0xbd, 0xee, 0xcd, 0x8a, 0xbd,
+  0x8b, 0x33, 0x7c, 0x3d, 0xc5, 0x0a, 0x2a, 0x3d, 0x13, 0x49, 0x77, 0x3d, 0x7e,
+  0x78, 0xd1, 0xbd, 0xd3, 0x18, 0x3c, 0x3c, 0xb7, 0xaa, 0xb1, 0xbc, 0x54, 0x3a,
+  0xce, 0xbc, 0x86, 0x08, 0x97, 0xbd, 0x04, 0x21, 0x01, 0xbc, 0x72, 0xa8, 0x65,
+  0x3d, 0x71, 0x0b, 0xf3, 0x3b, 0x14, 0x9e, 0x88, 0x3c, 0x9c, 0xc6, 0x90, 0x3d,
+  0x1d, 0xdb, 0x37, 0xbd, 0x8e, 0x9e, 0x59, 0x3c, 0xf6, 0xa9, 0x1a, 0xbd, 0xfd,
+  0xec, 0x19, 0x3d, 0xa3, 0x01, 0x5a, 0xbd, 0xcc, 0xe7, 0x15, 0xbd, 0x26, 0xe6,
+  0x51, 0x3d, 0xeb, 0x5f, 0x8d, 0x3d, 0x93, 0x7a, 0x73, 0x3c, 0x94, 0x02, 0x10,
+  0x3d, 0x5d, 0x7e, 0xa7, 0x3c, 0x52, 0x78, 0x12, 0xbd, 0xe2, 0xfb, 0x44, 0x3d,
+  0xb8, 0xdf, 0xa4, 0x3c, 0x84, 0x3d, 0x0e, 0xbd, 0xad, 0xae, 0x0e, 0x3c, 0x52,
+  0xda, 0x1e, 0x3d, 0xfe, 0x93, 0x92, 0xbd, 0xe8, 0xe3, 0xde, 0xbd, 0x7a, 0xdc,
+  0xd9, 0xbc, 0xc3, 0xb0, 0x68, 0x3d, 0x58, 0x56, 0x25, 0xbd, 0x3a, 0x61, 0xdc,
+  0xbc, 0x71, 0xa2, 0xbc, 0x3c, 0x1b, 0xab, 0x30, 0x3d, 0x2a, 0x68, 0xbd, 0xbb,
+  0x5e, 0xaf, 0x8b, 0xbd, 0xb4, 0x4d, 0x30, 0x3d, 0xa0, 0x46, 0x72, 0x3d, 0x4e,
+  0xd2, 0x10, 0x3d, 0x71, 0x47, 0x4e, 0xbd, 0xe5, 0xd4, 0xe6, 0xbc, 0x25, 0x05,
+  0x87, 0x3c, 0x33, 0x85, 0xec, 0x3c, 0x84, 0x58, 0x5f, 0xbd, 0xb0, 0xfa, 0xc0,
+  0xbd, 0xc0, 0xdb, 0x87, 0xba, 0xa0, 0x30, 0x13, 0x3d, 0x84, 0x01, 0xe2, 0xbc,
+  0xee, 0x8d, 0xa1, 0x3c, 0xc8, 0x8c, 0x24, 0x3c, 0x2b, 0x33, 0xf0, 0x3c, 0xc5,
+  0xdd, 0x55, 0x3c, 0x89, 0x7c, 0xa5, 0xbc, 0x3b, 0x39, 0x19, 0xbd, 0xed, 0x0d,
+  0x74, 0x3d, 0x98, 0xdf, 0x24, 0xbc, 0xdd, 0xdc, 0x38, 0xbd, 0xab, 0x9f, 0x75,
+  0x3b, 0xd7, 0x20, 0xf3, 0x3c, 0x96, 0xa3, 0x78, 0x3c, 0x58, 0x44, 0x90, 0xbd,
+  0x21, 0xcb, 0xf2, 0x3b, 0x18, 0x22, 0x58, 0xbd, 0x7c, 0x1c, 0x1b, 0xbd, 0xdc,
+  0x4d, 0x19, 0xbd, 0xff, 0x68, 0x35, 0xbb, 0x34, 0xc5, 0x5e, 0x3c, 0x48, 0x3a,
+  0x90, 0xbd, 0xa1, 0x84, 0xa7, 0x3c, 0x96, 0xc6, 0x46, 0xbd, 0x20, 0x22, 0xb3,
+  0xbc, 0x16, 0x95, 0x18, 0x3d, 0x84, 0xa2, 0x5e, 0x3d, 0x78, 0x3a, 0x29, 0xbd,
+  0x37, 0x9a, 0x5a, 0xbd, 0x93, 0x8b, 0x80, 0x3d, 0x25, 0xff, 0x49, 0xbd, 0xf0,
+  0x1e, 0x8c, 0xbb, 0xde, 0xa1, 0x48, 0x3d, 0x58, 0x67, 0x2d, 0x3d, 0x09, 0x18,
+  0x26, 0x3d, 0x37, 0x68, 0x85, 0x3d, 0xa0, 0x28, 0x70, 0x3d, 0x33, 0xf5, 0x9f,
+  0xbc, 0x81, 0xcc, 0x97, 0xbd, 0x75, 0x24, 0x45, 0xbd, 0x60, 0x45, 0x29, 0x3d,
+  0x6b, 0x87, 0x25, 0xbd, 0x67, 0xd9, 0xb5, 0xbc, 0x15, 0xcb, 0x01, 0xbd, 0x39,
+  0xa5, 0xc6, 0xbd, 0xd2, 0xbe, 0xb9, 0xbd, 0x7c, 0x53, 0x20, 0xbd, 0x1a, 0x64,
+  0xb4, 0xbd, 0x5a, 0xc1, 0x1d, 0x3d, 0xdf, 0xdd, 0x50, 0xbc, 0x8e, 0x86, 0x2b,
+  0x3d, 0x20, 0xeb, 0x4d, 0x3d, 0x9a, 0xf8, 0x88, 0x3d, 0x92, 0xf1, 0x5e, 0xbd,
+  0x24, 0xb3, 0xd8, 0xbb, 0x19, 0xbc, 0xd9, 0xbc, 0x8d, 0x97, 0x8f, 0xbd, 0x6d,
+  0xf5, 0x7b, 0x3c, 0xfe, 0x33, 0x66, 0xbc, 0x35, 0x64, 0xfa, 0x3b, 0xe6, 0x00,
+  0x9d, 0xbc, 0xd6, 0x9c, 0x63, 0xbd, 0x02, 0xff, 0x8e, 0xbd, 0x10, 0xa1, 0x23,
+  0xbd, 0x93, 0x33, 0x0f, 0xbd, 0x59, 0xfc, 0x1b, 0x3d, 0x43, 0x0c, 0x7f, 0x3d,
+  0x06, 0xbd, 0x96, 0x3d, 0xe1, 0x5b, 0x9f, 0xbc, 0x44, 0x05, 0xf8, 0x3c, 0x1c,
+  0x60, 0xec, 0xbd, 0x33, 0x7f, 0x8c, 0xbd, 0x93, 0xcb, 0x0c, 0xbc, 0xc0, 0x8d,
+  0x0e, 0xbb, 0x16, 0x45, 0x65, 0xbd, 0x76, 0x93, 0x88, 0xbd, 0x49, 0xd0, 0xb3,
+  0xbd, 0xeb, 0x0e, 0x56, 0xbd, 0x8f, 0x1a, 0xab, 0x3d, 0x30, 0xde, 0x72, 0xb8,
+  0xcf, 0xc7, 0x1d, 0xbd, 0x12, 0xc3, 0x31, 0xbd, 0x6e, 0x1d, 0x47, 0xbd, 0xb3,
+  0x0f, 0x8c, 0x3d, 0x31, 0x82, 0x80, 0x3d, 0x44, 0xc4, 0x6b, 0xbc, 0x07, 0x28,
+  0x5a, 0x3d, 0xa3, 0x3c, 0x3d, 0xbd, 0x13, 0x5c, 0x6a, 0x3d, 0x1c, 0x3f, 0x11,
+  0x3d, 0x50, 0xac, 0xb5, 0xbc, 0x9f, 0x0e, 0xd9, 0x3c, 0x55, 0xfb, 0xde, 0xbc,
+  0x6b, 0x4f, 0x6a, 0xbd, 0x38, 0x5f, 0x3f, 0x3b, 0x5a, 0x26, 0x98, 0xbc, 0x32,
+  0x8c, 0x36, 0x3d, 0x78, 0x0a, 0x73, 0x3c, 0x7f, 0xd4, 0x51, 0x3d, 0x69, 0xdb,
+  0x97, 0x3d, 0x52, 0x37, 0x80, 0x3d, 0x9b, 0x10, 0x88, 0xbd, 0xc0, 0xbf, 0x90,
+  0xbd, 0x43, 0x84, 0x44, 0x3d, 0x12, 0x73, 0xc8, 0xbc, 0x84, 0xe0, 0x42, 0x3d,
+  0xf5, 0x79, 0xd2, 0xbc, 0x88, 0x3b, 0x05, 0x3d, 0xf6, 0x10, 0xf3, 0x3b, 0x73,
+  0x77, 0x8d, 0x3d, 0x92, 0xf0, 0x77, 0x3d, 0xd4, 0xcd, 0x55, 0xbd, 0x44, 0x7c,
+  0x88, 0xbd, 0x3b, 0xe3, 0x5f, 0xbd, 0x0c, 0x35, 0x87, 0x3c, 0x09, 0x68, 0xf0,
+  0x3c, 0x60, 0x3e, 0x47, 0x3a, 0xf6, 0x12, 0xb2, 0xbd, 0x2b, 0xe9, 0x9d, 0x3d,
+  0x8e, 0x7c, 0x97, 0xbc, 0xb1, 0x05, 0x2e, 0xbc, 0x99, 0x6b, 0x14, 0xbd, 0xb2,
+  0xa1, 0x85, 0x3d, 0x1c, 0xd1, 0x31, 0x3d, 0x18, 0xe6, 0xf5, 0x3c, 0xa7, 0x25,
+  0x5a, 0x3c, 0xe0, 0x75, 0x9e, 0xbd, 0x1b, 0xe1, 0x69, 0xbd, 0x1b, 0x22, 0xc0,
+  0x3d, 0xc4, 0x04, 0x8e, 0x3d, 0x92, 0x7f, 0x9d, 0x3d, 0xd3, 0xf3, 0x80, 0xbb,
+  0x69, 0x7a, 0x58, 0x3c, 0xd5, 0xc2, 0x92, 0xbc, 0x26, 0x08, 0xa2, 0xbd, 0x9f,
+  0xe8, 0x45, 0x3d, 0x10, 0xc9, 0x44, 0x3d, 0x7e, 0xac, 0x61, 0x3d, 0x88, 0xa8,
+  0xf1, 0x3c, 0xa2, 0xd1, 0x87, 0xbd, 0x8c, 0xa7, 0xd1, 0xbc, 0x77, 0x21, 0x86,
+  0xbd, 0x3b, 0x5a, 0xaa, 0x3d, 0x27, 0x8b, 0xb7, 0x3d, 0xe2, 0x8c, 0x39, 0x3d,
+  0x16, 0x70, 0xc0, 0xbc, 0x45, 0xcc, 0x81, 0xbd, 0xfd, 0x54, 0x09, 0x3d, 0x7f,
+  0x19, 0x0d, 0x3c, 0x0a, 0xfe, 0x39, 0xbd, 0xaf, 0x91, 0x66, 0xbd, 0x1c, 0xf9,
+  0xa3, 0x3d, 0x6d, 0xfa, 0xa7, 0x3b, 0x55, 0x1d, 0xa2, 0x3d, 0xd4, 0x1c, 0x8a,
+  0x3d, 0x21, 0xeb, 0xbd, 0xbc, 0xd7, 0x77, 0x45, 0xbc, 0x2b, 0xb9, 0x37, 0xbd,
+  0x7b, 0x7c, 0xbd, 0xbd, 0x59, 0xa0, 0x92, 0xbd, 0xb9, 0x28, 0x2f, 0xbd, 0x1c,
+  0xb6, 0x8c, 0xbc, 0x48, 0x52, 0x58, 0xbd, 0x90, 0x67, 0xa3, 0x3b, 0x92, 0xff,
+  0x79, 0x3d, 0x55, 0x80, 0x9d, 0x3c, 0x68, 0x54, 0x98, 0xbd, 0xc6, 0xff, 0xbc,
+  0xbc, 0x76, 0xb5, 0x72, 0xbd, 0x00, 0x62, 0x86, 0xbd, 0x6b, 0x01, 0xe3, 0xbc,
+  0x42, 0x03, 0x6e, 0xbd, 0xd6, 0xe1, 0x7d, 0xbd, 0xcd, 0xed, 0x8b, 0x3c, 0x67,
+  0x9d, 0x49, 0x3d, 0x6a, 0xe8, 0x31, 0x3d, 0xfd, 0x25, 0x4c, 0x3d, 0x87, 0x12,
+  0xe8, 0xbb, 0x31, 0x54, 0x92, 0xbc, 0xbe, 0xab, 0x98, 0xbb, 0x85, 0x6c, 0xf7,
+  0x3b, 0xb8, 0x0e, 0xbc, 0xbc, 0xf8, 0xea, 0x9a, 0x3d, 0x36, 0x13, 0xe2, 0xbc,
+  0x9f, 0xd7, 0x6d, 0x3d, 0x4f, 0x0a, 0xb1, 0x3d, 0xba, 0x5c, 0x6b, 0xbd, 0xae,
+  0x73, 0x60, 0xbc, 0x61, 0xf2, 0x8b, 0x3c, 0x90, 0x4c, 0x7b, 0xbd, 0x50, 0xef,
+  0xe9, 0xbd, 0x54, 0x83, 0x99, 0xbc, 0x8f, 0xd5, 0x4d, 0x3d, 0x6b, 0x02, 0x37,
+  0x3d, 0xc8, 0xe7, 0x84, 0x3d, 0x4e, 0x73, 0x87, 0x3d, 0x7a, 0xcc, 0xaa, 0x3c,
+  0x0e, 0xde, 0x26, 0xbd, 0xef, 0xfb, 0xc8, 0xbd, 0x96, 0xe9, 0x11, 0xbd, 0xd2,
+  0xd6, 0x26, 0xbc, 0x01, 0xea, 0x72, 0xbd, 0xf4, 0xb7, 0xad, 0xbb, 0x5b, 0xe7,
+  0x9e, 0x3d, 0xe6, 0xa1, 0x06, 0xbe, 0x4d, 0xa9, 0xd4, 0x3c, 0x83, 0xc9, 0xdf,
+  0x3c, 0x31, 0x26, 0x85, 0x3c, 0x4d, 0x25, 0xcf, 0xbb, 0x6c, 0xea, 0x91, 0x3d,
+  0xb3, 0x55, 0x5d, 0x3c, 0x7f, 0x1d, 0x70, 0xbd, 0x0d, 0x6f, 0x85, 0x3d, 0xbe,
+  0xe6, 0x35, 0xbd, 0x0f, 0x5b, 0x02, 0xbc, 0x1e, 0xad, 0x60, 0xbd, 0xeb, 0x48,
+  0x4c, 0x3d, 0x73, 0x67, 0xaf, 0x3c, 0xda, 0x33, 0x03, 0x3d, 0xd9, 0xa3, 0x0d,
+  0xbb, 0x6e, 0x31, 0x11, 0x3d, 0xb3, 0x7e, 0xfc, 0x3c, 0xc4, 0x86, 0x49, 0x3c,
+  0x0a, 0x52, 0x0b, 0x3d, 0x68, 0x25, 0xae, 0x3d, 0xe0, 0x16, 0x02, 0x3d, 0xc0,
+  0x47, 0x3f, 0xbd, 0x98, 0x55, 0x70, 0x3c, 0x1a, 0xbb, 0x38, 0x3d, 0xcf, 0x31,
+  0xe4, 0xbc, 0xe0, 0x45, 0x39, 0xbd, 0x7c, 0xa1, 0x3f, 0xbd, 0xcc, 0x5b, 0x91,
+  0xbd, 0x55, 0x28, 0x59, 0x3a, 0x75, 0xdc, 0x02, 0xbd, 0xd8, 0x0d, 0xfe, 0xbb,
+  0x38, 0x7f, 0x92, 0xbd, 0x0f, 0xeb, 0x83, 0xbc, 0xcf, 0xe7, 0x0c, 0xbd, 0xb5,
+  0xf8, 0x59, 0x3d, 0xfc, 0xd4, 0xcf, 0xbb, 0xa3, 0x75, 0x8a, 0x3d, 0xac, 0xe9,
+  0x8e, 0xbd, 0x4a, 0xf9, 0x71, 0x3d, 0xee, 0x83, 0x32, 0xbc, 0x7c, 0x78, 0xa0,
+  0xbd, 0x87, 0x86, 0x6a, 0xbd, 0x1a, 0x3c, 0xe4, 0xbc, 0x89, 0x4a, 0xa1, 0x3d,
+  0xa0, 0x39, 0xdd, 0x3c, 0x93, 0xa3, 0x93, 0x3c, 0xdd, 0x08, 0xa2, 0x3d, 0x9a,
+  0x87, 0x98, 0xbd, 0xe6, 0x5a, 0x32, 0xbd, 0xeb, 0x4d, 0xea, 0xbb, 0x48, 0xda,
+  0x6b, 0x3c, 0x36, 0x23, 0x82, 0x3d, 0x80, 0x78, 0x90, 0x3d, 0x0e, 0x4c, 0x1b,
+  0xbd, 0xb9, 0x3c, 0x54, 0x3d, 0x5f, 0x8b, 0xf5, 0xbb, 0x54, 0x40, 0x54, 0xbd,
+  0x35, 0x04, 0x8e, 0xbc, 0x38, 0xcf, 0xe0, 0x3b, 0x2f, 0xf6, 0x55, 0xbd, 0xe0,
+  0xed, 0x7e, 0x3c, 0x84, 0x12, 0x9c, 0x3d, 0x74, 0x34, 0xfb, 0xbc, 0x02, 0xd9,
+  0x93, 0xbd, 0xff, 0x27, 0xa8, 0xbd, 0x83, 0xf3, 0xaf, 0xbb, 0x99, 0x16, 0x7d,
+  0x3d, 0xc6, 0xd9, 0x32, 0xbd, 0xb1, 0xa4, 0xbd, 0xbc, 0xd2, 0x1c, 0x5b, 0x3d,
+  0xb3, 0xdb, 0x31, 0x3d, 0xe4, 0x10, 0x03, 0x3c, 0x29, 0xb0, 0x0b, 0xbd, 0x16,
+  0x47, 0x9b, 0x3d, 0x75, 0x6b, 0xfd, 0xbc, 0x09, 0x92, 0xac, 0x3c, 0x12, 0x2c,
+  0x07, 0x3d, 0x5a, 0xb3, 0xa0, 0x3c, 0xc9, 0x3d, 0x21, 0xbd, 0xc1, 0x80, 0x6d,
+  0xbd, 0xa9, 0x20, 0x9c, 0x3d, 0xf5, 0x5b, 0x07, 0xbe, 0x9a, 0x76, 0x6f, 0xbd,
+  0xd5, 0x11, 0xff, 0x3d, 0x58, 0xda, 0xd4, 0x3c, 0x18, 0x2f, 0xb9, 0x3d, 0xd4,
+  0xa0, 0x6c, 0xbd, 0x4d, 0xe5, 0x2b, 0xbc, 0x97, 0x9d, 0x5f, 0xbc, 0x55, 0xe6,
+  0x9b, 0xbd, 0x61, 0xee, 0xb3, 0x3c, 0x24, 0x06, 0xbf, 0x3c, 0xc2, 0x90, 0x09,
+  0xbd, 0x91, 0xaf, 0x63, 0x3d, 0xde, 0x86, 0x7b, 0x3c, 0xca, 0x42, 0x0d, 0x3c,
+  0x5f, 0xda, 0xcd, 0xbc, 0x7b, 0x27, 0x13, 0x3d, 0xf9, 0xd1, 0x14, 0x3c, 0xb6,
+  0x83, 0x4a, 0x3d, 0x37, 0x74, 0x63, 0xbd, 0xbb, 0x85, 0x40, 0xbd, 0x3e, 0x15,
+  0x13, 0x3d, 0x00, 0xe1, 0x22, 0xbd, 0xef, 0xdd, 0x63, 0xbd, 0x95, 0xdb, 0xa6,
+  0x3c, 0xf4, 0xc1, 0x86, 0xbd, 0xfd, 0xf0, 0xe5, 0x3c, 0x84, 0xc1, 0x69, 0xbd,
+  0xe4, 0x85, 0xf5, 0x3c, 0x18, 0xfa, 0x79, 0xbd, 0xe3, 0xd5, 0x2e, 0xbd, 0x32,
+  0x90, 0x8f, 0xbc, 0x40, 0xfa, 0x08, 0xbc, 0xa4, 0x5f, 0xcb, 0xbc, 0x5a, 0xa7,
+  0x3f, 0x3d, 0x09, 0x40, 0x23, 0x3d, 0x7b, 0x17, 0x0e, 0xbd, 0x6e, 0x70, 0xb9,
+  0x3b, 0xc7, 0x3d, 0x4d, 0xbd, 0xe9, 0x57, 0x5d, 0x3d, 0x5c, 0x02, 0x91, 0x3c,
+  0xc8, 0x08, 0x31, 0xbd, 0x09, 0xea, 0xe3, 0x3c, 0x14, 0x23, 0xf6, 0x3c, 0x95,
+  0xd1, 0x22, 0xbd, 0xba, 0x27, 0xce, 0x3c, 0xb2, 0x59, 0x42, 0xbd, 0x29, 0x50,
+  0x6d, 0x3d, 0x20, 0xe5, 0x10, 0xbd, 0xc2, 0x68, 0x5a, 0xbd, 0x04, 0x6e, 0x81,
+  0xbd, 0xd6, 0xc7, 0xa4, 0xbc, 0x16, 0x22, 0x33, 0x3d, 0x80, 0xbf, 0x70, 0x3c,
+  0xbf, 0x62, 0x02, 0xbd, 0xdd, 0x19, 0x28, 0xbd, 0x8d, 0x5c, 0x60, 0x3d, 0x96,
+  0xb4, 0x24, 0xbd, 0x9a, 0xb5, 0x6e, 0xbd, 0x52, 0xb5, 0x81, 0x3d, 0xf3, 0x49,
+  0x85, 0xbd, 0x4a, 0x65, 0xcc, 0x3c, 0x06, 0xca, 0x13, 0xbd, 0x18, 0x94, 0x07,
+  0x3d, 0xde, 0x60, 0x45, 0x3c, 0x7a, 0x2d, 0x69, 0x3d, 0x7e, 0xc6, 0xba, 0xbc,
+  0xff, 0xcf, 0x64, 0x3d, 0x3e, 0x22, 0x98, 0xbd, 0xe1, 0x87, 0xc8, 0x3c, 0xec,
+  0x54, 0x90, 0xbd, 0x60, 0x0b, 0x09, 0x3d, 0x5e, 0xc7, 0x95, 0x3c, 0x54, 0x1c,
+  0x5b, 0x3b, 0xac, 0x77, 0xfe, 0x3c, 0x4c, 0x43, 0xea, 0xbc, 0xe4, 0x4d, 0xb3,
+  0x3c, 0xab, 0x96, 0x20, 0xbd, 0xf7, 0x8a, 0x48, 0xbd, 0xcc, 0xcb, 0x70, 0x3d,
+  0x25, 0x01, 0x91, 0xbc, 0x9c, 0x9a, 0x96, 0x3c, 0x9c, 0x7d, 0x56, 0x3d, 0x3e,
+  0x2b, 0x47, 0xbd, 0x44, 0x48, 0x15, 0xbd, 0x38, 0x4e, 0xc1, 0x3c, 0x9e, 0x72,
+  0x05, 0x3d, 0xe9, 0xbd, 0x44, 0xbc, 0x96, 0xdd, 0x6f, 0x3d, 0x17, 0x2b, 0x4e,
+  0x3c, 0x21, 0x91, 0x4c, 0x3d, 0x2f, 0x87, 0x8e, 0xbd, 0xf2, 0xd2, 0x31, 0x3d,
+  0x47, 0x07, 0xad, 0xbc, 0x41, 0x54, 0x89, 0x3c, 0xee, 0xa9, 0x4d, 0x3d, 0xf2,
+  0xb1, 0x80, 0x3d, 0x6a, 0xd9, 0x78, 0xbd, 0x55, 0x4a, 0x32, 0xbd, 0xd1, 0xd8,
+  0x44, 0x3d, 0xda, 0x72, 0x7d, 0x3d, 0xa1, 0xd1, 0xbc, 0x3b, 0x7a, 0xf4, 0x32,
+  0xbd, 0xf0, 0x44, 0x84, 0x3d, 0xd3, 0x0b, 0x8c, 0x3d, 0xd9, 0xc8, 0x58, 0xbd,
+  0xdd, 0x2c, 0x7c, 0x3d, 0x49, 0x3e, 0x8f, 0x3d, 0x39, 0xbd, 0x95, 0xbd, 0x99,
+  0x46, 0x25, 0x3d, 0x63, 0xfe, 0x20, 0xbd, 0x0a, 0x1d, 0x62, 0xbc, 0x4b, 0xae,
+  0x3b, 0xbc, 0x3c, 0x28, 0x84, 0xbc, 0x79, 0x24, 0x25, 0xbd, 0x62, 0x6b, 0x56,
+  0xbd, 0xe9, 0x9a, 0x88, 0x3d, 0xd6, 0x9f, 0x85, 0xbc, 0xad, 0xf6, 0x51, 0xbd,
+  0xc2, 0x72, 0x85, 0x3d, 0xf6, 0x0d, 0x89, 0xbd, 0x3e, 0x76, 0xca, 0x39, 0x90,
+  0x96, 0x89, 0x3d, 0xa1, 0x6e, 0x25, 0xbd, 0x4b, 0xbd, 0x18, 0x3c, 0x0e, 0x05,
+  0x69, 0xbc, 0x03, 0x9e, 0x76, 0x3d, 0xa3, 0xae, 0x67, 0x3d, 0xc4, 0x38, 0x5a,
+  0x3d, 0x8c, 0x9d, 0x53, 0xbd, 0x35, 0x24, 0x42, 0xbd, 0x36, 0xfa, 0xcf, 0x3c,
+  0xe8, 0x09, 0x0f, 0xbd, 0xe9, 0x6e, 0x15, 0xbd, 0x51, 0x03, 0x1b, 0xbd, 0xf7,
+  0x1d, 0x32, 0x3d, 0x08, 0xfc, 0x2f, 0xbd, 0x9d, 0x4c, 0x65, 0x3d, 0x9d, 0xf0,
+  0x98, 0xbb, 0xb0, 0xba, 0x0d, 0xbc, 0x64, 0xee, 0x03, 0xbb, 0x92, 0x82, 0x16,
+  0xbc, 0xa5, 0xa0, 0x94, 0xbd, 0xd0, 0x1f, 0xf1, 0x3c, 0xeb, 0x06, 0x8c, 0xbb,
+  0xb5, 0xc2, 0x64, 0x3c, 0x7e, 0x30, 0x55, 0x3c, 0x68, 0x89, 0x64, 0x3c, 0xec,
+  0x1e, 0x9e, 0x3c, 0xf0, 0xc9, 0x57, 0x3d, 0xfe, 0x25, 0x0c, 0xbd, 0x2f, 0xb4,
+  0x0b, 0x3c, 0x32, 0x76, 0x7a, 0xbd, 0xd2, 0x15, 0xea, 0xba, 0xc0, 0xc9, 0x45,
+  0xbd, 0xb7, 0xda, 0x48, 0xbc, 0x5e, 0x85, 0x6c, 0x3c, 0xbc, 0xda, 0x84, 0xbc,
+  0xc6, 0x56, 0x35, 0xbd, 0x21, 0xfd, 0x7d, 0x3d, 0xbf, 0x0c, 0x0f, 0x3b, 0xc2,
+  0x28, 0xa4, 0xbc, 0xad, 0xa3, 0xe7, 0xbb, 0x77, 0xd9, 0x55, 0x3d, 0x6d, 0x5a,
+  0x21, 0xbc, 0x3f, 0xa0, 0xd9, 0xbc, 0x1b, 0x86, 0x85, 0x3d, 0x38, 0x2f, 0x1f,
+  0xbd, 0xd5, 0xa5, 0x43, 0x3d, 0xdb, 0x04, 0x8d, 0xbd, 0xbc, 0x0d, 0x25, 0x3d,
+  0xf5, 0x71, 0x86, 0x3d, 0xa8, 0x4e, 0x88, 0xbd, 0xca, 0xab, 0x24, 0x3c, 0x8d,
+  0x03, 0xda, 0x3c, 0xad, 0x77, 0x19, 0xbc, 0x2e, 0x7c, 0xf5, 0x3c, 0x75, 0x45,
+  0x6e, 0x3d, 0x9b, 0x9f, 0x80, 0xbd, 0x1d, 0xce, 0x85, 0x3d, 0xb6, 0xbe, 0x86,
+  0xbc, 0xc0, 0x1c, 0x55, 0xbb, 0xd0, 0xc7, 0x5c, 0xbd, 0x1f, 0x60, 0x64, 0x3c,
+  0x4f, 0x04, 0x60, 0xbd, 0x04, 0xc9, 0x64, 0x3d, 0x0a, 0xbb, 0x10, 0x3b, 0x08,
+  0x41, 0x92, 0xbd, 0xac, 0x5b, 0x15, 0xbd, 0x44, 0xe8, 0x27, 0x3b, 0x9c, 0x98,
+  0x0c, 0x3d, 0x09, 0x52, 0x7a, 0x3d, 0x33, 0xe4, 0xcd, 0xbc, 0xda, 0x48, 0x17,
+  0xbd, 0x26, 0xe5, 0x5d, 0xbb, 0x2f, 0xfc, 0x69, 0xbd, 0x9f, 0xfd, 0x54, 0x3d,
+  0x1d, 0x45, 0x07, 0xbd, 0x86, 0x69, 0x91, 0x3c, 0x9e, 0x1a, 0xbe, 0xbc, 0xfa,
+  0xf4, 0x5e, 0x3d, 0xb5, 0x9d, 0x00, 0xbd, 0xe0, 0xfd, 0x90, 0x3c, 0x3a, 0xac,
+  0xc9, 0xbc, 0x11, 0xa7, 0xb0, 0xbb, 0x3e, 0x18, 0xa8, 0x3c, 0x79, 0x2e, 0x55,
+  0xbd, 0xe0, 0xb2, 0xfd, 0xbb, 0x72, 0xb0, 0x5d, 0xbc, 0xe1, 0xd9, 0x6f, 0x3d,
+  0xd5, 0x3a, 0x9f, 0xbc, 0xc8, 0x8f, 0x1a, 0xbd, 0x18, 0x60, 0x3b, 0x3c, 0xc0,
+  0x90, 0x24, 0xbc, 0x78, 0xb6, 0x50, 0x3d, 0x84, 0xc6, 0x81, 0xbd, 0x98, 0x2d,
+  0x46, 0x3d, 0x7f, 0x8a, 0x3b, 0x3d, 0x03, 0xd9, 0x7f, 0x3d, 0x50, 0x04, 0xae,
+  0x3c, 0xaf, 0xae, 0x6b, 0xbd, 0xcd, 0x34, 0x48, 0xbd, 0xbd, 0x05, 0xa8, 0x3c,
+  0x84, 0xc8, 0x3f, 0xbd, 0xcb, 0x46, 0x89, 0x3d, 0x92, 0x2b, 0x16, 0x3d, 0x98,
+  0xfb, 0xcd, 0xbc, 0x80, 0x5b, 0x43, 0xbd, 0xac, 0x5e, 0x78, 0x3c, 0xd6, 0xbf,
+  0x7e, 0x3b, 0x32, 0xec, 0x81, 0x3b, 0xce, 0xab, 0xf1, 0x3b, 0xb2, 0xd7, 0x86,
+  0xbc, 0xb1, 0xe3, 0x09, 0x3d, 0x4f, 0xc6, 0xa5, 0xbc, 0x4c, 0x1b, 0x89, 0x3c,
+  0xd6, 0x09, 0x2b, 0x3d, 0x61, 0x67, 0x4a, 0xbc, 0x7a, 0x5e, 0x87, 0xbc, 0x6c,
+  0x32, 0x55, 0x3c, 0x6b, 0xe0, 0xa7, 0xba, 0x41, 0xc8, 0xb5, 0xbc, 0x94, 0x54,
+  0x64, 0xbc, 0x81, 0xb6, 0x33, 0x3d, 0x3a, 0x05, 0x59, 0x3d, 0x42, 0x25, 0x46,
+  0xbd, 0xfc, 0xda, 0x8c, 0xbd, 0x17, 0x64, 0x87, 0x3d, 0x55, 0x39, 0x61, 0x3d,
+  0x4f, 0xcf, 0x25, 0xbd, 0xfc, 0x4d, 0x26, 0x3c, 0x7c, 0x18, 0xd8, 0x3c, 0x4f,
+  0x1b, 0x5c, 0x3d, 0x3a, 0x09, 0xcd, 0x3c, 0x27, 0x4a, 0x00, 0x3d, 0x1c, 0xb7,
+  0xb7, 0xbc, 0x0a, 0x1b, 0x38, 0xbc, 0x88, 0x6d, 0x2f, 0x3d, 0x96, 0xdf, 0x6a,
+  0xbd, 0x7e, 0x7e, 0xa0, 0xb9, 0x10, 0x23, 0x10, 0xbc, 0xec, 0x6b, 0xbf, 0x3c,
+  0x1a, 0x8e, 0x7a, 0xbc, 0x68, 0xb1, 0x7c, 0x3d, 0xb0, 0xcc, 0x30, 0xbd, 0xec,
+  0x59, 0xef, 0x3c, 0x8d, 0xd5, 0x41, 0x3b, 0x82, 0xa1, 0xec, 0xbc, 0x29, 0x35,
+  0x51, 0xbd, 0x6e, 0x6e, 0x91, 0xbc, 0xf9, 0x6d, 0x2a, 0x3d, 0x5d, 0x97, 0x17,
+  0x3d, 0xcb, 0xad, 0x29, 0x3c, 0xc4, 0x47, 0x41, 0x3d, 0x40, 0x7c, 0x6a, 0xbc,
+  0xa6, 0x09, 0x1e, 0x3d, 0x14, 0x9c, 0xf2, 0xbc, 0x70, 0x31, 0x5d, 0x3c, 0xd1,
+  0x54, 0x70, 0xbc, 0xd8, 0x58, 0xdd, 0x3a, 0x65, 0x21, 0x6a, 0xbd, 0x64, 0x81,
+  0x99, 0xbd, 0x51, 0x5a, 0x64, 0x3c, 0x8c, 0xa6, 0x90, 0x3c, 0xe6, 0xb6, 0x2a,
+  0xbd, 0x3d, 0x2a, 0x15, 0xbd, 0x82, 0xbe, 0x8d, 0xbc, 0x65, 0x32, 0x68, 0xbd,
+  0x0a, 0x5d, 0x6d, 0xbc, 0x24, 0x8c, 0xd6, 0xbc, 0x70, 0x4d, 0xe7, 0x3c, 0x06,
+  0x58, 0x01, 0x3c, 0x22, 0xd2, 0x58, 0x3d, 0x62, 0x60, 0x88, 0x3c, 0xfc, 0xe6,
+  0x12, 0x3d, 0x31, 0x59, 0xdb, 0x3c, 0x5d, 0xfb, 0x96, 0xbc, 0xb6, 0x50, 0x7f,
+  0x3b, 0xd7, 0x01, 0x37, 0x3d, 0x6a, 0x71, 0xc4, 0xbc, 0x8d, 0x28, 0xc9, 0x3c,
+  0x33, 0x39, 0x4f, 0xbb, 0x14, 0x14, 0x1b, 0x3d, 0x32, 0x36, 0x62, 0xbd, 0xa7,
+  0xf1, 0x89, 0x3d, 0xc4, 0x12, 0x13, 0x3d, 0xf3, 0x79, 0xde, 0x3c, 0xc0, 0x39,
+  0xb3, 0xbb, 0x36, 0xb5, 0x54, 0xbd, 0x04, 0xf2, 0xcc, 0xbc, 0x45, 0x14, 0xf8,
+  0x3a, 0x4b, 0x1d, 0x55, 0xbd, 0x13, 0x35, 0xc6, 0xbc, 0x7a, 0x92, 0x1b, 0xbd,
+  0x71, 0xb0, 0x3b, 0xbd, 0xfe, 0x84, 0x2f, 0xbd, 0xd4, 0x64, 0x60, 0x3d, 0xa7,
+  0x0b, 0xb7, 0xbb, 0xd1, 0xc7, 0x8a, 0xbd, 0x21, 0x20, 0x78, 0x3d, 0x1b, 0x25,
+  0x77, 0x3d, 0x5e, 0x06, 0x20, 0xbd, 0x7d, 0xfa, 0xe0, 0xbc, 0x5b, 0x2b, 0x38,
+  0x3d, 0x8c, 0x10, 0x90, 0xbd, 0xbe, 0xc0, 0xb2, 0x3c, 0x5a, 0x88, 0x94, 0xbd,
+  0x80, 0x87, 0x94, 0x3c, 0x73, 0xed, 0x81, 0xbd, 0x73, 0x42, 0x3f, 0xba, 0xdc,
+  0xf8, 0x4e, 0x3d, 0x9a, 0xd4, 0x8d, 0xbc, 0x3a, 0x6f, 0x72, 0xbc, 0x37, 0xe8,
+  0x06, 0x3d, 0xbb, 0x35, 0x61, 0x3d, 0x64, 0xc6, 0x4a, 0x3d, 0xee, 0x94, 0x13,
+  0xb9, 0xc0, 0x4b, 0xaf, 0xba, 0x60, 0x4b, 0x42, 0x3d, 0x40, 0x88, 0xb1, 0x3c,
+  0xc6, 0x61, 0x6c, 0x3d, 0x92, 0xd0, 0x40, 0x3d, 0x32, 0xc0, 0x8d, 0xbd, 0x90,
+  0x66, 0xc2, 0xbc, 0x52, 0x1f, 0x14, 0xbd, 0x03, 0x9d, 0x23, 0x3d, 0x81, 0x60,
+  0xe1, 0x3c, 0xe3, 0x31, 0x5f, 0x3d, 0x38, 0xbc, 0x52, 0x3d, 0x23, 0x3e, 0x3b,
+  0xbd, 0xf6, 0x53, 0x8e, 0xbd, 0xc9, 0xb1, 0x88, 0xbd, 0x02, 0x0c, 0xc6, 0xbc,
+  0x2e, 0x6d, 0x26, 0xbd, 0xe2, 0x88, 0x87, 0xbd, 0x45, 0x45, 0x28, 0x3d, 0xbc,
+  0x73, 0xd7, 0xba, 0x17, 0x1e, 0x15, 0xbc, 0xa6, 0x0c, 0x9c, 0xbc, 0x5a, 0x74,
+  0x63, 0x3d, 0x05, 0x28, 0xf6, 0x3c, 0xe5, 0xda, 0x4d, 0xbd, 0x02, 0x69, 0x42,
+  0xbd, 0x8a, 0xb0, 0x2c, 0x3d, 0x27, 0x22, 0x07, 0x3d, 0x6a, 0x7a, 0x08, 0x3b,
+  0x88, 0xb6, 0x03, 0x3d, 0x80, 0xad, 0xac, 0xbb, 0xc9, 0x67, 0x6d, 0xbb, 0x80,
+  0xf0, 0x8d, 0xbd, 0x53, 0x78, 0x85, 0x3d, 0x14, 0x99, 0x24, 0xbb, 0x86, 0x7c,
+  0x0c, 0x3d, 0xbe, 0xff, 0x79, 0x3d, 0x01, 0x39, 0xb4, 0x3c, 0x19, 0x42, 0x52,
+  0x3c, 0x4d, 0x8b, 0x73, 0x3d, 0xb4, 0x6b, 0xf1, 0x3a, 0x6e, 0x53, 0xb4, 0xbc,
+  0x09, 0x88, 0x11, 0xbd, 0xdf, 0x5e, 0x86, 0xbd, 0x10, 0xdc, 0x5a, 0xbd, 0x6b,
+  0xb3, 0x3a, 0xbd, 0x7e, 0x23, 0x84, 0xbd, 0x95, 0x50, 0x8c, 0xbd, 0xd1, 0x50,
+  0x93, 0x3c, 0x5f, 0x43, 0x67, 0x3a, 0x92, 0xc2, 0x91, 0xbd, 0xbe, 0xb0, 0x4e,
+  0xbd, 0x8c, 0xeb, 0x36, 0xbd, 0x4e, 0x0e, 0x82, 0xbd, 0xc5, 0x15, 0x0b, 0xbd,
+  0x1c, 0x66, 0x5a, 0xbd, 0xf6, 0xe4, 0x19, 0x3b, 0x4d, 0x1c, 0x07, 0x3d, 0x70,
+  0x1f, 0x24, 0x3d, 0x59, 0x80, 0x3b, 0xbd, 0x8e, 0x9e, 0xae, 0xbb, 0x11, 0x6f,
+  0x8f, 0x3b, 0x5f, 0xc9, 0x74, 0xbd, 0x36, 0x65, 0x2b, 0x3c, 0x43, 0xb4, 0xcf,
+  0x3c, 0x7f, 0xbf, 0x18, 0x3d, 0x91, 0x58, 0x16, 0xbd, 0x72, 0xc4, 0xf3, 0xbc,
+  0x80, 0xd3, 0x8a, 0x3b, 0x95, 0x0e, 0xe7, 0x3c, 0xdd, 0x17, 0x1d, 0x3d, 0x55,
+  0x74, 0x98, 0xbd, 0x5c, 0x6b, 0x1e, 0xbc, 0x02, 0x65, 0x61, 0xba, 0x01, 0x7f,
+  0x81, 0xbc, 0x97, 0x95, 0x73, 0xbd, 0xd8, 0x60, 0xfd, 0xbc, 0xd4, 0x64, 0x8a,
+  0x3a, 0xe5, 0x81, 0x24, 0x3c, 0xfd, 0x2b, 0x14, 0x3d, 0x60, 0x49, 0xff, 0x3b,
+  0x6f, 0x63, 0x33, 0xbd, 0xe0, 0x83, 0x4b, 0xbd, 0xed, 0x7a, 0x10, 0x3d, 0x5b,
+  0x26, 0x33, 0x3d, 0x03, 0xff, 0x2d, 0x3d, 0xcd, 0xca, 0x42, 0xbd, 0x4c, 0x09,
+  0x3f, 0x3d, 0xcb, 0xcb, 0x95, 0xbc, 0xff, 0x04, 0x18, 0x3c, 0x99, 0x48, 0x6c,
+  0xbd, 0xb6, 0x3f, 0x04, 0x3a, 0x68, 0x3d, 0x67, 0x3c, 0x71, 0xd9, 0x7a, 0xbc,
+  0x88, 0x7d, 0x02, 0x3c, 0x0f, 0xfa, 0x3b, 0xbd, 0x78, 0x64, 0xfc, 0x3c, 0xab,
+  0x8c, 0x37, 0x3d, 0x08, 0x19, 0xcf, 0xbc, 0x03, 0xe0, 0x85, 0xbd, 0x1b, 0xaf,
+  0x79, 0xbd, 0x92, 0x9e, 0x67, 0x3d, 0x31, 0x3e, 0x94, 0xbd, 0xe8, 0xd1, 0x1f,
+  0xbd, 0x4d, 0xa1, 0xcb, 0x3c, 0x9f, 0xc0, 0xf7, 0x3c, 0xa8, 0x88, 0xe1, 0xbc,
+  0xf7, 0x13, 0x8b, 0x3c, 0x77, 0x1b, 0xfe, 0xbc, 0x11, 0xf0, 0x4d, 0x3d, 0x02,
+  0x73, 0xff, 0xbc, 0x20, 0x4b, 0x2f, 0x3d, 0x50, 0x14, 0x28, 0x3c, 0xa2, 0x0a,
+  0xc1, 0xbc, 0xb3, 0xf6, 0xe1, 0xbc, 0x32, 0x98, 0xa1, 0x3c, 0x3f, 0xef, 0xcc,
+  0x3b, 0xd6, 0xbf, 0x37, 0xbd, 0x4e, 0x0a, 0x15, 0x3d, 0xfd, 0x81, 0x24, 0xbd,
+  0x62, 0x05, 0x43, 0x3d, 0x4b, 0x8d, 0xb5, 0xbc, 0x0e, 0xe7, 0x7c, 0x3d, 0xd1,
+  0x64, 0x88, 0xbd, 0xca, 0x03, 0xd3, 0xbb, 0xc9, 0xaa, 0x9f, 0xbb, 0xb5, 0x0e,
+  0xbf, 0xbc, 0x48, 0x82, 0xe7, 0x3c, 0xa1, 0x4b, 0x10, 0x3d, 0x40, 0x51, 0x68,
+  0xbb, 0xc0, 0x36, 0xc4, 0x3c, 0xcc, 0xd9, 0x37, 0xbc, 0xec, 0x40, 0xcf, 0x3c,
+  0xb2, 0x38, 0x52, 0xbd, 0x15, 0xe7, 0x0c, 0xbd, 0x52, 0xea, 0x59, 0x3c, 0xcf,
+  0xe3, 0xd1, 0xbc, 0x9e, 0xb7, 0x94, 0xbc, 0x1a, 0x13, 0xc8, 0x3c, 0x04, 0x51,
+  0xa0, 0x3b, 0x7f, 0xb4, 0x32, 0x3d, 0x5e, 0x43, 0x5a, 0x3d, 0x8b, 0x6d, 0x98,
+  0xba, 0xa4, 0x70, 0x47, 0x3d, 0xe6, 0x23, 0x60, 0x3d, 0x48, 0xf3, 0x8b, 0xbc,
+  0x85, 0xfe, 0x60, 0x3d, 0x33, 0x94, 0xc7, 0xbc, 0xdd, 0xbf, 0x80, 0xbd, 0x31,
+  0x98, 0xbb, 0x3b, 0x76, 0x70, 0x8a, 0x3c, 0x72, 0xc5, 0x4e, 0x3c, 0x31, 0x53,
+  0x20, 0x3d, 0xcd, 0xda, 0x03, 0x3b, 0x8c, 0xc0, 0x3d, 0x3d, 0x9c, 0xaa, 0x90,
+  0xbd, 0xb5, 0x9f, 0xab, 0x3c, 0x45, 0x77, 0x31, 0xbd, 0xea, 0x85, 0x8e, 0xbd,
+  0x15, 0x6d, 0x8b, 0xbc, 0xb9, 0x98, 0xb1, 0xbc, 0x09, 0x9b, 0xff, 0x3c, 0x1e,
+  0xcf, 0x3c, 0x3d, 0x3c, 0xe3, 0x2a, 0xbd, 0x2a, 0xff, 0x20, 0x3d, 0xbb, 0x1c,
+  0x4a, 0x3b, 0x8f, 0x19, 0x83, 0xbd, 0xad, 0x9f, 0xe5, 0x3c, 0x43, 0x3d, 0x44,
+  0x3d, 0xaa, 0xb9, 0xe3, 0x3c, 0x8c, 0xd1, 0x86, 0x3d, 0xfa, 0x93, 0x7c, 0x3d,
+  0x31, 0xe5, 0x67, 0xbc, 0x3f, 0x25, 0x8a, 0xbd, 0x90, 0x91, 0x5e, 0x3b, 0xbf,
+  0xd8, 0xfe, 0xbc, 0x68, 0xaa, 0x85, 0x3c, 0xb3, 0xb6, 0x07, 0xbd, 0x6f, 0x51,
+  0x91, 0xbd, 0x3c, 0x5d, 0xc8, 0xbc, 0xba, 0xf5, 0xd3, 0xbb, 0x8d, 0x90, 0xd5,
+  0xbc, 0x02, 0x78, 0x2f, 0xbc, 0x12, 0x94, 0x10, 0x3d, 0xb2, 0x26, 0x82, 0xbd,
+  0x49, 0x2a, 0x70, 0x3d, 0x9c, 0xf4, 0x67, 0xbd, 0x8d, 0x33, 0xf3, 0xbc, 0x22,
+  0xa0, 0xc3, 0x3c, 0x38, 0xb2, 0x31, 0x3d, 0x71, 0xe9, 0x87, 0xbd, 0x7c, 0xc5,
+  0x96, 0xbd, 0x5b, 0x13, 0xa5, 0xbc, 0x2d, 0x8a, 0x8a, 0x3d, 0x80, 0xc2, 0x24,
+  0x3d, 0x1e, 0xc5, 0x74, 0x3d, 0xec, 0x3a, 0xca, 0x3c, 0x37, 0xb4, 0x00, 0xbc,
+  0x29, 0xe2, 0x0c, 0x3d, 0xbc, 0x36, 0x20, 0x3d, 0x58, 0x3a, 0x5f, 0x3d, 0x8a,
+  0xe4, 0x24, 0xbd, 0x22, 0x99, 0x45, 0xbd, 0xbe, 0xef, 0x0d, 0xbd, 0xbe, 0xae,
+  0x0f, 0xbc, 0xe1, 0xe9, 0x4e, 0x3c, 0xd2, 0xed, 0x54, 0xbd, 0x62, 0xcb, 0x7d,
+  0x3c, 0xc8, 0xe4, 0x0d, 0xbc, 0x61, 0xaa, 0xa8, 0x3b, 0x68, 0x56, 0x92, 0xbb,
+  0x83, 0xb3, 0x25, 0xbd, 0x0a, 0x28, 0x39, 0xbd, 0x9d, 0xd4, 0x13, 0x3c, 0x5c,
+  0x3c, 0x27, 0x3d, 0x34, 0x21, 0x30, 0x3d, 0x9d, 0xac, 0x54, 0xbd, 0xaa, 0xe8,
+  0x60, 0x3d, 0xb4, 0xaf, 0xe5, 0x3c, 0xb0, 0x22, 0x1d, 0x3d, 0x9c, 0x7e, 0x64,
+  0x3d, 0x3e, 0xd9, 0x7b, 0x3d, 0x55, 0x9e, 0x46, 0x3d, 0x47, 0xf9, 0xfe, 0x3a,
+  0x00, 0xf0, 0x79, 0xbc, 0x49, 0x93, 0xd5, 0xbb, 0x98, 0x75, 0x29, 0xbc, 0xfb,
+  0xdc, 0x37, 0xbd, 0x9a, 0x0e, 0x65, 0x3d, 0x7a, 0x74, 0x93, 0xbd, 0x39, 0x83,
+  0xba, 0x3c, 0x20, 0xa3, 0x94, 0xbd, 0xbf, 0x32, 0x18, 0xbc, 0xbd, 0x90, 0x19,
+  0x3c, 0x31, 0xbe, 0x94, 0xbd, 0x1f, 0xd5, 0x9b, 0x3a, 0x09, 0xa3, 0x44, 0xbd,
+  0xe4, 0x91, 0xae, 0xbc, 0x98, 0x84, 0x73, 0xbd, 0xe6, 0x64, 0x70, 0x3d, 0xcc,
+  0x0d, 0x01, 0xbd, 0xb0, 0xd6, 0xce, 0x3c, 0x2a, 0x8b, 0x78, 0xbd, 0x51, 0x8a,
+  0xcd, 0x3c, 0x76, 0x3b, 0x0b, 0x3b, 0x85, 0xe3, 0x76, 0xbd, 0xad, 0x98, 0x6f,
+  0x3d, 0xf8, 0xa1, 0x92, 0xbd, 0x22, 0xb9, 0x24, 0xbd, 0x81, 0xf4, 0x62, 0xbd,
+  0xeb, 0x97, 0x83, 0x3d, 0x0d, 0xa9, 0x91, 0x3a, 0x62, 0x88, 0x0c, 0xbc, 0x99,
+  0x64, 0x48, 0x3d, 0x0b, 0x11, 0x80, 0xba, 0x94, 0xe3, 0x70, 0xbc, 0xa3, 0x42,
+  0x56, 0x3c, 0x1c, 0x41, 0xec, 0x3c, 0x68, 0x56, 0x29, 0x3c, 0x50, 0x4a, 0x05,
+  0x3d, 0xfa, 0x33, 0x37, 0x3d, 0x5d, 0x7c, 0x8d, 0x3d, 0xa8, 0x02, 0x3f, 0x3c,
+  0xa6, 0x1d, 0x68, 0x3d, 0x41, 0x3b, 0x76, 0x3d, 0x29, 0xa1, 0x56, 0xbd, 0xbd,
+  0x90, 0x7c, 0x3b, 0xd9, 0x96, 0x62, 0xbd, 0xf2, 0x15, 0xd8, 0xbc, 0xad, 0x62,
+  0x38, 0x3d, 0x19, 0xc7, 0x0d, 0x3d, 0xda, 0xcc, 0xf8, 0x3b, 0x63, 0xaf, 0x84,
+  0xbd, 0x42, 0x94, 0x3f, 0xbc, 0x60, 0x67, 0x83, 0x3d, 0x13, 0xdb, 0xa8, 0x3c,
+  0x8f, 0xcb, 0x5e, 0x3d, 0x97, 0x69, 0x14, 0xbd, 0xd5, 0x52, 0x97, 0x3c, 0x28,
+  0xb2, 0x09, 0xbb, 0xd0, 0x5c, 0x0f, 0x3d, 0x08, 0x01, 0x38, 0xbd, 0x2a, 0xd1,
+  0x75, 0xbd, 0xb6, 0x48, 0x5e, 0xbd, 0xe6, 0x3a, 0x40, 0x3d, 0x91, 0x52, 0xb5,
+  0x3c, 0xe6, 0xe6, 0x2f, 0x3d, 0x7b, 0x0a, 0x0b, 0x3d, 0x05, 0xa6, 0xf1, 0xbb,
+  0xe5, 0x14, 0x12, 0x3c, 0x70, 0x4a, 0x61, 0xbd, 0xc0, 0xd5, 0x77, 0x3c, 0xea,
+  0x92, 0x4e, 0x3d, 0xe8, 0xea, 0x7a, 0x3c, 0x85, 0xec, 0x8d, 0xbc, 0x1f, 0x06,
+  0x3a, 0x3d, 0x24, 0x7d, 0x43, 0x3c, 0x3b, 0xfb, 0x4e, 0x3d, 0x10, 0xdb, 0x26,
+  0xbc, 0x3c, 0xe4, 0x44, 0x3d, 0x5f, 0x54, 0xe6, 0x3c, 0x32, 0x15, 0xdf, 0xbc,
+  0x07, 0x77, 0x1f, 0x3d, 0x68, 0x58, 0xea, 0x3c, 0xbe, 0x48, 0x90, 0xbc, 0x42,
+  0x47, 0x35, 0x3d, 0x21, 0x06, 0x7d, 0xbd, 0x96, 0xd4, 0x67, 0x3c, 0x17, 0x5e,
+  0x79, 0x3b, 0xd0, 0x09, 0x93, 0xbd, 0xaf, 0x34, 0x3d, 0x3d, 0xc6, 0xd3, 0x8f,
+  0xbc, 0xae, 0x06, 0x0c, 0x3c, 0x84, 0xeb, 0x04, 0xbd, 0x44, 0xf4, 0x2e, 0xbd,
+  0xad, 0x8d, 0x61, 0x3c, 0xb0, 0x1e, 0xaf, 0xb9, 0xb6, 0xd3, 0x57, 0xbc, 0x78,
+  0x89, 0x97, 0x3c, 0x39, 0xa2, 0x41, 0xbd, 0x1c, 0xb3, 0x30, 0xbd, 0x44, 0xc4,
+  0x90, 0x3c, 0xa3, 0x43, 0x03, 0xbd, 0xe0, 0xe2, 0xc4, 0xbb, 0xf0, 0xf3, 0x4d,
+  0x3c, 0x6c, 0xf3, 0x85, 0x3d, 0x8f, 0xa9, 0x56, 0xbd, 0x36, 0x75, 0x5c, 0x3d,
+  0x7e, 0x57, 0x89, 0x3c, 0x3a, 0xb8, 0x29, 0x3c, 0x2c, 0x10, 0x40, 0xbd, 0x5f,
+  0x74, 0x32, 0xbd, 0xaf, 0x9e, 0x09, 0xbd, 0x60, 0xe4, 0x4b, 0xbd, 0x49, 0xb4,
+  0xd7, 0x3c, 0xa0, 0x1f, 0x31, 0xbd, 0xd6, 0x5e, 0xde, 0x3c, 0x4e, 0xb1, 0xdb,
+  0xbc, 0x98, 0x5a, 0x1e, 0x3d, 0x03, 0xe2, 0xa0, 0xba, 0x76, 0xc1, 0x63, 0xbd,
+  0xbd, 0x03, 0xcf, 0x3c, 0xde, 0x4d, 0x22, 0x3d, 0x6a, 0x58, 0x5c, 0xbb, 0xc3,
+  0xb8, 0x19, 0xbd, 0xf3, 0x01, 0x8f, 0x3d, 0x40, 0x62, 0xdc, 0x3b, 0x58, 0x64,
+  0xa0, 0xbc, 0xdc, 0xd4, 0x6d, 0x3d, 0x62, 0x98, 0x1d, 0xbd, 0x96, 0x88, 0x4d,
+  0x3b, 0x0e, 0xab, 0x46, 0x3d, 0xcb, 0xee, 0xce, 0x3b, 0xc5, 0x27, 0xe2, 0xbb,
+  0xe4, 0xe4, 0x1c, 0x3d, 0x75, 0x86, 0x08, 0xbd, 0xf0, 0xce, 0x1c, 0x3d, 0xcb,
+  0x9d, 0x7a, 0x3d, 0x24, 0x56, 0x42, 0xbc, 0x3a, 0x7f, 0xc4, 0xbc, 0x6e, 0xfd,
+  0x6e, 0x3d, 0xa1, 0x3f, 0x80, 0x3d, 0xfb, 0x13, 0xc9, 0xbc, 0x5f, 0x8f, 0xb9,
+  0x3c, 0xe3, 0xde, 0x94, 0xbd, 0x9f, 0x88, 0x88, 0xbd, 0x79, 0x27, 0x71, 0x3d,
+  0xeb, 0xc8, 0x36, 0x3d, 0xe7, 0x2c, 0x9e, 0xbc, 0xb1, 0x19, 0x4d, 0xbd, 0x1e,
+  0x82, 0x79, 0x3d, 0x75, 0xfe, 0x94, 0xbd, 0xdc, 0xd7, 0x96, 0xbd, 0x3a, 0x57,
+  0x84, 0x3d, 0x70, 0xcd, 0x09, 0xbd, 0x08, 0xd9, 0x01, 0xbd, 0xa6, 0x1a, 0x85,
+  0x3d, 0x5e, 0x34, 0xec, 0xbc, 0x3c, 0x0f, 0xa6, 0xbc, 0x0a, 0xc2, 0x6f, 0x3d,
+  0x72, 0x1c, 0x89, 0x3d, 0xb0, 0x55, 0x12, 0xbd, 0x71, 0x87, 0x1f, 0x3d, 0x03,
+  0xf0, 0x07, 0x3c, 0x52, 0x7d, 0x29, 0x3d, 0xe0, 0x13, 0x55, 0xbc, 0xe0, 0xac,
+  0xbb, 0x3c, 0x36, 0x1f, 0x58, 0x3d, 0x34, 0x2f, 0xe3, 0x3c, 0xb5, 0xb7, 0x89,
+  0xbc, 0x06, 0xfa, 0x93, 0xbd, 0xe7, 0x2e, 0x20, 0xbc, 0xc8, 0x71, 0x4c, 0x3d,
+  0x03, 0x3b, 0xf6, 0xbb, 0x1c, 0xf7, 0x24, 0x3d, 0x88, 0x07, 0x09, 0x3d, 0xa6,
+  0x16, 0xde, 0xbc, 0xd4, 0xfa, 0xf5, 0xbc, 0x2e, 0x35, 0x3f, 0x3d, 0x22, 0x36,
+  0x5c, 0xbd, 0x99, 0xea, 0x90, 0x3d, 0x7c, 0xfd, 0xe6, 0x3c, 0xda, 0x89, 0x2e,
+  0x3d, 0xea, 0x83, 0x39, 0x3c, 0xe2, 0x35, 0x12, 0x3d, 0xa6, 0xee, 0x46, 0x3d,
+  0x7b, 0x4e, 0x36, 0xbd, 0x0a, 0x6d, 0xd1, 0x3b, 0x90, 0x59, 0x08, 0xbc, 0x3e,
+  0xee, 0x86, 0x3b, 0x18, 0x92, 0x13, 0x3d, 0x71, 0xd5, 0x69, 0x3c, 0x5f, 0xc2,
+  0x8d, 0xbd, 0xb0, 0x51, 0x81, 0x3c, 0x5a, 0x81, 0x9e, 0x3c, 0xcf, 0xae, 0x13,
+  0x3d, 0xa4, 0x0d, 0x54, 0x3d, 0xb6, 0x82, 0x77, 0x3d, 0x6a, 0x20, 0xf7, 0xbc,
+  0x60, 0xcc, 0x56, 0xbd, 0x45, 0x8f, 0x23, 0xbd, 0x92, 0x5c, 0x69, 0xbc, 0x8d,
+  0xb5, 0x5d, 0xbd, 0x39, 0x60, 0x29, 0xbc, 0x06, 0x25, 0x6b, 0x3c, 0xad, 0x40,
+  0x32, 0xbd, 0xcd, 0xbe, 0xf3, 0xbc, 0x7e, 0xd6, 0x74, 0x3d, 0x2e, 0x72, 0x63,
+  0x3d, 0xc3, 0xaa, 0x0c, 0xbd, 0x74, 0xfc, 0x6a, 0xbd, 0xff, 0xa6, 0x7b, 0x3d,
+  0xa8, 0x4f, 0xec, 0xbc, 0x8a, 0x91, 0x39, 0xbd, 0xd1, 0xa4, 0x7b, 0x3d, 0xff,
+  0x3a, 0x99, 0x3b, 0xe9, 0xd2, 0x4e, 0xbd, 0xc6, 0x84, 0x1e, 0x3d, 0xe7, 0x73,
+  0xdf, 0xbc, 0x88, 0xfb, 0x08, 0x3d, 0xf9, 0x98, 0xa2, 0xbc, 0x41, 0x1d, 0x8d,
+  0x3d, 0xe6, 0x32, 0x38, 0x3d, 0x5f, 0xea, 0x1a, 0xbd, 0xce, 0x8f, 0x92, 0xbd,
+  0xea, 0x1f, 0x69, 0x3d, 0x5b, 0x6e, 0x58, 0xbc, 0x6d, 0xfc, 0x2d, 0x3d, 0xa9,
+  0x01, 0x83, 0x3d, 0xbc, 0xdb, 0x53, 0x3d, 0x70, 0xea, 0x72, 0xbd, 0xa4, 0xc0,
+  0xae, 0xbc, 0x80, 0x8a, 0x54, 0x3a, 0x4a, 0x00, 0x80, 0xbc, 0x4a, 0x66, 0x78,
+  0xbc, 0xbe, 0x62, 0x79, 0xbd, 0xe8, 0x24, 0x84, 0xbc, 0x0d, 0xef, 0x0f, 0x3d,
+  0xa9, 0xa6, 0x26, 0x3d, 0xb8, 0x68, 0x83, 0xbd, 0xe2, 0x7b, 0x27, 0xbd, 0xdc,
+  0xda, 0x80, 0xbd, 0x5e, 0x50, 0x88, 0xbd, 0x76, 0x41, 0x8d, 0x3d, 0xee, 0x0a,
+  0x95, 0xbc, 0xc4, 0x0b, 0x41, 0x3c, 0x6e, 0x16, 0xe0, 0xbc, 0xb2, 0x34, 0x58,
+  0x3d, 0x65, 0xd4, 0x06, 0x3d, 0x8a, 0x8a, 0x18, 0xbd, 0x99, 0xdd, 0x47, 0x3d,
+  0x2b, 0xec, 0x00, 0x3d, 0xc3, 0xb1, 0xad, 0xb9, 0xf9, 0x57, 0x77, 0x3c, 0xae,
+  0xc6, 0x8a, 0xbd, 0x55, 0x51, 0x43, 0x3d, 0x34, 0xd3, 0x1b, 0xbd, 0xda, 0x9e,
+  0x47, 0x3d, 0xe5, 0x3a, 0x1f, 0x3d, 0x6d, 0xf2, 0x59, 0x3d, 0x14, 0x27, 0xb7,
+  0xbc, 0xb0, 0x72, 0x8f, 0x3d, 0xbe, 0x91, 0x83, 0xbd, 0xbb, 0x8f, 0x39, 0xbd,
+  0x40, 0x7f, 0x7e, 0xbd, 0x2d, 0x3e, 0x86, 0x3b, 0xca, 0x43, 0x29, 0xbc, 0xe2,
+  0xb8, 0x4d, 0x3d, 0x48, 0x31, 0x85, 0xbd, 0xcb, 0x54, 0x1b, 0x3d, 0xb4, 0xc8,
+  0x56, 0x3d, 0x09, 0x2f, 0x1d, 0x3d, 0xca, 0x8f, 0x10, 0x3d, 0xe1, 0x8d, 0x4c,
+  0x3a, 0xdb, 0x4d, 0xd2, 0xbc, 0x4a, 0xc7, 0xd1, 0xbc, 0xc8, 0x03, 0xfa, 0x3c,
+  0x4e, 0x3f, 0xa4, 0xbc, 0x5f, 0x9e, 0x90, 0xbd, 0x13, 0x82, 0xc0, 0x3c, 0x59,
+  0x55, 0x54, 0x3c, 0xb6, 0x95, 0xa5, 0xbb, 0xef, 0x59, 0xa4, 0x3b, 0x7e, 0x93,
+  0x1e, 0xbd, 0xaf, 0x49, 0x81, 0xbc, 0xe7, 0xd1, 0xc6, 0xbb, 0xc0, 0xa3, 0xc9,
+  0x3b, 0x53, 0xa9, 0x77, 0xbb, 0xfa, 0x26, 0x74, 0xbc, 0x06, 0x1b, 0x63, 0x3d,
+  0xe4, 0x90, 0x0a, 0xbd, 0x64, 0x50, 0x31, 0x3d, 0xff, 0x66, 0x82, 0x3d, 0x9d,
+  0x1c, 0x06, 0xbd, 0x38, 0x29, 0x40, 0xbd, 0x6f, 0xea, 0x89, 0x3d, 0xdc, 0x8a,
+  0x3f, 0xbd, 0xd1, 0x88, 0x02, 0x3d, 0x2f, 0x23, 0x27, 0x3c, 0x9c, 0x85, 0x56,
+  0x3d, 0x41, 0xc7, 0x41, 0xbd, 0x67, 0x51, 0x49, 0x3c, 0x5f, 0x41, 0xf9, 0xbb,
+  0x15, 0x37, 0xdb, 0xbc, 0x51, 0x7a, 0xd9, 0x3a, 0x05, 0xc0, 0x90, 0xbd, 0x8f,
+  0xdb, 0x84, 0xbd, 0x3a, 0xc1, 0x48, 0xb9, 0x22, 0x3c, 0xfb, 0x3c, 0x7d, 0xf5,
+  0x14, 0xbd, 0x26, 0xe6, 0x53, 0xbc, 0xde, 0x94, 0xa0, 0xbc, 0xd9, 0xc4, 0x5e,
+  0x3d, 0xd4, 0xcf, 0xa6, 0xba, 0xfa, 0x43, 0x18, 0xbd, 0xee, 0x62, 0x19, 0xbd,
+  0xfb, 0x61, 0x66, 0xbb, 0x1e, 0x8b, 0x82, 0xbd, 0x26, 0xec, 0x87, 0xbd, 0xc2,
+  0xf6, 0x04, 0x3d, 0x2b, 0x2e, 0xe4, 0xbc, 0x60, 0xa6, 0x4e, 0x3d, 0x21, 0x99,
+  0x5c, 0x3d, 0xdd, 0xde, 0x37, 0x3d, 0x8e, 0xfc, 0xf5, 0x3c, 0x6d, 0x33, 0xc2,
+  0x39, 0x48, 0xea, 0x34, 0x3d, 0x79, 0x3e, 0x85, 0xbd, 0x20, 0xb1, 0x3d, 0xbb,
+  0xdc, 0xe9, 0x64, 0xbc, 0xd2, 0xac, 0x4a, 0xbd, 0x1a, 0x4a, 0x8d, 0xbd, 0xb5,
+  0xa2, 0xf3, 0x3c, 0xcd, 0x54, 0xb6, 0xbc, 0xc1, 0x9b, 0x2c, 0x3c, 0xd0, 0xea,
+  0xad, 0xbc, 0x3f, 0xbc, 0x7f, 0x3c, 0xde, 0xe3, 0xe9, 0xbc, 0x1e, 0x28, 0x6f,
+  0xbc, 0xd1, 0xce, 0xfe, 0xbc, 0xcc, 0x16, 0x21, 0x3d, 0x2a, 0x10, 0x18, 0xbd,
+  0x5e, 0x73, 0xe9, 0xbb, 0xb3, 0x67, 0xa1, 0xbb, 0x94, 0x7d, 0x0d, 0x3c, 0x1d,
+  0x67, 0x3b, 0xbd, 0xa9, 0xb9, 0x84, 0x3c, 0xe1, 0xc1, 0x89, 0xba, 0x49, 0x7f,
+  0x91, 0xbd, 0x47, 0xf8, 0x57, 0xbc, 0x00, 0x6a, 0x24, 0x3d, 0x61, 0x71, 0x6f,
+  0x3c, 0xd7, 0x6e, 0x4e, 0xbc, 0x07, 0xda, 0x60, 0xbb, 0x2d, 0xd9, 0x8e, 0x3d,
+  0x0d, 0x9d, 0xc5, 0x3b, 0x50, 0x74, 0xe2, 0xbc, 0xaf, 0x90, 0x2d, 0xbd, 0xce,
+  0x93, 0x2a, 0x3d, 0x56, 0xee, 0xee, 0xbc, 0x62, 0x58, 0x0a, 0x3d, 0x25, 0x7c,
+  0x64, 0x3d, 0x23, 0x8d, 0x80, 0x3d, 0x3b, 0xfd, 0x55, 0xbd, 0x8f, 0x71, 0xe2,
+  0xbc, 0x9c, 0xae, 0x07, 0x3d, 0x0e, 0xe4, 0xdd, 0xbc, 0x93, 0xc9, 0xd7, 0x3c,
+  0x87, 0x9c, 0xe5, 0xbb, 0xa3, 0xd5, 0x5d, 0x3d, 0x23, 0xdb, 0x3a, 0xbd, 0x67,
+  0xb3, 0x1a, 0x3d, 0x9e, 0xa1, 0x6b, 0x3d, 0x93, 0x17, 0xc2, 0xbc, 0x0c, 0xb7,
+  0x33, 0xbd, 0xc0, 0xba, 0xeb, 0xbc, 0x16, 0x2c, 0x4d, 0xbd, 0xed, 0x60, 0x78,
+  0x3c, 0x54, 0xa3, 0x93, 0xbd, 0x62, 0xa6, 0x8a, 0xbd, 0xdc, 0x16, 0x25, 0xbd,
+  0xa9, 0xaf, 0x76, 0xbd, 0xab, 0x3c, 0x5d, 0xbd, 0xcf, 0x78, 0x9c, 0x3c, 0x74,
+  0xf2, 0x97, 0x3c, 0xaa, 0x5d, 0x3b, 0x3d, 0x9c, 0xd2, 0xef, 0x3c, 0xd8, 0x6a,
+  0x37, 0x3c, 0x44, 0xd2, 0xb9, 0xbc, 0x41, 0x5d, 0x7e, 0x3d, 0x74, 0x3c, 0x7d,
+  0xbd, 0x40, 0x08, 0x0c, 0xbd, 0xbb, 0xc3, 0x04, 0xbd, 0xd7, 0xd3, 0x5d, 0xbd,
+  0x41, 0xe7, 0x7c, 0x3d, 0x65, 0x20, 0x6f, 0x3b, 0x4e, 0xef, 0x81, 0x3a, 0xae,
+  0xe0, 0x5d, 0xbd, 0x3f, 0xfb, 0x82, 0xbd, 0xf1, 0xc5, 0x58, 0xbd, 0x96, 0xab,
+  0x45, 0x3b, 0x97, 0x5f, 0xcd, 0x3b, 0x39, 0x48, 0x5b, 0x3b, 0x6d, 0xf0, 0x28,
+  0xbd, 0x08, 0xcc, 0x9f, 0x3c, 0x21, 0xd5, 0x2b, 0xbd, 0xc1, 0xe3, 0x1c, 0x3d,
+  0x86, 0x52, 0xb4, 0x3c, 0x02, 0xd4, 0xc6, 0xbc, 0xbe, 0xab, 0x27, 0xbd, 0x18,
+  0x8f, 0x84, 0x3c, 0x7d, 0x47, 0x2e, 0x3d, 0x0a, 0x58, 0x9c, 0x3b, 0x52, 0x72,
+  0xe4, 0xbc, 0x98, 0x57, 0x5e, 0x3c, 0x24, 0xf1, 0x04, 0xbc, 0x3b, 0xec, 0x0f,
+  0xbd, 0xf5, 0x54, 0x13, 0x3d, 0x6f, 0xf9, 0x80, 0x3c, 0x80, 0x19, 0xa2, 0xbc,
+  0xfa, 0x89, 0x35, 0x3d, 0xd8, 0x61, 0x82, 0x3c, 0x21, 0x81, 0x8b, 0x3d, 0x40,
+  0x2d, 0x65, 0xbc, 0xc6, 0x21, 0x61, 0x3d, 0x51, 0x3d, 0xa9, 0xbc, 0x47, 0x12,
+  0x55, 0x3d, 0x7e, 0x85, 0x71, 0xbd, 0x22, 0x14, 0x05, 0x3d, 0x94, 0x35, 0x97,
+  0xbd, 0x3c, 0x00, 0x86, 0xbd, 0x3a, 0x46, 0x5f, 0x3d, 0x18, 0x14, 0x06, 0xbd,
+  0xb4, 0xea, 0x8c, 0xbd, 0xdc, 0x2e, 0xfe, 0x3b, 0x21, 0x96, 0x3d, 0xbd, 0x3a,
+  0xf6, 0x8b, 0xbc, 0x3a, 0x3b, 0x6d, 0xbb, 0x39, 0x87, 0x13, 0x3c, 0x15, 0xbc,
+  0x92, 0xbd, 0x24, 0xb7, 0x13, 0x3d, 0x9c, 0x66, 0x7a, 0xbd, 0x6b, 0xf2, 0x41,
+  0xbd, 0x1d, 0x15, 0x6a, 0xbc, 0x20, 0x2a, 0x73, 0x3d, 0x25, 0x95, 0x40, 0x3d,
+  0x23, 0x8f, 0x90, 0xbd, 0xd6, 0x95, 0xa7, 0xbc, 0xbe, 0xce, 0x4f, 0x3d, 0xaf,
+  0xe0, 0x3f, 0x3d, 0x1b, 0x9f, 0x47, 0x3c, 0x57, 0x37, 0x14, 0x3d, 0x33, 0x06,
+  0x86, 0x3d, 0xe5, 0x3c, 0x77, 0x3d, 0x60, 0x46, 0x95, 0x3b, 0xee, 0xd2, 0x97,
+  0xbc, 0x38, 0x20, 0x9c, 0x3c, 0xe6, 0x90, 0xdf, 0xba, 0x77, 0x4f, 0x30, 0x3d,
+  0x54, 0x87, 0x03, 0x3d, 0x86, 0x7c, 0x25, 0x3d, 0xdb, 0x5a, 0x18, 0x3d, 0x60,
+  0x84, 0xf9, 0xbc, 0x84, 0x3c, 0xd0, 0xbc, 0xe9, 0x8c, 0x87, 0xbb, 0x39, 0xb9,
+  0x81, 0x3d, 0x2e, 0x3e, 0x67, 0x3d, 0x5d, 0x57, 0xf8, 0xba, 0x60, 0x31, 0x38,
+  0x3c, 0xf4, 0x31, 0x02, 0xbd, 0x31, 0x10, 0x98, 0x3c, 0x85, 0x28, 0x16, 0x3d,
+  0xc5, 0xcd, 0xef, 0x3c, 0x92, 0x8d, 0x59, 0x3d, 0x6a, 0x54, 0x27, 0xbc, 0x72,
+  0x4a, 0xf7, 0xbc, 0x0d, 0x8d, 0x81, 0x3d, 0xbd, 0x74, 0x8f, 0xbd, 0x80, 0xed,
+  0x5c, 0x3b, 0xbe, 0x52, 0x7e, 0x3d, 0x49, 0x3f, 0x28, 0xbd, 0xcc, 0xc5, 0xea,
+  0xbc, 0x2f, 0x46, 0x6b, 0xbd, 0x05, 0xd4, 0x0c, 0xbc, 0x41, 0x09, 0x02, 0x3d,
+  0x2e, 0xa8, 0x53, 0xbc, 0xc7, 0x56, 0x56, 0xbd, 0xc2, 0x01, 0x88, 0xbd, 0x7a,
+  0x9c, 0x6f, 0x3d, 0x3c, 0x49, 0x1c, 0x3d, 0x2b, 0x80, 0xe3, 0x3b, 0x43, 0x27,
+  0x7d, 0x3d, 0x91, 0xa0, 0x58, 0x3d, 0xdb, 0x70, 0x76, 0xbc, 0xc4, 0xfa, 0x04,
+  0xbd, 0x5e, 0x76, 0xcc, 0x3b, 0x0a, 0xcf, 0xc0, 0xbc, 0xfa, 0x3f, 0x08, 0xbd,
+  0x26, 0x65, 0xaa, 0x3c, 0x2f, 0xec, 0x37, 0x3d, 0xa0, 0xae, 0x51, 0x3d, 0xbd,
+  0x0e, 0x4e, 0x3d, 0x4d, 0x36, 0xae, 0xbc, 0xf1, 0xc8, 0x3f, 0xbd, 0x79, 0xe5,
+  0x84, 0xbc, 0xac, 0x19, 0xf7, 0x3b, 0x5f, 0x52, 0x70, 0xbd, 0x46, 0x15, 0x01,
+  0xbd, 0x17, 0xb1, 0xb1, 0x3c, 0x2e, 0x19, 0x87, 0xbd, 0x0c, 0xe6, 0x98, 0x3c,
+  0x35, 0xd0, 0x22, 0xbd, 0xe3, 0x8f, 0x8a, 0xbd, 0x23, 0x8b, 0xfa, 0x3c, 0x01,
+  0x67, 0x80, 0x3d, 0x6c, 0x9e, 0xb2, 0x3a, 0x6b, 0xbe, 0x8b, 0x3d, 0x74, 0x68,
+  0xdb, 0x3c, 0x4c, 0x13, 0xae, 0xbc, 0x94, 0xfe, 0x50, 0xbd, 0xdc, 0x7e, 0x2f,
+  0x3d, 0x78, 0x0a, 0x6e, 0xbc, 0x0e, 0x2b, 0xe9, 0xbc, 0x3b, 0x4b, 0x08, 0x3d,
+  0x4d, 0x1a, 0x3d, 0xbd, 0x55, 0x7e, 0x51, 0xbb, 0x15, 0xa6, 0xb4, 0xbc, 0xac,
+  0x1b, 0x86, 0xbb, 0x8a, 0x27, 0x22, 0x3d, 0x39, 0xc8, 0x34, 0xbc, 0x65, 0x0e,
+  0x1a, 0xbb, 0x4c, 0x08, 0xdb, 0x3b, 0x60, 0x75, 0x2d, 0xbc, 0x25, 0xba, 0x64,
+  0xbc, 0x8c, 0x05, 0x70, 0x3d, 0x0e, 0xdc, 0xaa, 0xbc, 0x63, 0x17, 0x03, 0x3d,
+  0x03, 0x9d, 0x36, 0x3c, 0xe3, 0xf5, 0x6e, 0x3d, 0x01, 0xf8, 0x12, 0xbd, 0x15,
+  0x62, 0xb3, 0x3c, 0xe1, 0x20, 0x1f, 0x3d, 0xbd, 0x41, 0x8d, 0x3d, 0x7b, 0x02,
+  0x47, 0x3d, 0x8e, 0x9c, 0x93, 0xbc, 0x82, 0xa1, 0x81, 0xbd, 0xb9, 0x59, 0x6e,
+  0x3c, 0xc6, 0x93, 0x07, 0xbd, 0x4c, 0x87, 0x44, 0x3d, 0x6a, 0x66, 0x49, 0xbd,
+  0x80, 0xd5, 0x4b, 0xbb, 0x70, 0xd5, 0x09, 0x3c, 0x20, 0x85, 0x06, 0x3c, 0x7e,
+  0xd6, 0x42, 0x3d, 0x5d, 0x10, 0x01, 0x3c, 0x71, 0xbe, 0x6c, 0xbc, 0xcc, 0xba,
+  0x2d, 0xbd, 0xbf, 0xf6, 0x90, 0xbd, 0x59, 0xb8, 0x8c, 0x3d, 0x4a, 0xe8, 0x87,
+  0xbc, 0xee, 0xd3, 0xd1, 0x3c, 0xde, 0xdd, 0xa6, 0xbb, 0x26, 0x06, 0x6a, 0xbc,
+  0x1f, 0xa2, 0x88, 0xbd, 0x00, 0x6c, 0x24, 0xbb, 0x36, 0xf0, 0x00, 0x3c, 0x1e,
+  0x54, 0x86, 0xbb, 0x55, 0x5e, 0x01, 0xbc, 0x3e, 0x0e, 0xe8, 0x3c, 0xbd, 0x02,
+  0x70, 0xbb, 0x8e, 0xb9, 0x85, 0x3d, 0x8e, 0x8a, 0x5d, 0xbb, 0xa4, 0x21, 0x13,
+  0x3d, 0xd1, 0x77, 0x16, 0xbc, 0x40, 0x95, 0x1d, 0x3c, 0x58, 0x2f, 0xbb, 0x3c,
+  0xf5, 0x88, 0x86, 0xbb, 0xa0, 0x02, 0x83, 0xbd, 0x93, 0xb8, 0x0a, 0x3c, 0xfd,
+  0x65, 0xe2, 0xbb, 0x24, 0x21, 0x11, 0x3d, 0xc6, 0x89, 0x8c, 0xbd, 0xc3, 0xa9,
+  0x7a, 0xbd, 0x43, 0xcf, 0x81, 0xbd, 0xde, 0x81, 0x58, 0xbd, 0x3d, 0x35, 0x23,
+  0x3d, 0xbe, 0x81, 0x90, 0xbd, 0xd3, 0xd2, 0xbb, 0x3c, 0x60, 0x68, 0xe5, 0xbc,
+  0x25, 0x64, 0xa8, 0xbb, 0x8e, 0x5e, 0x4e, 0xbd, 0xc3, 0xa4, 0xd3, 0xbc, 0xb0,
+  0x99, 0xf7, 0xbc, 0x2d, 0x56, 0x17, 0xbd, 0x44, 0x65, 0x2b, 0x3d, 0xa7, 0x80,
+  0x05, 0xbd, 0xfc, 0xe1, 0x02, 0x3d, 0x65, 0xa7, 0x68, 0x3d, 0x52, 0x5d, 0x8b,
+  0xbd, 0x6a, 0x9e, 0x83, 0xbd, 0xd4, 0xac, 0x1a, 0xbc, 0x3e, 0x6b, 0x7d, 0xbc,
+  0xeb, 0xff, 0x40, 0xbd, 0xcd, 0xd2, 0x21, 0x3d, 0x7e, 0xf1, 0x70, 0xbd, 0x9b,
+  0xc6, 0x6a, 0xbb, 0x1e, 0xb9, 0x20, 0x3d, 0xfd, 0x9b, 0x61, 0xbd, 0x57, 0xf3,
+  0x5a, 0xbd, 0x5d, 0xbe, 0xbb, 0x3b, 0xd3, 0xc8, 0x50, 0xbd, 0x38, 0x8a, 0x5e,
+  0xbd, 0x86, 0x65, 0x57, 0x3d, 0x02, 0xc7, 0x85, 0xbd, 0x95, 0x0a, 0x80, 0x3d,
+  0x08, 0xcd, 0x66, 0x3c, 0x68, 0x38, 0x3d, 0x3c, 0xad, 0x64, 0x12, 0xbd, 0x20,
+  0x0d, 0xcc, 0x3c, 0x63, 0x2c, 0x3f, 0x3d, 0xf6, 0xe1, 0xdc, 0x3c, 0x5f, 0xa6,
+  0x35, 0x3d, 0x7b, 0xf6, 0x68, 0xbd, 0x9e, 0x65, 0xd2, 0x3c, 0x13, 0x63, 0x9d,
+  0xbb, 0xd6, 0x42, 0x51, 0xbc, 0xa2, 0xc5, 0x52, 0xbc, 0x6a, 0x3d, 0x3f, 0x3d,
+  0xa6, 0xde, 0xf8, 0xbc, 0x01, 0xa1, 0x5b, 0x3d, 0x8d, 0xdf, 0x16, 0xbd, 0x62,
+  0x4d, 0x35, 0xba, 0x22, 0xca, 0x30, 0xbd, 0x50, 0x22, 0x72, 0xbc, 0xf1, 0xaa,
+  0x96, 0xbd, 0x52, 0xf4, 0xd9, 0x3c, 0x08, 0x89, 0x6d, 0x3d, 0x90, 0x97, 0xa9,
+  0x3c, 0x20, 0x9d, 0x0b, 0x3c, 0x47, 0x97, 0xf5, 0xbc, 0x7f, 0xc1, 0x3c, 0x3d,
+  0x77, 0xa7, 0xeb, 0x3b, 0xe2, 0x0c, 0x77, 0x3d, 0xca, 0x57, 0x3e, 0x3d, 0x16,
+  0x46, 0x38, 0xbd, 0x15, 0xde, 0x87, 0x3d, 0x10, 0x09, 0x0a, 0xbd, 0xa0, 0xfa,
+  0x56, 0x3b, 0xba, 0x6c, 0x2f, 0x3d, 0x0f, 0xb9, 0x70, 0x3c, 0x35, 0xb8, 0x8c,
+  0xbd, 0x88, 0xad, 0xc5, 0xbc, 0xb2, 0x0b, 0x40, 0xbd, 0x63, 0x62, 0x80, 0xbd,
+  0xb4, 0xd9, 0x78, 0x3c, 0x91, 0x49, 0x8a, 0xbd, 0x59, 0x3c, 0x47, 0x3d, 0xb1,
+  0xb7, 0x3a, 0xbd, 0x0f, 0x07, 0xea, 0x3b, 0xca, 0x89, 0x50, 0xbd, 0xf6, 0x2c,
+  0x27, 0xbd, 0x3f, 0xf7, 0x37, 0x3c, 0x1c, 0x12, 0x23, 0x3c, 0x6d, 0x88, 0x97,
+  0xbd, 0x06, 0x09, 0x66, 0x3d, 0x40, 0xac, 0x80, 0xbc, 0xac, 0xea, 0x7c, 0xbd,
+  0x7e, 0xfb, 0x1a, 0x3d, 0x11, 0xd1, 0x65, 0x3d, 0x56, 0x13, 0xee, 0xbc, 0xa5,
+  0xe1, 0x69, 0xbd, 0x47, 0xff, 0x45, 0xbc, 0x20, 0xba, 0x2e, 0xbd, 0xff, 0x15,
+  0x48, 0xbc, 0x01, 0xd5, 0x8f, 0x3d, 0x42, 0x0f, 0x37, 0x3c, 0x68, 0xbc, 0xcc,
+  0x3c, 0xf4, 0x1e, 0x39, 0xbd, 0x00, 0x6c, 0x07, 0xb9, 0xe4, 0x6e, 0xb2, 0x3c,
+  0x9b, 0x53, 0x88, 0xbd, 0x20, 0xf2, 0xef, 0xbc, 0xd3, 0xf3, 0x8e, 0x3d, 0xbc,
+  0xe9, 0xa6, 0xbc, 0xa3, 0xb6, 0x6b, 0xbc, 0x73, 0xeb, 0xdd, 0xbc, 0xdf, 0xa3,
+  0x04, 0xbd, 0x1a, 0x9f, 0x21, 0x3c, 0x1d, 0xb7, 0x89, 0xbb, 0x28, 0x66, 0x85,
+  0xbc, 0xf9, 0x7f, 0x95, 0xbd, 0x4c, 0x07, 0xfa, 0xbc, 0x52, 0x7d, 0x29, 0x3d,
+  0x66, 0x78, 0x24, 0xbc, 0xd4, 0x70, 0xfa, 0xbc, 0x20, 0xdb, 0x02, 0xbd, 0x51,
+  0x27, 0x09, 0xbd, 0xb6, 0xb6, 0x42, 0x3d, 0x37, 0xa4, 0x3f, 0xbd, 0xfc, 0x30,
+  0xb2, 0xbb, 0x2b, 0xa7, 0xb7, 0x3c, 0x77, 0xf6, 0x2e, 0x3d, 0x4e, 0x18, 0x6c,
+  0x3d, 0xb0, 0xb9, 0xe4, 0x3c, 0xa6, 0xce, 0x89, 0xbd, 0x18, 0x9a, 0xc2, 0x3c,
+  0x8d, 0xdc, 0x51, 0xbd, 0x50, 0x09, 0x0a, 0x3d, 0xd8, 0x90, 0x6c, 0xbc, 0x28,
+  0x48, 0x96, 0xbc, 0x50, 0x5f, 0x62, 0xbc, 0x8b, 0xbc, 0x82, 0xbd, 0xb0, 0x24,
+  0xce, 0x3b, 0x54, 0xb0, 0x4b, 0x3c, 0xd8, 0x02, 0x59, 0x3c, 0x0b, 0x7d, 0xa0,
+  0x3c, 0x2a, 0x6f, 0xfa, 0xbc, 0x51, 0xf4, 0x0a, 0xbd, 0xe5, 0xdd, 0x45, 0x3d,
+  0x69, 0xcb, 0x5f, 0x3d, 0x59, 0xee, 0x1b, 0x3d, 0x15, 0x0c, 0x6d, 0x3d, 0xb4,
+  0xe8, 0x3a, 0x3c, 0xd6, 0x4c, 0x71, 0x3d, 0x2c, 0x6c, 0x5f, 0xbc, 0x23, 0xc7,
+  0x96, 0x3c, 0x90, 0xfd, 0xef, 0xb9, 0x80, 0x9a, 0xce, 0xbc, 0xc8, 0xa7, 0xfa,
+  0xbc, 0x3f, 0x84, 0x4d, 0xbc, 0xb9, 0x1e, 0x63, 0x3d, 0x91, 0xff, 0x16, 0xbd,
+  0xe4, 0x6d, 0x65, 0xbc, 0xbb, 0x19, 0x69, 0xbc, 0xf0, 0xba, 0xfe, 0xbc, 0xbb,
+  0xe6, 0x30, 0x3d, 0x12, 0x3a, 0x4d, 0x3d, 0x08, 0xa7, 0x79, 0x3d, 0x37, 0x6c,
+  0x88, 0x3d, 0xb4, 0x66, 0xf1, 0xba, 0xb8, 0x48, 0xcc, 0xbc, 0x61, 0xb9, 0x1d,
+  0xbd, 0x8a, 0x51, 0x45, 0xbd, 0x2e, 0x8a, 0x59, 0x3d, 0x88, 0xe0, 0x7d, 0xbd,
+  0x53, 0xc6, 0x8e, 0xbd, 0x0e, 0x7b, 0x5a, 0x3d, 0x13, 0xc2, 0xcb, 0xbc, 0x57,
+  0xcd, 0x8b, 0xbd, 0x60, 0x8c, 0x4e, 0xbd, 0xe2, 0x03, 0x07, 0x3d, 0x5f, 0x0d,
+  0x80, 0x3c, 0x5f, 0xc8, 0x3d, 0x3d, 0x89, 0x06, 0xc8, 0x3c, 0x17, 0x2b, 0x88,
+  0x3d, 0xf6, 0x31, 0x63, 0x3d, 0x51, 0x2b, 0x60, 0xbd, 0xc9, 0x26, 0x67, 0xbd,
+  0x02, 0x8e, 0x4f, 0xbd, 0xbd, 0x67, 0x20, 0x3d, 0x53, 0xfa, 0x64, 0xbb, 0x27,
+  0x16, 0x28, 0xbd, 0x45, 0x52, 0xfb, 0xbb, 0x66, 0x53, 0x8d, 0x3c, 0x0c, 0x18,
+  0x74, 0xbc, 0x60, 0x98, 0x19, 0x3d, 0xd2, 0x7c, 0x3c, 0x3d, 0x77, 0x65, 0x90,
+  0xbc, 0x69, 0x1e, 0x3e, 0xbd, 0x04, 0x22, 0x7f, 0xbc, 0x7c, 0x5d, 0x2c, 0xbc,
+  0x51, 0xb3, 0x1f, 0xbc, 0xc4, 0xaf, 0xbf, 0xbc, 0xa8, 0xc5, 0x59, 0x3c, 0xfe,
+  0x08, 0x62, 0x3d, 0x7c, 0x3a, 0x56, 0x3d, 0x4a, 0xaf, 0x38, 0x3d, 0xd9, 0x9e,
+  0x26, 0xbd, 0x48, 0xc2, 0x16, 0xbc, 0x6e, 0xcc, 0xec, 0xbc, 0x05, 0x78, 0x0e,
+  0xbc, 0xd2, 0x5c, 0x51, 0xbd, 0x44, 0x63, 0x6b, 0x3d, 0x7c, 0xfd, 0xca, 0xbb,
+  0x62, 0xda, 0x30, 0x3c, 0xc4, 0xcc, 0x61, 0x3d, 0xdc, 0xa6, 0x34, 0xbd, 0xff,
+  0x8f, 0x24, 0xbc, 0x68, 0x37, 0xf6, 0xbc, 0xd1, 0x4d, 0x25, 0xbd, 0x33, 0x6e,
+  0x91, 0x3c, 0x60, 0x57, 0x6b, 0x3d, 0x04, 0xf7, 0x34, 0xbd, 0x90, 0xe7, 0x30,
+  0x3d, 0x8e, 0x22, 0x65, 0xbd, 0x62, 0xcf, 0xb6, 0x3c, 0xce, 0x5d, 0x9f, 0x3c,
+  0xa0, 0x0a, 0x43, 0xbd, 0x1e, 0x7b, 0x56, 0xbd, 0x1f, 0x6a, 0x93, 0xbd, 0x60,
+  0x5e, 0x39, 0x3d, 0x4d, 0x17, 0x8e, 0xbd, 0x28, 0x00, 0xad, 0x3c, 0x79, 0xd0,
+  0xab, 0xbb, 0x15, 0xf3, 0x1a, 0xbd, 0x28, 0x13, 0x05, 0x3c, 0x90, 0x55, 0x20,
+  0x3d, 0x98, 0x9b, 0xc4, 0x3c, 0x32, 0x5f, 0x86, 0xbd, 0x6d, 0xf8, 0x52, 0xbd,
+  0xcc, 0x28, 0xae, 0x3c, 0x96, 0xc7, 0x81, 0x3d, 0x04, 0x2e, 0x5b, 0xbc, 0xdd,
+  0xce, 0xb2, 0x3c, 0x14, 0x5d, 0x67, 0x3d, 0x74, 0xe8, 0x77, 0x3d, 0x2e, 0xf5,
+  0x51, 0x3d, 0x21, 0x78, 0x7a, 0xbd, 0x62, 0xea, 0x6a, 0xbd, 0x36, 0x1c, 0xf4,
+  0xbc, 0xd0, 0x98, 0xda, 0x3b, 0x26, 0x14, 0x8a, 0xbd, 0xf2, 0xa4, 0x67, 0xbd,
+  0xb2, 0xa7, 0x39, 0xbd, 0x93, 0xa6, 0xd6, 0x3c, 0xe1, 0xa9, 0xe4, 0x3b, 0x49,
+  0xca, 0x3f, 0x3d, 0x07, 0xe3, 0x64, 0x3d, 0x1e, 0xf5, 0x4d, 0xbd, 0x4e, 0xc3,
+  0x8a, 0xbd, 0x88, 0xf9, 0xf8, 0x3c, 0xc6, 0x2a, 0xba, 0xbc, 0x56, 0xd7, 0xb1,
+  0xbc, 0xbd, 0xff, 0x10, 0x3c, 0xfe, 0x3d, 0x16, 0xbd, 0x88, 0xdd, 0x5f, 0x3c,
+  0x66, 0xd4, 0x50, 0xbd, 0xe2, 0x59, 0x62, 0x3d, 0x1c, 0xdf, 0xac, 0x3c, 0xc2,
+  0x72, 0xb7, 0xbc, 0xe2, 0x19, 0x4d, 0xbd, 0xc1, 0xbb, 0xa1, 0x3c, 0xf2, 0x8f,
+  0x24, 0x3d, 0x2f, 0xb1, 0xeb, 0xbc, 0xa7, 0xe6, 0x13, 0xbd, 0x4c, 0x51, 0x7c,
+  0xbd, 0x23, 0x87, 0x3e, 0xbd, 0x65, 0x03, 0x86, 0x3b, 0x5d, 0x13, 0x15, 0x3d,
+  0x44, 0x77, 0x96, 0xba, 0xe9, 0x74, 0x0a, 0x3d, 0xb4, 0xd0, 0x59, 0xbd, 0x4c,
+  0x9a, 0x22, 0x3d, 0x82, 0x1b, 0x85, 0x3d, 0x09, 0x1e, 0xf9, 0x3c, 0x20, 0xcf,
+  0x97, 0xbd, 0xf9, 0x46, 0x0e, 0xbd, 0xba, 0x0d, 0x82, 0x3d, 0xf6, 0xf1, 0xd7,
+  0x3c, 0x8e, 0x08, 0xf8, 0xbc, 0x4d, 0xbf, 0x22, 0xbd, 0xd0, 0x25, 0x8a, 0x3c,
+  0xa8, 0x71, 0x2e, 0xbd, 0xd9, 0xaa, 0x24, 0x3a, 0x48, 0x85, 0x6c, 0xbd, 0x90,
+  0x0e, 0x8c, 0x3c, 0x3c, 0x45, 0x50, 0x3d, 0x71, 0xab, 0x65, 0x3d, 0x60, 0x38,
+  0xdb, 0x3b, 0x9b, 0x94, 0x81, 0xbd, 0xc0, 0xaa, 0xb3, 0xbc, 0xc8, 0x46, 0x93,
+  0xbc, 0x3a, 0x19, 0xea, 0xbc, 0x16, 0xab, 0x36, 0xbc, 0x20, 0x52, 0x74, 0xbd,
+  0xbd, 0x3b, 0x75, 0x3d, 0xea, 0xef, 0xc3, 0xbc, 0x54, 0xbe, 0x26, 0xbd, 0x88,
+  0x03, 0x6c, 0x3d, 0xa0, 0x3e, 0x4a, 0x3d, 0x46, 0x60, 0x0a, 0x3d, 0xf9, 0x88,
+  0x59, 0x3d, 0xa2, 0x8a, 0x87, 0xbd, 0xde, 0x60, 0x48, 0x3d, 0xc6, 0x87, 0x60,
+  0x3d, 0x05, 0x18, 0x3d, 0xbc, 0xa8, 0x15, 0x01, 0x3d, 0x68, 0x46, 0x41, 0xbd,
+  0x7f, 0x8e, 0x58, 0x3d, 0xc6, 0xa4, 0xf6, 0x3c, 0x22, 0xbc, 0x73, 0x3d, 0xe8,
+  0x2d, 0x83, 0x3c, 0x97, 0x7f, 0x8b, 0xbb, 0xe6, 0x83, 0x81, 0xbc, 0x42, 0x79,
+  0x5b, 0x3d, 0x62, 0xfb, 0xd4, 0x3b, 0xf3, 0x51, 0x06, 0xbd, 0xb0, 0x65, 0x79,
+  0x3d, 0xbc, 0x83, 0xdc, 0x3c, 0xbe, 0xbd, 0x8c, 0x3d, 0x64, 0xdf, 0x13, 0x3d,
+  0x1f, 0xa8, 0x44, 0xbd, 0x1e, 0x7f, 0x87, 0xbc, 0x15, 0x05, 0x6c, 0xbd, 0x43,
+  0x6b, 0x75, 0xbd, 0x38, 0x5a, 0x64, 0x3d, 0xb8, 0x35, 0x2c, 0x3c, 0x93, 0x41,
+  0xd5, 0xb9, 0xf4, 0x66, 0x79, 0xbc, 0xd9, 0xda, 0xae, 0xbc, 0xd6, 0x82, 0xd4,
+  0x3b, 0x48, 0x9e, 0x3e, 0xbd, 0x0c, 0x2c, 0xb7, 0xbc, 0xba, 0x9c, 0x2f, 0xbd,
+  0x9c, 0x53, 0x4f, 0x3d, 0xf5, 0x5f, 0xe6, 0x3c, 0x60, 0x8e, 0x1f, 0x3b, 0xa6,
+  0x27, 0x4a, 0xbd, 0xe5, 0x82, 0x9b, 0x3c, 0xb7, 0xe1, 0x84, 0x3d, 0x13, 0x34,
+  0x34, 0xbc, 0x58, 0xca, 0x09, 0x3d, 0xe2, 0x9f, 0x70, 0x3d, 0x7b, 0x73, 0xa1,
+  0xbc, 0xdb, 0x26, 0x08, 0xbd, 0xc0, 0x46, 0xce, 0xba, 0xfc, 0xde, 0xe1, 0x3c,
+  0xf5, 0xd5, 0xbc, 0x3c, 0x03, 0x9b, 0x16, 0x3d, 0x61, 0xda, 0x16, 0xbd, 0x9c,
+  0x34, 0x15, 0xbd, 0x6c, 0xae, 0x50, 0xbd, 0xc0, 0x47, 0x89, 0xbd, 0xf0, 0xff,
+  0x52, 0x3d, 0xa2, 0xf2, 0x01, 0x3d, 0x7c, 0x68, 0x1a, 0x3d, 0x70, 0x77, 0x58,
+  0xbd, 0x62, 0xb8, 0xb3, 0x3c, 0xd8, 0x2e, 0x07, 0xbc, 0xe6, 0x32, 0x8b, 0x3d,
+  0x6b, 0xa2, 0x53, 0x3d, 0x12, 0xfa, 0x55, 0xbd, 0x7d, 0x83, 0x28, 0x3d, 0x92,
+  0xa8, 0x73, 0xbd, 0xd5, 0xd5, 0x9c, 0x3c, 0xe5, 0x93, 0x83, 0x3c, 0xf9, 0xc8,
+  0xb3, 0xbc, 0xfb, 0x27, 0x78, 0xbd, 0xa6, 0x7d, 0x5b, 0x3d, 0x9c, 0x51, 0x4d,
+  0x3d, 0x25, 0x60, 0x4b, 0x3d, 0xba, 0x91, 0x96, 0xb9, 0xd7, 0xaf, 0xc3, 0x3c,
+  0x34, 0x25, 0x3c, 0x3d, 0x3a, 0x04, 0x3a, 0x3d, 0x86, 0xb2, 0x30, 0x3c, 0x90,
+  0xcf, 0x46, 0x3d, 0x96, 0xee, 0xe2, 0xbc, 0x9c, 0x30, 0xa7, 0x3c, 0x56, 0xe3,
+  0x5a, 0xbd, 0x2f, 0xb6, 0x23, 0x3d, 0xda, 0x3e, 0x3c, 0xbd, 0x6e, 0xa0, 0x5c,
+  0x3d, 0x28, 0xe0, 0x6e, 0xbd, 0x1a, 0x52, 0x34, 0x3d, 0xb8, 0xcd, 0x27, 0xbc,
+  0x4a, 0xb4, 0x22, 0x3d, 0x1c, 0xd7, 0x64, 0xbc, 0x8f, 0xd9, 0x1d, 0xbd, 0xa2,
+  0x1e, 0x17, 0x3d, 0x78, 0xed, 0xe2, 0x3c, 0x82, 0x5e, 0x0d, 0x3c, 0x93, 0x9d,
+  0x58, 0xbd, 0x35, 0x43, 0x8a, 0xbd, 0xbd, 0xa6, 0xdf, 0x3c, 0x11, 0xc3, 0x3b,
+  0x3d, 0x6c, 0xad, 0x58, 0xbd, 0x2e, 0x39, 0x1f, 0x3d, 0x45, 0x7d, 0x00, 0x3a,
+  0xa9, 0xb2, 0x5b, 0x3d, 0x00, 0x38, 0x81, 0x38, 0xaa, 0x9f, 0xc9, 0x3a, 0xaa,
+  0x79, 0x73, 0xbd, 0x39, 0x7b, 0xf7, 0x3b, 0xc4, 0x9f, 0x4e, 0xbd, 0xa1, 0x0c,
+  0x64, 0x3a, 0x9b, 0x06, 0x5f, 0xbd, 0x32, 0x21, 0x6d, 0xbd, 0xbe, 0x94, 0x4e,
+  0x3d, 0x7c, 0x40, 0xf9, 0x3c, 0xc8, 0xac, 0xca, 0x3c, 0x30, 0x76, 0x50, 0xbd,
+  0x08, 0x66, 0x93, 0xbd, 0x0b, 0x4c, 0xb9, 0x3c, 0x8e, 0xef, 0x26, 0x3d, 0xe3,
+  0x00, 0x68, 0x3d, 0x51, 0x3a, 0x84, 0xbd, 0x54, 0xac, 0xb3, 0xbc, 0x95, 0x17,
+  0x91, 0xbd, 0x04, 0xf2, 0x31, 0x3d, 0x48, 0xbb, 0x20, 0x3c, 0xf3, 0x82, 0x88,
+  0xbd, 0xdd, 0x5e, 0x4e, 0xbd, 0x95, 0x9e, 0x45, 0xbd, 0x62, 0xce, 0x51, 0xbd,
+  0xa3, 0x8b, 0x3b, 0x3d, 0x40, 0xdb, 0x85, 0x3d, 0x33, 0xdc, 0xc1, 0xbc, 0xa7,
+  0xb6, 0x7d, 0xbd, 0xd3, 0x99, 0x40, 0xbc, 0x6b, 0x63, 0x18, 0x3d, 0x73, 0x2f,
+  0x63, 0xbc, 0xf8, 0xa2, 0x4a, 0xbc, 0xa5, 0x0b, 0x76, 0x3d, 0xd5, 0x88, 0x79,
+  0x3d, 0x97, 0x41, 0x98, 0x3c, 0xe8, 0x20, 0x16, 0x3d, 0xcc, 0x47, 0x78, 0xbd,
+  0xfd, 0x9a, 0xae, 0x3c, 0xf2, 0xe2, 0x8a, 0xbd, 0x07, 0xd1, 0x19, 0x3d, 0xd4,
+  0xef, 0x68, 0xbc, 0x82, 0x5d, 0x51, 0x3d, 0x0c, 0x61, 0xc8, 0xba, 0xc1, 0xd5,
+  0x36, 0xbd, 0xf2, 0x3c, 0x1d, 0x3d, 0x86, 0xdf, 0x65, 0x3d, 0x04, 0x4c, 0x87,
+  0x3d, 0xe9, 0x46, 0x91, 0x3d, 0xc0, 0x63, 0x33, 0xbc, 0x7c, 0xd0, 0xbf, 0x3c,
+  0xe8, 0xfe, 0x55, 0xbd, 0x18, 0x50, 0x53, 0x3c, 0x51, 0x99, 0xb0, 0xbb, 0x50,
+  0x90, 0xec, 0x3b, 0x3d, 0x3a, 0x69, 0xbd, 0x6e, 0x49, 0x09, 0xbc, 0x74, 0x12,
+  0xde, 0xbc, 0xad, 0x0c, 0x87, 0x3c, 0x35, 0x8f, 0x41, 0x3d, 0x5e, 0xa8, 0x3b,
+  0xbd, 0x28, 0x85, 0x61, 0x3d, 0xfe, 0xb2, 0xe1, 0x3b, 0xec, 0xbb, 0x0e, 0x3d,
+  0x04, 0xe3, 0x05, 0x3d, 0x10, 0xeb, 0x07, 0xbd, 0x63, 0x3a, 0x68, 0x3d, 0x55,
+  0x9c, 0x49, 0x3b, 0x58, 0xdc, 0x62, 0x3d, 0x33, 0x78, 0x03, 0x3d, 0x0f, 0xc8,
+  0x7a, 0xbd, 0xa3, 0x94, 0x83, 0xbd, 0xf7, 0x86, 0x5d, 0xbd, 0xcb, 0xd6, 0x82,
+  0x3d, 0xcb, 0x78, 0x82, 0xbd, 0xcb, 0x8b, 0x46, 0xbc, 0x44, 0xff, 0x75, 0xbd,
+  0x63, 0xc6, 0x48, 0x3d, 0x50, 0x1b, 0x14, 0xbc, 0x57, 0xd1, 0xe1, 0x3c, 0x60,
+  0xa8, 0xe2, 0x3c, 0x00, 0xa0, 0xf8, 0xb9, 0x9c, 0x9f, 0x24, 0x3d, 0x10, 0x2c,
+  0x4a, 0x3c, 0x90, 0xdf, 0xbc, 0xbc, 0x9e, 0xae, 0xa4, 0xbc, 0xf7, 0x31, 0x66,
+  0xbd, 0x1e, 0x83, 0x14, 0x3c, 0x9b, 0xaa, 0x91, 0x3b, 0x91, 0x24, 0x11, 0xbd,
+  0x54, 0x0b, 0x90, 0x3b, 0x30, 0xa4, 0x64, 0x3d, 0x69, 0xa8, 0x81, 0x3d, 0x5e,
+  0x35, 0x03, 0xbb, 0xcc, 0xce, 0xa6, 0x3c, 0x2f, 0x18, 0xfd, 0xbc, 0x50, 0x81,
+  0xe2, 0xbb, 0x40, 0x4b, 0x16, 0x3d, 0xc0, 0x66, 0x63, 0xbd, 0x5f, 0xcd, 0x9b,
+  0xbc, 0x2f, 0xf8, 0x25, 0xbd, 0xa0, 0x4d, 0x7a, 0x3c, 0x81, 0x0c, 0x5a, 0xbd,
+  0x54, 0xa9, 0x6a, 0x3d, 0xc0, 0x3b, 0x3c, 0xbd, 0xb4, 0x63, 0xfb, 0x3c, 0x26,
+  0x9c, 0x11, 0x3d, 0x06, 0xea, 0xa3, 0xbc, 0x3f, 0x44, 0x92, 0xbc, 0x00, 0x88,
+  0x6f, 0x3b, 0xd8, 0x6f, 0x36, 0xbd, 0xe0, 0xad, 0x89, 0x3d, 0x52, 0xfb, 0x72,
+  0x3d, 0x64, 0x05, 0x64, 0xbc, 0xd7, 0x2a, 0x57, 0xbd, 0x02, 0x49, 0xad, 0xbc,
+  0x38, 0xf1, 0x2d, 0xbd, 0x8a, 0x2e, 0x8b, 0x3d, 0x39, 0x44, 0x12, 0xbd, 0xfc,
+  0xa0, 0xb8, 0xbc, 0x32, 0x17, 0x8a, 0xbd, 0x7e, 0xbf, 0x6b, 0x3d, 0x32, 0x76,
+  0xad, 0xbc, 0xb0, 0x21, 0x58, 0x3d, 0x62, 0xf5, 0x59, 0x3d, 0xb3, 0x5f, 0x98,
+  0x3c, 0xa4, 0x02, 0x2c, 0x3b, 0x59, 0x69, 0x97, 0xbd, 0x70, 0xcf, 0x91, 0x3b,
+  0x6b, 0xc3, 0x47, 0xbd, 0x10, 0xfe, 0xd4, 0xbc, 0x08, 0x93, 0xd1, 0x3b, 0xf5,
+  0xe9, 0x14, 0xbd, 0x9a, 0x9c, 0x7b, 0x3d, 0x15, 0x75, 0x54, 0x3d, 0x09, 0xbf,
+  0x57, 0xbc, 0xbf, 0x09, 0x29, 0xbb, 0xf5, 0x6d, 0x91, 0xbd, 0xb8, 0x41, 0xbd,
+  0x3c, 0x80, 0x60, 0x6e, 0x3c, 0xab, 0xf2, 0x4f, 0xbd, 0x81, 0x36, 0x79, 0x3d,
+  0x6a, 0x5a, 0x85, 0xbd, 0xf2, 0xac, 0x36, 0x3d, 0x92, 0x7c, 0xc0, 0xbc, 0x00,
+  0x12, 0x06, 0x3c, 0xfe, 0x9c, 0x66, 0x3d, 0xa0, 0xf3, 0xbb, 0xbb, 0x37, 0xb0,
+  0x74, 0xbd, 0x18, 0xb1, 0x10, 0xbd, 0x82, 0xd7, 0xe2, 0xbc, 0x87, 0xee, 0x14,
+  0x3d, 0xe9, 0x2a, 0x40, 0xbd, 0xe3, 0x0d, 0x53, 0x3c, 0x5c, 0x02, 0x93, 0x3c,
+  0x25, 0x0f, 0x49, 0xbd, 0x88, 0xd8, 0x3f, 0x3d, 0x58, 0xf0, 0x39, 0xbd, 0xe3,
+  0x0a, 0x3b, 0xbd, 0xeb, 0x61, 0x01, 0x3d, 0xb4, 0xa0, 0x6b, 0xbd, 0x1d, 0x4b,
+  0x90, 0xbd, 0xb2, 0x31, 0x34, 0xbd, 0xaa, 0x20, 0xad, 0x3a, 0xd5, 0x1e, 0x3a,
+  0xbd, 0xf4, 0x05, 0x38, 0x3d, 0x1b, 0xb2, 0x46, 0xbc, 0x2c, 0xd7, 0x3e, 0x3d,
+  0xec, 0x98, 0xc7, 0x3c, 0xe7, 0xd3, 0x21, 0xbd, 0x07, 0x35, 0x60, 0xbd, 0x2b,
+  0xb9, 0xfd, 0xbc, 0x9b, 0x69, 0x36, 0x3d, 0xdf, 0xdf, 0x6f, 0xbd, 0x5a, 0x80,
+  0x81, 0xbd, 0x9b, 0x67, 0xf2, 0x3b, 0x20, 0x94, 0xde, 0xbb, 0xc5, 0xfc, 0x29,
+  0xbd, 0x0c, 0x34, 0x30, 0xbd, 0x50, 0xbb, 0xc9, 0xbc, 0x92, 0x32, 0x93, 0xbc,
+  0x12, 0xf9, 0x69, 0xbd, 0x1c, 0x84, 0x3a, 0xbc, 0x88, 0x93, 0x84, 0xbd, 0x07,
+  0x7e, 0xb5, 0x3c, 0xe6, 0xb8, 0x4a, 0x3d, 0xde, 0x7c, 0x55, 0x3d, 0x16, 0x69,
+  0xf0, 0xbc, 0x91, 0x57, 0x5b, 0xbd, 0xa2, 0x4a, 0x26, 0x3d, 0x5b, 0xdc, 0xaf,
+  0xba, 0xe8, 0x30, 0xe1, 0xbc, 0xf8, 0x97, 0x21, 0x3d, 0x00, 0x3e, 0x11, 0x3c,
+  0x92, 0x1c, 0xb1, 0xbc, 0xce, 0x5f, 0xa3, 0x3c, 0x2d, 0x13, 0x88, 0xbd, 0xbc,
+  0x64, 0xbc, 0x3c, 0xd1, 0x47, 0x97, 0xbb, 0xf2, 0x46, 0x55, 0x3d, 0x70, 0x6e,
+  0x09, 0x3d, 0x6b, 0x66, 0x93, 0xbd, 0x26, 0xf4, 0xcb, 0xbc, 0x59, 0xb5, 0x84,
+  0xbc, 0x13, 0x19, 0x8d, 0x3d, 0x35, 0xf3, 0x3e, 0xbc, 0x9d, 0xf8, 0x78, 0x3d,
+  0x75, 0x6d, 0x4f, 0x3d, 0xd4, 0x8a, 0xd7, 0x3c, 0x74, 0x49, 0x0d, 0xbd, 0x40,
+  0x3d, 0xcd, 0x3a, 0xa2, 0xb6, 0x64, 0x3d, 0x73, 0xc5, 0x90, 0x3d, 0x5b, 0x4e,
+  0x85, 0xbd, 0xf6, 0x1b, 0x64, 0x3d, 0x15, 0x44, 0xbf, 0xbc, 0x4c, 0xb6, 0x0e,
+  0x3d, 0xaf, 0x91, 0x06, 0xbc, 0xa0, 0xc6, 0xdf, 0x3c, 0xb7, 0xb5, 0x66, 0x3d,
+  0x23, 0x0d, 0x68, 0xbd, 0xcf, 0x9f, 0xe9, 0xbc, 0xcd, 0xa5, 0x1f, 0xbd, 0x92,
+  0x3c, 0x5b, 0x3d, 0x0c, 0x92, 0x57, 0x3d, 0x73, 0xa2, 0x2e, 0xbd, 0x4a, 0xeb,
+  0x23, 0xbc, 0x6b, 0xa1, 0x3c, 0xba, 0xd2, 0x19, 0xbb, 0xbc, 0x44, 0x55, 0x29,
+  0xbd, 0xcd, 0x07, 0x34, 0xbd, 0xbf, 0xaa, 0xf9, 0xba, 0x18, 0x7b, 0x8a, 0xbc,
+  0x4a, 0xe1, 0x5d, 0x3d, 0x28, 0x1b, 0x38, 0x3c, 0xfd, 0x1b, 0xd0, 0x3b, 0xdd,
+  0x1c, 0x92, 0xbb, 0xf4, 0x64, 0x31, 0x3c, 0x82, 0x22, 0x44, 0x3d, 0x22, 0xd5,
+  0x0c, 0xbd, 0x63, 0x1f, 0x24, 0xbd, 0xd0, 0xe3, 0x03, 0x3c, 0xfc, 0x32, 0x22,
+  0xbc, 0x26, 0x4e, 0xba, 0xbc, 0xf2, 0x18, 0xa8, 0xbc, 0x1d, 0xb1, 0x43, 0xbc,
+  0x4b, 0x52, 0x17, 0xbd, 0xe1, 0xf7, 0x05, 0x3d, 0xdb, 0xfb, 0xd9, 0x3c, 0x0b,
+  0x58, 0x8e, 0xbc, 0xc1, 0x1f, 0x81, 0x3d, 0xa0, 0x6f, 0x36, 0xbd, 0x52, 0xec,
+  0x57, 0xbd, 0x6a, 0x3b, 0x06, 0xbd, 0xb5, 0x5b, 0x9c, 0xbc, 0x08, 0xb1, 0x32,
+  0xbc, 0xc0, 0xde, 0x85, 0xbd, 0x2d, 0xd5, 0xd2, 0x3c, 0xa6, 0x1d, 0x14, 0xbc,
+  0x8d, 0x5e, 0xd8, 0x3c, 0x83, 0x8e, 0xcf, 0xbc, 0xa0, 0xc2, 0x83, 0xbd, 0xce,
+  0x5f, 0x3b, 0xbd, 0x60, 0xbc, 0x7d, 0xbc, 0x8e, 0x9c, 0x7f, 0xbd, 0xb3, 0x61,
+  0x0b, 0xbd, 0x1c, 0x2b, 0xc9, 0x3c, 0xbc, 0xb7, 0x6f, 0x3c, 0x61, 0x58, 0xda,
+  0xbc, 0xcc, 0x72, 0x23, 0x3c, 0x28, 0x64, 0x61, 0x3c, 0x5a, 0x19, 0x42, 0x3d,
+  0xb0, 0x39, 0x13, 0x3c, 0xe6, 0x3a, 0xf7, 0xbc, 0xc4, 0xaf, 0xc4, 0x3c, 0xd2,
+  0x14, 0xd0, 0xbc, 0x1a, 0x00, 0xb8, 0xbc, 0xf9, 0x9e, 0x23, 0xbd, 0xdf, 0x82,
+  0x6a, 0xbd, 0x7a, 0xc2, 0x18, 0xbc, 0xbf, 0xb0, 0x11, 0xbc, 0x2d, 0x48, 0x5b,
+  0xbd, 0xff, 0xff, 0x46, 0x3c, 0x6c, 0x6c, 0x36, 0x3c, 0xec, 0x21, 0x8a, 0xbd,
+  0x02, 0x85, 0xe0, 0x3c, 0xdf, 0x2e, 0x42, 0xbd, 0xf0, 0xa5, 0x24, 0x3d, 0x0a,
+  0xd1, 0x00, 0x3d, 0x58, 0x44, 0xb3, 0x3c, 0xc9, 0xe4, 0x33, 0x39, 0xba, 0x0f,
+  0xb9, 0xbc, 0xba, 0x18, 0x64, 0x3c, 0x9e, 0xc4, 0x50, 0xbc, 0x5f, 0x96, 0x4c,
+  0x3d, 0xbc, 0xdc, 0x61, 0x3d, 0xba, 0xaf, 0x38, 0x3d, 0xf1, 0x21, 0x89, 0x3d,
+  0x60, 0x95, 0x05, 0x3c, 0xc6, 0xb2, 0x6e, 0xbc, 0x5f, 0x2d, 0x21, 0xbd, 0xee,
+  0x52, 0x23, 0x3d, 0x3c, 0xc0, 0x1d, 0xbc, 0x3e, 0xcd, 0x84, 0x3d, 0x00, 0xc5,
+  0xa8, 0x39, 0x06, 0x5b, 0x4a, 0xbd, 0xec, 0x4b, 0x1b, 0xbd, 0x05, 0x4c, 0x17,
+  0xbd, 0x18, 0x01, 0x56, 0x3c, 0xcd, 0x05, 0x87, 0xbd, 0xe4, 0x37, 0x41, 0xbc,
+  0xdc, 0x36, 0x84, 0x3d, 0xa1, 0xd7, 0x09, 0x3d, 0x44, 0xf4, 0x63, 0xbd, 0x56,
+  0x62, 0x78, 0xbd, 0x12, 0x57, 0x3b, 0xbd, 0x43, 0xcd, 0x71, 0xbb, 0xa3, 0xf6,
+  0x10, 0x3d, 0x3a, 0x9f, 0xff, 0xbc, 0x6f, 0xdd, 0x8d, 0x3d, 0xb3, 0xd7, 0x08,
+  0xbd, 0x3e, 0x97, 0x76, 0x3d, 0x99, 0x60, 0x02, 0xbd, 0x08, 0x27, 0x8d, 0x3d,
+  0xf1, 0x51, 0x29, 0x3d, 0x48, 0x9d, 0xfe, 0x3c, 0x97, 0xb9, 0x72, 0xbd, 0x35,
+  0x21, 0xab, 0xbc, 0xc3, 0x96, 0x69, 0x3c, 0x05, 0x44, 0x05, 0x3d, 0x80, 0x79,
+  0x75, 0x3a, 0x94, 0x62, 0xfe, 0x3b, 0x47, 0xb4, 0x64, 0x3c, 0xbb, 0x50, 0x29,
+  0xbd, 0xe9, 0xb8, 0x6e, 0xbd, 0x2e, 0xab, 0x26, 0xbc, 0x54, 0x42, 0xb6, 0xbc,
+  0x08, 0xdb, 0x22, 0xbd, 0xae, 0x42, 0x78, 0x3d, 0x3c, 0xba, 0x2c, 0xbc, 0x46,
+  0xf1, 0x6e, 0x3d, 0xed, 0xb1, 0x88, 0xbd, 0x96, 0x2c, 0x75, 0x3d, 0x26, 0x69,
+  0x90, 0xbd, 0x9b, 0x7b, 0x77, 0xbc, 0x9a, 0xbc, 0x05, 0xbd, 0x85, 0xb1, 0x19,
+  0xbd, 0xb8, 0x33, 0x8b, 0xbd, 0xfa, 0xa3, 0x8b, 0xbc, 0xc6, 0x36, 0xf2, 0x3c,
+  0x4e, 0x81, 0xa2, 0xbc, 0xa7, 0x85, 0x73, 0xbd, 0xca, 0xe5, 0x93, 0xbc, 0xc8,
+  0x3d, 0x0e, 0x3d, 0x75, 0x3c, 0x00, 0xbd, 0x28, 0x32, 0x0e, 0x3d, 0x8f, 0x29,
+  0x04, 0xbc, 0x0c, 0x29, 0x37, 0xbd, 0x47, 0x11, 0x83, 0xbd, 0x82, 0x57, 0x2a,
+  0xbd, 0x45, 0x1f, 0x6b, 0xbc, 0x66, 0xaf, 0x7d, 0xbd, 0xa8, 0x5a, 0x25, 0xbd,
+  0x96, 0xc0, 0x14, 0x3b, 0xba, 0xf0, 0x1b, 0xbd, 0xe0, 0x71, 0x44, 0xbb, 0x9c,
+  0x09, 0xb9, 0xbc, 0x45, 0xda, 0x77, 0x3c, 0x2b, 0x5d, 0x80, 0x3d, 0xaa, 0xf0,
+  0x21, 0x3d, 0xa0, 0x25, 0x31, 0x3d, 0x34, 0xc8, 0x3b, 0xbd, 0x90, 0x50, 0xf6,
+  0xbc, 0x53, 0xed, 0x04, 0x3a, 0x26, 0xf8, 0x6e, 0x3d, 0x6d, 0x73, 0x0f, 0x3d,
+  0xe8, 0xac, 0x43, 0x3d, 0xf1, 0x03, 0x8a, 0x3c, 0xc4, 0x94, 0x3d, 0x3d, 0x3c,
+  0x89, 0x8b, 0x3d, 0x62, 0x99, 0x0f, 0x3d, 0xb6, 0x30, 0x8d, 0x3c, 0xfa, 0x8f,
+  0x25, 0x3c, 0x4c, 0x45, 0xd2, 0xbc, 0x00, 0x5d, 0xc0, 0x3c, 0xae, 0x8d, 0x6c,
+  0xbd, 0xcb, 0xa3, 0x92, 0xbd, 0xc4, 0x1e, 0xbb, 0xbc, 0x63, 0xf8, 0xaa, 0x3c,
+  0xd7, 0x7c, 0x81, 0x3d, 0xbf, 0x33, 0x41, 0x3c, 0x80, 0x59, 0x69, 0xbb, 0x0a,
+  0x75, 0x37, 0xbd, 0x29, 0xdc, 0x1b, 0xbd, 0x10, 0x1f, 0x46, 0xbd, 0xee, 0xb4,
+  0x5d, 0x3d, 0xfa, 0x40, 0x95, 0xbd, 0x02, 0xd8, 0x19, 0xbd, 0xa8, 0xd0, 0xf0,
+  0xbc, 0x0a, 0xb8, 0xc4, 0x3c, 0x68, 0xa8, 0x11, 0xbd, 0x24, 0x4f, 0x3e, 0x3d,
+  0x39, 0x99, 0x90, 0xbd, 0x7c, 0x43, 0x13, 0xbd, 0x86, 0xe5, 0x8f, 0xbd, 0xa4,
+  0x16, 0xb4, 0xbc, 0xa0, 0xe9, 0xf2, 0x3c, 0x91, 0x68, 0x5d, 0xbd, 0x51, 0x92,
+  0x85, 0x3d, 0xd2, 0x4d, 0x35, 0xbd, 0xc7, 0x44, 0x3e, 0xbd, 0x20, 0xf6, 0xe0,
+  0x3c, 0x6b, 0x38, 0x35, 0x3d, 0xd2, 0x2b, 0x2a, 0xbb, 0xc8, 0xbf, 0x0c, 0xbd,
+  0xec, 0xd6, 0xfc, 0x3b, 0x1c, 0xae, 0xa9, 0xbc, 0x28, 0x65, 0xb3, 0x3c, 0xdf,
+  0x29, 0x98, 0xbc, 0x11, 0x52, 0xbd, 0x3c, 0x4d, 0x7d, 0xac, 0x3c, 0x95, 0xcb,
+  0x09, 0xbc, 0xc5, 0xc5, 0xf8, 0xbc, 0xe6, 0x99, 0x3f, 0x3c, 0xb0, 0x51, 0xfd,
+  0xbc, 0x88, 0x6b, 0xe0, 0xbc, 0xaa, 0x84, 0x83, 0xbd, 0x98, 0x79, 0x8d, 0x3c,
+  0xda, 0x5f, 0xf2, 0x3c, 0xb3, 0xcc, 0x7a, 0x3d, 0xc9, 0x55, 0x08, 0x3d, 0xd1,
+  0x83, 0x33, 0x3d, 0x6c, 0xc1, 0x66, 0xbc, 0x80, 0xf9, 0x62, 0xba, 0xe4, 0xd5,
+  0x88, 0xbd, 0x60, 0x31, 0xd2, 0xbc, 0x2b, 0x89, 0x86, 0x3d, 0x1b, 0x1e, 0x53,
+  0xbd, 0xfa, 0x0c, 0x07, 0xbd, 0x50, 0xe8, 0xb5, 0xbc, 0x4f, 0xc6, 0x65, 0xbd,
+  0xef, 0x09, 0x75, 0xbd, 0xd5, 0x47, 0x0c, 0xbd, 0xcc, 0x4e, 0x89, 0xbd, 0x9c,
+  0x69, 0xe3, 0x3c, 0x52, 0xea, 0x9d, 0xbc, 0x01, 0x0e, 0x86, 0xbc, 0x2a, 0x61,
+  0x72, 0xbd, 0x85, 0xbc, 0x87, 0x3d, 0x21, 0xf7, 0x42, 0x3d, 0x0b, 0x60, 0x23,
+  0xbd, 0x0f, 0x0f, 0xed, 0xbc, 0x7d, 0x05, 0xd2, 0xbc, 0x6e, 0x5e, 0x5f, 0xbd,
+  0x36, 0x52, 0x92, 0xbd, 0x7e, 0x96, 0x05, 0xbb, 0x6e, 0x51, 0x98, 0x3a, 0xe5,
+  0x11, 0x19, 0xbd, 0x00, 0xcf, 0x84, 0xbb, 0x61, 0x5e, 0xed, 0x3c, 0x60, 0xcf,
+  0x50, 0xbb, 0xce, 0xbe, 0x07, 0x3c, 0x5c, 0x81, 0x20, 0x3d, 0x45, 0x85, 0xf6,
+  0xbc, 0x1d, 0xb7, 0x91, 0x3d, 0x38, 0x08, 0x59, 0x3c, 0x28, 0x93, 0x4b, 0x3d,
+  0x3a, 0xc4, 0x87, 0xbd, 0x44, 0x7f, 0x04, 0xbd, 0xdd, 0x17, 0x81, 0x3d, 0xbe,
+  0x94, 0x48, 0x3d, 0x88, 0x6a, 0xce, 0xba, 0x93, 0x5b, 0x20, 0x3d, 0xab, 0x05,
+  0x90, 0xbd, 0xf9, 0x71, 0xc4, 0x3c, 0x6c, 0xd4, 0x7a, 0x3d, 0x4a, 0x2d, 0x20,
+  0x3d, 0x94, 0xd7, 0x88, 0x3d, 0x82, 0xb5, 0x87, 0xbd, 0x55, 0x15, 0xec, 0x3b,
+  0xc0, 0x09, 0xe4, 0xba, 0x31, 0x50, 0xfc, 0x3c, 0x25, 0x49, 0x6e, 0x3c, 0x5c,
+  0x79, 0x92, 0xbc, 0xed, 0xab, 0x14, 0xbd, 0x24, 0x3e, 0xaa, 0x3c, 0x98, 0x43,
+  0x58, 0x3d, 0x2f, 0x00, 0x62, 0x3d, 0x3c, 0x09, 0x2d, 0x3d, 0xe3, 0x27, 0x85,
+  0x3c, 0x7a, 0x37, 0x06, 0x3d, 0x49, 0xe6, 0x62, 0xbd, 0x71, 0x53, 0x94, 0xbd,
+  0xc4, 0xeb, 0xd0, 0xbb, 0xd8, 0xed, 0x11, 0x3c, 0xfe, 0x75, 0x8c, 0xbc, 0xc4,
+  0xeb, 0x16, 0xbd, 0xb8, 0xb8, 0xf7, 0x3c, 0x30, 0x85, 0xaa, 0xbb, 0xcb, 0x9f,
+  0x16, 0xbd, 0x1d, 0xed, 0x8d, 0x3d, 0x0f, 0xf3, 0x08, 0xbd, 0x8e, 0x3c, 0x13,
+  0x3d, 0xc4, 0x04, 0x74, 0x3d, 0x60, 0xeb, 0x35, 0xbd, 0xe7, 0xcf, 0x38, 0x3d,
+  0x12, 0xde, 0xaf, 0x3c, 0xca, 0x71, 0x04, 0x3d, 0x1c, 0xd8, 0xeb, 0x3c, 0xc6,
+  0xfc, 0xb3, 0x3c, 0xa0, 0x37, 0x5a, 0x3d, 0xbe, 0xcc, 0x59, 0x3c, 0x4c, 0x95,
+  0x9a, 0xbc, 0xa6, 0xff, 0xa8, 0x3b, 0xcd, 0x7d, 0x7d, 0xbd, 0x5c, 0xe7, 0xba,
+  0x3c, 0xf9, 0x97, 0x02, 0xbd, 0x3a, 0xd3, 0x80, 0xbd, 0xcd, 0xbe, 0x97, 0xbd,
+  0x3b, 0x0d, 0x35, 0xba, 0x76, 0x27, 0x44, 0x3d, 0x63, 0xae, 0x8a, 0x3d, 0x03,
+  0x4c, 0x68, 0xbd, 0xe5, 0x9d, 0x0f, 0xbc, 0x6f, 0x5d, 0x45, 0xbb, 0x48, 0x3a,
+  0x74, 0x3d, 0x85, 0xfa, 0x37, 0xbd, 0x31, 0xf5, 0x1c, 0x3d, 0x0b, 0x19, 0x52,
+  0xbd, 0x00, 0xcd, 0x9e, 0xb9, 0xdb, 0xe5, 0x84, 0xbd, 0x83, 0xf1, 0x7f, 0xbd,
+  0xb7, 0x44, 0x63, 0xbd, 0x44, 0x0a, 0x98, 0xbd, 0x60, 0xd8, 0x23, 0xbb, 0xd1,
+  0x69, 0x61, 0xbd, 0x71, 0x41, 0x5a, 0xbd, 0x2f, 0xd9, 0x70, 0xbd, 0xc3, 0xb8,
+  0xd3, 0x3c, 0x38, 0xa7, 0x99, 0x3c, 0xe0, 0xa0, 0x21, 0xbd, 0xd2, 0x90, 0xa8,
+  0xb8, 0xff, 0xae, 0x32, 0x3c, 0x65, 0x1a, 0x0d, 0x3d, 0xa6, 0xd0, 0x39, 0xbd,
+  0xdd, 0xb4, 0x18, 0xbd, 0xb0, 0xa0, 0xbc, 0x3c, 0xa0, 0xe4, 0x8b, 0x3d, 0x90,
+  0xe6, 0x25, 0x3d, 0x7c, 0x20, 0x5d, 0x3d, 0x74, 0x50, 0xda, 0xbb, 0x4a, 0xe0,
+  0x70, 0x3d, 0x02, 0x36, 0x13, 0x3d, 0xaa, 0xab, 0x05, 0xbd, 0xec, 0xda, 0x10,
+  0xbd, 0xd1, 0x40, 0x35, 0xbd, 0xd2, 0x14, 0x3a, 0xbd, 0xd6, 0x7f, 0x06, 0xbd,
+  0x55, 0xf8, 0x31, 0x3d, 0xea, 0xc4, 0x5c, 0x3d, 0xd6, 0x89, 0x52, 0x3d, 0x68,
+  0xe6, 0x44, 0x3d, 0xd5, 0x64, 0x20, 0xbd, 0x18, 0x41, 0xc8, 0x3c, 0x10, 0xfa,
+  0x44, 0x3d, 0x30, 0x39, 0x20, 0xbc, 0x27, 0x26, 0x85, 0x3d, 0x9e, 0x02, 0x48,
+  0x3d, 0x59, 0xbb, 0xad, 0xbc, 0x67, 0x3c, 0xe3, 0xbc, 0xcc, 0x6e, 0x4b, 0xbd,
+  0x08, 0xf9, 0x1c, 0xbd, 0x50, 0x02, 0xa8, 0x3c, 0x77, 0x8c, 0x21, 0xbd, 0x1b,
+  0x8e, 0x0c, 0x3c, 0x0a, 0xe3, 0x76, 0x3d, 0x60, 0xa0, 0xa6, 0xbc, 0x30, 0x1d,
+  0x2c, 0x3d, 0x89, 0xab, 0x57, 0xbd, 0x39, 0xdf, 0x8e, 0x3b, 0x4e, 0xd0, 0x81,
+  0x3d, 0x6f, 0xc7, 0x0c, 0x3d, 0xb8, 0x21, 0x12, 0x3d, 0x32, 0xe6, 0x5a, 0x3d,
+  0x26, 0xbf, 0x64, 0x3c, 0xa8, 0xaf, 0x35, 0x3d, 0x0e, 0x6e, 0xb4, 0xbc, 0x78,
+  0x59, 0xa8, 0x3c, 0xd1, 0xca, 0x5c, 0xbd, 0x3a, 0x40, 0x53, 0x3d, 0x30, 0x50,
+  0x0c, 0xbc, 0x11, 0xd3, 0x35, 0xbd, 0x06, 0x5b, 0x89, 0xbd, 0x2e, 0xe3, 0x63,
+  0x3d, 0xc5, 0xdc, 0x0e, 0xbd, 0x60, 0x04, 0x2d, 0xbb, 0xae, 0xfb, 0x42, 0x3d,
+  0x83, 0x52, 0xcd, 0xbc, 0x20, 0x53, 0x06, 0x3d, 0xd5, 0xc6, 0x38, 0x3c, 0xa7,
+  0xa9, 0xf4, 0xbc, 0x9b, 0x2d, 0x89, 0x3d, 0x70, 0x74, 0x83, 0x3c, 0x06, 0x87,
+  0xe7, 0x3b, 0x97, 0xa3, 0x92, 0x3c, 0x38, 0x5f, 0xf7, 0x3c, 0xdf, 0x71, 0x3b,
+  0xbd, 0xfe, 0x14, 0x4d, 0x3d, 0x0a, 0x42, 0xb8, 0xbc, 0xb4, 0xf6, 0x2f, 0x3c,
+  0x33, 0xe6, 0x94, 0xbd, 0x26, 0x39, 0x71, 0xbd, 0x10, 0xf4, 0x6e, 0xbd, 0xe4,
+  0x3f, 0x09, 0xbd, 0x35, 0xe6, 0xb7, 0x3c, 0x9b, 0x3a, 0x10, 0xbd, 0x4d, 0x58,
+  0x43, 0xbd, 0x3e, 0x25, 0x2c, 0xbd, 0x38, 0xdc, 0x4f, 0x3c, 0x06, 0xf5, 0xff,
+  0xbc, 0x33, 0x3e, 0x81, 0xbd, 0x27, 0x99, 0x8e, 0xbb, 0x27, 0xc9, 0x68, 0xbd,
+  0xce, 0x6c, 0x81, 0x3c, 0x0e, 0xab, 0x67, 0xbd, 0x50, 0x8a, 0x2f, 0x3c, 0x30,
+  0x32, 0x37, 0x3d, 0x49, 0xd1, 0x0e, 0xbd, 0x60, 0xe2, 0x38, 0x3d, 0xf8, 0xd0,
+  0x9f, 0x3c, 0x3e, 0x8a, 0x0d, 0x3d, 0x7e, 0x2f, 0x6a, 0xbd, 0xe8, 0x0f, 0xab,
+  0x3b, 0x6e, 0x3d, 0x49, 0xbd, 0xba, 0xdd, 0x00, 0x3d, 0x80, 0x40, 0xdc, 0x3b,
+  0x18, 0x06, 0x76, 0x3d, 0x48, 0xe5, 0x6d, 0x3d, 0xca, 0xcf, 0xa9, 0xbc, 0x3c,
+  0xb8, 0x50, 0xbc, 0x70, 0xbf, 0x76, 0x3c, 0x0c, 0xbc, 0x1c, 0x3d, 0x59, 0x70,
+  0xf3, 0xbc, 0x21, 0xaa, 0x83, 0xbc, 0xf6, 0x67, 0x4f, 0xbd, 0x86, 0xa6, 0x71,
+  0x3c, 0x69, 0xd6, 0x48, 0x3c, 0x50, 0x60, 0x56, 0x3d, 0x9c, 0x25, 0x50, 0xbd,
+  0x10, 0x27, 0x76, 0x3c, 0x98, 0x24, 0x7b, 0xbd, 0x6c, 0xb9, 0x01, 0xbc, 0xe6,
+  0xea, 0x85, 0x3d, 0x0e, 0xa0, 0xf5, 0x3b, 0xb4, 0xb3, 0x0e, 0x3d, 0xe2, 0xc0,
+  0xa1, 0x3c, 0x4c, 0x2c, 0xf6, 0xbc, 0xc8, 0x58, 0x25, 0x3c, 0xd0, 0x2c, 0xeb,
+  0x3c, 0xa8, 0x0f, 0xfa, 0x3c, 0x50, 0xc1, 0xd6, 0xbb, 0x42, 0x81, 0x4d, 0xbd,
+  0x37, 0x4c, 0x88, 0xbd, 0xf4, 0x1a, 0xd2, 0xbc, 0x94, 0xb7, 0xaf, 0xbb, 0xaf,
+  0xeb, 0x0f, 0x3d, 0xed, 0x56, 0xa3, 0x3c, 0x5e, 0x0a, 0x87, 0x3d, 0x5c, 0x4a,
+  0x64, 0xbc, 0x37, 0x90, 0x62, 0x3c, 0x57, 0xcd, 0xbb, 0x3b, 0x50, 0x0c, 0x76,
+  0xbd, 0x1c, 0x48, 0x87, 0xbc, 0x38, 0x8a, 0x4e, 0x3c, 0xda, 0x2b, 0x3a, 0x3d,
+  0xba, 0x1a, 0x81, 0xbc, 0x29, 0xca, 0xba, 0x3c, 0x78, 0x39, 0x2b, 0xbd, 0xd4,
+  0x80, 0xe2, 0xbb, 0x08, 0x96, 0x95, 0x3c, 0x55, 0x08, 0x50, 0x3c, 0xbd, 0xed,
+  0x15, 0xbd, 0xd0, 0xeb, 0xe5, 0xbb, 0xa5, 0x5a, 0x22, 0xbc, 0x6c, 0xe7, 0x8f,
+  0xbc, 0x63, 0x73, 0xb2, 0x3c, 0xc0, 0xae, 0x13, 0x3c, 0x54, 0xbd, 0x6f, 0xbd,
+  0x9e, 0x5a, 0x60, 0x3d, 0x62, 0xe8, 0x34, 0x3d, 0x38, 0x91, 0x24, 0x3d, 0x10,
+  0xac, 0x03, 0x3c, 0x04, 0xc0, 0x83, 0xbd, 0x16, 0x48, 0x7e, 0xbd, 0x64, 0x7a,
+  0x40, 0xbc, 0x52, 0xcf, 0x4a, 0x3d, 0xa1, 0x54, 0x1f, 0xb9, 0x61, 0x19, 0x8c,
+  0x3d, 0x08, 0xfa, 0x5a, 0xbd, 0x2a, 0xf5, 0x67, 0x3d, 0xb3, 0xcc, 0x12, 0xbd,
+  0xc3, 0x2a, 0x65, 0x3d, 0x06, 0xbb, 0x41, 0xbd, 0xfc, 0xc0, 0x09, 0xbd, 0x2c,
+  0xdf, 0xa7, 0xbc, 0xb7, 0xfe, 0x5d, 0xbd, 0xcb, 0x10, 0xa3, 0xbb, 0x75, 0xc3,
+  0xcd, 0x3c, 0x2b, 0xd5, 0x0e, 0x3d, 0x11, 0x1c, 0x83, 0x3d, 0x71, 0xdc, 0xb2,
+  0xbc, 0xda, 0xe1, 0x86, 0xbd, 0x39, 0xf2, 0x50, 0x3c, 0x40, 0x25, 0x50, 0x3b,
+  0x18, 0x17, 0x43, 0xbc, 0x6b, 0xa6, 0x88, 0x3c, 0x60, 0x10, 0x5d, 0xbd, 0x0e,
+  0x88, 0xa1, 0x3c, 0xa6, 0xd3, 0xe4, 0xbc, 0x11, 0x76, 0x88, 0xbc, 0x1e, 0x07,
+  0x6c, 0x3d, 0xa6, 0x6e, 0x1b, 0x3d, 0xc0, 0x30, 0x30, 0x3d, 0xf2, 0x34, 0x8d,
+  0xbd, 0xc0, 0xe2, 0x18, 0x3b, 0xce, 0xef, 0x83, 0xbc, 0xe7, 0x31, 0x0e, 0xbd,
+  0xd1, 0xf1, 0x8b, 0xbd, 0xba, 0x6e, 0x3e, 0xbc, 0xc7, 0x45, 0x08, 0xbd, 0x57,
+  0x7e, 0x56, 0x3d, 0x6d, 0xaf, 0x68, 0xbd, 0xef, 0x94, 0x28, 0xbd, 0x65, 0xf5,
+  0xa5, 0x3c, 0xea, 0x2c, 0x43, 0xbd, 0x5c, 0xc6, 0x5d, 0x3c, 0x3e, 0x7e, 0x3f,
+  0xbd, 0xd4, 0xa5, 0x7c, 0xbd, 0x14, 0x39, 0x35, 0xbd, 0xc5, 0x8a, 0x08, 0xbd,
+  0x7e, 0xc0, 0x0c, 0x3d, 0x45, 0xbb, 0x84, 0x3c, 0x0d, 0x10, 0x6f, 0x39, 0x81,
+  0x04, 0x4b, 0x3c, 0x5b, 0x45, 0xff, 0x3c, 0xab, 0xd1, 0x74, 0xbd, 0x98, 0x8a,
+  0x38, 0x3c, 0xe3, 0xc7, 0xa9, 0x3c, 0x8b, 0x12, 0x7f, 0xbd, 0x6f, 0xb7, 0xc5,
+  0x3a, 0x95, 0x7e, 0xaf, 0x3c, 0x50, 0xc8, 0xc5, 0x3b, 0xf9, 0x02, 0x89, 0xbd,
+  0x6e, 0x63, 0xa2, 0xbc, 0x0c, 0x74, 0x32, 0x3d, 0xea, 0x32, 0x79, 0x3d, 0x0e,
+  0x34, 0x91, 0xbd, 0xa1, 0x87, 0xec, 0xbc, 0x1c, 0xd4, 0x17, 0x3d, 0xe1, 0xb0,
+  0x74, 0x3d, 0xe9, 0x8e, 0xc6, 0x3c, 0x8a, 0x62, 0x55, 0xbc, 0x51, 0x37, 0x95,
+  0xbd, 0x2b, 0xc8, 0xbd, 0xbc, 0x8e, 0xe4, 0xef, 0xbc, 0x11, 0x49, 0x0d, 0x3d,
+  0xe8, 0xcc, 0x16, 0x3d, 0xc6, 0xa8, 0xc8, 0x3c, 0x98, 0x01, 0x88, 0x3c, 0xbd,
+  0x8e, 0x46, 0xbd, 0xab, 0x7d, 0xd4, 0xbc, 0x7a, 0xde, 0xb6, 0xbc, 0xf9, 0x44,
+  0xcd, 0xbc, 0xad, 0xae, 0x13, 0xbc, 0x8d, 0xb5, 0x21, 0xbd, 0x48, 0xfb, 0x05,
+  0xbc, 0x1d, 0x6d, 0x84, 0x3d, 0x4c, 0x32, 0x8a, 0x3c, 0xa8, 0xe9, 0x69, 0x3c,
+  0xa6, 0xba, 0x1b, 0xbd, 0xe5, 0xfa, 0x12, 0x3d, 0xea, 0xea, 0x11, 0x3d, 0xa4,
+  0xa1, 0x10, 0xbd, 0x0c, 0x0e, 0xad, 0x3d, 0x04, 0xeb, 0x1c, 0xbd, 0xe5, 0x6d,
+  0x0f, 0xbd, 0x1e, 0x40, 0xea, 0x3d, 0xfa, 0xc5, 0x36, 0x3d, 0x7a, 0xd3, 0x34,
+  0xbd, 0xe2, 0xe5, 0x4b, 0xbd, 0x27, 0x35, 0xf0, 0xbd, 0x60, 0x53, 0xc6, 0xbc,
+  0xb4, 0x7c, 0x0b, 0xbd, 0x0c, 0xc1, 0xbd, 0x39, 0x4b, 0xfb, 0x67, 0x3c, 0x4c,
+  0x65, 0xc4, 0x3c, 0x23, 0x9d, 0x88, 0x3c, 0x7c, 0x7e, 0xa0, 0x3b, 0x7f, 0xd2,
+  0x94, 0x3b, 0x45, 0xd2, 0x24, 0x3d, 0x00, 0xd4, 0xf5, 0xbb, 0x13, 0xf0, 0x99,
+  0x3d, 0xd6, 0x36, 0xa0, 0x3a, 0x28, 0xb0, 0x5d, 0x3d, 0x9f, 0xf9, 0x81, 0xbd,
+  0x42, 0x4b, 0x98, 0x3d, 0x29, 0x10, 0x7d, 0x3d, 0x8e, 0xe9, 0xf5, 0xbc, 0xfb,
+  0xc1, 0x91, 0xbc, 0x71, 0xda, 0xe2, 0xbc, 0x1e, 0x75, 0x3b, 0xbd, 0xbe, 0x22,
+  0x2f, 0x3d, 0xfa, 0xb6, 0x27, 0xba, 0x8c, 0x36, 0x86, 0x3c, 0x45, 0x63, 0xcf,
+  0xbc, 0x13, 0x05, 0x5e, 0xbc, 0xba, 0xc5, 0x24, 0xbd, 0xcd, 0x6d, 0x0b, 0x3c,
+  0x5d, 0xe6, 0x00, 0x3b, 0x82, 0xbb, 0xcf, 0xbc, 0xdb, 0x1f, 0x31, 0xbd, 0x91,
+  0x32, 0x95, 0xbc, 0x81, 0xff, 0x0b, 0xba, 0xa7, 0xe4, 0x0f, 0x3d, 0x50, 0xd4,
+  0x2c, 0x3d, 0x4c, 0x82, 0x27, 0x3c, 0x54, 0x76, 0x69, 0x3c, 0xef, 0x41, 0x53,
+  0xbb, 0x7b, 0x88, 0x26, 0xbd, 0xfa, 0x19, 0x51, 0x3d, 0x83, 0xe9, 0x89, 0xbd,
+  0x96, 0xa7, 0x4a, 0x3d, 0x87, 0xf0, 0xe6, 0xbc, 0x2b, 0x59, 0x61, 0xbc, 0x4a,
+  0x9a, 0x7d, 0x3d, 0x7c, 0x95, 0x54, 0x38, 0xa6, 0x6e, 0x69, 0x3d, 0xf3, 0x84,
+  0x27, 0xbd, 0x84, 0x7f, 0x26, 0x3c, 0xc3, 0xe1, 0x58, 0x3b, 0xa7, 0x2d, 0xa5,
+  0x3d, 0x13, 0x70, 0x2a, 0xbd, 0xae, 0x66, 0x1f, 0x3d, 0x6d, 0x44, 0xff, 0xbc,
+  0x66, 0x10, 0xb2, 0x3c, 0x94, 0xd5, 0x98, 0xb9, 0x00, 0xc8, 0xef, 0x3d, 0x5c,
+  0x00, 0x2f, 0xbc, 0xd7, 0xb1, 0xf6, 0x3c, 0x1b, 0xdb, 0xe1, 0x3c, 0xaa, 0x78,
+  0xe0, 0x3c, 0xb5, 0xe8, 0xd1, 0x3c, 0xda, 0x9e, 0x39, 0xbc, 0xe4, 0x90, 0x84,
+  0xbc, 0x42, 0x92, 0x6f, 0xbd, 0xdd, 0xd7, 0x8a, 0x3d, 0xd3, 0x62, 0x90, 0x3c,
+  0x1c, 0x20, 0x52, 0x3d, 0x1e, 0x29, 0x72, 0xbd, 0xf4, 0x8e, 0x1c, 0x3d, 0xd9,
+  0xda, 0xaf, 0xbc, 0x60, 0x11, 0x8e, 0xbb, 0x71, 0xc1, 0xbf, 0xbc, 0xec, 0x7f,
+  0x3d, 0x3c, 0xe5, 0x10, 0x3d, 0xbd, 0x1a, 0xbf, 0x69, 0x3d, 0x3f, 0x56, 0x0b,
+  0xbb, 0x19, 0x64, 0x9d, 0x3c, 0xe1, 0x00, 0x05, 0x3d, 0x4f, 0x77, 0x8e, 0x3d,
+  0x0f, 0x4d, 0x35, 0x3d, 0xe5, 0x6d, 0x4d, 0xbd, 0x9d, 0xb6, 0x58, 0x3c, 0x64,
+  0x44, 0x30, 0xba, 0x08, 0xe8, 0xaa, 0x3c, 0x73, 0xe7, 0x0b, 0x3d, 0x71, 0x00,
+  0x8c, 0x3d, 0x1a, 0xd9, 0xeb, 0x3c, 0xde, 0x78, 0xf2, 0xbb, 0xe5, 0x50, 0xcb,
+  0x3d, 0x03, 0x80, 0x7f, 0x3b, 0xb4, 0xf7, 0x1a, 0x3d, 0x32, 0xf5, 0xb0, 0x3d,
+  0x1c, 0x38, 0xe5, 0x3c, 0xb1, 0x72, 0x05, 0x3d, 0xc3, 0x92, 0xcf, 0x3c, 0xdc,
+  0x7b, 0x0c, 0xbe, 0x95, 0x0b, 0xfc, 0x3c, 0x5f, 0x34, 0x18, 0x3d, 0xc2, 0x08,
+  0x19, 0xbd, 0x25, 0xd4, 0x7b, 0x3d, 0x1e, 0xca, 0x88, 0xbd, 0x57, 0x5f, 0x9a,
+  0x3d, 0x57, 0x98, 0x80, 0x3d, 0x20, 0x7d, 0xdd, 0x3c, 0xdf, 0xb3, 0x65, 0x3d,
+  0x88, 0xde, 0x8d, 0xbd, 0x45, 0x90, 0x9d, 0x3d, 0x8a, 0xf8, 0xfa, 0xbc, 0xdf,
+  0xe2, 0xef, 0xb9, 0x21, 0x8d, 0x5a, 0xbc, 0x3e, 0x45, 0x17, 0x3c, 0x11, 0x8d,
+  0x8d, 0xbd, 0xb9, 0xd3, 0x2b, 0xb9, 0xd1, 0x2b, 0x24, 0xbc, 0x7e, 0x0e, 0x00,
+  0x3b, 0xfd, 0xc2, 0x2e, 0xbd, 0x80, 0x7d, 0x0d, 0x3d, 0x91, 0x8a, 0x49, 0x3d,
+  0xba, 0x7e, 0x10, 0x3d, 0xc3, 0x56, 0x2a, 0x3d, 0x1a, 0x4d, 0x6e, 0x3d, 0x20,
+  0x44, 0x90, 0x3c, 0x2f, 0xd8, 0x79, 0x3d, 0x7b, 0x5c, 0xab, 0x3d, 0x64, 0xa5,
+  0xe1, 0x3c, 0x26, 0x94, 0x31, 0x3d, 0xcc, 0xaf, 0xec, 0xbd, 0xc0, 0x25, 0x4b,
+  0xbd, 0xd1, 0x06, 0x87, 0x3d, 0x97, 0x3c, 0x44, 0xbd, 0x9c, 0x81, 0xc2, 0xbc,
+  0x0a, 0xd3, 0x1a, 0xbd, 0x0d, 0xe3, 0x00, 0xbd, 0x08, 0x6e, 0x53, 0xbd, 0x67,
+  0x84, 0x1a, 0x3d, 0xeb, 0xd0, 0x2f, 0x3d, 0x76, 0xea, 0x46, 0x3b, 0x3e, 0x6e,
+  0xbe, 0xbc, 0xf3, 0x6a, 0x11, 0x3d, 0x13, 0xed, 0xb8, 0x3c, 0xc1, 0x4f, 0x9a,
+  0x3d, 0xd6, 0x9a, 0x31, 0xbd, 0xcc, 0x51, 0x0e, 0x3d, 0x60, 0x8c, 0x89, 0x3d,
+  0x66, 0xc1, 0x41, 0xbd, 0x75, 0x80, 0xa2, 0x3d, 0x40, 0xbb, 0x5c, 0x3b, 0x6f,
+  0xb6, 0x90, 0x3d, 0xb7, 0x62, 0x02, 0x3c, 0x54, 0x75, 0x78, 0x3d, 0x3d, 0x29,
+  0xaf, 0x3d, 0x53, 0x5f, 0x97, 0x3d, 0xaf, 0x83, 0x91, 0xbc, 0xc9, 0x29, 0x55,
+  0x3d, 0xda, 0x00, 0x82, 0xbb, 0x8d, 0xcd, 0x2e, 0x3d, 0x9d, 0xcb, 0x88, 0xbd,
+  0x4d, 0x93, 0x3d, 0xbd, 0x55, 0xb8, 0x66, 0xbd, 0x98, 0xf2, 0x4e, 0xbc, 0xf9,
+  0xe0, 0x28, 0xbc, 0x6f, 0x30, 0x2d, 0x3d, 0xd8, 0xe6, 0x9e, 0x3d, 0x81, 0xcf,
+  0x31, 0xbd, 0x31, 0x50, 0x45, 0xbd, 0x90, 0x9e, 0x2f, 0xbd, 0x4b, 0x9a, 0x9a,
+  0x3d, 0x2f, 0x1a, 0xb3, 0xbc, 0x05, 0x59, 0x9b, 0xbc, 0xa6, 0x4f, 0x9b, 0xbc,
+  0x24, 0x10, 0x9e, 0xbd, 0x91, 0x8e, 0xa5, 0x3c, 0x0c, 0x2a, 0x43, 0x3d, 0x85,
+  0x85, 0x87, 0xbd, 0x00, 0x61, 0x36, 0xbd, 0x10, 0xb9, 0x43, 0xbc, 0x58, 0x2c,
+  0x24, 0x3b, 0xb7, 0x4f, 0x80, 0x3d, 0x46, 0x0f, 0x29, 0xbd, 0x76, 0x68, 0x44,
+  0xbd, 0x57, 0xcf, 0x18, 0xbd, 0x24, 0x15, 0x94, 0x3d, 0x13, 0x57, 0x98, 0x3d,
+  0x5e, 0xd6, 0x9c, 0x3d, 0xa0, 0x16, 0x9e, 0x3d, 0x66, 0x87, 0x83, 0xbd, 0x19,
+  0x6d, 0x8b, 0x3d, 0x24, 0x60, 0x9a, 0xbc, 0x00, 0x60, 0xea, 0xbb, 0xba, 0x09,
+  0x5f, 0xbd, 0xdc, 0xdd, 0xaa, 0x3b, 0x95, 0x08, 0xe9, 0xbc, 0x82, 0x0c, 0xc6,
+  0x3c, 0x19, 0xb1, 0xda, 0xbc, 0x80, 0x2e, 0x4b, 0x3c, 0xed, 0xab, 0x29, 0x3d,
+  0x17, 0x38, 0x51, 0x3d, 0x52, 0xa3, 0xef, 0x3c, 0xfd, 0x1c, 0x88, 0xbc, 0x40,
+  0x9f, 0x3a, 0x3c, 0x87, 0x8a, 0xbe, 0xbc, 0xe5, 0xf4, 0x2a, 0xbd, 0x01, 0x1f,
+  0x32, 0x3d, 0x2c, 0xbf, 0x3d, 0xbc, 0x33, 0xd3, 0xf9, 0xbb, 0xc4, 0x58, 0x2d,
+  0xbd, 0x5d, 0xa3, 0x8f, 0x3d, 0x27, 0x5d, 0x90, 0xbc, 0xcf, 0x00, 0x82, 0x3d,
+  0x0b, 0x65, 0xa7, 0x3d, 0x52, 0x11, 0xff, 0xbc, 0x37, 0xca, 0x18, 0xbd, 0xb9,
+  0x2f, 0x9d, 0x3c, 0x36, 0x90, 0x68, 0x3d, 0x85, 0x61, 0x6b, 0x3d, 0x27, 0xb0,
+  0x89, 0xbc, 0xcb, 0xb5, 0xac, 0xbb, 0xf4, 0x4b, 0x79, 0xbc, 0x34, 0x73, 0xe7,
+  0xbc, 0x81, 0x9b, 0x86, 0x3c, 0x58, 0xc2, 0xce, 0x3c, 0x0a, 0x63, 0x2c, 0xbd,
+  0xf6, 0xd3, 0xcf, 0xbd, 0xea, 0xf1, 0x01, 0xbd, 0x7a, 0x64, 0xe0, 0xbc, 0x12,
+  0x3a, 0x28, 0x3d, 0x98, 0xe9, 0x98, 0x3d, 0x95, 0xf1, 0xa8, 0xbc, 0x88, 0xb4,
+  0x2a, 0x3d, 0x81, 0xdf, 0xc4, 0xbc, 0x62, 0xb8, 0xfb, 0xbc, 0x46, 0xd2, 0x90,
+  0xbd, 0x74, 0x0a, 0xc4, 0x3c, 0x8e, 0x57, 0x6f, 0x3d, 0xf9, 0xea, 0x78, 0x3d,
+  0xdc, 0x6e, 0x62, 0xbd, 0x46, 0xe2, 0x16, 0xbd, 0xa6, 0x36, 0x37, 0xbd, 0xf5,
+  0x36, 0x35, 0xbd, 0x9a, 0x4f, 0xb8, 0xbc, 0xf2, 0xab, 0x15, 0x3c, 0xee, 0x55,
+  0xd7, 0x3b, 0xfa, 0xd0, 0x1c, 0xbd, 0xd4, 0x6b, 0x97, 0xbc, 0x91, 0x57, 0x51,
+  0xbd, 0x7c, 0xc9, 0x64, 0x3d, 0xf8, 0x29, 0xcd, 0xbc, 0x75, 0x65, 0x67, 0x3d,
+  0xaa, 0xd9, 0xa3, 0x3c, 0x55, 0xff, 0x8f, 0x3c, 0x7c, 0x18, 0x46, 0xbd, 0x92,
+  0x18, 0x2c, 0x3d, 0x3a, 0x9f, 0x8a, 0xbc, 0xee, 0xd4, 0x05, 0x3d, 0x37, 0x03,
+  0xaa, 0xbd, 0xe9, 0x50, 0x07, 0xbe, 0x1a, 0x94, 0x18, 0x3d, 0x79, 0x69, 0x03,
+  0xbd, 0x7f, 0xc8, 0xd4, 0xbc, 0x25, 0xa7, 0x86, 0x3a, 0x17, 0xf1, 0x00, 0x3c,
+  0xfd, 0x40, 0x10, 0x3d, 0x6e, 0x29, 0xf7, 0x3c, 0x05, 0xb0, 0x38, 0xbd, 0x7e,
+  0x44, 0x5a, 0xbc, 0x0e, 0xdf, 0x66, 0x3d, 0x08, 0x9d, 0x10, 0xbc, 0xff, 0x12,
+  0x8e, 0xbb, 0x01, 0x3f, 0x67, 0xbc, 0x6e, 0xa6, 0x4f, 0x3d, 0xca, 0x07, 0x63,
+  0xbd, 0x97, 0x61, 0x4b, 0x3d, 0x71, 0x21, 0x34, 0x3d, 0x4f, 0xa2, 0x6d, 0x3d,
+  0x8f, 0xf5, 0xe8, 0xbd, 0x72, 0x55, 0x4b, 0xbd, 0xee, 0xb2, 0xe9, 0xbc, 0xf2,
+  0x49, 0xa7, 0x3d, 0x89, 0x22, 0xf5, 0x3c, 0xd8, 0x73, 0xcb, 0x3d, 0xbb, 0x15,
+  0x81, 0x3d, 0x33, 0xf1, 0x5c, 0x3d, 0xa7, 0x30, 0x96, 0xbd, 0x4b, 0x2c, 0x58,
+  0xbd, 0x34, 0x05, 0x00, 0x3d, 0xbd, 0x81, 0x92, 0x3d, 0x67, 0x5b, 0x5f, 0xbc,
+  0xb4, 0x1e, 0xe6, 0xbd, 0x7c, 0x56, 0x00, 0x3c, 0x7c, 0x6d, 0xa8, 0x3c, 0x9b,
+  0x21, 0xbd, 0xbb, 0x71, 0xf4, 0x48, 0xbd, 0xf8, 0xe1, 0x87, 0xbd, 0xd7, 0x4f,
+  0xaf, 0xbc, 0x08, 0xef, 0xd9, 0x3c, 0x3e, 0x7b, 0x24, 0x3c, 0xa8, 0xcc, 0xe7,
+  0x3c, 0xf0, 0xa0, 0x4a, 0xbd, 0x45, 0xbf, 0x39, 0xbd, 0x4e, 0xb6, 0xd6, 0x3c,
+  0xfb, 0xfb, 0x49, 0x3d, 0xdd, 0x90, 0x4e, 0x3c, 0x0c, 0xb0, 0x83, 0x3d, 0x2d,
+  0x83, 0x42, 0x3c, 0x1f, 0x45, 0xeb, 0xbb, 0xd3, 0x7e, 0xf2, 0x3b, 0x4d, 0x22,
+  0xa6, 0xbd, 0x40, 0x45, 0x5c, 0xbb, 0x8c, 0xa5, 0x1c, 0xbd, 0x57, 0xd9, 0x86,
+  0x3d, 0x45, 0xfc, 0x4e, 0x3d, 0xc5, 0x64, 0x24, 0x3d, 0xc9, 0xf4, 0x27, 0x3c,
+  0xc7, 0x86, 0x08, 0x3d, 0x9c, 0x3c, 0x13, 0x3b, 0xab, 0x69, 0x12, 0x3d, 0x0d,
+  0xfa, 0x80, 0x3d, 0x6b, 0x86, 0x15, 0xbd, 0x93, 0x11, 0x1e, 0xbd, 0x70, 0x3b,
+  0x02, 0x3b, 0x50, 0x75, 0x06, 0xbd, 0x61, 0xe8, 0x7b, 0xbc, 0x5a, 0x15, 0xa7,
+  0x3d, 0x47, 0x26, 0x0b, 0x3c, 0xb8, 0x03, 0x98, 0x3c, 0xce, 0xcc, 0x8e, 0x3d,
+  0x12, 0x6c, 0xba, 0xbc, 0xca, 0x74, 0x5f, 0xbd, 0x84, 0x45, 0xd6, 0x3d, 0x2a,
+  0xc6, 0xb3, 0xbc, 0x75, 0x88, 0x53, 0x3d, 0x44, 0xc0, 0x37, 0x3c, 0x69, 0x7c,
+  0x59, 0x3d, 0xc1, 0xa5, 0xe5, 0xbc, 0x61, 0xc0, 0x9f, 0x3c, 0xbc, 0x7d, 0x7e,
+  0xbc, 0x9c, 0x18, 0x79, 0xbd, 0x09, 0x70, 0x16, 0x3d, 0xdd, 0x36, 0x0b, 0x3d,
+  0xcc, 0xba, 0xc8, 0x3c, 0xe6, 0xae, 0x18, 0xbc, 0xd6, 0x1a, 0x20, 0xbd, 0x43,
+  0x22, 0x24, 0xbc, 0xcc, 0x3e, 0xd4, 0x3c, 0xe2, 0x43, 0x1a, 0xbb, 0x02, 0x94,
+  0xd5, 0x3c, 0x24, 0x73, 0x3d, 0x3d, 0x4d, 0x1c, 0xce, 0x3c, 0x94, 0xea, 0x4a,
+  0x3d, 0x33, 0x7a, 0x09, 0x3d, 0xf4, 0xcc, 0x66, 0xbd, 0x13, 0xb9, 0x9e, 0xbd,
+  0x98, 0xbe, 0xb4, 0xbc, 0x19, 0x14, 0x21, 0x3d, 0x97, 0xca, 0x50, 0x3d, 0x8f,
+  0x3f, 0x2f, 0xbc, 0x69, 0x98, 0x25, 0x3d, 0x55, 0x13, 0x80, 0xbc, 0xef, 0x2e,
+  0x82, 0x3d, 0x24, 0xea, 0x71, 0xbd, 0x84, 0x97, 0x32, 0xbd, 0xb0, 0xaa, 0xaf,
+  0x3c, 0xfa, 0x13, 0x9b, 0x3d, 0x56, 0xa5, 0x2b, 0x3d, 0x03, 0x06, 0x2d, 0xbc,
+  0x6c, 0x24, 0x39, 0xbd, 0x46, 0x80, 0x29, 0x3d, 0x64, 0xdb, 0x61, 0xbb, 0x85,
+  0x2a, 0x22, 0xbd, 0x9f, 0x47, 0xc1, 0x3d, 0x71, 0xc5, 0x85, 0xbd, 0x00, 0x31,
+  0x9c, 0xb9, 0xc4, 0xd0, 0x2e, 0xbd, 0x08, 0x5d, 0x36, 0x3d, 0x41, 0x70, 0x3f,
+  0xbd, 0x01, 0xc0, 0x87, 0x3c, 0x05, 0xf1, 0x37, 0xbc, 0xaf, 0x5d, 0xd4, 0xbb,
+  0x10, 0xa9, 0x1c, 0x3d, 0xb8, 0xa9, 0x62, 0xba, 0xae, 0x29, 0x71, 0x3d, 0x51,
+  0x57, 0x73, 0xbc, 0x05, 0x0a, 0xb8, 0xbd, 0xe3, 0x38, 0xa1, 0xbd, 0x3d, 0x08,
+  0x13, 0x3d, 0x54, 0x69, 0x80, 0xbd, 0xe9, 0x65, 0x60, 0xbd, 0x2e, 0x02, 0x88,
+  0x3d, 0x00, 0xdf, 0x58, 0xbb, 0xde, 0x06, 0x35, 0xbd, 0x1e, 0x3f, 0x0a, 0xbd,
+  0x35, 0xe2, 0x15, 0xbd, 0xa6, 0xe3, 0x99, 0x3d, 0x42, 0x8e, 0x2e, 0xbd, 0x9b,
+  0x10, 0x97, 0xbd, 0xd9, 0x36, 0xca, 0x3b, 0x27, 0x9f, 0x5c, 0xbd, 0xb8, 0x0c,
+  0x25, 0xbd, 0x61, 0xe3, 0x8e, 0x3d, 0x8b, 0x23, 0xa5, 0xbc, 0xf4, 0xda, 0x47,
+  0xbd, 0x30, 0x95, 0xac, 0x3c, 0xe1, 0xb0, 0xab, 0xbd, 0xb0, 0x5a, 0x15, 0x3d,
+  0x58, 0x7e, 0x35, 0x3d, 0x13, 0xeb, 0x48, 0xbc, 0x00, 0xe6, 0x80, 0x3c, 0x39,
+  0x59, 0x21, 0xbb, 0xca, 0xf7, 0xbe, 0x3d, 0x2a, 0xb9, 0x37, 0x3d, 0x26, 0x13,
+  0x80, 0x3d, 0x9e, 0xbd, 0xc7, 0x3c, 0xb6, 0xd6, 0x50, 0xbd, 0xa6, 0x52, 0x82,
+  0x3d, 0x39, 0xa3, 0x81, 0xb9, 0xe3, 0xb2, 0xf8, 0xbd, 0xc5, 0x84, 0x54, 0xbd,
+  0xba, 0xea, 0x27, 0x3d, 0x1e, 0xce, 0xcf, 0x3c, 0x0d, 0xd3, 0x6f, 0x3c, 0xa7,
+  0xce, 0x87, 0xbc, 0x67, 0xe3, 0x5e, 0xbd, 0xf6, 0xdc, 0x3b, 0x3d, 0xca, 0x8f,
+  0x23, 0xbd, 0x69, 0x20, 0x9e, 0x3b, 0x32, 0x59, 0x2e, 0x3d, 0x12, 0x32, 0x09,
+  0xbd, 0xa1, 0xc3, 0x2a, 0x3c, 0x68, 0x2a, 0x6b, 0xbc, 0xf7, 0xbf, 0x92, 0xbc,
+  0x97, 0x8c, 0x97, 0x3d, 0x8e, 0xc6, 0x74, 0x3c, 0x04, 0x01, 0x47, 0x3c, 0x6b,
+  0x51, 0xf0, 0x3d, 0x0e, 0xf6, 0x3b, 0x3b, 0xee, 0xeb, 0x5d, 0x3d, 0x98, 0x69,
+  0x9b, 0x3c, 0xb5, 0x47, 0xfc, 0xbc, 0x5e, 0x56, 0x40, 0xbc, 0x15, 0x4e, 0xad,
+  0xbb, 0x84, 0xcf, 0x96, 0x3c, 0xe3, 0x32, 0xbe, 0xbc, 0x36, 0xcd, 0xc8, 0x3d,
+  0x70, 0xb8, 0x97, 0x3d, 0xd9, 0xc3, 0x28, 0xbd, 0x6c, 0xec, 0x7b, 0x3d, 0xbf,
+  0x32, 0xc6, 0xbd, 0x98, 0x0d, 0x0f, 0xbe, 0x32, 0xaa, 0x95, 0x3d, 0x6e, 0x2c,
+  0xfd, 0xbc, 0x10, 0x45, 0xc1, 0xbb, 0x4d, 0x8b, 0x03, 0x3d, 0xe4, 0x05, 0xde,
+  0xbc, 0x0d, 0x7c, 0xbe, 0x3c, 0x07, 0x24, 0x77, 0x3d, 0x98, 0xb0, 0x2a, 0x3c,
+  0x21, 0xc9, 0xa3, 0x3c, 0x1a, 0x6d, 0x69, 0x3d, 0x33, 0xf6, 0xeb, 0xbc, 0x40,
+  0x77, 0x90, 0x3d, 0x6c, 0xf5, 0x99, 0x3c, 0x42, 0x69, 0x08, 0x3d, 0x9b, 0x3f,
+  0xde, 0xbc, 0xe0, 0x71, 0x04, 0xbd, 0x6a, 0xcd, 0xfe, 0xbb, 0x77, 0xd6, 0xb3,
+  0x3d, 0xf9, 0xb4, 0xcc, 0x3b, 0x6a, 0x1c, 0x70, 0x3d, 0x10, 0x34, 0x15, 0xbc,
+  0x82, 0x15, 0x3a, 0x3d, 0xa8, 0xa6, 0x02, 0x3d, 0x06, 0x03, 0xaa, 0x3d, 0x15,
+  0x2c, 0xe6, 0xbc, 0xac, 0xf0, 0xdc, 0x3c, 0xa7, 0x3b, 0xef, 0xbc, 0x7a, 0xa7,
+  0x93, 0x3d, 0xaf, 0x46, 0x87, 0x3c, 0xf9, 0x13, 0x76, 0xbb, 0x30, 0x99, 0x15,
+  0xbd, 0x36, 0xd1, 0x8f, 0xbc, 0xc9, 0x26, 0xaf, 0x3d, 0xc0, 0xa3, 0x5b, 0x3c,
+  0x69, 0x65, 0x84, 0xbd, 0x1e, 0x30, 0x81, 0x3d, 0xb4, 0xbc, 0x22, 0x3d, 0x16,
+  0x60, 0x52, 0x3d, 0x5e, 0xfe, 0x6a, 0xbc, 0x16, 0x65, 0x34, 0xbd, 0xfe, 0xab,
+  0xf0, 0x3c, 0xe1, 0xfd, 0x90, 0x3d, 0xd4, 0x61, 0x6a, 0xbd, 0x55, 0xd1, 0x85,
+  0xbd, 0x87, 0x6f, 0x66, 0xbd, 0x29, 0x4a, 0x8d, 0x3a, 0xec, 0x8f, 0x91, 0x3d,
+  0x07, 0x75, 0x5a, 0x3b, 0x95, 0x09, 0x27, 0x3b, 0x25, 0x10, 0xd3, 0x3d, 0xde,
+  0xfe, 0x0b, 0xbd, 0xe8, 0xd4, 0xc4, 0x3c, 0x4e, 0xda, 0x7d, 0x3c, 0x54, 0xb5,
+  0xe8, 0xba, 0x69, 0x46, 0x40, 0x3d, 0xd1, 0xd6, 0x48, 0x3c, 0xfa, 0xb9, 0x87,
+  0x39, 0x5a, 0x17, 0x20, 0xbc, 0xd5, 0x9b, 0x66, 0x3d, 0x19, 0x23, 0xac, 0x3c,
+  0x56, 0x76, 0x5a, 0xbd, 0x7e, 0x50, 0x3c, 0xbc, 0x02, 0x8b, 0x17, 0xbd, 0x42,
+  0x85, 0xc6, 0xbd, 0x06, 0x12, 0x9f, 0x3d, 0xad, 0x96, 0xc7, 0xbb, 0xd9, 0xfc,
+  0xff, 0xbb, 0xb9, 0x86, 0x71, 0x3c, 0xc7, 0xf6, 0x3f, 0xbd, 0xc2, 0x39, 0xf7,
+  0x3a, 0x25, 0xcb, 0xf0, 0x3c, 0xfe, 0x25, 0xb0, 0xbb, 0xd3, 0x39, 0x02, 0x3d,
+  0xf8, 0xa3, 0x08, 0xbd, 0xba, 0xf2, 0x4e, 0xbd, 0x53, 0x83, 0x46, 0xbd, 0xae,
+  0x06, 0x06, 0x3d, 0x69, 0xf3, 0x8f, 0x3d, 0xd3, 0x57, 0x35, 0x3c, 0x05, 0x92,
+  0xb9, 0x3c, 0x60, 0x8e, 0x5b, 0x3b, 0xab, 0x7a, 0x8d, 0xbc, 0xf6, 0xdf, 0x87,
+  0xbd, 0x0d, 0xc5, 0x81, 0x3d, 0xec, 0x93, 0x5f, 0x3d, 0xf6, 0x54, 0x85, 0x3d,
+  0x86, 0xb3, 0x16, 0xbc, 0x7d, 0x95, 0x97, 0x3d, 0xff, 0xd8, 0x0c, 0x3d, 0x21,
+  0x38, 0x6e, 0xbd, 0x68, 0xfc, 0x83, 0x3d, 0x5c, 0x54, 0x1b, 0xbc, 0x26, 0x1d,
+  0x03, 0x3d, 0xd8, 0xaa, 0x90, 0xbd, 0xa9, 0x58, 0x0b, 0x3b, 0x02, 0x4e, 0x40,
+  0xbd, 0xdc, 0x76, 0xe0, 0xbb, 0x14, 0x2e, 0x24, 0x3d, 0xbb, 0x6b, 0xfe, 0x3b,
+  0xfd, 0xb5, 0x99, 0xbd, 0x4b, 0x2b, 0x0e, 0xbd, 0x2f, 0xc8, 0x69, 0xbd, 0xff,
+  0xf0, 0x04, 0x3d, 0x46, 0x9c, 0x13, 0x3c, 0x74, 0x89, 0x2e, 0x3d, 0xbe, 0x6e,
+  0x52, 0xbd, 0x59, 0x23, 0x34, 0x3d, 0x72, 0x3a, 0x3e, 0xbd, 0xf8, 0x03, 0x7a,
+  0x3d, 0x8e, 0xab, 0x74, 0x3c, 0x6e, 0x5e, 0x82, 0x3d, 0x16, 0x5b, 0x25, 0x3c,
+  0x56, 0x2c, 0xe7, 0xbd, 0x19, 0x4d, 0xc0, 0x3d, 0x8a, 0xb3, 0xdb, 0xbd, 0x34,
+  0xe5, 0x67, 0xbc, 0x0f, 0x5d, 0x35, 0x3d, 0xad, 0xad, 0x94, 0x3d, 0xa5, 0xc3,
+  0xba, 0xba, 0xb4, 0x7f, 0x02, 0x3e, 0xde, 0xcd, 0x8d, 0x3d, 0xc3, 0xa4, 0xa4,
+  0xbd, 0x7e, 0x1b, 0x37, 0x3d, 0xde, 0xb4, 0x91, 0xbd, 0x78, 0xf2, 0x62, 0xbd,
+  0x25, 0x4f, 0x60, 0xbd, 0x4e, 0xd2, 0x25, 0xbd, 0xd3, 0xc3, 0xe8, 0xbb, 0x7f,
+  0x00, 0x68, 0x3d, 0x7a, 0x9c, 0x1e, 0xbd, 0x17, 0x70, 0x81, 0x3c, 0xda, 0xb3,
+  0x68, 0x3d, 0xab, 0xf3, 0xb4, 0xbc, 0x46, 0x70, 0x16, 0xbd, 0x22, 0xe5, 0x82,
+  0x3d, 0x75, 0x02, 0x5a, 0x3d, 0xb5, 0xce, 0x86, 0xbd, 0x20, 0x29, 0xa8, 0xbb,
+  0xe5, 0x29, 0x95, 0xbd, 0x63, 0x0c, 0x5f, 0xbd, 0x42, 0x39, 0x99, 0xbc, 0x27,
+  0xd6, 0x82, 0xbb, 0x33, 0x1c, 0xda, 0xbc, 0x93, 0x96, 0x76, 0x3d, 0xd3, 0x8c,
+  0xd3, 0xbd, 0x75, 0x39, 0xe1, 0x3d, 0x42, 0x5b, 0x98, 0xbd, 0x5a, 0xc4, 0x4f,
+  0x3d, 0x3b, 0xb0, 0x14, 0xbd, 0xfc, 0x99, 0x4b, 0xbc, 0xd4, 0x88, 0x13, 0xbb,
+  0x6c, 0xca, 0xc4, 0x3d, 0xd4, 0xdc, 0xb1, 0x3d, 0x62, 0x2a, 0x8d, 0x3c, 0xd8,
+  0x1b, 0xb7, 0x3c, 0x0b, 0x8d, 0xba, 0xbb, 0x78, 0x25, 0x5c, 0xbd, 0xb9, 0xc6,
+  0xbb, 0xba, 0x26, 0x58, 0xc5, 0xbd, 0x5d, 0x48, 0xb7, 0xbd, 0x71, 0x0d, 0x0e,
+  0x3d, 0xa8, 0xa7, 0x54, 0xbd, 0x88, 0xfe, 0x84, 0xbc, 0x0b, 0x64, 0x1b, 0xbc,
+  0xba, 0xaa, 0x8e, 0x3c, 0x89, 0x54, 0xa5, 0xbc, 0xde, 0x32, 0x9c, 0x3c, 0x90,
+  0x13, 0x66, 0xbd, 0xb2, 0x5e, 0x11, 0xbd, 0xd0, 0x5e, 0xfb, 0xbb, 0x2e, 0x6c,
+  0x8c, 0xbd, 0x09, 0x4b, 0x2f, 0xbc, 0xa8, 0x5d, 0x27, 0xbd, 0xad, 0xd8, 0x2e,
+  0x3d, 0x78, 0x5e, 0xf0, 0x3c, 0x8e, 0xc0, 0x12, 0x3d, 0x49, 0xb5, 0xca, 0xbd,
+  0x1b, 0x2e, 0xb0, 0x3d, 0xeb, 0x3c, 0x8b, 0xbd, 0xe2, 0x4b, 0xd6, 0xbc, 0x14,
+  0xdf, 0xc3, 0x3c, 0x42, 0x9c, 0x87, 0x3c, 0xb7, 0x90, 0x18, 0x3d, 0xcb, 0x8a,
+  0xd8, 0x3d, 0xc1, 0x0c, 0x97, 0x3d, 0x35, 0xe8, 0xd3, 0x3c, 0xb1, 0x05, 0x28,
+  0x3d, 0x03, 0xd2, 0xbc, 0x3d, 0x56, 0xce, 0x44, 0x3d, 0x9f, 0xbf, 0x24, 0x3d,
+  0x21, 0x81, 0x81, 0xbd, 0xc0, 0xa2, 0xda, 0xbd, 0x50, 0x42, 0x27, 0x3d, 0x5f,
+  0xb2, 0xb9, 0x3c, 0x04, 0x67, 0x6c, 0x3d, 0xce, 0x89, 0x2c, 0xbd, 0x08, 0x2d,
+  0x4b, 0x3c, 0x88, 0x86, 0xf7, 0x3c, 0xcd, 0x8e, 0x94, 0x3d, 0x5a, 0x47, 0x6f,
+  0x3d, 0x67, 0xf4, 0xa2, 0xbd, 0xe3, 0x50, 0x91, 0xbd, 0xde, 0x9e, 0x84, 0x3d,
+  0xb3, 0x05, 0xbf, 0x3c, 0x10, 0x17, 0x34, 0x3d, 0xf4, 0x1f, 0x0e, 0xbd, 0x47,
+  0xb9, 0x49, 0x3d, 0xb1, 0x61, 0x10, 0x3d, 0x2a, 0x64, 0x90, 0xbd, 0x1e, 0xc9,
+  0xb8, 0x3c, 0x7d, 0x23, 0xb8, 0xbd, 0x19, 0x60, 0x85, 0x3d, 0x44, 0xb5, 0x4d,
+  0xbd, 0x05, 0x79, 0xec, 0x3b, 0xea, 0x1e, 0x21, 0xbd, 0xeb, 0x34, 0x59, 0x3d,
+  0x50, 0xa9, 0x00, 0x3d, 0x72, 0xf1, 0x4c, 0xb9, 0x98, 0x35, 0xc1, 0x3d, 0xbb,
+  0x18, 0x36, 0x3d, 0x19, 0x70, 0x62, 0xbd, 0xc5, 0xae, 0x75, 0x3d, 0x27, 0x77,
+  0xec, 0xbc, 0xab, 0x6d, 0xe1, 0xbd, 0x75, 0x4a, 0xae, 0x3c, 0x2d, 0xea, 0x18,
+  0xbb, 0xdc, 0x0e, 0x7b, 0x3d, 0xb2, 0x28, 0x24, 0xbd, 0x69, 0xd2, 0x78, 0xbd,
+  0xed, 0x29, 0x5f, 0xbc, 0xd9, 0x6e, 0x44, 0x3d, 0x3c, 0x6c, 0x87, 0xbd, 0xa5,
+  0xdf, 0x96, 0xbc, 0x1c, 0x4c, 0x35, 0x3d, 0x54, 0x97, 0x57, 0xbd, 0xe9, 0x88,
+  0x40, 0xbd, 0x6d, 0x9d, 0x71, 0x3c, 0x3f, 0x74, 0xaf, 0xbb, 0x41, 0xfa, 0x4b,
+  0x3d, 0x20, 0xe8, 0x7a, 0xbc, 0xe4, 0x37, 0xbe, 0xbd, 0xfa, 0xa2, 0x44, 0xbc,
+  0x2a, 0x3c, 0x61, 0xbd, 0xec, 0x0f, 0x0c, 0x3d, 0xd7, 0xef, 0x82, 0xbd, 0x0b,
+  0xe4, 0xd2, 0xbc, 0xd2, 0x57, 0x04, 0x3c, 0xa8, 0x6e, 0xce, 0x3d, 0x3c, 0xd8,
+  0xa4, 0x3b, 0x1d, 0x19, 0x45, 0xbd, 0xd6, 0x4d, 0x70, 0x3c, 0xed, 0x12, 0xf0,
+  0xbc, 0x1f, 0xc6, 0x4c, 0x3c, 0xeb, 0x27, 0x8e, 0xbc, 0x6a, 0xf8, 0x4f, 0x3d,
+  0xcf, 0x2c, 0xe3, 0xbd, 0x3b, 0xc9, 0x05, 0xbb, 0xe0, 0xfa, 0xfd, 0x3c, 0xfe,
+  0xb8, 0xfb, 0xbc, 0x84, 0xd9, 0x8b, 0x3d, 0xad, 0x88, 0x00, 0x3d, 0x21, 0xfa,
+  0x47, 0x3d, 0xf6, 0x17, 0x0d, 0xbd, 0xc5, 0x0c, 0xf1, 0x3c, 0xec, 0x3c, 0x13,
+  0xbd, 0x1a, 0x06, 0x4b, 0xbd, 0x76, 0x04, 0xa4, 0xbc, 0x89, 0x87, 0x92, 0x3d,
+  0xd2, 0xc6, 0xaf, 0x3d, 0xb1, 0xb1, 0x12, 0x3d, 0x99, 0xa4, 0x23, 0x3d, 0x25,
+  0x73, 0x75, 0x3b, 0x18, 0x34, 0xa1, 0xbd, 0xc0, 0x90, 0xa5, 0x3d, 0xaa, 0xa8,
+  0x14, 0xbd, 0x6c, 0xbc, 0xf3, 0x3c, 0x8a, 0x47, 0x51, 0xbc, 0xab, 0xfc, 0x2a,
+  0x3d, 0xc8, 0xb7, 0x68, 0x3d, 0xff, 0xbf, 0x72, 0x3d, 0x38, 0x39, 0x95, 0x3d,
+  0xdc, 0x49, 0x94, 0xbc, 0xbd, 0xce, 0x90, 0x3c, 0xcd, 0x13, 0x35, 0x3d, 0xd4,
+  0xd9, 0x51, 0xbd, 0x16, 0xde, 0xfb, 0xbc, 0xc7, 0x00, 0xb9, 0xbd, 0x38, 0x8e,
+  0x2e, 0xbc, 0xcb, 0xce, 0x5e, 0x3d, 0x44, 0x22, 0x7a, 0x3c, 0x70, 0x0a, 0x93,
+  0x3d, 0x9c, 0x88, 0x81, 0x3a, 0x02, 0x89, 0x01, 0xbd, 0x52, 0x9b, 0x50, 0xbc,
+  0xc7, 0x6f, 0x46, 0x3c, 0x41, 0xb4, 0x57, 0x3d, 0x79, 0x89, 0xd2, 0x3b, 0x20,
+  0xab, 0x75, 0x3b, 0x40, 0xf2, 0xea, 0x3c, 0x8f, 0x29, 0x8c, 0x3d, 0xb0, 0x20,
+  0x45, 0xbd, 0xf4, 0x67, 0x8c, 0x3d, 0xbf, 0x3f, 0x9d, 0x3c, 0xa7, 0x71, 0x01,
+  0xbd, 0x37, 0x6b, 0x02, 0xbc, 0x68, 0xc4, 0x2a, 0x3d, 0x43, 0x60, 0x9b, 0xbc,
+  0x72, 0xb9, 0x73, 0xbd, 0x90, 0xc4, 0x13, 0x3c, 0xba, 0xbf, 0x50, 0xbb, 0x86,
+  0x75, 0x78, 0xbd, 0x2e, 0xaf, 0x69, 0xbc, 0xdb, 0x89, 0xbc, 0x3d, 0x05, 0x7f,
+  0xa8, 0xbd, 0x42, 0x5f, 0x02, 0x3d, 0xe1, 0x3c, 0x12, 0xbd, 0xfd, 0xdf, 0x41,
+  0x3d, 0x2e, 0xda, 0xe3, 0xbb, 0x80, 0x3c, 0x5f, 0xbd, 0x26, 0x2b, 0x1f, 0xbd,
+  0xa8, 0xed, 0xd5, 0x3c, 0xa6, 0x84, 0xf1, 0x3c, 0xbe, 0xd2, 0x9a, 0xbb, 0x5b,
+  0x04, 0x61, 0x3d, 0x2b, 0xe5, 0x06, 0xbd, 0xc9, 0xb8, 0x85, 0x3c, 0x64, 0x7a,
+  0xc7, 0x3d, 0x4c, 0x12, 0xc9, 0x3c, 0x69, 0x12, 0x63, 0xbd, 0x88, 0x73, 0xbf,
+  0x3c, 0xfc, 0x66, 0x50, 0xbb, 0x64, 0x31, 0x9a, 0xbd, 0xeb, 0x81, 0x8d, 0x3d,
+  0x7e, 0x4e, 0xc5, 0x3c, 0x15, 0x80, 0x96, 0x3d, 0xb9, 0x1f, 0x65, 0xbd, 0xe3,
+  0x99, 0xda, 0xbd, 0x94, 0x02, 0x4a, 0x3c, 0xbf, 0x7b, 0x26, 0x3d, 0x20, 0xae,
+  0x9d, 0xbb, 0x84, 0x49, 0x1e, 0x3d, 0x88, 0x11, 0x17, 0x3d, 0x45, 0x77, 0x73,
+  0x3c, 0x76, 0x33, 0xaa, 0x3c, 0x28, 0x4d, 0x4b, 0x3d, 0x49, 0x89, 0x37, 0x3c,
+  0x3f, 0xe6, 0x92, 0xbd, 0xc8, 0x39, 0xa0, 0x3c, 0xd6, 0xff, 0x0a, 0x3b, 0xb4,
+  0xef, 0xad, 0xbd, 0xdb, 0x17, 0x19, 0x3c, 0x9a, 0x54, 0x7c, 0xbd, 0xe7, 0x50,
+  0xcc, 0x3c, 0x91, 0xeb, 0x75, 0xbd, 0x9a, 0x45, 0xac, 0x3d, 0xd3, 0x80, 0x4d,
+  0xbd, 0x17, 0x6c, 0x19, 0x3c, 0x47, 0xb1, 0x1f, 0xbd, 0xef, 0x17, 0x1d, 0xbd,
+  0xa2, 0xc8, 0x58, 0xbc, 0xf9, 0xc6, 0x81, 0xbb, 0x70, 0xfc, 0xa1, 0x3b, 0x70,
+  0x74, 0x38, 0x3d, 0xb9, 0x93, 0x6c, 0x3d, 0xb5, 0x22, 0x89, 0x3d, 0xa8, 0x15,
+  0xed, 0xbb, 0xee, 0x0c, 0xac, 0xbc, 0xbf, 0xca, 0xbe, 0xbc, 0x8e, 0x0d, 0xbf,
+  0xbd, 0xfb, 0x0c, 0x92, 0x3c, 0x3d, 0x1e, 0x61, 0xbd, 0xe1, 0xb2, 0x08, 0xbd,
+  0xcd, 0xab, 0x75, 0xbb, 0xc5, 0x1a, 0x2f, 0x3d, 0x4f, 0x02, 0x92, 0x3c, 0x8f,
+  0x47, 0x20, 0x3d, 0x33, 0xac, 0xc3, 0x3d, 0xc9, 0xdc, 0xbd, 0xbc, 0x68, 0x6e,
+  0xb4, 0x3b, 0x32, 0x32, 0xdc, 0x3d, 0xd8, 0xff, 0x92, 0x3d, 0xb3, 0xa4, 0x6f,
+  0xbd, 0xf0, 0xbe, 0x13, 0xbd, 0xff, 0xf5, 0xdf, 0xbd, 0x67, 0xeb, 0x94, 0x3c,
+  0xb2, 0xe8, 0x57, 0xbb, 0x92, 0x3f, 0xdc, 0xbb, 0xe3, 0x5f, 0x6b, 0x3c, 0x02,
+  0xcc, 0x6c, 0xbd, 0x25, 0xa1, 0x57, 0xbd, 0x22, 0x01, 0x82, 0x3d, 0xc3, 0xcf,
+  0xb2, 0x3c, 0xed, 0x35, 0x56, 0xbb, 0xe3, 0xf0, 0x8c, 0x3d, 0xdb, 0xf1, 0xb1,
+  0xbc, 0xaa, 0xe4, 0xc2, 0x3b, 0x53, 0x9c, 0xf6, 0xbc, 0x15, 0x86, 0x92, 0x3d,
+  0xe4, 0xf9, 0x39, 0x3d, 0x09, 0xa5, 0xa8, 0xbc, 0x6e, 0x89, 0xd1, 0xbc, 0x47,
+  0xd4, 0x7b, 0x3c, 0x7b, 0xff, 0xab, 0x3c, 0x15, 0x58, 0x8d, 0xbd, 0x7b, 0x21,
+  0xac, 0x3c, 0xda, 0xe5, 0xad, 0xbc, 0x8b, 0xfc, 0xd8, 0xbc, 0x8c, 0xe1, 0x0e,
+  0xbc, 0x36, 0x43, 0xc6, 0x3d, 0xfa, 0x15, 0x8b, 0xbc, 0xb8, 0xd0, 0x07, 0x3d,
+  0xd9, 0x12, 0x9c, 0x3c, 0x81, 0x20, 0x4f, 0xbd, 0xd8, 0x7f, 0x18, 0x3b, 0x38,
+  0xd4, 0x33, 0xbc, 0x00, 0x0f, 0xe2, 0xbd, 0x25, 0xa8, 0xf2, 0x3c, 0x87, 0xa6,
+  0x96, 0xbd, 0x84, 0xc3, 0xa8, 0x3c, 0xf4, 0x7a, 0x8b, 0x3c, 0xfd, 0xbd, 0x55,
+  0xbc, 0x45, 0x00, 0x97, 0xbd, 0x81, 0x3a, 0xbd, 0x3b, 0x21, 0x43, 0x30, 0xbd,
+  0x94, 0x58, 0xa5, 0x3b, 0x30, 0x2f, 0x12, 0xbd, 0xcb, 0xd3, 0x32, 0x3d, 0x36,
+  0xd2, 0x7c, 0xbd, 0xf2, 0x77, 0x49, 0x3d, 0x87, 0xdd, 0x87, 0xbc, 0x3d, 0x1a,
+  0x02, 0x3d, 0x5a, 0x1b, 0xc1, 0x3c, 0x04, 0xaf, 0x33, 0xbd, 0x84, 0x02, 0x1d,
+  0x3d, 0x47, 0x7d, 0x21, 0xbd, 0x46, 0xc4, 0x24, 0x3d, 0x8f, 0x16, 0x27, 0x3d,
+  0xce, 0x48, 0x22, 0x3d, 0xd9, 0x6b, 0xa3, 0x3c, 0x31, 0x91, 0xbb, 0x3c, 0xef,
+  0x24, 0x88, 0xbb, 0x1e, 0x6e, 0x41, 0xbd, 0x81, 0xea, 0x80, 0x3d, 0xa6, 0xa7,
+  0xf2, 0x3d, 0x74, 0xcf, 0xd7, 0x3c, 0x4c, 0x85, 0xf6, 0xbc, 0x57, 0xac, 0x0f,
+  0x3c, 0x1c, 0x44, 0x53, 0xbd, 0x44, 0x55, 0x35, 0x3d, 0x14, 0x45, 0x11, 0x3d,
+  0x0d, 0xfa, 0xff, 0xbc, 0xe0, 0xef, 0x32, 0x3d, 0x6c, 0x60, 0xac, 0x3b, 0xd2,
+  0xe0, 0xab, 0xbb, 0x77, 0x02, 0x3f, 0xbd, 0xcd, 0x77, 0x44, 0x3d, 0x4f, 0x8c,
+  0x3e, 0xbd, 0x74, 0xd6, 0x5a, 0xbd, 0x33, 0xb6, 0xf2, 0xbc, 0x94, 0xe4, 0x0e,
+  0x3b, 0x6c, 0x9b, 0xa9, 0x3a, 0x61, 0xd7, 0xea, 0xbc, 0xf6, 0x70, 0xe9, 0x3c,
+  0x06, 0x81, 0xeb, 0xbc, 0x51, 0x88, 0x47, 0xbb, 0x6c, 0xfb, 0x6d, 0x3d, 0x0a,
+  0x9d, 0x29, 0xbb, 0xa0, 0x45, 0x36, 0x3c, 0xe5, 0xd9, 0xb8, 0x3c, 0x09, 0xf4,
+  0x09, 0xbd, 0x2a, 0x13, 0x54, 0xbc, 0xad, 0xb0, 0xa3, 0x3d, 0x5a, 0x07, 0xff,
+  0x3c, 0x18, 0x10, 0xc9, 0x3c, 0x15, 0xf6, 0x07, 0xbd, 0x05, 0x70, 0x60, 0x3d,
+  0xb5, 0xbd, 0x50, 0x3d, 0xeb, 0xe1, 0x11, 0x3d, 0xdf, 0x70, 0x40, 0xbd, 0x51,
+  0x6f, 0x67, 0xbd, 0x61, 0xbf, 0xd0, 0x3c, 0x39, 0x5e, 0x14, 0xbd, 0xae, 0x58,
+  0xa1, 0x3d, 0xa2, 0x03, 0x88, 0x3d, 0x85, 0x40, 0x89, 0xbd, 0x3e, 0x4f, 0x21,
+  0x3c, 0x8b, 0x40, 0xcf, 0x3c, 0xa8, 0x0d, 0x76, 0x3d, 0x2f, 0x57, 0xf4, 0x3b,
+  0x78, 0x71, 0x8f, 0x3c, 0x15, 0x80, 0x72, 0x3d, 0x35, 0xc6, 0xe6, 0xbc, 0x1e,
+  0xdb, 0x8d, 0x3d, 0xc1, 0x52, 0x58, 0x3d, 0x1e, 0x0c, 0x37, 0x3d, 0x68, 0xdd,
+  0x25, 0x3d, 0x1a, 0x65, 0x59, 0xbc, 0x22, 0xe3, 0x8b, 0x3d, 0x29, 0xb2, 0x44,
+  0xbd, 0x56, 0x71, 0x34, 0xbd, 0x1c, 0x3f, 0x7c, 0xbb, 0x88, 0x17, 0x72, 0xbc,
+  0xbb, 0xb5, 0xae, 0x3c, 0xdd, 0x7b, 0xd5, 0x3c, 0xd3, 0x2f, 0x93, 0x3d, 0x07,
+  0x46, 0x38, 0x3d, 0x55, 0x2b, 0x47, 0x3d, 0xd2, 0x5c, 0xda, 0x3d, 0xa4, 0x8e,
+  0x80, 0x3d, 0xe6, 0xdb, 0xc9, 0x3c, 0xf3, 0x2d, 0x3f, 0xbd, 0x66, 0x10, 0xd1,
+  0xbd, 0xde, 0xa5, 0xda, 0x3c, 0xab, 0x8c, 0xe4, 0x3c, 0x85, 0x1c, 0xc0, 0x3c,
+  0xba, 0xe5, 0x95, 0xbd, 0x25, 0x50, 0x92, 0x3c, 0x25, 0x15, 0xc9, 0xba, 0x43,
+  0xdc, 0x63, 0xbc, 0x65, 0xd6, 0x07, 0x3d, 0x87, 0x8c, 0x0e, 0xbc, 0x0d, 0x90,
+  0x87, 0x3d, 0x9a, 0x0e, 0x4a, 0x3d, 0x67, 0x54, 0x4a, 0x3d, 0x63, 0x8b, 0x24,
+  0xbd, 0x56, 0x2c, 0xcf, 0xbc, 0x28, 0x2a, 0x23, 0x3d, 0xc6, 0x80, 0xa3, 0xbc,
+  0x66, 0xe5, 0x09, 0xbd, 0x69, 0xdb, 0x93, 0x3d, 0x00, 0xc7, 0x7e, 0xbd, 0xe0,
+  0x18, 0x06, 0x3d, 0x02, 0xb9, 0x77, 0xbd, 0x43, 0x60, 0x55, 0x3c, 0x46, 0x45,
+  0xa4, 0x3d, 0xb1, 0x0a, 0xac, 0x3c, 0x8a, 0xc5, 0x8e, 0x3d, 0xf6, 0x60, 0x31,
+  0xbc, 0x9b, 0x2d, 0xb0, 0x3a, 0xc3, 0xc4, 0x4a, 0xbd, 0x96, 0x31, 0x82, 0xbd,
+  0x4e, 0x50, 0x59, 0x3c, 0x2f, 0xf7, 0xd4, 0xbd, 0x18, 0xc1, 0x2b, 0xbd, 0xb8,
+  0x26, 0x9d, 0x3c, 0xd6, 0x9c, 0x3b, 0xbd, 0xb6, 0xdd, 0x11, 0xbd, 0x4e, 0x51,
+  0xd9, 0x3b, 0xbd, 0xfd, 0x3b, 0xbd, 0xe2, 0xe9, 0x35, 0xbc, 0x0d, 0xb1, 0x9c,
+  0x3c, 0x02, 0x6e, 0xab, 0x3c, 0xc9, 0x70, 0x25, 0x3c, 0xae, 0xe4, 0x60, 0xbd,
+  0x11, 0xc2, 0x49, 0x3d, 0x9b, 0x09, 0xaf, 0xbc, 0xbc, 0x74, 0x75, 0x3c, 0x38,
+  0x61, 0x16, 0x3d, 0x0c, 0x99, 0x94, 0x3d, 0x01, 0x83, 0x03, 0xbb, 0xc5, 0x45,
+  0x1b, 0x3d, 0x82, 0xab, 0x6f, 0x3c, 0xe1, 0x41, 0xce, 0x3c, 0x86, 0xd5, 0x79,
+  0xbd, 0x0e, 0x6c, 0x69, 0x3d, 0xcf, 0xbb, 0x87, 0x3d, 0x65, 0x17, 0xb4, 0xbc,
+  0xca, 0x64, 0x07, 0x3e, 0x7d, 0x34, 0xca, 0x3d, 0x40, 0x0d, 0xfb, 0x3c, 0x0e,
+  0xea, 0xc2, 0x3c, 0x06, 0x26, 0x88, 0xbc, 0xed, 0x76, 0x84, 0x3d, 0xca, 0x92,
+  0xa4, 0xbc, 0x4c, 0x98, 0x74, 0xbd, 0x62, 0x77, 0xdb, 0xbd, 0x97, 0xba, 0x87,
+  0x3d, 0xe9, 0x05, 0x95, 0xbd, 0xcc, 0xfd, 0x99, 0x3d, 0x36, 0x01, 0x0b, 0xbd,
+  0x23, 0x33, 0x7d, 0x3d, 0x2f, 0xba, 0x5c, 0x3d, 0xaa, 0xed, 0xb2, 0xbc, 0xfc,
+  0xe7, 0x97, 0x3d, 0xaa, 0x40, 0x7d, 0x3d, 0x2a, 0x5f, 0x5e, 0x3d, 0x51, 0x91,
+  0x7d, 0xbd, 0xc8, 0xf8, 0x2a, 0x3d, 0x7b, 0x8c, 0x2f, 0x3d, 0x35, 0xe0, 0xb9,
+  0xbb, 0xc4, 0x0b, 0x56, 0xbd, 0xcf, 0xd0, 0xb8, 0x3c, 0xf7, 0xef, 0x61, 0x3d,
+  0xf5, 0x33, 0x9a, 0x3d, 0x07, 0xd8, 0xf0, 0xbc, 0x34, 0x49, 0x61, 0xbd, 0x7c,
+  0x0c, 0x74, 0xbd, 0x0c, 0x85, 0xf7, 0xbc, 0xeb, 0x13, 0xdd, 0xbc, 0x70, 0x3a,
+  0xd1, 0x3c, 0xd0, 0x31, 0xe1, 0x3d, 0xbf, 0xb4, 0x90, 0xbd, 0x6c, 0x8a, 0x4f,
+  0xbc, 0x89, 0x66, 0x29, 0xbc, 0x5d, 0x8a, 0x18, 0xbd, 0xa4, 0x2b, 0x91, 0xbd,
+  0x6a, 0x8d, 0x2b, 0xb9, 0x44, 0x9f, 0xf1, 0xbd, 0xe3, 0x9a, 0x87, 0x3c, 0x3c,
+  0x77, 0x5c, 0x3d, 0x1b, 0x6f, 0x50, 0xbd, 0x43, 0x9e, 0x41, 0xbd, 0x13, 0x6f,
+  0x5d, 0x3d, 0x44, 0x7f, 0x67, 0x3c, 0xf5, 0x9e, 0x31, 0x3c, 0xc0, 0x48, 0x8b,
+  0x3d, 0x48, 0xc4, 0xd0, 0xbc, 0x80, 0x20, 0x17, 0x3a, 0x4c, 0x44, 0x42, 0x3b,
+  0xcd, 0x50, 0x0e, 0x3d, 0xf8, 0xdd, 0x6a, 0x3d, 0xa7, 0xa4, 0x57, 0x3c, 0x5c,
+  0x60, 0x94, 0x3c, 0xd4, 0x6e, 0x34, 0xbc, 0xa3, 0xa2, 0x8e, 0xbd, 0x88, 0xe0,
+  0xad, 0x3d, 0xdb, 0xd6, 0x9f, 0xbd, 0x14, 0xcb, 0x61, 0xbd, 0x02, 0x50, 0x7f,
+  0xbd, 0xb9, 0x4c, 0x9d, 0x3d, 0x0d, 0x5a, 0x88, 0x3d, 0x8b, 0x0a, 0x06, 0x3c,
+  0xdf, 0x17, 0x8e, 0x3d, 0x75, 0x07, 0x0c, 0x3d, 0x5d, 0xd3, 0x52, 0xbd, 0x22,
+  0x56, 0x0b, 0x3a, 0x62, 0x34, 0xcb, 0xbc, 0x55, 0x58, 0xaa, 0x3c, 0x72, 0x28,
+  0xa3, 0xbd, 0x60, 0x8d, 0x3f, 0xbc, 0x5b, 0xaa, 0x51, 0xbb, 0xa8, 0x60, 0x31,
+  0xbd, 0x8c, 0xc5, 0xfb, 0x3c, 0x90, 0x97, 0x3f, 0xbc, 0x94, 0x3a, 0x45, 0xbd,
+  0xb5, 0xc1, 0x8d, 0xbd, 0x07, 0xd0, 0x08, 0x3d, 0x47, 0x05, 0xe2, 0xbb, 0x69,
+  0x2e, 0x16, 0x3d, 0xd0, 0x2d, 0x50, 0xbd, 0xd3, 0x88, 0x9e, 0x3d, 0x2f, 0x19,
+  0xbb, 0xbc, 0x20, 0x1f, 0xa4, 0x3d, 0x38, 0x4e, 0x9c, 0xbc, 0x71, 0x5a, 0x6e,
+  0x3c, 0x47, 0x9a, 0x49, 0x3d, 0x7a, 0x7b, 0x07, 0x3a, 0x54, 0xf5, 0xcd, 0x3d,
+  0x54, 0xb0, 0xde, 0x3c, 0xb0, 0xbd, 0x1b, 0x3c, 0x31, 0x85, 0x2c, 0xbd, 0xda,
+  0x03, 0xe4, 0xbb, 0x9e, 0xf5, 0x87, 0x3d, 0xef, 0x15, 0x41, 0x3d, 0x82, 0x56,
+  0xa3, 0x3d, 0xfa, 0x31, 0x5e, 0xbd, 0xf2, 0x5e, 0x5f, 0xbb, 0x1c, 0xda, 0x9f,
+  0x3d, 0x45, 0x09, 0x71, 0xbc, 0x37, 0x80, 0x9a, 0x3b, 0x5a, 0x7a, 0xfd, 0xbc,
+  0x37, 0x4f, 0x1a, 0xbe, 0xfa, 0x30, 0xeb, 0xbc, 0xa9, 0xd5, 0x74, 0xbd, 0x18,
+  0xad, 0x9b, 0xbc, 0x00, 0xc4, 0xce, 0x3a, 0x98, 0x58, 0x19, 0x3c, 0xf0, 0x22,
+  0xa1, 0x3b, 0x84, 0xfa, 0x08, 0xbd, 0x6f, 0xfe, 0x96, 0x3d, 0xe3, 0xc4, 0x90,
+  0x3d, 0xa0, 0xc8, 0x5a, 0xbc, 0x97, 0x7f, 0xc2, 0xbc, 0xea, 0xcc, 0xcc, 0x3c,
+  0xae, 0xb0, 0x9c, 0xbc, 0x49, 0xdf, 0x97, 0xbc, 0xdd, 0x01, 0x18, 0xbd, 0x66,
+  0x26, 0xa7, 0xbc, 0x2a, 0x3d, 0x59, 0xbd, 0x93, 0x1b, 0x1a, 0x3d, 0xd9, 0x46,
+  0xcc, 0x3c, 0x00, 0xf0, 0x34, 0x3a, 0x99, 0x3d, 0xc0, 0xbc, 0x08, 0xb1, 0x09,
+  0x3c, 0xbe, 0xfb, 0x79, 0x3d, 0xa9, 0x90, 0x86, 0xbd, 0xa2, 0x17, 0x8f, 0xbd,
+  0x30, 0x94, 0x8a, 0xbb, 0xd9, 0xd7, 0x82, 0x3d, 0xe4, 0xea, 0x2f, 0xbd, 0x7e,
+  0x59, 0x73, 0xbd, 0x46, 0x73, 0xe2, 0xbc, 0xe0, 0xd4, 0x42, 0xbc, 0x3c, 0x6c,
+  0xdf, 0x3c, 0x08, 0xce, 0xf9, 0x3c, 0xfc, 0xe4, 0x79, 0xbd, 0xac, 0x5c, 0x4f,
+  0xbd, 0x60, 0x67, 0x12, 0xbb, 0xb2, 0xcf, 0xbf, 0xbc, 0xe2, 0x7c, 0x31, 0xbd,
+  0xb6, 0xc7, 0x18, 0x3d, 0xdc, 0x89, 0x90, 0xbd, 0x0c, 0xf7, 0x99, 0xbc, 0xa0,
+  0x2a, 0x3c, 0xbd, 0x92, 0x1b, 0x38, 0x3d, 0x34, 0xe9, 0x86, 0xbd, 0x69, 0x76,
+  0x6d, 0xbd, 0x76, 0x2b, 0x6e, 0x3d, 0x70, 0x53, 0x3f, 0x3d, 0x22, 0xe5, 0x4c,
+  0x3d, 0x52, 0x57, 0xfc, 0xbc, 0xf8, 0x6b, 0x31, 0xbd, 0xb4, 0xb1, 0xa3, 0x3c,
+  0x10, 0x0c, 0x60, 0x3c, 0xbc, 0x80, 0x85, 0xbd, 0xe6, 0x9f, 0x78, 0xbd, 0x00,
+  0x20, 0x90, 0xba, 0xbc, 0x54, 0x5d, 0xbd, 0x6c, 0xd7, 0xc5, 0xbc, 0x87, 0x6b,
+  0x87, 0x3d, 0x0a, 0x34, 0x0c, 0x3d, 0x44, 0xe5, 0x47, 0xbd, 0xe0, 0xd3, 0x05,
+  0x3b, 0x23, 0x83, 0x11, 0xbd, 0xab, 0x22, 0x8c, 0xbd, 0x48, 0x17, 0xe9, 0x3c,
+  0xbd, 0x8a, 0x89, 0x3d, 0xc0, 0x3a, 0x71, 0x3b, 0x08, 0x52, 0x61, 0x3c, 0x40,
+  0xb4, 0x6d, 0x3c, 0xa0, 0x6a, 0xa0, 0x3b, 0x00, 0xc4, 0xb9, 0x39, 0x74, 0x71,
+  0xa8, 0x3c, 0x13, 0xa7, 0x90, 0xbd, 0x04, 0xb5, 0xb4, 0xbc, 0x70, 0x36, 0x31,
+  0x3c, 0x28, 0x25, 0x0f, 0x3c, 0xfc, 0x08, 0x46, 0xbd, 0x80, 0xa0, 0xa5, 0xba,
+  0xe2, 0x11, 0x6f, 0xbd, 0x39, 0xf0, 0x31, 0xbd, 0xd8, 0xbe, 0x2f, 0xbd, 0x68,
+  0x21, 0x4d, 0xbd, 0x64, 0x1b, 0x8e, 0xbd, 0x80, 0xd4, 0x78, 0xba, 0x92, 0x81,
+  0x5a, 0xbd, 0xf4, 0xf9, 0x57, 0xbd, 0x80, 0x59, 0xa2, 0x3c, 0x22, 0xe6, 0xde,
+  0xbc, 0x91, 0xdf, 0x87, 0xbd, 0x3a, 0xea, 0x22, 0xbd, 0xba, 0xf7, 0x75, 0x3d,
+  0xba, 0x8a, 0x0c, 0x3d, 0x81, 0xa7, 0x8d, 0xbd, 0x90, 0xee, 0x50, 0xbd, 0x14,
+  0xa3, 0x90, 0xbd, 0xdc, 0xdf, 0x81, 0x3c, 0x4a, 0xb5, 0x66, 0xbd, 0x10, 0xa0,
+  0x94, 0x3b, 0x9a, 0x12, 0x2d, 0xbd, 0xda, 0x60, 0x42, 0xbd, 0xea, 0x9f, 0xb0,
+  0xbc, 0x38, 0xfc, 0x02, 0x3d, 0xa6, 0x08, 0x04, 0x3d, 0x23, 0xf6, 0x03, 0xbd,
+  0xa2, 0x7a, 0x63, 0x3d, 0x26, 0xca, 0x36, 0x3d, 0x96, 0xd3, 0x0d, 0x3d, 0x3f,
+  0xfd, 0x89, 0x3d, 0x08, 0xa3, 0x24, 0xbd, 0x28, 0x10, 0x57, 0xbc, 0xbb, 0xb9,
+  0x83, 0x3d, 0x50, 0x2b, 0xb5, 0x3b, 0x9c, 0x94, 0x19, 0xbc, 0xc4, 0x4d, 0x9a,
+  0xbc, 0x91, 0xf8, 0x0d, 0xbd, 0x63, 0x13, 0x7d, 0xbd, 0xed, 0xd0, 0x02, 0xbd,
+  0x1c, 0x10, 0x85, 0xbd, 0x00, 0xca, 0x36, 0x3c, 0xc8, 0x17, 0x7a, 0x3c, 0x24,
+  0x32, 0xc7, 0xbc, 0x88, 0x75, 0xa5, 0x3c, 0x2e, 0x18, 0x39, 0xbd, 0xd4, 0xa9,
+  0xfb, 0x3c, 0x8c, 0x61, 0x48, 0x3d, 0x40, 0x34, 0xb1, 0xba, 0xb7, 0xec, 0x83,
+  0x3d, 0x7c, 0x1d, 0x5a, 0x3d, 0x30, 0x5c, 0x91, 0x3c, 0xcb, 0x9d, 0x85, 0x3d,
+  0x74, 0xa8, 0x35, 0x3d, 0x93, 0x54, 0x76, 0xbd, 0xa3, 0xb8, 0x8c, 0xbd, 0xf3,
+  0x38, 0x8d, 0xbd, 0x45, 0x41, 0x8d, 0xbd, 0xb0, 0x35, 0x2c, 0x3d, 0x79, 0x2f,
+  0x91, 0x3d, 0x1c, 0xa0, 0xde, 0xbc, 0x26, 0xd7, 0x53, 0xbd, 0xec, 0x6e, 0x11,
+  0x3d, 0x1c, 0x44, 0x8f, 0x3c, 0x2b, 0x97, 0x2b, 0xbd, 0x78, 0x4e, 0x62, 0xbc,
+  0x4a, 0x20, 0xe3, 0xbc, 0x2e, 0x7e, 0xd5, 0xbc, 0x34, 0xe0, 0xcc, 0xbc, 0x00,
+  0xd9, 0x05, 0x3d, 0x6e, 0xe3, 0xd8, 0xbc, 0x32, 0x01, 0x51, 0x3d, 0x57, 0x4a,
+  0x83, 0x3d, 0x98, 0x90, 0x4c, 0xbd, 0x0d, 0x8e, 0x8b, 0x3d, 0x76, 0x2c, 0x32,
+  0x3d, 0x6a, 0x76, 0x91, 0xbd, 0xc8, 0xf9, 0x85, 0x3c, 0x40, 0x2b, 0x80, 0x3a,
+  0xe0, 0x00, 0xe3, 0xbb, 0x00, 0x06, 0x79, 0xb9, 0x27, 0xbd, 0x8f, 0x3d, 0xce,
+  0x76, 0x2c, 0x3d, 0x56, 0x63, 0xd7, 0xbc, 0x30, 0x52, 0xf0, 0xbb, 0x69, 0x1f,
+  0x85, 0xbd, 0x7e, 0xdb, 0x64, 0xbd, 0x85, 0xd6, 0x87, 0x3d, 0x92, 0xc0, 0x70,
+  0x3d, 0x4c, 0x7a, 0x78, 0xbc, 0x6c, 0x7d, 0x2b, 0xbd, 0x6f, 0x2b, 0x85, 0x3d,
+  0x98, 0x48, 0x39, 0xbd, 0x8c, 0x9d, 0xce, 0x3c, 0x08, 0xf9, 0x5c, 0xbc, 0xe8,
+  0x5a, 0xcd, 0x3c, 0x88, 0xb0, 0x3c, 0x3d, 0xf8, 0x88, 0x4e, 0xbd, 0x30, 0x8f,
+  0x38, 0x3c, 0xba, 0xa1, 0xc9, 0xbc, 0xba, 0xdc, 0x6d, 0x3d, 0xc0, 0x39, 0x5a,
+  0xbb, 0xa6, 0x2d, 0x1d, 0x3d, 0x04, 0xde, 0xe4, 0x3c, 0x24, 0x67, 0x4f, 0xbd,
+  0xde, 0xc0, 0x7c, 0x3d, 0x31, 0x68, 0x09, 0xbd, 0x01, 0x59, 0x80, 0xbd, 0x13,
+  0x09, 0x91, 0x3d, 0xc8, 0xdd, 0x18, 0x3d, 0x2b, 0x88, 0x91, 0x3d, 0x50, 0xef,
+  0x80, 0x3c, 0xec, 0x4a, 0x65, 0xbc, 0xb0, 0xca, 0x0a, 0x3d, 0x48, 0x1f, 0x29,
+  0xbd, 0x56, 0xe9, 0x3a, 0x3d, 0xd0, 0x9c, 0x67, 0xbc, 0xe0, 0x47, 0xdb, 0xbc,
+  0xd8, 0x70, 0x4a, 0xbd, 0x86, 0x63, 0x39, 0xbd, 0xfb, 0x2a, 0x10, 0xbd, 0xbc,
+  0xfb, 0x42, 0xbd, 0xdc, 0x59, 0xe4, 0xbc, 0x2e, 0x08, 0x5f, 0xbd, 0x34, 0xb6,
+  0xe1, 0x3c, 0x76, 0x68, 0x22, 0x3d, 0x18, 0x3d, 0x14, 0x3c, 0xa5, 0xa2, 0x8b,
+  0xbd, 0x9c, 0x97, 0x87, 0xbd, 0xbd, 0x22, 0x87, 0x3d, 0x20, 0x18, 0x57, 0x3c,
+  0xb6, 0x45, 0x5e, 0x3d, 0xa4, 0x1e, 0x63, 0xbd, 0x88, 0x1f, 0x68, 0x3c, 0xe0,
+  0x00, 0x4f, 0x3d, 0x34, 0xe0, 0x5a, 0xbc, 0xd4, 0xd3, 0x61, 0xbc, 0x40, 0x8f,
+  0x14, 0xbb, 0xae, 0x4e, 0x94, 0xbc, 0x8d, 0x80, 0x61, 0xbd, 0x11, 0xcc, 0x85,
+  0x3d, 0xb4, 0x7b, 0x24, 0xbd, 0x3e, 0x81, 0x15, 0x3d, 0xaa, 0xe5, 0x85, 0xbd,
+  0xa0, 0xa4, 0x2c, 0xbb, 0x02, 0x5e, 0x25, 0x3d, 0x5d, 0x8b, 0x37, 0xbd, 0xa1,
+  0xb0, 0x25, 0xbd, 0x4a, 0xa5, 0x6b, 0x3d, 0xd3, 0x4a, 0x92, 0x3d, 0x40, 0x57,
+  0x06, 0x3d, 0x20, 0xdd, 0x30, 0x3b, 0xb0, 0x9e, 0xd3, 0x3c, 0x62, 0xb5, 0xd8,
+  0xbc, 0xa0, 0xec, 0x93, 0xbb, 0x20, 0xc4, 0x7a, 0x3b, 0xc0, 0x64, 0xfe, 0x3b,
+  0xcb, 0xb4, 0x90, 0x3d, 0x3f, 0x87, 0x8c, 0x3d, 0xfa, 0x94, 0x21, 0x3d, 0x9c,
+  0xc3, 0x03, 0x3d, 0xc2, 0x4f, 0x8d, 0xbc, 0x22, 0x1e, 0xd2, 0xbc, 0xa0, 0xd5,
+  0x66, 0xbc, 0xba, 0xf8, 0xcd, 0xbc, 0x7f, 0x26, 0x60, 0xbd, 0x6c, 0x27, 0x90,
+  0x3c, 0xf4, 0xd5, 0x85, 0x3c, 0xc0, 0x88, 0x3c, 0xbb, 0x8e, 0x17, 0x9d, 0xbc,
+  0x34, 0xb8, 0xef, 0x3c, 0x78, 0x16, 0xbd, 0x3c, 0x41, 0x5e, 0x90, 0xbd, 0x3e,
+  0x1c, 0x40, 0x3d, 0xeb, 0xf2, 0x8c, 0x3d, 0xd4, 0xb2, 0xa8, 0xbc, 0x0a, 0xae,
+  0x29, 0x3d, 0x40, 0x78, 0x1c, 0xbb, 0x60, 0xfb, 0xd1, 0x3c, 0x9d, 0xd0, 0x84,
+  0x3d, 0x8a, 0xcc, 0x08, 0x3d, 0x72, 0x4d, 0x41, 0x3d, 0xa9, 0x49, 0x50, 0xbd,
+  0x92, 0x44, 0x1c, 0x3d, 0xc8, 0x15, 0x5f, 0xbd, 0x1a, 0xda, 0xb6, 0xbc, 0xb4,
+  0x03, 0xd1, 0x3c, 0xdc, 0x8e, 0xb0, 0x3c, 0x88, 0x61, 0x7a, 0xbc, 0xb0, 0xab,
+  0xc4, 0xbb, 0xa2, 0x9f, 0x35, 0xbd, 0xac, 0xc1, 0x1e, 0xbd, 0x78, 0xd0, 0x54,
+  0x3d, 0x22, 0x03, 0xa9, 0xbc, 0x00, 0x71, 0x30, 0xbb, 0x30, 0xaa, 0xc8, 0x3b,
+  0xa9, 0x9c, 0x35, 0xbd, 0x00, 0xb3, 0x09, 0xbb, 0x40, 0x51, 0x2e, 0x3c, 0xc8,
+  0xb4, 0x23, 0x3c, 0x6d, 0xf4, 0x06, 0xbd, 0xaa, 0x77, 0x6f, 0x3d, 0xce, 0xc4,
+  0xb1, 0xbc, 0x6f, 0x91, 0x8b, 0x3d, 0x5f, 0xc4, 0x8a, 0x3d, 0xe4, 0x1f, 0xac,
+  0x3c, 0x4c, 0xc1, 0x89, 0x3c, 0x4c, 0x09, 0x5d, 0xbd, 0x38, 0x91, 0x3e, 0x3c,
+  0xe0, 0x15, 0x30, 0xbd, 0x60, 0x09, 0xd2, 0x3c, 0xe0, 0x4f, 0x35, 0xbb, 0xe8,
+  0xf2, 0xdf, 0xbc, 0x40, 0xa5, 0xcc, 0xba, 0x28, 0xaa, 0x04, 0xbc, 0xb4, 0x3b,
+  0x3d, 0xbc, 0xa8, 0xbc, 0x9d, 0x3c, 0x22, 0x77, 0x51, 0x3d, 0xd3, 0x53, 0x48,
+  0xbd, 0x80, 0x2a, 0x2c, 0x3b, 0x4e, 0x95, 0x79, 0x3d, 0x9c, 0x2c, 0x52, 0xbd,
+  0xac, 0x7e, 0xd9, 0x3c, 0x76, 0xd7, 0x78, 0x3d, 0x00, 0xe8, 0x78, 0xbd, 0x2e,
+  0x63, 0x0f, 0x3d, 0xeb, 0x59, 0x14, 0xbd, 0x84, 0xd4, 0x1c, 0xbc, 0x1d, 0x54,
+  0x1a, 0xbd, 0xe0, 0x16, 0x5c, 0xbb, 0x5c, 0xf1, 0x48, 0x3d, 0x94, 0x95, 0x59,
+  0xbc, 0x48, 0x14, 0x37, 0xbd, 0x3e, 0x60, 0x76, 0x3d, 0xb4, 0x88, 0xdb, 0x3c,
+  0x24, 0xf3, 0x8b, 0xbc, 0xb8, 0x6e, 0x0f, 0x3d, 0x00, 0x2c, 0xda, 0x3a, 0x79,
+  0x80, 0x88, 0x3d, 0x58, 0xf7, 0x26, 0x3c, 0x10, 0x19, 0x45, 0x3d, 0xf9, 0xba,
+  0x6a, 0xbd, 0x0e, 0x30, 0x43, 0x3d, 0xe0, 0x09, 0x68, 0x3b, 0x51, 0x84, 0x8f,
+  0xbd, 0x6a, 0xa1, 0x7a, 0xbd, 0xbc, 0x1c, 0x72, 0xbd, 0x94, 0xf7, 0x75, 0xbd,
+  0xc8, 0x32, 0x69, 0xbd, 0xf5, 0x29, 0x1e, 0xbd, 0x00, 0xe7, 0x59, 0x3a, 0x90,
+  0x9c, 0x84, 0xbd, 0x5c, 0x5f, 0x2f, 0xbd, 0x50, 0x8c, 0x95, 0xbb, 0x00, 0x13,
+  0x85, 0xbd, 0x26, 0xab, 0x7f, 0xbd, 0xc8, 0x91, 0x2a, 0xbc, 0x34, 0xda, 0xd2,
+  0xbc, 0x2c, 0xb7, 0x4b, 0x3d, 0x73, 0xe4, 0x2b, 0xbd, 0x48, 0x46, 0x8f, 0xbd,
+  0x0c, 0xa7, 0x36, 0xbd, 0x58, 0x23, 0x9f, 0x3c, 0xec, 0x5b, 0x2e, 0x3d, 0x28,
+  0xde, 0x34, 0xbd, 0x00, 0xd5, 0x8e, 0x3b, 0x76, 0xa2, 0x76, 0x3d, 0x64, 0xe8,
+  0x4d, 0x3d, 0x47, 0xc2, 0x82, 0xbd, 0x90, 0x0c, 0x8b, 0xbd, 0x9c, 0x98, 0x1a,
+  0x3d, 0x74, 0xd4, 0xd1, 0xbc, 0xd6, 0x3b, 0x78, 0x3d, 0x88, 0xad, 0x04, 0xbd,
+  0x5c, 0x4e, 0xbf, 0x3c, 0x20, 0xd8, 0x5b, 0x3c, 0x68, 0x77, 0x0e, 0xbc, 0xc0,
+  0x8a, 0xc8, 0x3b, 0x00, 0x68, 0x5d, 0xba, 0x4c, 0x05, 0x30, 0x3d, 0x20, 0xb7,
+  0x56, 0x3d, 0xa0, 0x6e, 0xef, 0x3c, 0xb4, 0x50, 0x1c, 0x3d, 0x5c, 0x0f, 0x68,
+  0xbd, 0xf7, 0x3c, 0x53, 0xbd, 0x96, 0xa5, 0x0c, 0x3d, 0x3a, 0x6c, 0x07, 0x3d,
+  0xa0, 0x60, 0x2c, 0xbd, 0x20, 0xaf, 0xbf, 0xbc, 0x00, 0x2d, 0x05, 0xbb, 0xe0,
+  0x97, 0x4b, 0x3b, 0x32, 0xdc, 0x37, 0x3d, 0xe2, 0x39, 0x54, 0xbd, 0x2a, 0xde,
+  0xeb, 0xbc, 0x1e, 0x8b, 0x6d, 0x3d, 0x0c, 0x92, 0xd6, 0xbc, 0xec, 0x48, 0x19,
+  0xbc, 0x23, 0xd9, 0x90, 0xbd, 0x84, 0x8b, 0x83, 0xbd, 0xc8, 0x8c, 0x7c, 0x3c,
+  0xfe, 0xca, 0x7d, 0xbd, 0x06, 0xb7, 0x69, 0x3d, 0x34, 0x35, 0xb0, 0x3c, 0x52,
+  0x14, 0x56, 0xbd, 0xf4, 0xf3, 0x43, 0xbd, 0x34, 0x5e, 0xbf, 0xbc, 0x9c, 0x32,
+  0x1e, 0x3d, 0xa0, 0x4d, 0xe0, 0x3b, 0x00, 0x68, 0x5d, 0xb8, 0x9e, 0x47, 0x7b,
+  0x3d, 0xe1, 0xcd, 0x8b, 0x3d, 0xb8, 0x10, 0x8f, 0xbc, 0xc8, 0x30, 0x28, 0x3c,
+  0xec, 0x42, 0x28, 0x3d, 0xfe, 0xea, 0x8a, 0xbd, 0x36, 0x76, 0x1a, 0xbd, 0xfa,
+  0x9c, 0xca, 0xbc, 0x10, 0xe9, 0x82, 0xbd, 0x72, 0x8b, 0x7b, 0x3d, 0x46, 0x75,
+  0x1c, 0xbd, 0x5a, 0xb9, 0x06, 0xbd, 0x6c, 0xa7, 0x25, 0xbc, 0x6a, 0x37, 0xd3,
+  0xbc, 0xbc, 0x78, 0x85, 0x3c, 0x98, 0xb7, 0x01, 0x3d, 0x3c, 0xb7, 0x0d, 0x3d,
+  0x3c, 0x57, 0x21, 0xbc, 0x28, 0xfb, 0xa7, 0x3c, 0x18, 0x3f, 0x49, 0x3c, 0x81,
+  0x34, 0x8d, 0xbd, 0xb4, 0xfb, 0x6e, 0xbd, 0x60, 0x97, 0x95, 0x3c, 0xac, 0xdd,
+  0x86, 0xbc, 0xd8, 0x6e, 0xda, 0x3c, 0xd8, 0xd9, 0x3d, 0x3d, 0x90, 0xa6, 0xea,
+  0x3c, 0x40, 0x67, 0x3f, 0x3d, 0x3a, 0x43, 0x69, 0x3d, 0x0a, 0x20, 0x5e, 0x3d,
+  0x33, 0x91, 0x12, 0xbd, 0xb4, 0xc5, 0x31, 0xbd, 0x0e, 0x96, 0x45, 0x3d, 0xc6,
+  0x22, 0x37, 0xbd, 0x7c, 0x12, 0x44, 0x3d, 0xc9, 0x61, 0x8a, 0x3d, 0x1c, 0x66,
+  0x44, 0x3d, 0xa2, 0x51, 0x30, 0x3d, 0xc8, 0xdb, 0xd9, 0x3c, 0xd3, 0xfb, 0x8e,
+  0xbd, 0x08, 0x6a, 0x91, 0xbd, 0xea, 0x2e, 0x48, 0xbd, 0x60, 0x5b, 0x22, 0xbb,
+  0x06, 0x39, 0x53, 0x3d, 0x84, 0xb4, 0x0b, 0xbd, 0xa0, 0x77, 0xfa, 0x3b, 0x84,
+  0xaf, 0xaa, 0x3c, 0x47, 0xd2, 0x86, 0xbd, 0xe3, 0xef, 0x43, 0xbd, 0x36, 0x8d,
+  0x16, 0x3d, 0x85, 0xa6, 0x85, 0x3d, 0x8e, 0xda, 0xa0, 0xbc, 0xc3, 0x58, 0x80,
+  0xbd, 0x93, 0x30, 0x0f, 0xbd, 0x0c, 0x85, 0xcf, 0xbc, 0xc0, 0x8c, 0x2a, 0x3c,
+  0x02, 0xe2, 0x0d, 0xbd, 0xe9, 0xf8, 0x8c, 0xbd, 0x15, 0x8d, 0x8b, 0x3d, 0xf3,
+  0x1f, 0x8b, 0xbd, 0x0f, 0xa0, 0x80, 0xbd, 0xee, 0x04, 0x63, 0x3d, 0xb4, 0x7a,
+  0xf6, 0xbc, 0x60, 0x5b, 0x2e, 0xbc, 0x04, 0x6d, 0x42, 0x3d, 0x8a, 0xfc, 0x1c,
+  0x3d, 0x52, 0xb0, 0x27, 0x3d, 0xe8, 0xf9, 0x35, 0xbd, 0xd4, 0xc2, 0x1b, 0x3d,
+  0x00, 0x3a, 0x0b, 0xbb, 0x80, 0x7e, 0x4b, 0x3c, 0x06, 0xba, 0x3e, 0xbd, 0x70,
+  0xc9, 0x35, 0xbd, 0xe0, 0x8b, 0x9d, 0xbb, 0x16, 0x05, 0x2f, 0xbd, 0xa0, 0xeb,
+  0x03, 0x3c, 0x40, 0x3e, 0x95, 0xbc, 0xea, 0x76, 0x73, 0xbd, 0x90, 0xb0, 0xe8,
+  0x3c, 0x3e, 0x61, 0x42, 0xbd, 0x17, 0x02, 0x8d, 0xbd, 0x42, 0x66, 0x1d, 0x3d,
+  0xfe, 0x31, 0x68, 0x3d, 0x52, 0x8e, 0x30, 0xbd, 0x6b, 0xca, 0x10, 0xbd, 0xbd,
+  0xcc, 0x80, 0xbd, 0x38, 0x91, 0x53, 0xbd, 0x90, 0xd7, 0xd3, 0x3c, 0x00, 0x0c,
+  0xf4, 0x3b, 0x82, 0xf5, 0x3f, 0xbd, 0xb2, 0xa9, 0x04, 0x3d, 0x62, 0x67, 0x5c,
+  0x3d, 0x86, 0xab, 0x91, 0xbc, 0xc2, 0x2b, 0xe8, 0xbc, 0x3a, 0x8a, 0x67, 0xbd,
+  0xcc, 0x83, 0xdb, 0x3c, 0xf0, 0x8a, 0x03, 0x3c, 0x94, 0x78, 0x53, 0x3d, 0x9c,
+  0x1b, 0xd4, 0x3c, 0xdb, 0xf9, 0x89, 0x3d, 0x40, 0xa5, 0x10, 0x3b, 0x89, 0xed,
+  0x80, 0xbd, 0x6e, 0xb8, 0x57, 0xbd, 0x12, 0xc2, 0xcf, 0xbc, 0x44, 0x32, 0xb1,
+  0x3c, 0xd5, 0xed, 0x34, 0xbd, 0x5e, 0x6c, 0x5c, 0xbd, 0x68, 0x69, 0x85, 0x3c,
+  0x30, 0xdb, 0xb6, 0xbb, 0x00, 0x7f, 0xe0, 0x3c, 0x80, 0x24, 0x1e, 0x3b, 0x78,
+  0x6f, 0x81, 0xbc, 0x3a, 0x27, 0x1b, 0x3d, 0x7f, 0xb5, 0x8a, 0xbd, 0xbb, 0xc1,
+  0x8e, 0x3d, 0xa8, 0x7e, 0x69, 0x3c, 0x00, 0x80, 0x47, 0xbb, 0x21, 0xb9, 0x15,
+  0xbd, 0x14, 0x0b, 0x8e, 0x3c, 0xa2, 0x1b, 0x55, 0x3d, 0x28, 0xea, 0x5b, 0xbd,
+  0x10, 0x9a, 0x43, 0x3d, 0x40, 0xf6, 0x8a, 0x3a, 0x58, 0xb1, 0x92, 0xbc, 0x5c,
+  0x0a, 0x4e, 0xbd, 0x10, 0xec, 0x1f, 0xbd, 0xa8, 0x31, 0xa7, 0x3c, 0x60, 0xfa,
+  0x9f, 0xbb, 0xf0, 0x04, 0xa3, 0xbb, 0xc4, 0xd8, 0x5f, 0xbd, 0xba, 0x5f, 0x66,
+  0xbd, 0x52, 0x94, 0x97, 0xbc, 0x1a, 0x9b, 0x22, 0xbd, 0xaa, 0x28, 0x59, 0x3d,
+  0xaa, 0x06, 0x64, 0xbd, 0xe7, 0xc2, 0x83, 0xbd, 0xd0, 0x3d, 0xd0, 0xbc, 0x00,
+  0x8c, 0xa3, 0x39, 0xd0, 0x27, 0x0c, 0xbc, 0x40, 0x8f, 0x79, 0xbc, 0x9e, 0x32,
+  0x7f, 0x3d, 0xac, 0x9b, 0xfd, 0xbc, 0xb1, 0x17, 0x91, 0x3d, 0xa8, 0xca, 0x4e,
+  0x3d, 0x40, 0xc3, 0xb7, 0x3a, 0xc0, 0x8e, 0x78, 0xbb, 0x3f, 0x3c, 0x83, 0x3d,
+  0x47, 0xdc, 0x81, 0xbd, 0x5b, 0xe6, 0x1c, 0xbd, 0x70, 0xe3, 0xc8, 0xbc, 0x70,
+  0x12, 0xd6, 0xbb, 0x0c, 0xb6, 0xe3, 0x3c, 0x88, 0x2a, 0x22, 0x3c, 0xd6, 0xbf,
+  0x8d, 0xbd, 0xde, 0x15, 0x20, 0x3d, 0x76, 0x83, 0x3e, 0xbd, 0x85, 0x35, 0x80,
+  0x3d, 0xc1, 0x0b, 0x87, 0x3d, 0xbf, 0x64, 0x18, 0xbd, 0x80, 0x22, 0x68, 0x3b,
+  0xc4, 0xb0, 0xb0, 0x3c, 0xa2, 0xf2, 0x4f, 0xbd, 0xb6, 0x63, 0x04, 0x3d, 0xc0,
+  0x4a, 0xc9, 0x3c, 0x36, 0x66, 0xc0, 0xbc, 0x64, 0x7a, 0x4c, 0x3d, 0xc1, 0x5b,
+  0x8c, 0x3d, 0xae, 0xa2, 0x41, 0x3d, 0x66, 0x93, 0x01, 0x3d, 0x6c, 0xb7, 0x37,
+  0xbd, 0x8c, 0x03, 0x28, 0xbd, 0x7c, 0xf6, 0x69, 0xbd, 0xa2, 0xe7, 0x0d, 0xbd,
+  0xb0, 0xf3, 0x41, 0x3d, 0xc0, 0xbf, 0xc4, 0x3b, 0xe2, 0x58, 0x46, 0xbd, 0x02,
+  0xb4, 0x60, 0x3d, 0xa2, 0xf8, 0x29, 0x3d, 0x90, 0xf7, 0xc8, 0x3b, 0xee, 0xad,
+  0x43, 0x3d, 0x1b, 0x51, 0x12, 0xbd, 0xee, 0xc3, 0x91, 0xbd, 0x20, 0xad, 0x58,
+  0x3c, 0xc6, 0x54, 0x3a, 0x3d, 0xea, 0xba, 0x60, 0xbd, 0x7e, 0x31, 0x22, 0x3d,
+  0x98, 0xe6, 0x80, 0xbd, 0x00, 0x41, 0x29, 0x3b, 0x85, 0xec, 0x8c, 0x3d, 0x7a,
+  0x8e, 0x3e, 0x3d, 0x42, 0x31, 0xfc, 0xbc, 0x58, 0x3c, 0x08, 0x3c, 0xdc, 0x04,
+  0xb5, 0xbc, 0x9e, 0xbf, 0x0f, 0xbd, 0x70, 0xad, 0x2a, 0xbc, 0x6c, 0x83, 0x8c,
+  0xbc, 0x6a, 0xd4, 0x6c, 0xbd, 0x62, 0x1b, 0x8e, 0xbc, 0x94, 0x48, 0x1f, 0xbd,
+  0x35, 0xe0, 0x3d, 0xbd, 0x60, 0x91, 0x88, 0x3b, 0x6c, 0x16, 0x07, 0x3d, 0x30,
+  0xa0, 0x93, 0x3b, 0x3c, 0xec, 0x5e, 0xbc, 0x66, 0xbf, 0x51, 0xbd, 0xfc, 0x42,
+  0x47, 0x3d, 0x78, 0x73, 0x71, 0x3c, 0x62, 0x96, 0x89, 0xbd, 0x50, 0x2b, 0xca,
+  0x3c, 0x98, 0xc5, 0x21, 0x3c, 0xbb, 0x4b, 0x19, 0xbd, 0x36, 0x22, 0x75, 0x3d,
+  0x44, 0x6e, 0x7d, 0xbd, 0xec, 0x88, 0x8d, 0x3c, 0xa8, 0x57, 0x0e, 0x3c, 0x96,
+  0x97, 0x01, 0x3d, 0x1c, 0x9c, 0x59, 0x3d, 0xc4, 0x0b, 0x31, 0x3d, 0x60, 0xf0,
+  0x6c, 0xbc, 0xb8, 0xa9, 0xb4, 0x3c, 0xd8, 0xbb, 0x33, 0xbc, 0x98, 0x35, 0x99,
+  0x3c, 0xd2, 0x49, 0x3d, 0xbd, 0xe6, 0xc9, 0x5b, 0x3d, 0x42, 0xf7, 0x41, 0x3d,
+  0xda, 0x13, 0x37, 0xbd, 0x96, 0x91, 0x94, 0xbc, 0xb8, 0xde, 0x89, 0x3c, 0xda,
+  0x37, 0x08, 0xbd, 0x20, 0xda, 0x3e, 0x3c, 0xda, 0xe8, 0x61, 0xbd, 0x70, 0x8a,
+  0x29, 0x3d, 0x18, 0xa4, 0x8f, 0xbd, 0x20, 0xee, 0x56, 0x3c, 0x70, 0xc3, 0xc8,
+  0xbc, 0x5c, 0xf4, 0x99, 0x3c, 0x54, 0xd5, 0x4b, 0xbd, 0x88, 0xcf, 0x6a, 0x3c,
+  0xa5, 0xc7, 0x1c, 0xbd, 0x10, 0x98, 0xb3, 0xbb, 0x9a, 0xe0, 0x86, 0xbd, 0x3e,
+  0x34, 0x87, 0xbd, 0xfa, 0x36, 0x7d, 0x3d, 0x40, 0x64, 0xfe, 0xbc, 0xd0, 0x4f,
+  0x67, 0xbd, 0x21, 0xda, 0x72, 0xbd, 0x2e, 0x02, 0x38, 0xbd, 0xc6, 0xd9, 0xff,
+  0xbc, 0x1a, 0x30, 0xb9, 0xbc, 0x58, 0xea, 0x58, 0x3c, 0xb1, 0xb7, 0x03, 0xbd,
+  0x80, 0x5b, 0xfc, 0x3a, 0x43, 0x60, 0x80, 0x3d, 0xa8, 0x67, 0x4a, 0xbd, 0x68,
+  0xd8, 0x3e, 0x3c, 0xf0, 0xe8, 0x2a, 0x3c, 0x68, 0x26, 0x3f, 0xbd, 0x28, 0x26,
+  0x73, 0xbd, 0x38, 0xe5, 0x24, 0x3d, 0x00, 0xb0, 0xa1, 0xba, 0x7e, 0x0f, 0x18,
+  0xbd, 0x35, 0x0d, 0x7c, 0xbd, 0x14, 0xa7, 0x3f, 0x3d, 0x16, 0x49, 0x0e, 0x3d,
+  0x2e, 0xd8, 0x90, 0xbd, 0x50, 0xc3, 0x21, 0xbd, 0xd4, 0x13, 0x44, 0x3d, 0x70,
+  0x10, 0xfd, 0x3b, 0x7b, 0x43, 0x87, 0x3d, 0x64, 0xb7, 0xf9, 0x3c, 0xd6, 0xc6,
+  0xb7, 0xbc, 0x00, 0xd8, 0xbb, 0x3b, 0xe0, 0x1b, 0x42, 0xbb, 0x68, 0x5c, 0xcf,
+  0xbc, 0xea, 0xfb, 0x8e, 0xbd, 0xdc, 0x09, 0x33, 0x3d, 0x80, 0xef, 0xb9, 0x3c,
+  0x00, 0xde, 0x92, 0xb9, 0x31, 0x42, 0x08, 0xbd, 0x80, 0x6d, 0x40, 0x3b, 0x80,
+  0xab, 0x20, 0x3d, 0xc0, 0x60, 0xc3, 0xba, 0x0b, 0xb6, 0x5e, 0xbd, 0xd4, 0x28,
+  0x3e, 0xbd, 0x47, 0x7b, 0x87, 0x3d, 0x81, 0x52, 0x84, 0x3d, 0x90, 0x8e, 0xc2,
+  0x3c, 0x04, 0x5b, 0xf3, 0xbc, 0x70, 0xa9, 0xea, 0x3c, 0x55, 0x55, 0x4d, 0xbd,
+  0x52, 0x8b, 0x59, 0xbd, 0xf2, 0xeb, 0x56, 0x3d, 0x1e, 0xc7, 0x3f, 0x3d, 0xe0,
+  0x52, 0xa3, 0x3b, 0x16, 0x93, 0x9d, 0xbc, 0x28, 0xeb, 0x36, 0x3d, 0x70, 0x4c,
+  0x1d, 0x3d, 0x8d, 0x81, 0x14, 0xbd, 0xb0, 0x22, 0xa0, 0xbb, 0x50, 0xfa, 0x87,
+  0x3c, 0x33, 0xc6, 0x2d, 0xbd, 0xd3, 0xd8, 0x85, 0x3d, 0xe8, 0xfd, 0x15, 0x3c,
+  0x20, 0x79, 0xe4, 0x3b, 0xb0, 0xd4, 0x4f, 0xbd, 0x24, 0xe9, 0xb5, 0x3c, 0xba,
+  0x47, 0x27, 0x3d, 0x23, 0xef, 0x02, 0xbd, 0xf0, 0xac, 0x31, 0x3d, 0x62, 0xde,
+  0xdd, 0xbc, 0x2c, 0xa0, 0x29, 0x3d, 0xa5, 0xec, 0x85, 0x3d, 0xa9, 0x1b, 0x8d,
+  0x3d, 0x2c, 0x6c, 0xa2, 0xbc, 0xf0, 0xc7, 0x37, 0xbc, 0x6c, 0xf7, 0xc5, 0xbc,
+  0xf4, 0x1d, 0x1c, 0xbc, 0x20, 0x3c, 0xc9, 0x3b, 0x9d, 0xff, 0x0b, 0xbd, 0x10,
+  0xa3, 0x53, 0x3d, 0x64, 0xbb, 0xc9, 0xbc, 0xfc, 0x8d, 0xe8, 0xbc, 0x20, 0x1f,
+  0x5a, 0x3c, 0x11, 0xe2, 0x17, 0xbd, 0xe0, 0x37, 0x97, 0x3b, 0x88, 0x44, 0x2a,
+  0xbd, 0x88, 0x79, 0x4c, 0xbd, 0xa8, 0x9e, 0x0d, 0x3c, 0x15, 0x54, 0x8c, 0x3d,
+  0xcb, 0x9b, 0x87, 0x3d, 0x18, 0xdd, 0x07, 0xbd, 0x2b, 0x33, 0x81, 0xbd, 0xb2,
+  0x57, 0x2e, 0xbd, 0x18, 0xc5, 0x2b, 0xbd, 0x88, 0x10, 0x91, 0xbd, 0x66, 0x69,
+  0x15, 0x3d, 0x98, 0x6c, 0xf7, 0x3c, 0x10, 0x05, 0x07, 0xbc, 0x44, 0x3b, 0xc6,
+  0xbc, 0x30, 0x43, 0xa8, 0x3b, 0x5b, 0xd8, 0x38, 0xbd, 0x66, 0x01, 0xe8, 0xbc,
+  0x36, 0xef, 0xaf, 0xbc, 0x88, 0x76, 0x24, 0x3c, 0x3a, 0x71, 0x5d, 0x3d, 0x30,
+  0xa0, 0x38, 0xbc, 0x04, 0x86, 0xf5, 0xbc, 0x30, 0xdc, 0x7c, 0x3c, 0x0c, 0x37,
+  0x2f, 0xbd, 0x80, 0xa4, 0x1f, 0xba, 0x2c, 0xa1, 0x2f, 0xbd, 0xb0, 0xb7, 0xa0,
+  0x3c, 0x37, 0xb1, 0x14, 0xbd, 0xb6, 0x07, 0x54, 0xbd, 0xb0, 0xbf, 0xd7, 0xbc,
+  0x6c, 0xc8, 0x2c, 0x3d, 0x2c, 0x09, 0x31, 0x3d, 0x04, 0x69, 0xe4, 0xbc, 0xa0,
+  0x5e, 0x7a, 0xbb, 0x90, 0x52, 0xb3, 0x3c, 0x4e, 0x6b, 0x84, 0xbd, 0xcc, 0x7e,
+  0x25, 0x3d, 0x30, 0x08, 0x99, 0xbb, 0x00, 0x08, 0xfc, 0x3b, 0xaa, 0xf0, 0x66,
+  0x3d, 0x13, 0xa5, 0x8a, 0x3d, 0xc8, 0x1c, 0xad, 0xbc, 0xf1, 0x48, 0x82, 0x3d,
+  0x7d, 0x18, 0x80, 0xbd, 0x14, 0x52, 0xa6, 0x3c, 0x10, 0x21, 0x9c, 0xbb, 0xfc,
+  0xda, 0x31, 0xbc, 0x0e, 0x65, 0xd2, 0xbc, 0x74, 0x2a, 0xcd, 0xbc, 0xb6, 0xb6,
+  0x64, 0x3d, 0x24, 0x32, 0x55, 0x3d, 0x8e, 0xc7, 0xbc, 0xbc, 0x94, 0x15, 0x89,
+  0x3c, 0x72, 0x1e, 0x3b, 0x3d, 0xb0, 0x0e, 0x25, 0x3c, 0xf8, 0x00, 0xad, 0x3c,
+  0xc1, 0xb3, 0x92, 0xbd, 0xce, 0xcf, 0x33, 0x3d, 0xe8, 0xec, 0x6a, 0x3c, 0x9e,
+  0x76, 0x9c, 0xbc, 0x4e, 0x5f, 0x29, 0xbd, 0x7c, 0xa7, 0x88, 0x3c, 0x00, 0xf3,
+  0xbf, 0x3c, 0x10, 0x12, 0x26, 0x3c, 0xf4, 0x7c, 0x4b, 0x3d, 0x90, 0x83, 0xec,
+  0xbb, 0xb6, 0x48, 0x92, 0xbd, 0x5c, 0x63, 0x47, 0x3d, 0x3f, 0xb2, 0x71, 0xbd,
+  0x60, 0x1f, 0x7e, 0xbc, 0xbc, 0xff, 0x9a, 0xbc, 0x96, 0x17, 0xb2, 0xbc, 0x78,
+  0x09, 0x0a, 0x3c, 0xa5, 0xbb, 0x8d, 0x3d, 0x80, 0x7e, 0xbd, 0x3a, 0x8c, 0x61,
+  0x8f, 0xbd, 0x70, 0x44, 0x19, 0x3d, 0xde, 0x63, 0x4b, 0x3d, 0x00, 0x61, 0x0b,
+  0xbb, 0x36, 0x70, 0x32, 0xbd, 0xc6, 0x8f, 0x71, 0x3d, 0xf0, 0xf7, 0xa0, 0xbc,
+  0x00, 0x80, 0x01, 0xb8, 0xe4, 0xc6, 0x93, 0x3c, 0x08, 0xd4, 0x3b, 0x3c, 0x96,
+  0x32, 0x40, 0x3d, 0xb8, 0x22, 0x31, 0x3d, 0x4a, 0xd9, 0x6f, 0x3d, 0x28, 0x10,
+  0x2c, 0xbc, 0x94, 0x4b, 0x9c, 0xbc, 0x90, 0x38, 0x57, 0x3d, 0xa4, 0x0d, 0x81,
+  0xbc, 0x90, 0xa5, 0xb6, 0x3c, 0x9d, 0xfe, 0x78, 0xbd, 0x3c, 0x24, 0x19, 0x3d,
+  0xa8, 0x56, 0x0c, 0x3d, 0x6b, 0xec, 0x54, 0xbd, 0x10, 0x49, 0x94, 0xbb, 0x80,
+  0x25, 0xe9, 0x3c, 0xe4, 0xb5, 0xe2, 0xbc, 0x68, 0xb2, 0x10, 0x3d, 0x6a, 0x13,
+  0xe0, 0xbc, 0x3a, 0x69, 0x44, 0xbd, 0x18, 0x3f, 0xfc, 0x3c, 0x6e, 0x08, 0x60,
+  0x3d, 0x5e, 0x5b, 0xa2, 0xbc, 0x7c, 0xbd, 0x81, 0xbd, 0xf0, 0xf9, 0xd6, 0x3b,
+  0xfa, 0x80, 0x14, 0xbd, 0xdb, 0xb0, 0x8d, 0xbd, 0xb0, 0x41, 0xe5, 0x3b, 0xe0,
+  0x03, 0xe3, 0x3c, 0xf4, 0x88, 0x07, 0xbd, 0x52, 0x89, 0xd0, 0xbc, 0x90, 0x90,
+  0x10, 0x3d, 0x9c, 0xc3, 0x3e, 0x3d, 0x2f, 0x07, 0x09, 0xbd, 0x7e, 0x67, 0xf6,
+  0xbc, 0xde, 0x88, 0xe1, 0xbc, 0xbe, 0x4b, 0x08, 0xbd, 0xac, 0xc1, 0x24, 0x3d,
+  0x5e, 0xd5, 0x3c, 0x3d, 0x80, 0x9e, 0x01, 0xbc, 0xa6, 0xdb, 0xc7, 0xbc, 0xbb,
+  0x37, 0x83, 0xbd, 0x34, 0x71, 0x50, 0x3d, 0x10, 0x46, 0x2d, 0xbd, 0x71, 0x50,
+  0x67, 0xbd, 0x20, 0x2e, 0x15, 0xbb, 0xaa, 0x05, 0x74, 0x3d, 0xc1, 0xb5, 0x79,
+  0xbd, 0x21, 0xaa, 0x44, 0xbd, 0xda, 0xbd, 0x0c, 0xbd, 0xb1, 0xee, 0x8c, 0x3d,
+  0x54, 0x83, 0x83, 0xbd, 0x5e, 0xe5, 0x75, 0x3d, 0x52, 0x3d, 0x73, 0x3d, 0x40,
+  0xf3, 0xd4, 0x3c, 0x9a, 0x1a, 0x78, 0x3d, 0x85, 0x49, 0x62, 0xbd, 0x6b, 0x57,
+  0x91, 0x3d, 0x30, 0xd7, 0x3f, 0x3d, 0xed, 0x16, 0x3f, 0xbd, 0xd0, 0xf4, 0x85,
+  0xbb, 0x47, 0x5e, 0x1e, 0xbd, 0x70, 0xe9, 0x87, 0x3c, 0x87, 0x5d, 0x80, 0xbd,
+  0xa0, 0x7a, 0xb6, 0xbb, 0x03, 0x86, 0x84, 0xbd, 0x50, 0x4c, 0x74, 0x3c, 0x85,
+  0x86, 0x80, 0x3d, 0x00, 0xe2, 0x56, 0xbb, 0x7e, 0xb0, 0x16, 0xbd, 0x10, 0xa9,
+  0x80, 0xbd, 0xe0, 0x8b, 0x47, 0x3d, 0x19, 0x07, 0x68, 0xbd, 0x4e, 0xd8, 0x70,
+  0x3d, 0xa8, 0x10, 0x2a, 0x3d, 0x22, 0x23, 0x96, 0xbc, 0x92, 0xe3, 0x72, 0xbd,
+  0xb8, 0x0f, 0x13, 0x3d, 0x16, 0xc3, 0x53, 0x3d, 0xa4, 0x95, 0x41, 0x3d, 0x02,
+  0xc3, 0x6f, 0x3d, 0x48, 0x02, 0xac, 0xbc, 0x40, 0x53, 0x6d, 0x3b, 0xf4, 0x2a,
+  0x19, 0xbc, 0x10, 0x1f, 0xc2, 0xbb, 0x21, 0xb8, 0x69, 0xbd, 0x97, 0x8c, 0x8a,
+  0x3d, 0x38, 0x13, 0xb4, 0x3c, 0xf1, 0x0d, 0x8d, 0x3d, 0x00, 0x69, 0x30, 0x3d,
+  0x38, 0x92, 0xf9, 0x3c, 0xb5, 0xff, 0x8a, 0x3d, 0x15, 0x27, 0x91, 0x3d, 0x96,
+  0xd4, 0x00, 0x3d, 0x66, 0xde, 0x1c, 0x3d, 0x7c, 0x48, 0x40, 0x3d, 0x08, 0x06,
+  0xf2, 0x3c, 0x8e, 0xfe, 0x71, 0x3d, 0x90, 0xa1, 0xc6, 0xbb, 0x88, 0x57, 0x05,
+  0x3c, 0x80, 0x92, 0x6d, 0x3a, 0x80, 0x99, 0xc9, 0xba, 0x0f, 0x0f, 0x33, 0xbd,
+  0x76, 0xfc, 0x31, 0x3d, 0xd8, 0x9f, 0x23, 0xbd, 0x8c, 0x07, 0x07, 0xbd, 0x68,
+  0x38, 0x5e, 0x3c, 0xf0, 0x39, 0xbf, 0xbc, 0x6c, 0x16, 0xfc, 0x3c, 0x94, 0xf2,
+  0xb4, 0xbc, 0x20, 0x52, 0xc4, 0xbb, 0xb7, 0x3f, 0x02, 0xbd, 0x78, 0x48, 0x61,
+  0xbd, 0x48, 0xad, 0x6b, 0xbd, 0xcd, 0xb1, 0x8c, 0x3d, 0x20, 0x28, 0xcd, 0x3c,
+  0xb4, 0x49, 0x53, 0x3d, 0x30, 0x59, 0x06, 0x3c, 0xda, 0xea, 0x83, 0xbd, 0xf8,
+  0xe2, 0x16, 0xbd, 0x96, 0xc3, 0x77, 0x3d, 0x2c, 0x90, 0xf6, 0x3c, 0x94, 0x78,
+  0x4d, 0xbc, 0x75, 0x0d, 0x2f, 0xbd, 0xa2, 0x00, 0xa7, 0xbc, 0x32, 0xec, 0x7c,
+  0x3d, 0x6c, 0x7a, 0x5a, 0xbc, 0x7e, 0x59, 0x58, 0x3d, 0x60, 0x65, 0x91, 0x3b,
+  0x28, 0x8b, 0x75, 0xbd, 0x22, 0xa7, 0x7b, 0x3d, 0xc4, 0xdd, 0x39, 0x3d, 0xe4,
+  0x54, 0xa3, 0xbc, 0xb6, 0x39, 0x30, 0x3d, 0x38, 0x91, 0x35, 0x3c, 0xd0, 0xb9,
+  0x10, 0x3c, 0x4c, 0x8a, 0xab, 0x3c, 0x04, 0x8d, 0x0e, 0xbd, 0x20, 0xc2, 0xcb,
+  0x3b, 0x32, 0xbe, 0x58, 0xbd, 0xec, 0x4e, 0x03, 0x3d, 0xf0, 0x59, 0xee, 0x3c,
+  0x18, 0x48, 0x0d, 0xbc, 0xa0, 0xfd, 0xe6, 0xbb, 0x8c, 0x9c, 0x4b, 0x3d, 0xa8,
+  0xe8, 0x13, 0x3c, 0x14, 0xb9, 0x4e, 0xbd, 0xe6, 0xbf, 0x03, 0x3d, 0xf0, 0x7a,
+  0xdd, 0xbc, 0xc8, 0x1b, 0x91, 0xbc, 0x9b, 0x2a, 0x24, 0xbd, 0x98, 0x93, 0x01,
+  0xbc, 0x1a, 0x0c, 0x34, 0x3d, 0xfe, 0xfa, 0xa3, 0xbc, 0x7c, 0x82, 0xbd, 0x3c,
+  0x70, 0x96, 0xe8, 0x3c, 0xa6, 0x08, 0x67, 0x3d, 0x48, 0x11, 0x68, 0xbc, 0x90,
+  0xfb, 0x58, 0xbd, 0x91, 0x9e, 0x8b, 0xbd, 0x4b, 0xd8, 0x87, 0xbd, 0x6a, 0x90,
+  0x63, 0x3d, 0x36, 0xa5, 0x20, 0x3d, 0x30, 0x61, 0x3d, 0x3d, 0x56, 0x99, 0x11,
+  0xbd, 0xce, 0xff, 0x70, 0x3d, 0xd5, 0x52, 0x3d, 0xbd, 0x44, 0x1e, 0x92, 0x3c,
+  0x6e, 0xb4, 0x44, 0xbd, 0x42, 0xeb, 0xec, 0xbc, 0xa2, 0xea, 0x85, 0xbc, 0x40,
+  0x48, 0x01, 0x3b, 0x52, 0xcd, 0x75, 0x3d, 0xe9, 0xa7, 0x08, 0xbd, 0x61, 0x2e,
+  0x0c, 0xbd, 0x06, 0xda, 0x24, 0x3d, 0xce, 0xfc, 0xf7, 0xbc, 0x62, 0xab, 0x7d,
+  0x3d, 0x2f, 0x02, 0x89, 0xbd, 0xea, 0x05, 0x48, 0xbd, 0xea, 0x7c, 0x7b, 0xbd,
+  0x80, 0x05, 0x8c, 0xba, 0xba, 0x77, 0x3d, 0xbd, 0xfa, 0xee, 0x34, 0xbd, 0xd2,
+  0x24, 0x28, 0x3d, 0x30, 0xb2, 0x40, 0xbd, 0x52, 0x8b, 0x18, 0x3d, 0xe3, 0xfc,
+  0x8b, 0x3d, 0x58, 0x86, 0x65, 0xbc, 0x64, 0x1e, 0xa8, 0xbc, 0xba, 0xc7, 0x75,
+  0x3d, 0xdb, 0xb4, 0x80, 0x3d, 0x07, 0x16, 0x67, 0xbd, 0x84, 0x95, 0x6d, 0xbc,
+  0x11, 0xb3, 0x1e, 0xbd, 0x40, 0x9b, 0x56, 0xbb, 0x7e, 0x66, 0x57, 0x3d, 0xca,
+  0x1c, 0x5e, 0x3d, 0x20, 0xef, 0xe5, 0x3b, 0xd3, 0x0f, 0x2e, 0xbd, 0x8a, 0xdf,
+  0x81, 0xbd, 0x58, 0xc9, 0x0f, 0x3d, 0xbc, 0x54, 0x63, 0xbd, 0x60, 0x24, 0x85,
+  0xbd, 0x5a, 0xa5, 0xda, 0xbc, 0x12, 0x87, 0x01, 0x3d, 0xf6, 0xc0, 0x96, 0xbc,
+  0x78, 0x46, 0x1d, 0x3d, 0xb6, 0x90, 0x62, 0xbd, 0xc0, 0x43, 0x94, 0x3b, 0xf0,
+  0xed, 0xce, 0xbb, 0xb8, 0x25, 0x14, 0xbc, 0xf4, 0x5c, 0x20, 0xbc, 0xd8, 0x5b,
+  0x1c, 0x3d, 0x44, 0xcb, 0x4c, 0xbc, 0x2e, 0xf6, 0x36, 0x3d, 0x94, 0xa7, 0xe6,
+  0xbc, 0xd8, 0xac, 0x4f, 0x3c, 0x06, 0x78, 0x11, 0x3d, 0xe6, 0x53, 0x14, 0x3d,
+  0x3b, 0x4b, 0x25, 0xbd, 0x03, 0xb6, 0x88, 0xbd, 0xd0, 0xc2, 0x2b, 0x3c, 0xc5,
+  0xf9, 0x12, 0xbd, 0x78, 0x6f, 0xf5, 0x3c, 0xc6, 0xc0, 0x63, 0x3d, 0x60, 0xd4,
+  0xa9, 0x3c, 0x1b, 0x87, 0x92, 0x3d, 0x70, 0x70, 0x35, 0xbd, 0xb8, 0xaa, 0x17,
+  0x3d, 0xec, 0x13, 0xde, 0xbc, 0x04, 0xc8, 0x8c, 0x3c, 0x3c, 0xcd, 0xf4, 0x3c,
+  0x66, 0x81, 0x4b, 0x3d, 0x3e, 0x59, 0x8b, 0xbd, 0xb8, 0xab, 0x04, 0x3c, 0xdc,
+  0x9a, 0xd8, 0x3c, 0x00, 0x22, 0x4d, 0x3d, 0x08, 0x10, 0x93, 0x3c, 0x64, 0x64,
+  0x7e, 0xbc, 0x32, 0xd1, 0x00, 0x3d, 0xfc, 0x6a, 0x2a, 0xbd, 0x04, 0x05, 0xa8,
+  0x3c, 0x4c, 0xb2, 0xc3, 0x3c, 0x57, 0x68, 0x0d, 0xbd, 0x18, 0x0f, 0x6e, 0xbd,
+  0x31, 0x3c, 0x0d, 0xbd, 0xa0, 0xef, 0xe0, 0xbb, 0x5a, 0xa3, 0xf2, 0xbc, 0xb3,
+  0xcd, 0x88, 0x3d, 0x0c, 0x86, 0x6e, 0xbc, 0x78, 0x6a, 0x14, 0xbc, 0x51, 0x9b,
+  0x2e, 0xbd, 0x45, 0x0b, 0x22, 0xbd, 0xf0, 0x38, 0x9e, 0x3c, 0x53, 0x6c, 0x87,
+  0x3d, 0x00, 0x20, 0x2d, 0x3a, 0x40, 0xea, 0xd2, 0xba, 0xcd, 0x35, 0x88, 0xbd,
+  0xb2, 0xad, 0x62, 0x3d, 0xf6, 0x83, 0xb9, 0xbc, 0x92, 0xb4, 0x4b, 0x3d, 0xe6,
+  0x0e, 0x86, 0xbc, 0x55, 0x4e, 0x85, 0x3d, 0x7e, 0x89, 0x05, 0x3d, 0xa1, 0xb1,
+  0x83, 0x3d, 0x7c, 0x7c, 0xf5, 0x3c, 0xdb, 0x2e, 0x8c, 0xbd, 0x98, 0x94, 0x5c,
+  0xbd, 0x0c, 0xfd, 0xb9, 0xbc, 0x40, 0x7e, 0xa5, 0x3c, 0xc0, 0x1e, 0xd6, 0x3a,
+  0x88, 0x80, 0x1d, 0x3c, 0x48, 0x6f, 0xfe, 0x3c, 0x2a, 0x7a, 0xde, 0xbc, 0x9c,
+  0x7d, 0x1a, 0xbd, 0x70, 0xd8, 0x1b, 0x3c, 0xa8, 0x27, 0x75, 0xbd, 0x92, 0x9a,
+  0x53, 0x3d, 0xb3, 0x0a, 0x8b, 0x3d, 0xd0, 0xe2, 0x10, 0x3c, 0xb0, 0x82, 0x9d,
+  0x3b, 0x38, 0x23, 0x10, 0x3c, 0xc0, 0xfb, 0xab, 0xbb, 0x7a, 0xff, 0x77, 0xbd,
+  0x3f, 0x50, 0x91, 0x3d, 0x30, 0x33, 0x01, 0x3c, 0x48, 0x28, 0x43, 0x3d, 0xd4,
+  0x59, 0xac, 0xbc, 0xa3, 0xa9, 0x0d, 0xbd, 0x1c, 0x90, 0x52, 0xbd, 0x40, 0xa7,
+  0x57, 0x3c, 0x94, 0x79, 0x28, 0xbd, 0xf0, 0x27, 0x9b, 0x3c, 0x02, 0x37, 0x7d,
+  0x3d, 0x14, 0x5b, 0x94, 0xbc, 0xde, 0x3f, 0x2c, 0xbd, 0x06, 0xe5, 0x2b, 0xbd,
+  0x58, 0x3a, 0x01, 0xbd, 0xda, 0x88, 0xa5, 0xbc, 0x27, 0x42, 0x08, 0xbd, 0x30,
+  0x39, 0xd1, 0x3b, 0xdc, 0xf2, 0xb6, 0xbc, 0x78, 0xe4, 0xe9, 0x3c, 0x56, 0xdd,
+  0x8c, 0xbc, 0x20, 0xbf, 0x17, 0x3d, 0x8a, 0x7a, 0x5e, 0xbd, 0x6a, 0x3e, 0xac,
+  0xbc, 0xb2, 0x0d, 0x7b, 0x3d, 0x02, 0x11, 0xae, 0xbc, 0x8c, 0x5a, 0x14, 0x3d,
+  0xba, 0x7e, 0xa6, 0xbc, 0xdc, 0x76, 0x0c, 0x3d, 0xfc, 0x09, 0x5a, 0x3d, 0x4e,
+  0x8d, 0x8b, 0xbd, 0xd4, 0x0c, 0xa3, 0xbc, 0x7f, 0x0e, 0x8f, 0xbd, 0x20, 0x38,
+  0x62, 0xbb, 0xe0, 0x57, 0xf8, 0xbb, 0x00, 0x7b, 0x12, 0xba, 0x5c, 0x6f, 0xbe,
+  0x3c, 0x40, 0xc3, 0x2a, 0x3b, 0xf4, 0xe3, 0xb4, 0x3c, 0xda, 0x17, 0x4d, 0x3d,
+  0xd0, 0xca, 0x1e, 0x3d, 0x80, 0x09, 0xaa, 0x3c, 0xce, 0x89, 0x5d, 0x3d, 0x24,
+  0x5d, 0x0f, 0x3d, 0xa0, 0x6d, 0x44, 0x3c, 0x0e, 0x09, 0x92, 0xbc, 0x00, 0xde,
+  0x57, 0x3c, 0x91, 0x01, 0x73, 0xbd, 0x5e, 0x90, 0x1a, 0x3d, 0x4c, 0xf8, 0xd6,
+  0x3c, 0xf8, 0x9a, 0x91, 0xbd, 0xe2, 0x1c, 0x5d, 0xbd, 0x80, 0xde, 0x76, 0x3b,
+  0xd6, 0x26, 0x2c, 0x3d, 0x00, 0xd0, 0x39, 0xbc, 0xfc, 0x5d, 0xee, 0xbc, 0x7a,
+  0xdc, 0x83, 0xbc, 0x3b, 0x14, 0x81, 0x3d, 0x30, 0x85, 0xf3, 0x3c, 0x0e, 0x0d,
+  0x85, 0xbd, 0x86, 0x9f, 0xcf, 0xbc, 0x32, 0xf9, 0xfa, 0xbc, 0xdc, 0x92, 0x8e,
+  0xbd, 0xf0, 0xf2, 0x45, 0x3c, 0xb2, 0xcd, 0x31, 0xbd, 0x40, 0x13, 0xcc, 0xba,
+  0x81, 0x90, 0x0b, 0xbd, 0xf5, 0xd9, 0x7d, 0xbd, 0x74, 0xf2, 0xc1, 0xbc, 0x8e,
+  0xb9, 0x2b, 0x3d, 0xb0, 0xef, 0x7e, 0xbd, 0x00, 0x57, 0x81, 0x3c, 0xc2, 0x40,
+  0x76, 0xbd, 0xaf, 0xe7, 0x08, 0xbd, 0x02, 0x79, 0x26, 0x3d, 0x77, 0x1f, 0x2f,
+  0xbd, 0x20, 0x66, 0x1c, 0x3c, 0x28, 0x56, 0xc2, 0x3c, 0xe8, 0x78, 0x0e, 0x3c,
+  0xb8, 0x4e, 0x2c, 0xbc, 0xd0, 0x97, 0x26, 0xbc, 0x5e, 0x8f, 0x3b, 0x3d, 0x30,
+  0xff, 0x28, 0x3c, 0x91, 0x25, 0x92, 0x3d, 0x20, 0xd1, 0x20, 0xbc, 0x24, 0xb8,
+  0x23, 0xbd, 0xfc, 0xca, 0x55, 0xbc, 0xf8, 0x46, 0xf0, 0x3c, 0xf7, 0x15, 0x88,
+  0x3d, 0x96, 0x4a, 0x78, 0x3d, 0x40, 0xdb, 0xce, 0xba, 0x50, 0x38, 0xed, 0x3b,
+  0x3a, 0xfd, 0x00, 0x3d, 0x40, 0x1d, 0x3d, 0xbb, 0x8a, 0xd6, 0xae, 0xbc, 0x10,
+  0x55, 0x7a, 0xbd, 0x91, 0x66, 0x59, 0x3d, 0x40, 0x74, 0xd5, 0xbc, 0x76, 0x92,
+  0xb9, 0xbc, 0xa0, 0x5c, 0x4d, 0x3d, 0x59, 0xd0, 0x4a, 0x3d, 0x65, 0xa7, 0x5e,
+  0xbd, 0x45, 0x6b, 0xea, 0x3d, 0x2b, 0x08, 0xdf, 0x3c, 0xb3, 0x37, 0x6e, 0x3d,
+  0xfa, 0xad, 0xe0, 0xbc, 0xc3, 0xd2, 0x01, 0xbe, 0x24, 0x15, 0x90, 0x3d, 0x42,
+  0xd3, 0xc4, 0x3c, 0x2b, 0xd6, 0x00, 0x3c, 0x9b, 0xf7, 0xcc, 0x3d, 0x7c, 0xc1,
+  0x37, 0x3d, 0x4c, 0x98, 0xb6, 0x3d, 0x65, 0xac, 0x04, 0x3d, 0xbe, 0x0d, 0xf6,
+  0x3c, 0x0a, 0x47, 0xb9, 0xbd, 0xa0, 0x2d, 0x4f, 0x3b, 0x44, 0x5d, 0xd1, 0xbc,
+  0x3c, 0x8b, 0x82, 0x3d, 0xf8, 0xf9, 0x02, 0xbd, 0x21, 0xa7, 0x39, 0xbd, 0xa2,
+  0x22, 0x82, 0x3d, 0xda, 0x8a, 0xb9, 0xbd, 0x6c, 0x42, 0x95, 0xbc, 0x98, 0x7b,
+  0x9a, 0x3d, 0x1d, 0x34, 0x40, 0xbd, 0x68, 0xfa, 0x6f, 0x3c, 0xd6, 0x23, 0xa0,
+  0x3d, 0x5a, 0xe0, 0x71, 0x3d, 0xda, 0xb5, 0x20, 0xbd, 0x0d, 0x43, 0xe0, 0x3c,
+  0x77, 0xeb, 0x0c, 0x3d, 0x97, 0x10, 0xf9, 0x3c, 0xdb, 0xd9, 0xe6, 0x3a, 0xcb,
+  0xff, 0x63, 0xbd, 0x75, 0x4f, 0xbf, 0xb9, 0x69, 0x4a, 0x20, 0xbd, 0xa2, 0xbf,
+  0x56, 0x3d, 0xcc, 0xfe, 0x0e, 0xbe, 0xbe, 0xe9, 0x2e, 0x3d, 0x32, 0x25, 0x5d,
+  0xbd, 0x77, 0x8a, 0x43, 0xbd, 0xc8, 0x8d, 0x4d, 0x3d, 0xd7, 0x87, 0xe4, 0x3c,
+  0xc4, 0xf1, 0x50, 0x3d, 0x1a, 0xb6, 0x1a, 0x3d, 0x70, 0x13, 0x0f, 0x3c, 0xeb,
+  0x1e, 0x6f, 0xbc, 0x4a, 0x22, 0x12, 0x3d, 0x7b, 0xe9, 0xcd, 0x3c, 0x1a, 0x2d,
+  0x93, 0xbd, 0x21, 0xcd, 0x4b, 0xbd, 0x52, 0x94, 0x21, 0x3d, 0x1c, 0xb7, 0x0e,
+  0xbd, 0x15, 0xea, 0x0c, 0xbd, 0x55, 0x60, 0xb0, 0x3b, 0xb4, 0x1d, 0xd0, 0x3d,
+  0x43, 0xa2, 0x7b, 0xbd, 0xc9, 0x7b, 0x12, 0xbd, 0x64, 0x4f, 0x87, 0xbd, 0xea,
+  0x0f, 0x8c, 0x3d, 0x07, 0x3a, 0xbb, 0xbd, 0xa8, 0xb6, 0x62, 0xbd, 0x74, 0xe8,
+  0x84, 0x3d, 0xc2, 0x72, 0x6a, 0x3d, 0x58, 0xba, 0x67, 0xbb, 0x31, 0xf4, 0xb2,
+  0x3d, 0x04, 0x0e, 0x92, 0xbd, 0xd4, 0x9f, 0x7a, 0x3d, 0x81, 0xd4, 0x89, 0xbc,
+  0xe5, 0xe2, 0xe7, 0xbd, 0xb2, 0xd7, 0x51, 0xbd, 0x64, 0x57, 0x52, 0xbd, 0xb4,
+  0x3f, 0x73, 0xbc, 0x22, 0x15, 0x4e, 0x3d, 0xe9, 0xf0, 0x4c, 0x3d, 0x05, 0x9b,
+  0xfa, 0xbc, 0x28, 0xc4, 0xa1, 0x3d, 0xd2, 0x16, 0x51, 0x3d, 0xa0, 0x9f, 0x8f,
+  0xbb, 0xc9, 0x02, 0x82, 0x3d, 0x13, 0x45, 0x84, 0x3c, 0x0a, 0x79, 0xc9, 0x3c,
+  0xb9, 0x89, 0x19, 0xbd, 0x57, 0x1f, 0x86, 0xbb, 0xaa, 0xfa, 0xa0, 0x3d, 0x27,
+  0x94, 0x00, 0xbd, 0x95, 0xf0, 0x86, 0xbd, 0x70, 0x37, 0x81, 0xbc, 0x0a, 0x32,
+  0x09, 0x3d, 0x18, 0x6d, 0x18, 0xbd, 0x16, 0x40, 0x7e, 0x3d, 0x69, 0xfb, 0xaa,
+  0xbc, 0x31, 0x93, 0x17, 0xbd, 0x3e, 0xc6, 0x59, 0xbc, 0x17, 0xc8, 0xe7, 0x3c,
+  0x9e, 0x08, 0xc3, 0x3c, 0x79, 0x41, 0x12, 0x3d, 0xc8, 0xc2, 0x37, 0xbc, 0x3f,
+  0xc1, 0x8f, 0xbd, 0xd9, 0x75, 0x94, 0xbd, 0x8c, 0xc3, 0x97, 0x3d, 0x36, 0xad,
+  0x1b, 0xbe, 0x28, 0x9f, 0x80, 0xbc, 0x79, 0x5c, 0x84, 0xbc, 0x20, 0x29, 0x6b,
+  0x3d, 0xe1, 0xad, 0xd1, 0xbb, 0xa4, 0x2c, 0x08, 0x3d, 0x6e, 0x13, 0x52, 0xbd,
+  0x4c, 0x51, 0x60, 0x3d, 0xc0, 0xae, 0x92, 0x3d, 0xd3, 0x90, 0x35, 0xbd, 0x04,
+  0x9e, 0x5f, 0xbd, 0x8c, 0xad, 0xee, 0xbc, 0x6f, 0x0b, 0x3e, 0x3d, 0xfb, 0x15,
+  0x1c, 0x3c, 0x2f, 0x67, 0x98, 0xbb, 0x90, 0x7f, 0x9f, 0x3d, 0x21, 0x97, 0x2a,
+  0xbc, 0xa0, 0x67, 0x9d, 0xbd, 0x5d, 0x64, 0x18, 0x3d, 0xaf, 0x36, 0xd9, 0x3b,
+  0xe0, 0x06, 0xdc, 0x3c, 0xd0, 0x51, 0x8e, 0x3c, 0x48, 0x40, 0x56, 0x3d, 0xac,
+  0x63, 0xb2, 0xbc, 0x63, 0x31, 0xf6, 0xbc, 0x48, 0x65, 0x07, 0x3d, 0x9c, 0x92,
+  0x8d, 0xbd, 0x5c, 0xbb, 0x96, 0xbc, 0xa7, 0xdc, 0x07, 0x3c, 0xc4, 0xe5, 0xd8,
+  0x3c, 0xb9, 0xea, 0x11, 0x3c, 0x10, 0x39, 0x13, 0x3a, 0x18, 0x34, 0x28, 0xbd,
+  0xf4, 0x41, 0x6c, 0x3c, 0x25, 0x46, 0x12, 0xbd, 0xf9, 0x23, 0x3f, 0x3d, 0xfc,
+  0x1d, 0xd9, 0x3d, 0x68, 0xc6, 0xa9, 0xbc, 0x97, 0x32, 0x1c, 0xbd, 0x3f, 0x51,
+  0xbf, 0x3d, 0x7e, 0xd5, 0x3c, 0x3c, 0xda, 0x77, 0xcb, 0xbd, 0x10, 0x52, 0xb6,
+  0xbc, 0xd8, 0xbd, 0x9b, 0x3d, 0x43, 0xd7, 0x7c, 0x3d, 0x4c, 0x78, 0xb2, 0xbc,
+  0x7c, 0xda, 0xc9, 0xbc, 0x31, 0x8c, 0x4d, 0x3d, 0x82, 0x0e, 0xcb, 0xbc, 0xed,
+  0xf9, 0xe8, 0x3b, 0xa8, 0x08, 0x4b, 0x3d, 0x38, 0x3c, 0x4a, 0xbd, 0x1d, 0xd9,
+  0x0f, 0xbd, 0xd6, 0x17, 0x86, 0x3b, 0xa1, 0x90, 0xab, 0x3d, 0x91, 0xcc, 0x8f,
+  0xbd, 0x07, 0xfa, 0x39, 0x3d, 0x11, 0x95, 0x03, 0x3d, 0x29, 0x0f, 0x31, 0xbc,
+  0x87, 0xab, 0x3c, 0x3d, 0xc8, 0xe5, 0x5c, 0xb9, 0x44, 0x79, 0x44, 0xbd, 0x6d,
+  0x4c, 0x90, 0xbc, 0x86, 0x90, 0xa5, 0xbc, 0x47, 0x61, 0x39, 0xbe, 0xf9, 0xeb,
+  0x17, 0x3b, 0xea, 0x28, 0xe4, 0xbc, 0x79, 0x88, 0x12, 0xbc, 0x7a, 0x61, 0xdd,
+  0x3d, 0x7f, 0xfe, 0x49, 0x3d, 0x78, 0x92, 0x5c, 0xbd, 0x6d, 0xe2, 0xa4, 0x3b,
+  0x68, 0x57, 0x27, 0xbd, 0x61, 0x22, 0xaf, 0x3c, 0x02, 0x98, 0x6e, 0x3d, 0x74,
+  0x02, 0xbb, 0x3d, 0x33, 0x4d, 0x24, 0xbd, 0x3e, 0x93, 0x81, 0xbc, 0xb2, 0x1e,
+  0x1f, 0x3d, 0xb5, 0x79, 0x64, 0x3b, 0xbc, 0xfb, 0xf6, 0xbc, 0x61, 0x0c, 0xcd,
+  0xbd, 0xc1, 0x64, 0x08, 0x3c, 0x6f, 0x3d, 0x27, 0xbd, 0x10, 0xd3, 0xdb, 0xbc,
+  0xe4, 0xb6, 0xd2, 0x3b, 0x51, 0x12, 0x81, 0x3d, 0x37, 0xee, 0x87, 0xbc, 0xdd,
+  0x80, 0xaf, 0x39, 0x90, 0x85, 0xaf, 0x3d, 0x80, 0x5f, 0x12, 0xbc, 0xcb, 0x3c,
+  0x63, 0xbd, 0x81, 0x3c, 0x85, 0x3d, 0x10, 0xe7, 0x54, 0xbc, 0xa6, 0xb7, 0x98,
+  0xbc, 0x07, 0x98, 0x2f, 0x3d, 0x70, 0x80, 0x28, 0xbe, 0x7a, 0xe5, 0x77, 0x3d,
+  0x0b, 0x81, 0x51, 0xbd, 0xb1, 0xdf, 0x35, 0xbc, 0xd2, 0xf7, 0x0b, 0x3d, 0xbe,
+  0x9e, 0x02, 0xbd, 0xa2, 0xc0, 0x03, 0x3d, 0x97, 0xf5, 0x2f, 0xbb, 0xc6, 0x6b,
+  0x13, 0xbd, 0x81, 0xbc, 0xe8, 0xbb, 0x2a, 0x57, 0x63, 0x3d, 0x49, 0x18, 0x51,
+  0xbc, 0xd7, 0x9e, 0x44, 0xbd, 0x51, 0x59, 0xb8, 0x3b, 0x5b, 0x9b, 0x86, 0x3c,
+  0x1d, 0x63, 0x8a, 0x3d, 0x15, 0xc7, 0x94, 0xbd, 0x43, 0xc8, 0x05, 0xbd, 0x7b,
+  0xc8, 0x26, 0x3d, 0xdc, 0x03, 0xbd, 0x3c, 0xa0, 0x16, 0x2b, 0xbd, 0x33, 0x15,
+  0xfa, 0x3c, 0xfe, 0xce, 0x91, 0xbc, 0x0f, 0x1e, 0xe3, 0x3b, 0x01, 0x19, 0x2b,
+  0xbd, 0x26, 0xff, 0x53, 0x3c, 0x4f, 0x22, 0x91, 0xbb, 0xf6, 0x4f, 0x84, 0xbd,
+  0xc5, 0xf6, 0x8a, 0x3d, 0x76, 0xcf, 0x90, 0xbd, 0x4d, 0x0e, 0xb7, 0x3d, 0x90,
+  0x1f, 0xd0, 0xbc, 0xd8, 0xa6, 0x7c, 0xbd, 0x39, 0xa0, 0x70, 0x3c, 0x33, 0x14,
+  0x91, 0xbd, 0xa4, 0x66, 0x12, 0xbb, 0xfd, 0x3b, 0x4e, 0x3d, 0x87, 0x72, 0x0c,
+  0x3d, 0xa1, 0x1b, 0x7b, 0xbc, 0xe0, 0x0f, 0xb5, 0xbc, 0x74, 0x49, 0x42, 0xbd,
+  0x61, 0x8f, 0x34, 0x3d, 0x40, 0x4a, 0xb0, 0xbc, 0x19, 0xf3, 0x14, 0x3d, 0x5c,
+  0xd5, 0x8a, 0x3d, 0x4e, 0xd1, 0x54, 0x3d, 0xd8, 0x0b, 0x0d, 0x3d, 0x04, 0x61,
+  0x85, 0x3d, 0x7e, 0x9e, 0x33, 0x3d, 0xd7, 0x75, 0xcb, 0x3b, 0x71, 0x7a, 0x89,
+  0xbb, 0xb5, 0x56, 0x62, 0xbd, 0x00, 0xe5, 0x87, 0xbc, 0x84, 0x92, 0xca, 0xbc,
+  0xf4, 0x15, 0xbb, 0xbc, 0xe7, 0xae, 0xc5, 0x3a, 0x8a, 0x96, 0x98, 0x3c, 0x55,
+  0xb6, 0x9a, 0xbc, 0x59, 0x6f, 0x2c, 0x3d, 0x5b, 0x3b, 0x14, 0x3c, 0xd7, 0xb4,
+  0xa6, 0x3b, 0x3f, 0x09, 0x21, 0x3d, 0x64, 0xfc, 0x54, 0x3c, 0x03, 0xd5, 0xf4,
+  0xbc, 0x06, 0x74, 0xb6, 0xbd, 0xd5, 0x70, 0x0b, 0xbd, 0xa6, 0xf8, 0x4b, 0x3c,
+  0xea, 0x46, 0x32, 0xbd, 0xb4, 0x06, 0x3b, 0x3c, 0xc2, 0xa8, 0x0d, 0xbb, 0x12,
+  0x60, 0x6f, 0x3c, 0x20, 0xca, 0x10, 0x3c, 0x05, 0xcc, 0xa6, 0xbc, 0x7a, 0xdd,
+  0xdf, 0xbb, 0xcc, 0x65, 0x9e, 0x3c, 0x02, 0x81, 0xe3, 0x3c, 0x58, 0x15, 0x90,
+  0x3d, 0x80, 0x4a, 0xb2, 0xbd, 0xd3, 0x92, 0x8d, 0x3d, 0xc8, 0x03, 0xd9, 0xbc,
+  0xc9, 0xce, 0x49, 0xbd, 0x57, 0xb1, 0x87, 0xbc, 0xf8, 0xc8, 0xb9, 0x3d, 0xb5,
+  0x6a, 0x02, 0xbd, 0x60, 0xe3, 0x24, 0x3d, 0xb3, 0xdd, 0x4d, 0x3d, 0x87, 0x6d,
+  0x0e, 0xbd, 0xea, 0x2d, 0x67, 0xbd, 0x62, 0x3b, 0xa9, 0xbc, 0xd1, 0x23, 0x79,
+  0x3d, 0x27, 0x90, 0x1a, 0x3d, 0xfa, 0xf4, 0xa3, 0x3c, 0x88, 0xf8, 0x76, 0xbd,
+  0x48, 0x27, 0x4e, 0xbd, 0xad, 0xe7, 0x6d, 0x3c, 0xbd, 0x3f, 0xba, 0x3d, 0x6a,
+  0x30, 0xb8, 0xbd, 0x2e, 0x5c, 0xc7, 0xbb, 0x76, 0x8f, 0x85, 0xbc, 0x9d, 0x0f,
+  0x48, 0x3d, 0xae, 0x8b, 0xa4, 0x3d, 0x72, 0xca, 0x36, 0x3d, 0xcd, 0xab, 0xad,
+  0xbc, 0xf4, 0x68, 0x11, 0xbd, 0xe4, 0xf0, 0x20, 0x39, 0x85, 0x8d, 0x52, 0xbd,
+  0x73, 0x80, 0x89, 0x3d, 0x3e, 0x97, 0x11, 0xbd, 0x44, 0xe7, 0x13, 0x3d, 0x25,
+  0xc3, 0x68, 0x3d, 0x4f, 0x88, 0x1c, 0x3d, 0x51, 0x5f, 0x86, 0xbc, 0xce, 0x97,
+  0xfb, 0xbc, 0x0e, 0x5c, 0x11, 0xbd, 0x00, 0x0f, 0x05, 0x3d, 0x8c, 0x5a, 0xe2,
+  0x3c, 0xdb, 0x30, 0x8c, 0x3d, 0x69, 0xac, 0xd6, 0x3c, 0xb6, 0x26, 0x22, 0x3d,
+  0x11, 0x74, 0x72, 0xbd, 0x85, 0xc5, 0x4e, 0x3b, 0x9c, 0x72, 0x9e, 0x3d, 0xa6,
+  0x49, 0x25, 0xbd, 0x9e, 0x77, 0x23, 0x3c, 0x01, 0xbf, 0x35, 0xbc, 0xf9, 0x0a,
+  0x06, 0xbd, 0x66, 0xc8, 0x70, 0xbd, 0xb9, 0x54, 0x80, 0x3d, 0x70, 0x83, 0xd1,
+  0xbc, 0x7b, 0x7a, 0xd5, 0xbc, 0x72, 0x5e, 0x1e, 0xbd, 0x7d, 0xb0, 0x24, 0x3d,
+  0x88, 0x95, 0x3b, 0x3d, 0xb9, 0xc0, 0x4f, 0xbc, 0xf6, 0xf0, 0xcc, 0x3c, 0x6e,
+  0x8d, 0x20, 0x3c, 0x0e, 0xe0, 0x8f, 0xbd, 0xfe, 0xd6, 0x2f, 0xbe, 0x40, 0x5e,
+  0x05, 0x3c, 0x43, 0x3c, 0x1f, 0x3d, 0x2b, 0xfe, 0x63, 0xbd, 0xac, 0xfc, 0x78,
+  0x3d, 0x89, 0xc7, 0x7b, 0xbd, 0xf8, 0x57, 0x38, 0xbd, 0x27, 0xf8, 0x9f, 0x3c,
+  0xfe, 0xbe, 0x93, 0xbc, 0xa7, 0x0b, 0x52, 0xbc, 0xf9, 0xc1, 0xae, 0x3c, 0x84,
+  0xf4, 0x6a, 0xbc, 0x3c, 0xcf, 0xf6, 0xba, 0x16, 0x08, 0x95, 0xbc, 0xcf, 0xf0,
+  0x57, 0xbd, 0x5e, 0x93, 0x98, 0xbd, 0x84, 0x6a, 0xb4, 0x3d, 0xf6, 0x01, 0xe7,
+  0xbc, 0x52, 0x9a, 0x85, 0xbc, 0x25, 0x22, 0x99, 0x3d, 0x00, 0xa0, 0x87, 0xbb,
+  0xf8, 0xb5, 0x0e, 0xbc, 0xcd, 0xd6, 0x3d, 0x3d, 0x01, 0x80, 0x2d, 0xbe, 0xf5,
+  0xcb, 0x94, 0x3d, 0x65, 0x93, 0x7f, 0xbc, 0x90, 0x42, 0x98, 0x3c, 0x1c, 0x10,
+  0x13, 0x3d, 0xed, 0xb4, 0x8e, 0x3d, 0xdb, 0xd9, 0x01, 0xbd, 0x18, 0xe6, 0x8b,
+  0x3c, 0x64, 0x69, 0x60, 0x3b, 0x63, 0x00, 0x1c, 0xbd, 0xe4, 0x57, 0x43, 0x3d,
+  0xac, 0x16, 0xdc, 0x3d, 0x3d, 0x41, 0x3d, 0xbd, 0x18, 0xcb, 0x34, 0xbd, 0x28,
+  0x93, 0x06, 0x3b, 0xf2, 0x17, 0x02, 0xbd, 0x2d, 0x29, 0x07, 0xbd, 0xde, 0xd1,
+  0x88, 0xbc, 0xd8, 0x1e, 0x86, 0x3d, 0xda, 0xd2, 0xe3, 0xbb, 0xb6, 0xd8, 0x66,
+  0xbd, 0xe9, 0xbd, 0x91, 0x3d, 0xd2, 0xf8, 0xa1, 0x3d, 0xce, 0x41, 0x1f, 0x3d,
+  0x33, 0x84, 0xfa, 0xbc, 0xa7, 0x81, 0x8f, 0x3c, 0xe2, 0xf0, 0xda, 0xbc, 0x8d,
+  0x67, 0x2a, 0x3d, 0xee, 0x5c, 0xef, 0x3d, 0x00, 0xf6, 0x3c, 0xbb, 0xcd, 0xa3,
+  0x70, 0x3d, 0x3a, 0x58, 0x89, 0x3d, 0x03, 0xe3, 0x15, 0xbe, 0xfc, 0x75, 0x10,
+  0x3c, 0xcc, 0xc4, 0x23, 0xbc, 0xd8, 0x48, 0x1f, 0x3c, 0xb2, 0x7c, 0xa1, 0x3a,
+  0x7f, 0x0b, 0xda, 0x3d, 0x0d, 0xd0, 0x03, 0x3d, 0xf3, 0xca, 0xd9, 0x3b, 0x72,
+  0x97, 0x1a, 0x3c, 0x5c, 0x19, 0xfa, 0xbd, 0xaa, 0x5d, 0x12, 0x3d, 0x75, 0xda,
+  0x58, 0x3d, 0xec, 0x05, 0xb1, 0x3c, 0x6a, 0x21, 0xd9, 0xbc, 0x1d, 0x2c, 0x8c,
+  0x3c, 0xfa, 0x2f, 0x1e, 0xbd, 0x93, 0x81, 0x98, 0xba, 0x42, 0x27, 0x62, 0xbd,
+  0x1a, 0xe3, 0xa5, 0x3d, 0x17, 0x24, 0x18, 0xbc, 0x73, 0x8a, 0x24, 0xbd, 0xea,
+  0x88, 0x92, 0xbc, 0x9d, 0x8d, 0xf7, 0xbc, 0xb4, 0xa6, 0xc8, 0xbd, 0xa0, 0xdd,
+  0x8e, 0xbd, 0x4c, 0x81, 0x72, 0x3d, 0x59, 0x67, 0x48, 0xbd, 0x23, 0x21, 0xb3,
+  0x3c, 0x6a, 0xc5, 0x43, 0x3d, 0x13, 0x50, 0x85, 0x3d, 0x0a, 0xd5, 0xb9, 0x3c,
+  0xf3, 0xe6, 0x2b, 0xbd, 0x32, 0x6c, 0xe6, 0xbc, 0x11, 0x7c, 0x05, 0x3d, 0x99,
+  0xeb, 0x48, 0xbc, 0x7d, 0x87, 0x35, 0xbd, 0x8b, 0x42, 0x5f, 0x3d, 0xae, 0x56,
+  0x10, 0x3d, 0x02, 0x1e, 0x96, 0x3d, 0xf7, 0x64, 0xab, 0x3d, 0x66, 0xc3, 0xa2,
+  0x3c, 0xe6, 0x36, 0xd8, 0xbc, 0x8c, 0xaa, 0x29, 0x3d, 0x52, 0x0b, 0x8b, 0xbc,
+  0xce, 0x93, 0xef, 0xbc, 0xd9, 0x9b, 0x2c, 0xbd, 0x4a, 0x7a, 0xe6, 0x3c, 0xa1,
+  0xdb, 0xaa, 0x3d, 0xfe, 0xac, 0x77, 0x3c, 0xd0, 0x02, 0xe2, 0xbc, 0x1c, 0xec,
+  0xef, 0xbc, 0xe0, 0x92, 0xad, 0xbd, 0x46, 0xe8, 0x02, 0x3d, 0xd0, 0x99, 0x45,
+  0x3b, 0x8a, 0xbc, 0x3f, 0xbd, 0x02, 0x86, 0x84, 0xbd, 0x34, 0xfb, 0xc3, 0xbd,
+  0x71, 0xb4, 0xb7, 0x3d, 0xc0, 0x74, 0x42, 0xbb, 0xba, 0xef, 0x5d, 0xbc, 0x2b,
+  0xd3, 0x21, 0x3c, 0x5a, 0xa2, 0xe4, 0xbc, 0x9f, 0xa9, 0x80, 0xbd, 0xa0, 0x48,
+  0xb3, 0x3d, 0x39, 0xbb, 0xa4, 0xbd, 0xa9, 0x25, 0xb4, 0x3d, 0xb7, 0x12, 0xf3,
+  0xbc, 0x25, 0x61, 0x37, 0xbd, 0xb9, 0x66, 0x80, 0x3d, 0xcd, 0xce, 0xcf, 0x3d,
+  0x9f, 0xd0, 0x90, 0xbc, 0xd7, 0xbd, 0xf4, 0x3c, 0x20, 0x96, 0x8e, 0xbd, 0xd9,
+  0xdf, 0x00, 0xbe, 0x8c, 0xf9, 0x5d, 0xbc, 0x58, 0xf0, 0x1e, 0x3d, 0xee, 0xec,
+  0x2f, 0xbd, 0x32, 0x6b, 0x46, 0xbd, 0x72, 0x10, 0x2e, 0x3d, 0x33, 0x5a, 0x09,
+  0xbd, 0x43, 0x78, 0x14, 0x3d, 0x33, 0xde, 0xa1, 0xbd, 0xcd, 0x6e, 0x35, 0x3c,
+  0x05, 0x48, 0x22, 0xbd, 0x5b, 0x57, 0x80, 0x3d, 0x66, 0x64, 0xd7, 0x3b, 0x26,
+  0xf1, 0x1a, 0x3c, 0x81, 0x24, 0x8a, 0xbd, 0x00, 0x84, 0x5e, 0xbd, 0xbc, 0xc0,
+  0xdc, 0x3b, 0x74, 0x77, 0xa3, 0x3d, 0x8a, 0x55, 0xe3, 0x3c, 0x84, 0x75, 0x2e,
+  0x3d, 0x45, 0x17, 0x3c, 0x3d, 0xcf, 0xd9, 0x62, 0xbd, 0x6e, 0x1c, 0xd2, 0x3c,
+  0x6e, 0xe1, 0x21, 0xbe, 0x36, 0xf2, 0x95, 0x3d, 0x44, 0x50, 0x00, 0xba, 0x87,
+  0x5b, 0xc8, 0xbc, 0xeb, 0xe0, 0xbd, 0x3d, 0x92, 0x7c, 0xff, 0x3c, 0x34, 0x97,
+  0x32, 0x3d, 0x8f, 0x57, 0x73, 0x3d, 0x70, 0xfe, 0x5b, 0x3c, 0xba, 0x43, 0xee,
+  0xbc, 0xa8, 0x7b, 0x06, 0x3c, 0xfc, 0x87, 0x8f, 0x3d, 0xf2, 0xd6, 0x43, 0xbd,
+  0x18, 0x3c, 0x11, 0xbc, 0x1e, 0xc3, 0x62, 0x3c, 0x46, 0x98, 0x9e, 0x3c, 0x5a,
+  0x90, 0xc4, 0xbc, 0xe6, 0x6b, 0x72, 0xbd, 0xce, 0x30, 0xa7, 0x3d, 0x81, 0xa2,
+  0x10, 0xbd, 0x4e, 0x75, 0x24, 0x3d, 0xff, 0x9d, 0xea, 0xbc, 0x25, 0x08, 0x92,
+  0x3c, 0x50, 0x0a, 0xf0, 0xbb, 0xf0, 0x91, 0x8d, 0xbc, 0x4c, 0xd8, 0xc8, 0x3c,
+  0x16, 0xbb, 0x5d, 0xbd, 0x24, 0x8d, 0x32, 0x3d, 0x75, 0x67, 0x64, 0x3d, 0xe0,
+  0x67, 0x46, 0x3b, 0xbc, 0x93, 0xbb, 0x3c, 0xd2, 0x74, 0x17, 0xbd, 0x45, 0x88,
+  0x21, 0xbe, 0x4d, 0x15, 0x95, 0x3d, 0x41, 0x5c, 0xe7, 0xbb, 0xc9, 0x97, 0xfd,
+  0xbc, 0x3b, 0xe2, 0x0f, 0xbd, 0x57, 0x38, 0xab, 0x3d, 0x13, 0x12, 0xeb, 0x3c,
+  0x92, 0x5d, 0x4f, 0x3d, 0xf0, 0x1f, 0xbf, 0xbc, 0x37, 0x63, 0xf7, 0xbc, 0xa8,
+  0x76, 0x32, 0x3c, 0x97, 0xd3, 0xc9, 0xbc, 0x28, 0x83, 0x5b, 0x3d, 0xe2, 0x0f,
+  0x90, 0xbd, 0x31, 0x0b, 0x8a, 0xbd, 0x04, 0x7c, 0xd5, 0xbc, 0x16, 0x5d, 0xa7,
+  0x3a, 0x54, 0x36, 0x4f, 0xbd, 0x4d, 0xae, 0x64, 0x3d, 0xfd, 0x4c, 0x94, 0xbc,
+  0x72, 0x3f, 0x96, 0xbc, 0x41, 0xd7, 0xfa, 0x3b, 0x52, 0x45, 0x03, 0xbc, 0x1f,
+  0x50, 0xa6, 0xbd, 0x28, 0xb9, 0x78, 0x3c, 0x16, 0xa5, 0x77, 0x3c, 0xf2, 0x4e,
+  0xa1, 0x3c, 0x84, 0xb6, 0x84, 0xbd, 0xc5, 0x78, 0xdc, 0x3c, 0xb4, 0xd1, 0x27,
+  0xbd, 0x04, 0x20, 0x8d, 0xbd, 0xa0, 0x12, 0x36, 0x3c, 0xce, 0xb5, 0x31, 0xbe,
+  0x4b, 0xfd, 0x44, 0xbc, 0xe3, 0x38, 0x00, 0xbd, 0xca, 0x35, 0x60, 0x3c, 0xc6,
+  0xe4, 0x93, 0xb6, 0xc9, 0x84, 0xc0, 0x3a, 0xb3, 0x53, 0x88, 0x3d, 0x08, 0x37,
+  0x0b, 0x3c, 0xd9, 0x6d, 0x00, 0xbb, 0x54, 0x22, 0xcc, 0xbb, 0x3c, 0x72, 0xa7,
+  0xbc, 0x39, 0xbd, 0xc0, 0x3d, 0xc7, 0xb5, 0x0a, 0x3b, 0xe3, 0xbc, 0x38, 0xbc,
+  0x0d, 0x1c, 0x1f, 0xbc, 0xbc, 0x5b, 0x42, 0xbc, 0xf3, 0x43, 0xb2, 0x3c, 0x5e,
+  0x7e, 0xc3, 0xbc, 0x40, 0xbf, 0x47, 0x3c, 0xe7, 0x7d, 0x3e, 0xbc, 0x30, 0xf4,
+  0x13, 0xbc, 0x5f, 0x8d, 0xd1, 0x3c, 0xe1, 0x93, 0xe7, 0xbc, 0x73, 0x12, 0x87,
+  0xbc, 0x52, 0xb6, 0x9d, 0x3b, 0xf6, 0xda, 0x8d, 0x3d, 0x6b, 0xb8, 0x03, 0x3c,
+  0x58, 0x8e, 0x25, 0xbd, 0x7b, 0xaa, 0x8a, 0xbc, 0x75, 0xd1, 0x84, 0x3d, 0x0e,
+  0x90, 0xcd, 0xbc, 0x17, 0x0e, 0x8b, 0x3d, 0x87, 0x5e, 0x04, 0xbd, 0xe5, 0x99,
+  0x9b, 0xbc, 0x0a, 0xdd, 0x3b, 0x3d, 0x22, 0xc9, 0x83, 0xbc, 0xb8, 0x42, 0x3f,
+  0x3d, 0x86, 0x99, 0x90, 0x3d, 0x41, 0x4e, 0xa2, 0x3d, 0xf0, 0x89, 0x4f, 0xbd,
+  0xa6, 0x28, 0x75, 0xbd, 0xea, 0xf1, 0x56, 0xbd, 0x96, 0xb0, 0x9b, 0xbc, 0x01,
+  0x85, 0xb5, 0x3d, 0xcf, 0x71, 0x4c, 0x3d, 0x98, 0xf9, 0x6d, 0xbc, 0xc8, 0x59,
+  0x38, 0xbd, 0x12, 0x6f, 0x7b, 0x3d, 0x61, 0xac, 0xf1, 0xbb, 0xd4, 0x32, 0x4a,
+  0x3d, 0x92, 0x25, 0x45, 0x3d, 0x53, 0x88, 0x6d, 0xbd, 0xa0, 0x69, 0xda, 0xbb,
+  0xf2, 0xf2, 0xda, 0x3b, 0xf3, 0x4d, 0x84, 0xbc, 0x61, 0x96, 0xda, 0x3c, 0xa3,
+  0x9c, 0x9a, 0x3b, 0x70, 0x04, 0x93, 0xbb, 0x11, 0x0f, 0xe7, 0xbc, 0x06, 0x52,
+  0x86, 0xbd, 0x0f, 0xf5, 0x6c, 0xbd, 0xe1, 0x4c, 0x8d, 0x3d, 0x59, 0x20, 0xa0,
+  0xbd, 0xf8, 0x29, 0x94, 0x3d, 0x3f, 0x89, 0x86, 0xbd, 0x15, 0x66, 0x15, 0xbd,
+  0xad, 0x80, 0xdf, 0x3c, 0x5b, 0xd4, 0x6c, 0xbc, 0x2c, 0x5f, 0x60, 0x3c, 0x2b,
+  0x82, 0xd5, 0x3c, 0x3f, 0x7e, 0x14, 0xbd, 0x6c, 0xe8, 0xaf, 0xbb, 0xee, 0x8b,
+  0x27, 0xbd, 0xa0, 0xa8, 0x20, 0xbd, 0xe8, 0x39, 0x54, 0xbc, 0x9b, 0x57, 0xb7,
+  0x3d, 0x6a, 0x42, 0x81, 0x3d, 0xd3, 0x09, 0x10, 0xbd, 0x95, 0xd4, 0x3a, 0x3d,
+  0x48, 0xe1, 0xb8, 0xbc, 0xf4, 0x91, 0xa0, 0xbd, 0x8e, 0x67, 0x5e, 0xbd, 0x3b,
+  0x3d, 0xa0, 0x3d, 0x82, 0x2e, 0x85, 0x3d, 0x10, 0x91, 0x8c, 0xbb, 0x63, 0xb7,
+  0x75, 0xbd, 0xf5, 0xd8, 0x35, 0xbd, 0xea, 0x58, 0x11, 0xbb, 0xc4, 0x87, 0xe5,
+  0xbc, 0xb4, 0x14, 0xce, 0x3d, 0x86, 0x00, 0x0b, 0x3c, 0x91, 0x4b, 0xb2, 0xbd,
+  0xa9, 0x2e, 0x93, 0x3d, 0xc3, 0x3a, 0xc3, 0xbb, 0x7c, 0x8a, 0x83, 0xbd, 0xd2,
+  0xb1, 0x2e, 0xbd, 0xbb, 0x27, 0xa9, 0xbd, 0xa7, 0x9f, 0x41, 0x3d, 0x0a, 0x47,
+  0x15, 0xbd, 0xeb, 0x11, 0xca, 0x3c, 0xfe, 0x0d, 0xef, 0xbc, 0x71, 0x53, 0x52,
+  0x3d, 0x0b, 0x4b, 0x44, 0x3c, 0x9d, 0xbf, 0x10, 0xbb, 0xf9, 0x31, 0xe6, 0x3c,
+  0x97, 0x60, 0xbd, 0xbd, 0x8c, 0x40, 0x87, 0x3c, 0x30, 0x66, 0x18, 0x3d, 0x1a,
+  0x2b, 0xcd, 0x3c, 0x52, 0x92, 0x7e, 0xbd, 0x58, 0xee, 0x02, 0x3d, 0x0a, 0x85,
+  0xf7, 0xbc, 0x76, 0x75, 0x7f, 0xbd, 0xff, 0x11, 0xde, 0x3b, 0x5b, 0x43, 0x4b,
+  0x3d, 0xa2, 0x53, 0x3f, 0xbd, 0x90, 0xf3, 0x42, 0xbd, 0x5b, 0xb9, 0x1e, 0x3d,
+  0x43, 0x66, 0x46, 0xbc, 0x3e, 0x79, 0x7f, 0xbd, 0x24, 0xa8, 0xa0, 0xbd, 0xd5,
+  0xb2, 0xd2, 0x3c, 0xf6, 0x82, 0x7d, 0x3b, 0x52, 0x09, 0x4e, 0xbd, 0x23, 0x30,
+  0xfa, 0x3d, 0x62, 0xb4, 0x72, 0x3d, 0xa6, 0x3c, 0x98, 0x3c, 0x20, 0x3f, 0xdd,
+  0xbb, 0xb0, 0xfa, 0x4f, 0xbd, 0x0f, 0x36, 0x24, 0xbb, 0x19, 0xbc, 0x7d, 0xbd,
+  0x8d, 0xab, 0x2e, 0x3d, 0x1e, 0x67, 0x61, 0x3d, 0x8a, 0x39, 0x61, 0xbb, 0xb1,
+  0xa0, 0x01, 0xbc, 0x0d, 0x75, 0x64, 0xbc, 0x89, 0xd7, 0x84, 0xbd, 0x1f, 0x26,
+  0xa6, 0xbd, 0x7a, 0x67, 0x62, 0x3d, 0x3d, 0x4d, 0x06, 0xbb, 0xff, 0xe4, 0x92,
+  0x3d, 0x32, 0x12, 0x95, 0xbc, 0x4b, 0x2e, 0x8b, 0xbc, 0x8b, 0x4a, 0x14, 0x3c,
+  0xea, 0x08, 0x81, 0xbd, 0xb3, 0x3e, 0xb3, 0xbd, 0x96, 0x40, 0xef, 0x3c, 0xc6,
+  0xf4, 0x83, 0xbd, 0x70, 0x8a, 0xad, 0xbc, 0x28, 0x6d, 0x26, 0xbd, 0x0e, 0x8f,
+  0x89, 0x3a, 0xbc, 0x30, 0xc8, 0xbd, 0x81, 0x3c, 0x22, 0xbd, 0x19, 0x06, 0xb4,
+  0x3d, 0x2a, 0xbf, 0x2a, 0x3d, 0xc9, 0xd4, 0x00, 0xbd, 0x74, 0x7d, 0x9b, 0x3b,
+  0xc5, 0x7a, 0x13, 0xbd, 0xbf, 0x24, 0x18, 0xbc, 0x63, 0x21, 0xfd, 0x3c, 0x8f,
+  0x45, 0xf6, 0xbd, 0xf6, 0xb7, 0x85, 0x3c, 0x49, 0xc7, 0xee, 0xbb, 0x31, 0x16,
+  0x9c, 0x3d, 0x86, 0x9e, 0x44, 0x3d, 0x97, 0x25, 0x99, 0x3d, 0x33, 0x23, 0xa6,
+  0x3d, 0x7f, 0x66, 0x2b, 0x3d, 0xbd, 0xe9, 0x43, 0x3d, 0x11, 0x56, 0x76, 0xbc,
+  0x30, 0x7c, 0x87, 0xbb, 0xfe, 0xae, 0xfb, 0xb8, 0x4c, 0x48, 0x47, 0xbd, 0x74,
+  0x13, 0x8b, 0xbd, 0x26, 0x22, 0x87, 0x3d, 0x22, 0xb0, 0x87, 0x3d, 0x9f, 0xc6,
+  0x74, 0xbd, 0x7a, 0x47, 0x70, 0x3c, 0xe0, 0x41, 0x8b, 0x3d, 0xfb, 0xa2, 0x43,
+  0xbc, 0x63, 0x0d, 0x21, 0xbd, 0x8a, 0x60, 0x36, 0xbb, 0x54, 0xe8, 0x59, 0x3c,
+  0x21, 0xd4, 0xa9, 0x3b, 0x00, 0x5b, 0x20, 0x3d, 0x61, 0x25, 0x72, 0x3d, 0x39,
+  0x8d, 0x3b, 0x3d, 0x5e, 0xcd, 0x4f, 0x3d, 0xa0, 0x47, 0x0c, 0xbd, 0x34, 0xc9,
+  0x09, 0x3d, 0xb8, 0x59, 0xa2, 0xbc, 0x9a, 0xa3, 0x82, 0x3d, 0x1b, 0xd4, 0x1f,
+  0xbe, 0xa4, 0x45, 0x9d, 0x3d, 0x9e, 0x03, 0xc6, 0x3c, 0x0c, 0x23, 0x30, 0x3d,
+  0x9c, 0xb4, 0xec, 0xbb, 0xf8, 0x66, 0x9c, 0xbc, 0x6c, 0x32, 0x7e, 0x3d, 0x4b,
+  0x32, 0x51, 0x3d, 0x64, 0x32, 0x75, 0x3d, 0x1b, 0xc9, 0xd1, 0x3c, 0x98, 0xac,
+  0x05, 0x3d, 0x4a, 0x99, 0x74, 0x3b, 0x40, 0x86, 0x41, 0xbd, 0xf6, 0xa7, 0x03,
+  0xbd, 0x95, 0x47, 0x23, 0x3c, 0x78, 0xf3, 0x0c, 0x3d, 0xf4, 0x66, 0xdc, 0x3b,
+  0x4d, 0x45, 0xbf, 0xbb, 0x65, 0x4b, 0x73, 0xbc, 0x51, 0x10, 0x8c, 0x3c, 0x5e,
+  0x5a, 0x67, 0x3d, 0xd7, 0x47, 0x82, 0x3d, 0xdc, 0x32, 0x9c, 0xbc, 0xe4, 0xa5,
+  0x87, 0xbd, 0xc2, 0xd2, 0xc4, 0xbd, 0x08, 0xbe, 0x6e, 0x3d, 0xa8, 0x8b, 0xf1,
+  0x3c, 0x10, 0xc0, 0xb1, 0xbc, 0x12, 0x09, 0x88, 0x3d, 0x3f, 0x54, 0x25, 0x3d,
+  0x11, 0x70, 0x26, 0x3b, 0xdd, 0x48, 0x18, 0x3c, 0x01, 0x3c, 0xee, 0xbd, 0x4f,
+  0x63, 0x36, 0xbc, 0xea, 0x7e, 0x3f, 0x3d, 0x86, 0x4d, 0x45, 0x3d, 0x4b, 0x63,
+  0x70, 0xbc, 0x32, 0xdf, 0xc0, 0x3d, 0x50, 0x3c, 0x13, 0x3c, 0x0e, 0x61, 0xa3,
+  0x3d, 0xe8, 0xc5, 0x37, 0xbd, 0x3b, 0xd7, 0x01, 0xbd, 0x20, 0x1b, 0x89, 0xbc,
+  0x70, 0x18, 0xee, 0xbc, 0x3e, 0xeb, 0xfa, 0xbb, 0x18, 0xda, 0xda, 0x3c, 0xd6,
+  0x82, 0x19, 0xbd, 0xf1, 0x7e, 0x88, 0xbd, 0x39, 0x1d, 0xb8, 0xbb, 0x67, 0x98,
+  0x1c, 0x3d, 0x72, 0x83, 0x90, 0x3d, 0xd3, 0x17, 0x6b, 0xbd, 0xcc, 0x55, 0xa8,
+  0x3c, 0x18, 0x2e, 0x2c, 0xbd, 0x08, 0xc4, 0x34, 0x3c, 0xf8, 0x8f, 0x51, 0xbd,
+  0x88, 0x62, 0xfe, 0x3c, 0xbc, 0xe0, 0xb1, 0xbc, 0x09, 0x93, 0x88, 0xbb, 0x95,
+  0x9c, 0xda, 0x3c, 0x83, 0xda, 0x3a, 0xbd, 0xb8, 0x82, 0x81, 0x3c, 0x39, 0xa8,
+  0x8a, 0xbd, 0x8b, 0xb0, 0x31, 0xbb, 0x4a, 0x2c, 0x07, 0xbe, 0xec, 0x84, 0x9b,
+  0x3c, 0xc9, 0x97, 0x56, 0x3d, 0x3d, 0xce, 0x97, 0xbd, 0xa6, 0xe3, 0xbc, 0x3d,
+  0x91, 0xc4, 0x0f, 0x3d, 0x35, 0xe9, 0xd1, 0xbc, 0x10, 0x48, 0x17, 0x3c, 0x9a,
+  0x86, 0x86, 0xbd, 0x08, 0x63, 0xf9, 0xbc, 0xb0, 0xb0, 0x98, 0x3c, 0x3e, 0x7e,
+  0x4e, 0x3d, 0xe0, 0x6f, 0x73, 0xbc, 0xa5, 0x9e, 0x03, 0xbd, 0x7c, 0x39, 0x53,
+  0x39, 0x6d, 0x86, 0x40, 0xba, 0x1d, 0x71, 0x86, 0x3d, 0x62, 0xec, 0x9d, 0x3c,
+  0x03, 0x1e, 0x29, 0x3d, 0xbd, 0xbf, 0xd2, 0xbd, 0xce, 0x1c, 0x0c, 0x3d, 0x7f,
+  0xb3, 0x9c, 0x3d, 0x93, 0xa6, 0xa1, 0xbc, 0xb9, 0xf4, 0x6b, 0xbd, 0x17, 0xce,
+  0x40, 0xbd, 0x33, 0x15, 0x00, 0x3d, 0xd3, 0x33, 0x9c, 0x3d, 0x01, 0xc6, 0xec,
+  0x3c, 0x65, 0x42, 0xba, 0x3c, 0x33, 0x73, 0xec, 0xbc, 0x47, 0xf8, 0x00, 0x3d,
+  0xd1, 0x1b, 0x66, 0x3d, 0x10, 0x9b, 0x0b, 0xbe, 0xe6, 0x45, 0x48, 0xbd, 0x90,
+  0x46, 0xbd, 0x3c, 0x29, 0xe0, 0xb5, 0xbc, 0x50, 0x42, 0x6a, 0x3d, 0x00, 0x37,
+  0x9e, 0x3d, 0xc1, 0x54, 0xa0, 0x3c, 0x00, 0x3c, 0x2f, 0xbb, 0x05, 0x4f, 0xa7,
+  0xbc, 0x3d, 0x86, 0x68, 0xbd, 0x24, 0x65, 0x51, 0xbc, 0xff, 0x74, 0x21, 0x3d,
+  0x81, 0x5d, 0x25, 0x3d, 0x5d, 0xd0, 0x7a, 0xbd, 0x37, 0xb1, 0x40, 0xbd, 0xf0,
+  0xfd, 0x3d, 0x3d, 0x1e, 0xb2, 0x2a, 0xbc, 0x62, 0x35, 0x9e, 0xbd, 0xeb, 0x65,
+  0x51, 0xbc, 0x6f, 0xf6, 0x9a, 0xbd, 0x82, 0x5b, 0x81, 0xbc, 0xd7, 0x8a, 0x29,
+  0x3d, 0x5a, 0x89, 0x81, 0xbb, 0x6d, 0xf8, 0xe0, 0x3c, 0xa6, 0x56, 0x3c, 0x3d,
+  0x9d, 0xc6, 0x49, 0xbc, 0xdf, 0x38, 0x79, 0x3c, 0x51, 0x74, 0x4e, 0x3d, 0x02,
+  0xb4, 0x2e, 0xbd, 0x6e, 0x2c, 0x52, 0xbd, 0x98, 0x05, 0x96, 0x3c, 0x5e, 0xef,
+  0x12, 0x3d, 0xa9, 0x44, 0x29, 0xbd, 0x29, 0xcf, 0x47, 0x3d, 0x08, 0x33, 0xa3,
+  0xbd, 0xc7, 0xe5, 0x26, 0x3c, 0x16, 0xf0, 0xc7, 0xbc, 0x89, 0xde, 0xa2, 0x3a,
+  0x57, 0x77, 0xb9, 0x3b, 0xa0, 0x30, 0x9d, 0x3c, 0xd9, 0xf8, 0x91, 0xbc, 0xdc,
+  0xac, 0x41, 0x3c, 0xc9, 0xe5, 0x1a, 0xbd, 0x66, 0xcc, 0x89, 0x3d, 0xae, 0x83,
+  0x95, 0xbd, 0xf6, 0x92, 0xd3, 0x3c, 0x6a, 0x9a, 0xf7, 0x3c, 0xb4, 0xf9, 0x7c,
+  0xbb, 0x79, 0xd8, 0x99, 0xbc, 0x82, 0x88, 0xb6, 0xbc, 0xf7, 0xdf, 0xb3, 0x3d,
+  0x57, 0xa6, 0xa7, 0xbd, 0x2e, 0x22, 0xd9, 0xbc, 0xd6, 0x67, 0x91, 0xbc, 0x54,
+  0x25, 0x32, 0x3d, 0xc3, 0x91, 0x93, 0xbd, 0x1d, 0x77, 0x33, 0x3b, 0x56, 0xc9,
+  0x8b, 0x3d, 0xbf, 0xe2, 0x21, 0x3c, 0xf5, 0x88, 0x80, 0xbd, 0xee, 0x4f, 0xd8,
+  0xbc, 0xbf, 0x1c, 0x83, 0xbd, 0xa4, 0x91, 0x61, 0x3d, 0xdc, 0xc1, 0x74, 0x3d,
+  0xb4, 0x4d, 0x90, 0xbd, 0x80, 0x3d, 0xbb, 0x3c, 0x27, 0x03, 0xa2, 0xbb, 0x7e,
+  0x7e, 0xd9, 0x3c, 0xf4, 0x18, 0x5f, 0xbc, 0xb1, 0xde, 0x83, 0x3d, 0xd5, 0xee,
+  0x20, 0xbd, 0xbe, 0xa8, 0x7a, 0xbc, 0x01, 0x94, 0x03, 0xbd, 0x27, 0xa8, 0xfc,
+  0xbd, 0x72, 0x14, 0x56, 0x3d, 0x79, 0x46, 0x0d, 0xbc, 0x69, 0x23, 0xd1, 0x3c,
+  0x3b, 0x33, 0x49, 0x3d, 0x8d, 0xef, 0x18, 0x3b, 0xe9, 0xe1, 0x8f, 0xbd, 0x4f,
+  0x45, 0x05, 0x3d, 0x28, 0x80, 0x49, 0x3c, 0xbd, 0x49, 0x18, 0x3d, 0xfd, 0xd4,
+  0x86, 0x3c, 0xcc, 0x56, 0xa6, 0x3c, 0x37, 0x8e, 0xef, 0x3a, 0x57, 0x1e, 0x5f,
+  0x3d, 0xc2, 0xef, 0x68, 0xbc, 0x24, 0xc0, 0xbe, 0xbd, 0x9c, 0xfd, 0xa0, 0x3b,
+  0x48, 0x3b, 0x5d, 0x3d, 0xcf, 0xe0, 0x2c, 0xbd, 0x49, 0x51, 0xa7, 0x3d, 0x65,
+  0xcf, 0x7a, 0xbc, 0x27, 0x68, 0x4c, 0xbd, 0x00, 0xed, 0x99, 0xbc, 0x2a, 0xac,
+  0x5d, 0xbd, 0x6b, 0x5c, 0x9a, 0x3c, 0x71, 0xb7, 0x51, 0x3c, 0x1a, 0x04, 0x60,
+  0xbd, 0x4b, 0xb8, 0x42, 0x3d, 0xf6, 0x92, 0x4f, 0x3d, 0xcb, 0x7a, 0xc4, 0x3c,
+  0xc2, 0x1f, 0x85, 0x3d, 0xbf, 0x4c, 0x3b, 0x3b, 0x52, 0x04, 0x9a, 0xbd, 0x3a,
+  0x5c, 0x29, 0x3d, 0x5f, 0x4e, 0xb1, 0x3d, 0xfc, 0x4e, 0x87, 0xbc, 0x59, 0x10,
+  0xaa, 0x3d, 0x99, 0xff, 0x43, 0x3d, 0x20, 0x80, 0x8e, 0x3c, 0x79, 0x81, 0x3e,
+  0xbd, 0xfe, 0x38, 0xab, 0xbd, 0x3d, 0x72, 0xad, 0x3d, 0x18, 0xa1, 0x64, 0xbd,
+  0xa0, 0x6e, 0xb0, 0xbb, 0x19, 0x6b, 0x00, 0x3d, 0x6b, 0x7b, 0x15, 0xbc, 0x45,
+  0xb5, 0xa6, 0xbd, 0xef, 0x81, 0x05, 0xbd, 0x9f, 0xe8, 0x37, 0x3d, 0x71, 0xbe,
+  0xb6, 0xbc, 0x22, 0x55, 0xd6, 0xbc, 0x0d, 0x9b, 0xcf, 0x3c, 0x47, 0xa3, 0x92,
+  0x3d, 0xfd, 0x13, 0x74, 0x3d, 0x4f, 0xef, 0x53, 0x3d, 0x8b, 0xeb, 0x0f, 0xbd,
+  0xf9, 0x86, 0x00, 0x3d, 0xb8, 0xd1, 0x68, 0xbc, 0x68, 0xa4, 0x1c, 0xbd, 0x96,
+  0x27, 0x01, 0x3d, 0x28, 0x65, 0x4a, 0x3d, 0xef, 0xa3, 0x41, 0xbd, 0xdd, 0xd4,
+  0xac, 0x3c, 0x24, 0x42, 0x48, 0x3d, 0x55, 0x49, 0x99, 0x39, 0x7a, 0x2f, 0xde,
+  0xbc, 0x7f, 0xff, 0x94, 0x3d, 0x76, 0x44, 0x14, 0xbd, 0xea, 0xa9, 0x05, 0x3d,
+  0xd1, 0xa5, 0x2c, 0x3d, 0xfa, 0x4f, 0x0c, 0xbd, 0xda, 0x0a, 0x6d, 0xbd, 0x52,
+  0x92, 0x47, 0x3d, 0x8b, 0x87, 0x8b, 0x3d, 0xd0, 0x89, 0x48, 0xbd, 0xaa, 0xbe,
+  0x03, 0x3d, 0xa0, 0x14, 0x6d, 0xbd, 0x20, 0x3a, 0x80, 0x3d, 0x08, 0x2f, 0x86,
+  0xbd, 0xf9, 0xfd, 0xa4, 0xbd, 0xde, 0xd5, 0x92, 0xbc, 0xcd, 0x8a, 0x64, 0x3d,
+  0x48, 0xd0, 0x6c, 0x3d, 0x6a, 0xa3, 0xfa, 0xbc, 0xc3, 0xc7, 0x36, 0xbd, 0xb1,
+  0x87, 0x2e, 0xbd, 0x3b, 0x6c, 0x9e, 0x3d, 0x56, 0x18, 0x1a, 0xbe, 0x9e, 0xd1,
+  0xf5, 0x3c, 0xb9, 0xfe, 0xc3, 0xbc, 0x46, 0xbc, 0x40, 0xbd, 0x94, 0x3a, 0x48,
+  0x3d, 0xbc, 0x4e, 0xbb, 0x3d, 0xa0, 0x7b, 0x94, 0xbc, 0xd8, 0xeb, 0x91, 0x3d,
+  0x95, 0xa1, 0x99, 0xbd, 0xf4, 0x73, 0x9c, 0x3b, 0x23, 0x2d, 0x8e, 0x3d, 0x46,
+  0x9c, 0xa5, 0xbb, 0x61, 0x13, 0x50, 0xbd, 0xad, 0x99, 0xf8, 0x3c, 0xd2, 0xac,
+  0x7d, 0xbd, 0xc1, 0xb2, 0x6d, 0xbc, 0xf7, 0xde, 0x9f, 0xbd, 0x60, 0x72, 0x15,
+  0x3d, 0x69, 0xaf, 0xa2, 0x3d, 0xfd, 0x72, 0x79, 0x3d, 0xd0, 0xc0, 0xa1, 0xbb,
+  0x80, 0x21, 0x4f, 0x3d, 0xbc, 0x91, 0x0a, 0xbc, 0x23, 0xa3, 0xee, 0xbc, 0xd0,
+  0x1a, 0xbb, 0xbd, 0x2a, 0x71, 0x35, 0x3d, 0x21, 0x26, 0x66, 0x3d, 0xb4, 0x17,
+  0x89, 0xbb, 0x54, 0x4f, 0x80, 0xbc, 0x47, 0x10, 0xf3, 0xbc, 0x22, 0x75, 0x6c,
+  0x3d, 0xb1, 0x75, 0x00, 0x3d, 0xe2, 0xf4, 0xf5, 0xbd, 0xbe, 0xbc, 0x7b, 0x3d,
+  0xe3, 0x01, 0xc1, 0xbc, 0x05, 0x25, 0x82, 0xbb, 0x3f, 0x02, 0x5d, 0xbb, 0xa9,
+  0xc1, 0x5a, 0x3d, 0xea, 0xe4, 0x5e, 0x3c, 0x96, 0xd6, 0xa5, 0x3c, 0xcb, 0x77,
+  0xa4, 0x3c, 0xb2, 0x4f, 0x06, 0xbd, 0x84, 0xc3, 0x2c, 0xbd, 0x48, 0xdc, 0x9d,
+  0x3b, 0xdb, 0xd6, 0xbb, 0xbc, 0xc8, 0xdf, 0x98, 0xbc, 0x29, 0x14, 0x31, 0x3d,
+  0x6f, 0xfa, 0x4f, 0xbd, 0x7c, 0xb4, 0xaa, 0xbd, 0xe0, 0xeb, 0x2e, 0xbd, 0x53,
+  0x3f, 0xc4, 0x3d, 0xbc, 0xcb, 0x38, 0x3d, 0x30, 0x45, 0x30, 0x3c, 0xf0, 0xc1,
+  0x0c, 0xbd, 0xb3, 0x20, 0x39, 0xbd, 0x80, 0xe2, 0x8b, 0x3b, 0x35, 0x31, 0x05,
+  0xbd, 0xf5, 0xaa, 0x49, 0xbc, 0x7d, 0x08, 0x0a, 0x3d, 0xdd, 0x96, 0x84, 0xbc,
+  0x0f, 0xb9, 0x4c, 0x3d, 0x49, 0xea, 0x86, 0x3d, 0xc9, 0xd0, 0x75, 0xbb, 0xcd,
+  0x9b, 0xd1, 0x3d, 0x7a, 0x5e, 0x6f, 0xbd, 0x4a, 0x2e, 0xc0, 0xba, 0x3b, 0x7d,
+  0x7d, 0xbd, 0x2b, 0x8f, 0xfe, 0xbb, 0x2a, 0xf4, 0xce, 0x3d, 0xf6, 0xfc, 0x06,
+  0xbc, 0xdd, 0x02, 0x4a, 0x3c, 0x71, 0x3c, 0x03, 0xbd, 0x03, 0x9a, 0x90, 0xbd,
+  0x76, 0xb7, 0xb3, 0xbd, 0xa2, 0xd1, 0x47, 0xbd, 0xc1, 0x56, 0x6e, 0x3d, 0xff,
+  0x97, 0x57, 0x3d, 0x50, 0x57, 0xe6, 0xbc, 0x8f, 0xb3, 0x3d, 0xbd, 0x75, 0x8e,
+  0x80, 0xbd, 0xc7, 0x6c, 0x43, 0xbc, 0xaa, 0xe3, 0x9d, 0xbd, 0x6f, 0xe4, 0x1d,
+  0x3d, 0x3a, 0x57, 0x98, 0x3c, 0x6c, 0x08, 0x5c, 0x3d, 0xeb, 0xd2, 0xa5, 0xbb,
+  0xf7, 0x60, 0x08, 0xbc, 0x72, 0x03, 0x3b, 0xbd, 0xe7, 0xc1, 0x8f, 0x3d, 0xb6,
+  0x1f, 0x98, 0x3d, 0x59, 0xff, 0x88, 0x3d, 0x51, 0xe9, 0x73, 0xbc, 0x1f, 0x91,
+  0xa5, 0x3d, 0x3b, 0x64, 0x17, 0xbd, 0x5b, 0xa5, 0x80, 0x3d, 0x03, 0x38, 0x85,
+  0x3d, 0xbe, 0x27, 0x90, 0xbd, 0x4e, 0x87, 0xa3, 0xbc, 0xc1, 0xbb, 0x22, 0xbc,
+  0x8b, 0x25, 0xd0, 0xbb, 0x6a, 0x2f, 0x1d, 0x3d, 0x0a, 0xdd, 0x48, 0x3d, 0x0b,
+  0x37, 0x37, 0x3d, 0x2a, 0x68, 0x1a, 0x3d, 0xc8, 0x85, 0x4a, 0x3d, 0x0a, 0xa5,
+  0x03, 0x3c, 0xd2, 0x41, 0x12, 0x3d, 0x25, 0xc3, 0x24, 0x3b, 0x1a, 0x95, 0x33,
+  0x3d, 0xbf, 0xfd, 0xd7, 0x3c, 0xce, 0xff, 0x6e, 0xbc, 0x91, 0xc5, 0x0f, 0x3c,
+  0x7e, 0x5f, 0x64, 0xbd, 0x64, 0x7d, 0x1c, 0xbd, 0x42, 0x2d, 0xba, 0x3d, 0x99,
+  0x69, 0xa5, 0x3c, 0x39, 0x7d, 0x72, 0xbd, 0x6a, 0xbf, 0x8f, 0x3b, 0xaa, 0x43,
+  0x02, 0x3d, 0xb7, 0xb7, 0x35, 0xbd, 0x97, 0xaf, 0x6c, 0x3c, 0x62, 0x39, 0xd6,
+  0xbc, 0x33, 0xd6, 0x85, 0x3d, 0x4c, 0x50, 0x47, 0x3d, 0x26, 0x4b, 0x57, 0x3d,
+  0xf8, 0x80, 0x15, 0x3c, 0x9e, 0x69, 0x05, 0xbc, 0xa4, 0x13, 0xb5, 0x3d, 0x41,
+  0x17, 0xda, 0xbd, 0x48, 0x79, 0x2b, 0xbb, 0xb4, 0x86, 0xcc, 0xbb, 0xad, 0x20,
+  0x95, 0xbd, 0x20, 0xf5, 0x01, 0x3e, 0x23, 0x9e, 0x9b, 0x3d, 0xdb, 0xfe, 0x38,
+  0x3b, 0x23, 0x42, 0x57, 0x3b, 0x42, 0x99, 0x59, 0x3d, 0xf2, 0x9d, 0xba, 0xbd,
+  0x92, 0xe5, 0x5d, 0x3d, 0x20, 0x17, 0x07, 0xbb, 0xf0, 0x57, 0x08, 0x3d, 0x7d,
+  0xed, 0x91, 0xbc, 0x2e, 0xc4, 0x8d, 0xbd, 0xdb, 0x15, 0xc2, 0x3c, 0xaa, 0xc3,
+  0xe6, 0xbb, 0x90, 0x5d, 0xb4, 0xbc, 0xee, 0xaa, 0x9a, 0x3d, 0x74, 0x6d, 0x22,
+  0xbb, 0x00, 0x65, 0xc2, 0xb9, 0x37, 0x30, 0x07, 0xbd, 0x85, 0xbd, 0x60, 0xbb,
+  0x2b, 0x40, 0xd7, 0x3c, 0xca, 0x82, 0x33, 0xbd, 0x29, 0xb2, 0x81, 0x3d, 0x08,
+  0xee, 0xd5, 0x3c, 0x28, 0x34, 0xdf, 0x3c, 0x3d, 0x41, 0x67, 0xbd, 0x0c, 0x1e,
+  0xf7, 0x3c, 0x9c, 0x86, 0xe4, 0x3c, 0x36, 0x7c, 0x07, 0x3d, 0xc7, 0x27, 0x04,
+  0xbd, 0x45, 0xcb, 0x77, 0x3d, 0xcf, 0x66, 0x14, 0xbd, 0x29, 0xae, 0x3f, 0xbd,
+  0x70, 0x86, 0x25, 0xbc, 0x08, 0xc9, 0xa6, 0x3c, 0x70, 0xa3, 0xa8, 0xbb, 0xbe,
+  0x82, 0x49, 0x3d, 0x13, 0xa1, 0x73, 0xbd, 0xd5, 0x6c, 0x35, 0xbd, 0x98, 0xfa,
+  0x3a, 0x3c, 0xff, 0x0c, 0xe2, 0xb9, 0x37, 0xe9, 0xf2, 0xbb, 0x78, 0x2d, 0x89,
+  0xbd, 0xec, 0x2c, 0x88, 0xbc, 0x97, 0x7f, 0x2e, 0x3d, 0x9e, 0x32, 0x88, 0xbd,
+  0x17, 0xdb, 0x20, 0xbd, 0xde, 0xbd, 0xc7, 0x3b, 0x30, 0x01, 0xf4, 0x3c, 0xf8,
+  0x47, 0x05, 0xbd, 0xab, 0x0c, 0xdf, 0x3c, 0x8b, 0xdc, 0xa5, 0x3c, 0x62, 0x53,
+  0x78, 0xbd, 0xf1, 0x6e, 0x56, 0x3d, 0x1e, 0xf2, 0x79, 0x3d, 0x0a, 0xce, 0x9b,
+  0xbc, 0x18, 0xed, 0xaf, 0x3c, 0xd1, 0x1d, 0x8a, 0x3d, 0x78, 0xe8, 0x6e, 0x3c,
+  0x1d, 0x2a, 0x84, 0x3d, 0x90, 0xb3, 0x80, 0x3d, 0x26, 0x1f, 0x74, 0x3d, 0x14,
+  0xc6, 0x79, 0xbb, 0x37, 0x9d, 0x18, 0x3d, 0x1a, 0x28, 0x86, 0x3d, 0x8b, 0x8e,
+  0x0f, 0xbd, 0x50, 0x3e, 0x82, 0xbc, 0x6f, 0x35, 0x70, 0xbd, 0xa5, 0xa6, 0x88,
+  0x3d, 0xb6, 0xe7, 0x2a, 0xbd, 0x57, 0x46, 0x0a, 0x3d, 0xd6, 0xba, 0x34, 0xbd,
+  0xc2, 0xf8, 0xc1, 0xbc, 0x2e, 0xe5, 0x30, 0xbd, 0xd5, 0x76, 0x85, 0x3d, 0xb4,
+  0xeb, 0x88, 0xbd, 0xb5, 0x44, 0x40, 0x3d, 0x08, 0x9a, 0x8f, 0xbd, 0xe4, 0xa2,
+  0xdf, 0x3c, 0x40, 0x83, 0xaf, 0x3a, 0xe0, 0xfb, 0x20, 0x3b, 0x84, 0xc3, 0xf1,
+  0x3c, 0x13, 0x24, 0x88, 0xbd, 0x03, 0x21, 0x4a, 0xbd, 0xd6, 0x14, 0x39, 0x3d,
+  0x10, 0x2c, 0x84, 0xbd, 0x47, 0xe0, 0xed, 0xbc, 0x8e, 0xfd, 0x91, 0xbc, 0x0e,
+  0x42, 0x93, 0xbc, 0xe4, 0x43, 0x6b, 0x3d, 0x96, 0xc7, 0x36, 0x3d, 0xb0, 0xc2,
+  0xac, 0xbb, 0x28, 0x29, 0x74, 0x3d, 0xf0, 0x10, 0xb5, 0xbb, 0x09, 0x5e, 0x6c,
+  0x3d, 0xc3, 0xa9, 0x97, 0x3c, 0x4f, 0xc1, 0x9c, 0x3c, 0x4e, 0xc4, 0xf0, 0x3c,
+  0x4e, 0x42, 0xfa, 0xbc, 0x9a, 0x53, 0x79, 0x3c, 0x9e, 0xc3, 0xd8, 0xbc, 0xfe,
+  0x1e, 0x57, 0x3c, 0xa2, 0xec, 0x3f, 0xba, 0xfa, 0x34, 0x12, 0x3d, 0x43, 0x1c,
+  0xd4, 0x3c, 0xf3, 0x3f, 0xa5, 0x3a, 0xda, 0xa7, 0x96, 0xbd, 0x6a, 0x5f, 0x2a,
+  0x3d, 0xbd, 0x83, 0xd3, 0xbb, 0xb8, 0x9c, 0x5b, 0xbd, 0x67, 0xbb, 0x2d, 0x3c,
+  0x44, 0x9a, 0xb0, 0xbc, 0x5c, 0x1b, 0xe6, 0x3c, 0x10, 0xfd, 0x67, 0xbd, 0x3b,
+  0x8e, 0x94, 0xbd, 0xf3, 0x97, 0xca, 0xbb, 0x3a, 0xae, 0x3f, 0x3c, 0xd2, 0xbe,
+  0x81, 0x3d, 0xd7, 0x2c, 0x86, 0xbd, 0x48, 0xc8, 0xbf, 0xbc, 0x00, 0x15, 0x5e,
+  0xbc, 0x43, 0x09, 0x1d, 0x3d, 0x3d, 0xe7, 0x75, 0xbd, 0x38, 0xe4, 0x5f, 0x3c,
+  0x8f, 0xe1, 0x09, 0x3d, 0xab, 0xa4, 0x16, 0xbd, 0x69, 0x15, 0x35, 0x3d, 0x6d,
+  0x6a, 0x20, 0xbd, 0xa1, 0xd2, 0x9b, 0xbb, 0x89, 0xfb, 0xd1, 0x3c, 0x91, 0x05,
+  0x82, 0x3d, 0x5c, 0x10, 0x3c, 0xbd, 0x7e, 0x4d, 0x5d, 0x3d, 0x5a, 0xac, 0x44,
+  0xbc, 0xe5, 0x82, 0xfd, 0xbc, 0xd7, 0xc2, 0x82, 0xbd, 0xe7, 0xd3, 0x5f, 0x3d,
+  0x3e, 0x16, 0x1e, 0x3d, 0x72, 0xcf, 0x9c, 0xbd, 0xf9, 0x44, 0xa2, 0xbc, 0x1c,
+  0x64, 0x69, 0xba, 0x9e, 0xc1, 0x01, 0x3c, 0x07, 0xc9, 0x81, 0xbd, 0x18, 0x75,
+  0x25, 0xbd, 0x12, 0x0b, 0xfd, 0xbc, 0x00, 0x54, 0xd5, 0x38, 0x73, 0x47, 0x85,
+  0xbd, 0xaa, 0x08, 0x68, 0x3d, 0xa5, 0xf5, 0xa8, 0xbc, 0xd7, 0xea, 0x16, 0x3d,
+  0x38, 0x81, 0x2a, 0xbd, 0xb0, 0x44, 0x45, 0x3d, 0xe6, 0x66, 0x71, 0x3d, 0x39,
+  0x4d, 0x58, 0xbc, 0x6c, 0xd5, 0xbc, 0xbc, 0x40, 0x65, 0xab, 0x3c, 0x92, 0x4f,
+  0x83, 0x3d, 0x46, 0xb4, 0x83, 0x3d, 0xf3, 0x7b, 0x5e, 0xbd, 0x8f, 0x77, 0x98,
+  0xbc, 0x28, 0xd3, 0xe2, 0xbc, 0xa8, 0x94, 0xdc, 0xbc, 0xdc, 0x3a, 0x03, 0x39,
+  0x6e, 0xd2, 0x81, 0x3c, 0x49, 0x64, 0xb8, 0xbc, 0xdb, 0x96, 0x03, 0xbd, 0xeb,
+  0x90, 0x4c, 0x3d, 0xcc, 0xc7, 0x45, 0xbc, 0xca, 0xbc, 0x4a, 0xbd, 0xcc, 0xf4,
+  0x90, 0x3c, 0x1e, 0x78, 0x93, 0x3b, 0xe8, 0x46, 0x68, 0xbd, 0x02, 0xe7, 0x78,
+  0xbc, 0x95, 0x12, 0x48, 0xbd, 0x36, 0xd3, 0x60, 0xbd, 0x0b, 0x6a, 0x1c, 0x3d,
+  0x9c, 0xa6, 0xb4, 0x3c, 0x20, 0xe6, 0xca, 0x3c, 0x52, 0x5e, 0x97, 0xbd, 0xe8,
+  0x0f, 0x10, 0xbd, 0x01, 0xe8, 0x51, 0xbd, 0xf1, 0x2a, 0x0e, 0xbd, 0x1d, 0x03,
+  0x85, 0x3a, 0x00, 0x7f, 0x50, 0x3d, 0x5a, 0x91, 0xd7, 0xbc, 0xc5, 0x55, 0x3b,
+  0x3d, 0xd6, 0x47, 0x8a, 0xbd, 0x2d, 0x40, 0x80, 0x3d, 0x49, 0x84, 0xd9, 0xbb,
+  0x2c, 0x7d, 0x5a, 0x3d, 0x94, 0x2d, 0xcd, 0x3c, 0x84, 0xe9, 0x90, 0xbd, 0x67,
+  0xf2, 0x95, 0xbd, 0xf6, 0x29, 0x12, 0xbd, 0x7b, 0x2e, 0x64, 0x3d, 0xf5, 0x42,
+  0x01, 0xbd, 0x42, 0x57, 0x2b, 0x3d, 0x0d, 0xd5, 0x99, 0xbd, 0xdf, 0xd5, 0x4b,
+  0xbd, 0xc4, 0x97, 0x4a, 0xbd, 0xb1, 0xb5, 0xa0, 0x3c, 0x97, 0xa5, 0x13, 0xbb,
+  0xda, 0x02, 0x11, 0x3d, 0x6e, 0x22, 0xce, 0xbb, 0x9f, 0x3e, 0xf0, 0x3c, 0x92,
+  0x5d, 0xb5, 0xbc, 0xda, 0x5e, 0x45, 0x3d, 0x53, 0x93, 0x0a, 0x3d, 0xa4, 0xf0,
+  0x8b, 0x3c, 0x4a, 0x4c, 0x04, 0x3d, 0x76, 0xc7, 0x8e, 0x3c, 0x55, 0xba, 0x39,
+  0x3c, 0xa5, 0xed, 0x8c, 0xbd, 0x16, 0x33, 0x80, 0xbd, 0x32, 0xd7, 0x3b, 0x3d,
+  0x07, 0xe9, 0x62, 0xbd, 0x6e, 0x01, 0x76, 0x3d, 0x42, 0x8b, 0x5e, 0xbd, 0x30,
+  0x56, 0x07, 0x3d, 0x2c, 0x8b, 0xdb, 0xbc, 0xaf, 0xff, 0x8f, 0xbd, 0xf3, 0x4a,
+  0x5d, 0xbd, 0xb0, 0x52, 0xb7, 0x3b, 0x29, 0x47, 0x9c, 0xbc, 0x5a, 0x8d, 0x30,
+  0xbd, 0x71, 0xf8, 0x07, 0x3d, 0xc0, 0x46, 0x27, 0xbd, 0x93, 0x7d, 0x89, 0xbc,
+  0xd2, 0x61, 0x39, 0x3d, 0x8d, 0x18, 0x69, 0x3c, 0x43, 0xd6, 0x18, 0xbc, 0x00,
+  0x37, 0x0f, 0xba, 0x68, 0x4c, 0x4a, 0x3d, 0x4a, 0x6d, 0x6c, 0xbd, 0x63, 0x4a,
+  0x7c, 0xbc, 0x0e, 0xed, 0x6b, 0xbd, 0x43, 0xc3, 0x97, 0xbd, 0xd0, 0x48, 0xa4,
+  0xbb, 0xb4, 0x48, 0xa0, 0x3c, 0x89, 0x3c, 0x89, 0xbd, 0x00, 0xa7, 0xb4, 0x39,
+  0xe2, 0xd3, 0x5e, 0x3d, 0x19, 0x2b, 0x10, 0xbc, 0x46, 0xef, 0x9a, 0xbd, 0x1c,
+  0x32, 0xac, 0x3c, 0xe2, 0x57, 0x4b, 0x3d, 0xf7, 0x44, 0x41, 0x3d, 0x84, 0x06,
+  0x89, 0xbc, 0x20, 0xf0, 0xb7, 0x3b, 0x3a, 0x7b, 0x50, 0x3d, 0xc0, 0xe4, 0x59,
+  0xbd, 0x06, 0x58, 0x19, 0x3d, 0x80, 0x23, 0xe1, 0x3b, 0xe2, 0xdc, 0x8c, 0xbd,
+  0xdc, 0x0a, 0x84, 0x3d, 0x96, 0xfe, 0x23, 0xbb, 0x45, 0x27, 0x40, 0xbd, 0x5d,
+  0xc4, 0x0f, 0x3d, 0xcc, 0xe2, 0xab, 0xbc, 0x64, 0xec, 0xf8, 0xbc, 0x5e, 0x9d,
+  0x1f, 0xbd, 0xa4, 0x84, 0x16, 0xbd, 0x26, 0x34, 0x99, 0xbd, 0xeb, 0x94, 0x91,
+  0x3d, 0xae, 0x2b, 0x25, 0x3d, 0x7d, 0x8a, 0x2c, 0x3d, 0x65, 0xdb, 0xa1, 0xbc,
+  0xb9, 0x5c, 0x2a, 0x3d, 0xe4, 0x06, 0x1d, 0xbb, 0xb6, 0xca, 0x17, 0x3d, 0xc8,
+  0xd8, 0x12, 0x3d, 0x5c, 0xf3, 0x28, 0xbd, 0x44, 0x6b, 0x85, 0xbc, 0xa0, 0x1c,
+  0x05, 0x3b, 0x1e, 0x13, 0x49, 0x3d, 0xd0, 0xbc, 0x07, 0x3d, 0xe4, 0xe8, 0x33,
+  0x3c, 0xe1, 0xbe, 0x4c, 0x3d, 0xcf, 0xa9, 0x0d, 0x3c, 0x52, 0x61, 0x62, 0x3d,
+  0x2e, 0x19, 0x63, 0x3d, 0xbe, 0x72, 0x86, 0x3d, 0x20, 0x7b, 0x34, 0x3c, 0xa0,
+  0x1b, 0x6d, 0xbb, 0xbe, 0xdf, 0xd9, 0x3a, 0x6b, 0xae, 0x4e, 0x3d, 0x3b, 0x38,
+  0x7d, 0xbd, 0xa1, 0xee, 0x3b, 0x3d, 0x51, 0x91, 0x37, 0x3b, 0x26, 0x34, 0xe4,
+  0xbc, 0x13, 0x50, 0x8c, 0xbd, 0x5b, 0x2d, 0x52, 0xbd, 0xb3, 0xf6, 0x5d, 0xbc,
+  0x82, 0x69, 0x3f, 0xbb, 0xf3, 0x6b, 0x14, 0x3d, 0xe8, 0x54, 0x9a, 0x3c, 0x42,
+  0xa5, 0x35, 0x3d, 0x99, 0x10, 0x0b, 0xbc, 0x87, 0x55, 0x2d, 0xbd, 0x1f, 0x1a,
+  0x16, 0xbd, 0x99, 0xaa, 0x16, 0xbc, 0x1a, 0x04, 0x3e, 0xbd, 0x62, 0x5f, 0x12,
+  0x3d, 0xea, 0x90, 0x18, 0x3d, 0x32, 0x9f, 0x17, 0x3d, 0x1c, 0x6f, 0xba, 0x3c,
+  0xce, 0xe2, 0x13, 0x3d, 0x47, 0xa2, 0xdb, 0xbc, 0xf7, 0x85, 0x4f, 0xbd, 0x24,
+  0x60, 0xc8, 0xbc, 0xea, 0x00, 0x5e, 0xbd, 0x08, 0x73, 0x58, 0x3d, 0xf3, 0x42,
+  0x85, 0xbd, 0x0e, 0xcd, 0x91, 0xbd, 0x3c, 0xba, 0xb1, 0xbc, 0x48, 0x41, 0x01,
+  0x3d, 0xb1, 0xcf, 0x64, 0x3d, 0x6f, 0x25, 0x9a, 0xbc, 0xda, 0xaa, 0xce, 0x3c,
+  0x22, 0x5f, 0x62, 0x3d, 0xf9, 0x36, 0x9b, 0xbd, 0x85, 0x6f, 0x81, 0x3d, 0x22,
+  0xd8, 0x2e, 0xbd, 0x72, 0x49, 0x19, 0xbd, 0x21, 0x3c, 0xb9, 0xba, 0xc5, 0x69,
+  0x8a, 0xbd, 0x68, 0xec, 0x08, 0xbd, 0xd9, 0x7e, 0x06, 0xbd, 0x0e, 0xa4, 0x36,
+  0x3d, 0x9e, 0xbb, 0x65, 0xbd, 0xaf, 0x04, 0x81, 0x3d, 0x07, 0xa0, 0x7b, 0xbd,
+  0xa7, 0x30, 0x51, 0xbd, 0x15, 0x8e, 0x05, 0x3c, 0xe0, 0x7a, 0x7c, 0x3c, 0x43,
+  0x90, 0x04, 0x3d, 0x00, 0xf1, 0x4b, 0xbb, 0xe0, 0xe9, 0x29, 0x3b, 0x6f, 0x91,
+  0x1d, 0xbd, 0xff, 0xc5, 0xd0, 0x3c, 0x6b, 0x02, 0xe3, 0x3c, 0xba, 0x1f, 0x53,
+  0xbc, 0x0e, 0xd5, 0x7e, 0x3d, 0x54, 0xe0, 0x97, 0xbc, 0x00, 0x7a, 0xf2, 0xb9,
+  0x66, 0x00, 0x84, 0x3d, 0x62, 0x17, 0x08, 0xbd, 0x5a, 0x30, 0x46, 0x3d, 0x75,
+  0xb1, 0x37, 0xbd, 0x6f, 0x28, 0x55, 0x3c, 0xe0, 0xc4, 0x82, 0xbd, 0xfc, 0xf5,
+  0xb2, 0xbc, 0x96, 0xdc, 0x0a, 0xbb, 0x83, 0x2a, 0x91, 0x3c, 0x29, 0x21, 0x40,
+  0x3d, 0xff, 0x1f, 0x9c, 0xbd, 0x82, 0xb2, 0x5d, 0x3d, 0x8e, 0x14, 0x2c, 0x3d,
+  0xec, 0xb2, 0xed, 0xbc, 0xb8, 0xa0, 0x3a, 0xbc, 0x66, 0x70, 0x11, 0xbc, 0x49,
+  0xa6, 0xd0, 0xbc, 0x55, 0x34, 0x14, 0xbc, 0xb4, 0x65, 0x80, 0x3d, 0x76, 0x98,
+  0x87, 0xbd, 0x23, 0x3d, 0xa2, 0x3c, 0xaa, 0xc5, 0x7e, 0x3d, 0xb7, 0x41, 0x91,
+  0xbd, 0x9f, 0xe6, 0x80, 0xbd, 0x20, 0x0a, 0x13, 0x3c, 0xc8, 0xa0, 0xf3, 0x3c,
+  0x51, 0xf3, 0x04, 0x3d, 0x61, 0x7e, 0x0c, 0x3d, 0xbe, 0x25, 0x47, 0x3d, 0x25,
+  0x2b, 0x2b, 0x3d, 0xa9, 0x7a, 0x3f, 0xbd, 0xc2, 0xd4, 0xe3, 0xbc, 0x67, 0xc5,
+  0x79, 0x3d, 0x10, 0x4b, 0xb0, 0x3c, 0xb8, 0xd1, 0x87, 0x3c, 0xd3, 0x7b, 0x54,
+  0xbd, 0x81, 0x81, 0xcc, 0x3c, 0x85, 0x81, 0x15, 0x3d, 0xaa, 0xa8, 0xb0, 0x3b,
+  0x4b, 0x90, 0xae, 0x3c, 0xaa, 0x38, 0x0f, 0x3d, 0x92, 0x82, 0x0a, 0xbd, 0xfd,
+  0x99, 0x51, 0x3d, 0x90, 0x87, 0x0b, 0xbd, 0xc6, 0x71, 0x58, 0xbd, 0x4f, 0x17,
+  0x86, 0x38, 0x03, 0x9a, 0x00, 0xbd, 0xeb, 0xae, 0x34, 0xbd, 0xab, 0x28, 0x19,
+  0x3b, 0xc5, 0x48, 0x6c, 0xbd, 0x4a, 0xa3, 0x7c, 0xbd, 0x1f, 0xe7, 0x00, 0x3c,
+  0xf4, 0xd8, 0xd8, 0x3c, 0xbc, 0x01, 0x59, 0xbd, 0xa9, 0x77, 0xb5, 0xbb, 0x67,
+  0xc3, 0x82, 0x3d, 0x37, 0xd8, 0x8c, 0x3d, 0xea, 0x92, 0x59, 0x3d, 0x30, 0x97,
+  0x31, 0x3d, 0x36, 0xb9, 0x23, 0xbb, 0x98, 0x99, 0x7f, 0xbd, 0x0b, 0xfd, 0x8e,
+  0xbc, 0x80, 0xc6, 0x5c, 0xbd, 0xb2, 0xf0, 0x76, 0x3d, 0x7e, 0x01, 0xe5, 0xbc,
+  0x0a, 0x94, 0x08, 0x3d, 0xb2, 0x9b, 0x7b, 0xbd, 0xdc, 0x27, 0x6b, 0xbd, 0x32,
+  0x1e, 0x41, 0x3d, 0x4b, 0xd8, 0x8a, 0xbd, 0xe6, 0xdc, 0xd5, 0x3c, 0x72, 0xfd,
+  0x09, 0xbd, 0x33, 0x80, 0xc5, 0xba, 0xbc, 0xdd, 0xc0, 0x3b, 0xf4, 0x31, 0x9a,
+  0xbd, 0x29, 0x45, 0xd9, 0x3c, 0x02, 0x33, 0xd8, 0xbc, 0x97, 0x48, 0x73, 0x3d,
+  0x7f, 0x13, 0x88, 0xbd, 0x9b, 0xed, 0x40, 0xbd, 0xae, 0x86, 0x7d, 0xbd, 0xea,
+  0xa5, 0x4a, 0x3b, 0x8d, 0xd4, 0xd8, 0x3c, 0x57, 0xc1, 0x28, 0xbc, 0x6a, 0xb8,
+  0x15, 0x3d, 0x30, 0xb0, 0xdc, 0xbb, 0x71, 0x34, 0x05, 0xbd, 0x39, 0x9c, 0x8a,
+  0x3d, 0x98, 0xdd, 0x45, 0xbc, 0xf1, 0xcc, 0xcb, 0xbc, 0xe1, 0xf6, 0xd8, 0x3c,
+  0xae, 0xb9, 0x18, 0xbb, 0x67, 0x50, 0x82, 0x3d, 0x20, 0x71, 0x82, 0x3d, 0x0e,
+  0x45, 0x4a, 0xbd, 0x30, 0x86, 0xbe, 0xbb, 0x60, 0xc7, 0x07, 0x3d, 0xdb, 0xf7,
+  0x04, 0xbd, 0x9a, 0xc3, 0xb2, 0xbc, 0xe0, 0x58, 0xf5, 0xbc, 0x12, 0x0a, 0x48,
+  0x3d, 0xf7, 0x85, 0x2e, 0x3d, 0xab, 0x2b, 0xe6, 0x3b, 0xed, 0x4c, 0x15, 0xbc,
+  0x99, 0x4b, 0xb1, 0xbc, 0xa1, 0x82, 0x09, 0x3d, 0x8b, 0x84, 0x09, 0xbd, 0x85,
+  0x5a, 0x38, 0xbb, 0x83, 0xc7, 0x80, 0xbd, 0xfe, 0xf3, 0x67, 0xbd, 0x6e, 0x25,
+  0x6f, 0x3d, 0x00, 0xa4, 0xf8, 0xbc, 0x3a, 0x24, 0x17, 0xbc, 0xb2, 0x0d, 0x8a,
+  0x3c, 0x87, 0xac, 0x69, 0x3d, 0xcd, 0x5f, 0x89, 0xbc, 0x9e, 0x08, 0x7d, 0xbd,
+  0x4c, 0xa4, 0xa0, 0xbc, 0x63, 0x21, 0x2c, 0x3d, 0x5a, 0x78, 0x71, 0xbd, 0xa2,
+  0xe8, 0x71, 0x3d, 0x2b, 0xc9, 0xc1, 0xbb, 0x6f, 0x4f, 0x78, 0xbd, 0xa9, 0xee,
+  0xdf, 0x3c, 0x3c, 0xe2, 0xb3, 0xbc, 0x64, 0xa2, 0x7d, 0xbc, 0xcc, 0x2c, 0x35,
+  0x3d, 0xfd, 0x8c, 0x86, 0x3d, 0xe9, 0x57, 0xf3, 0x3c, 0xc1, 0x84, 0x82, 0x3d,
+  0x8e, 0x7a, 0x6c, 0xbd, 0xf1, 0x40, 0x04, 0x3d, 0x7e, 0x17, 0x5b, 0x3d, 0x74,
+  0xba, 0x83, 0x3a, 0x6f, 0x01, 0x86, 0xbd, 0x62, 0x58, 0x69, 0xbd, 0x33, 0xcd,
+  0x07, 0x3d, 0x6e, 0xc5, 0x8c, 0xbd, 0x5a, 0x4c, 0x99, 0x3c, 0x87, 0xb8, 0xf0,
+  0x3c, 0xc1, 0x64, 0x8a, 0x3c, 0x4c, 0x69, 0x23, 0xbd, 0x93, 0x75, 0x80, 0x3d,
+  0x54, 0x27, 0x87, 0xbd, 0xdc, 0x3e, 0x62, 0x3d, 0x9e, 0xdb, 0x43, 0xbc, 0x03,
+  0xd4, 0x65, 0xbd, 0x4c, 0xb6, 0x59, 0x3d, 0xc4, 0xa1, 0xe8, 0xbc, 0xf3, 0xdc,
+  0x87, 0x3d, 0xf5, 0x34, 0x82, 0xbc, 0x4e, 0x2d, 0xe2, 0x3b, 0xd6, 0x1e, 0x3d,
+  0xbd, 0xea, 0x0c, 0x83, 0x3d, 0x34, 0x3e, 0x20, 0xbd, 0xb6, 0x87, 0x77, 0x3c,
+  0x9c, 0x9a, 0xe4, 0xba, 0x48, 0x21, 0xa5, 0xbc, 0xb3, 0x81, 0x89, 0x3d, 0xf4,
+  0x2c, 0x49, 0x3d, 0x98, 0xb5, 0xd6, 0xbc, 0x88, 0xdb, 0x30, 0xbd, 0xa4, 0x2f,
+  0x88, 0xbc, 0x67, 0xc1, 0xb6, 0xbc, 0x8e, 0xba, 0xb8, 0xbc, 0xdd, 0x22, 0xc2,
+  0x3c, 0xaf, 0x08, 0x8f, 0x3b, 0xa5, 0x85, 0xcb, 0xbc, 0x26, 0x24, 0x2c, 0x3d,
+  0x2c, 0x73, 0x35, 0x3c, 0xf9, 0xb2, 0xaf, 0xbb, 0xf2, 0x50, 0x2f, 0xbd, 0x15,
+  0x10, 0x31, 0x3c, 0x75, 0xdb, 0x67, 0x3d, 0x5c, 0xe2, 0xfe, 0x3c, 0x51, 0xe0,
+  0x8d, 0x3d, 0x1c, 0x25, 0xb9, 0x3c, 0xcf, 0x20, 0x80, 0x3d, 0x5c, 0x61, 0xdf,
+  0x3c, 0x9a, 0x2e, 0x5d, 0x3d, 0x4d, 0x63, 0xd8, 0x3c, 0x23, 0x0e, 0x32, 0xbc,
+  0x6a, 0xaa, 0x61, 0x3d, 0xa3, 0x74, 0x86, 0xbd, 0x60, 0x32, 0x73, 0x3b, 0xe3,
+  0x8b, 0x73, 0xbc, 0x6d, 0x26, 0x40, 0x3d, 0x8c, 0xbb, 0xbf, 0xbb, 0x4f, 0x89,
+  0xf9, 0x3c, 0x6a, 0xfe, 0x0b, 0x3d, 0x43, 0x89, 0x3f, 0xbd, 0xe6, 0x1f, 0xda,
+  0xbc, 0xdf, 0x48, 0x36, 0xbd, 0xd8, 0x5a, 0x8f, 0xbd, 0x58, 0x20, 0xfc, 0x3c,
+  0xec, 0xc0, 0x69, 0x3d, 0xc9, 0x17, 0x06, 0xbd, 0xc1, 0x2b, 0xd9, 0x3b, 0xba,
+  0x7f, 0x73, 0x3a, 0xde, 0xd4, 0xbd, 0xbc, 0x9f, 0x94, 0xd6, 0x3c, 0xfe, 0xb3,
+  0x56, 0x3c, 0xbd, 0xda, 0xd0, 0xbc, 0x9c, 0x13, 0x6c, 0xbc, 0x10, 0x12, 0xab,
+  0x3c, 0x94, 0x9f, 0x1d, 0xbd, 0x78, 0xbb, 0x9d, 0x3c, 0x6c, 0xca, 0x00, 0xbd,
+  0x4c, 0xb7, 0xb8, 0x3c, 0x09, 0x38, 0xd3, 0x3c, 0x4c, 0x70, 0x91, 0x3c, 0xe9,
+  0x6b, 0x26, 0xbc, 0x57, 0x19, 0xa4, 0x3c, 0xd2, 0xf7, 0x54, 0x3d, 0x0f, 0x9a,
+  0x48, 0x3d, 0xd0, 0xe2, 0x8f, 0x3b, 0x58, 0x63, 0x13, 0x3c, 0x81, 0xda, 0x1b,
+  0xbd, 0x77, 0x24, 0x83, 0x3c, 0xd7, 0x64, 0xc7, 0x3b, 0xb0, 0xf6, 0x6b, 0xbc,
+  0x8a, 0xaa, 0x62, 0x3d, 0xa4, 0x13, 0xbb, 0xbc, 0xe8, 0x06, 0xb3, 0x3c, 0xb1,
+  0x41, 0x77, 0x3d, 0x1c, 0xac, 0xe0, 0x3c, 0x40, 0x0f, 0x25, 0x3c, 0x89, 0xc0,
+  0x54, 0x3c, 0xec, 0x1d, 0x7a, 0x3d, 0x41, 0x1e, 0x31, 0x3d, 0x51, 0x3e, 0x26,
+  0x3d, 0x00, 0x55, 0x39, 0xbd, 0x2e, 0x9d, 0x7f, 0x3d, 0x2f, 0xe9, 0x4d, 0xbd,
+  0x46, 0x85, 0x35, 0xbd, 0xa2, 0x67, 0xf8, 0x3c, 0x16, 0x0f, 0x82, 0xbd, 0xcd,
+  0x48, 0x9a, 0x3b, 0x62, 0xd9, 0x08, 0x3d, 0x67, 0x0f, 0x5a, 0xbc, 0xd0, 0x09,
+  0x56, 0xbc, 0x31, 0x38, 0xda, 0xbc, 0x67, 0xf7, 0xa1, 0xbc, 0x8c, 0x2a, 0x79,
+  0xbd, 0xb3, 0xf5, 0xb1, 0xbc, 0xe8, 0xf4, 0x8b, 0xbd, 0x5f, 0x45, 0x11, 0xbd,
+  0x9f, 0x79, 0x1e, 0xbd, 0xf5, 0xbf, 0x86, 0x3d, 0x4e, 0xd8, 0xed, 0xbc, 0xcd,
+  0x66, 0x5b, 0x3c, 0x4a, 0x74, 0x8f, 0x3b, 0xe3, 0x98, 0x4f, 0x3d, 0x0d, 0x54,
+  0x91, 0xbb, 0x24, 0xb6, 0x1b, 0x3d, 0xd8, 0x0d, 0xb7, 0xbc, 0x04, 0x76, 0x31,
+  0xbd, 0x10, 0x43, 0x11, 0xbd, 0x0e, 0xc2, 0x02, 0xbd, 0x88, 0x66, 0x43, 0x3c,
+  0xb5, 0xda, 0x95, 0xbb, 0x07, 0x09, 0x28, 0xbd, 0x22, 0xcc, 0x19, 0xbd, 0xf0,
+  0x47, 0xfe, 0x3c, 0x10, 0x43, 0xfb, 0xbc, 0x5f, 0x5f, 0x2c, 0x3d, 0xfb, 0xce,
+  0x18, 0xbc, 0xcd, 0x87, 0x6a, 0x3d, 0xee, 0xf6, 0x61, 0xbd, 0x37, 0x86, 0x12,
+  0x3d, 0x4c, 0x01, 0xb7, 0x3c, 0x8c, 0x44, 0x19, 0xbd, 0xc1, 0x3d, 0xa6, 0x3c,
+  0xcd, 0xf1, 0x5e, 0xbb, 0x9e, 0xe0, 0x41, 0x3d, 0x8c, 0xfb, 0x95, 0xbd, 0xa7,
+  0x04, 0xc1, 0xbb, 0xcc, 0xf0, 0x25, 0xbd, 0x1c, 0x72, 0x81, 0x3c, 0x76, 0xf2,
+  0x6d, 0x3d, 0x3b, 0xf9, 0x86, 0x3d, 0xc2, 0xbe, 0x4a, 0x3d, 0x5d, 0x80, 0x5a,
+  0xbd, 0x63, 0x28, 0x3b, 0xbd, 0xb4, 0xb7, 0x5e, 0x3d, 0x04, 0x5b, 0x57, 0x3d,
+  0x64, 0xac, 0x56, 0xbd, 0xb6, 0x67, 0x35, 0xbd, 0xb1, 0xc7, 0x0b, 0x3d, 0x0c,
+  0xae, 0x2d, 0x3d, 0xcc, 0x4c, 0x7d, 0xbc, 0x2f, 0x01, 0x34, 0x3d, 0xa8, 0x4e,
+  0x63, 0x3d, 0xa3, 0xad, 0xb8, 0xbc, 0x32, 0x0c, 0x25, 0xbd, 0x66, 0x15, 0xab,
+  0xbc, 0x8a, 0x1a, 0x10, 0x3d, 0xca, 0xcb, 0x46, 0x3d, 0x4a, 0xe5, 0xfe, 0x3c,
+  0x4a, 0xcc, 0xa6, 0x3c, 0x2e, 0x05, 0x4f, 0xbb, 0x31, 0xef, 0x62, 0xbc, 0xa0,
+  0xeb, 0x7c, 0xbd, 0x49, 0x9b, 0x13, 0x3d, 0x07, 0x55, 0x82, 0x3d, 0xca, 0x81,
+  0x1d, 0xbd, 0x67, 0xc0, 0x52, 0x3b, 0xae, 0xd6, 0x0d, 0x3d, 0x53, 0x79, 0x70,
+  0xbd, 0x9c, 0x93, 0xa8, 0xbc, 0x5b, 0xbb, 0x58, 0x3d, 0x73, 0x1d, 0x0b, 0xbd,
+  0xe8, 0xe9, 0x0f, 0x3d, 0x3b, 0xda, 0xbd, 0xbb, 0x66, 0x91, 0x80, 0x3d, 0x46,
+  0xcc, 0xe8, 0xbc, 0x86, 0xe3, 0x32, 0x3d, 0x37, 0x9f, 0x5f, 0xbc, 0x9a, 0x06,
+  0x19, 0xbd, 0xec, 0xb6, 0x78, 0xbd, 0xd9, 0xd5, 0x49, 0xbd, 0xe8, 0xf9, 0x59,
+  0x3c, 0x48, 0x30, 0x8c, 0x3c, 0x03, 0x1d, 0x8a, 0x3d, 0x4d, 0x47, 0xc6, 0x3c,
+  0x77, 0x88, 0x9d, 0xbd, 0x3e, 0xf0, 0x63, 0xbd, 0x83, 0x92, 0x2b, 0xbd, 0x9a,
+  0xb0, 0x05, 0x3d, 0xee, 0x10, 0x86, 0x3c, 0xf1, 0xb2, 0x92, 0xbd, 0x2a, 0x0e,
+  0x3f, 0xbd, 0x6c, 0xfc, 0xbb, 0xbb, 0x62, 0xee, 0x16, 0x3a, 0xf8, 0xdb, 0xa1,
+  0x3c, 0x1c, 0xce, 0x43, 0xbd, 0xd3, 0xbf, 0x64, 0xbd, 0xe6, 0xb9, 0xc4, 0x3c,
+  0x43, 0x6b, 0x63, 0x3c, 0xe8, 0xbd, 0x87, 0x3c, 0x95, 0x2d, 0x29, 0x3d, 0x10,
+  0xbd, 0x7a, 0xbc, 0x26, 0xe3, 0x8e, 0xbd, 0xa1, 0x64, 0x70, 0xbd, 0xf7, 0x22,
+  0x8f, 0x3d, 0x68, 0x73, 0x95, 0xbc, 0x33, 0x1c, 0xdb, 0xbc, 0x95, 0x44, 0x11,
+  0x3d, 0xc5, 0x6c, 0x86, 0xbd, 0xf8, 0x9b, 0x8a, 0xbd, 0x48, 0xba, 0x13, 0x3c,
+  0x6a, 0x54, 0x28, 0xbd, 0xd0, 0xaa, 0x15, 0xbd, 0x32, 0x4e, 0x56, 0x3d, 0x8e,
+  0x65, 0x4b, 0x3d, 0x62, 0x4d, 0x76, 0xbc, 0x65, 0x5f, 0x05, 0x3d, 0x40, 0xb5,
+  0xb5, 0xbb, 0x1a, 0xd6, 0x83, 0x3d, 0x9d, 0xea, 0xa7, 0x3b, 0x73, 0x19, 0x59,
+  0x3c, 0xb2, 0x83, 0x25, 0xbd, 0x38, 0x93, 0x9e, 0x3c, 0x95, 0xe2, 0x7a, 0x3c,
+  0xc6, 0x09, 0x95, 0xbd, 0xfe, 0x8a, 0x84, 0x3d, 0x09, 0x99, 0x8c, 0x3d, 0x3d,
+  0xb5, 0x0e, 0xbd, 0x1e, 0x91, 0x8c, 0xbd, 0xc1, 0x52, 0xce, 0x3c, 0xc2, 0xa5,
+  0x88, 0xbd, 0x9c, 0x3f, 0x97, 0xbd, 0x79, 0x5b, 0xd3, 0x3c, 0x20, 0xf6, 0xfd,
+  0x3c, 0xcf, 0x37, 0x5f, 0x3c, 0x41, 0xc8, 0x6e, 0xbd, 0xa4, 0xde, 0xf8, 0x3c,
+  0xe6, 0x88, 0x19, 0xbc, 0xe3, 0x00, 0x01, 0x3d, 0xa7, 0x4e, 0x1e, 0xbd, 0xb8,
+  0xa1, 0x65, 0xbd, 0xbf, 0xfd, 0x81, 0xbd, 0xf0, 0x80, 0xe8, 0xbb, 0x3c, 0x62,
+  0xdc, 0x3c, 0x02, 0x96, 0x70, 0x3d, 0x05, 0x55, 0x7d, 0xbd, 0x66, 0xb3, 0x15,
+  0x3d, 0xa7, 0x8e, 0x16, 0xbd, 0xf5, 0xcf, 0x06, 0x3d, 0x5b, 0x78, 0xdf, 0xbc,
+  0x54, 0xcc, 0x2c, 0xbd, 0xdc, 0x15, 0xc6, 0xbc, 0xeb, 0xaf, 0x87, 0x3d, 0x3b,
+  0x65, 0x95, 0xbd, 0x52, 0x02, 0x65, 0x3d, 0x0a, 0x99, 0x0a, 0xbc, 0x6a, 0xfd,
+  0x67, 0x3d, 0x00, 0x53, 0x3e, 0xbd, 0xa0, 0xbe, 0xe4, 0xbc, 0xaa, 0x76, 0xf4,
+  0x3c, 0xd9, 0x22, 0x3c, 0xbd, 0x28, 0xa2, 0x3b, 0x3b, 0x44, 0x27, 0x7e, 0xbd,
+  0xb3, 0xd4, 0xa8, 0x3c, 0xb3, 0x30, 0x29, 0x3b, 0xd0, 0x0f, 0x3b, 0x3b, 0x74,
+  0x3e, 0x8a, 0xbd, 0x2f, 0x61, 0x1f, 0xbd, 0x58, 0x65, 0x4a, 0xbd, 0xd7, 0xb7,
+  0xf8, 0xbc, 0xfd, 0x91, 0x25, 0xbd, 0xfd, 0xd2, 0x39, 0xbd, 0x49, 0xa6, 0x82,
+  0x3d, 0xd8, 0x60, 0x04, 0x3d, 0xf8, 0x76, 0xac, 0x3c, 0x18, 0x61, 0x2d, 0xbc,
+  0xd6, 0xf2, 0x0b, 0xbd, 0x18, 0x53, 0x01, 0x3c, 0xac, 0x10, 0xb7, 0x3c, 0x22,
+  0xab, 0xd0, 0xbc, 0x40, 0x50, 0x3b, 0x3a, 0xf4, 0x70, 0x44, 0xbd, 0xb8, 0xaa,
+  0x81, 0xbd, 0x09, 0x70, 0x8f, 0x3c, 0x51, 0x00, 0xc5, 0xbc, 0x41, 0x17, 0xb8,
+  0xbc, 0xd2, 0xe1, 0x07, 0xbd, 0x58, 0xa0, 0x95, 0xbd, 0x7d, 0x24, 0x4b, 0xbd,
+  0x47, 0x50, 0x5f, 0x3d, 0x4a, 0x41, 0x1e, 0x3d, 0xc1, 0x38, 0x21, 0xbd, 0xbd,
+  0x82, 0x13, 0x3d, 0xdb, 0xe8, 0x4d, 0xbd, 0x76, 0x8d, 0x1d, 0xbc, 0x96, 0x2f,
+  0x72, 0x3d, 0xa9, 0x4c, 0x56, 0xbd, 0xe3, 0x39, 0x79, 0x3d, 0xf2, 0xaa, 0x0e,
+  0x3d, 0xee, 0xfa, 0x27, 0x3d, 0x70, 0x0c, 0x24, 0x3c, 0x3c, 0xf8, 0x7e, 0xbd,
+  0xc2, 0x3b, 0x55, 0xbb, 0x83, 0x9c, 0xcc, 0x3b, 0x52, 0x0f, 0x5d, 0x3d, 0x86,
+  0x3f, 0x3a, 0xbc, 0xf0, 0xbb, 0xbc, 0xbb, 0xe0, 0xff, 0xaf, 0x3c, 0x12, 0xca,
+  0x22, 0x3c, 0xd4, 0x78, 0x41, 0xbc, 0xc9, 0xaa, 0x1f, 0xbd, 0x7c, 0x59, 0x9e,
+  0x3a, 0x1a, 0x15, 0x4d, 0xbc, 0x25, 0x53, 0xfa, 0xbc, 0x6e, 0xbb, 0x82, 0xbc,
+  0xc2, 0x7d, 0x8d, 0x3c, 0xa8, 0x73, 0x19, 0xbd, 0x04, 0x34, 0x4c, 0xbc, 0xbb,
+  0x37, 0x5e, 0x3d, 0xb8, 0xc0, 0x30, 0x3d, 0xac, 0x71, 0x9d, 0xbd, 0xf8, 0x58,
+  0x2a, 0x3b, 0xd0, 0x94, 0xa4, 0x3b, 0xeb, 0x76, 0x5a, 0xbc, 0xcf, 0x43, 0x94,
+  0x3c, 0x48, 0x10, 0x66, 0x3d, 0x35, 0xee, 0x78, 0xbc, 0x29, 0x9a, 0x64, 0x3c,
+  0x39, 0x2a, 0x27, 0x3d, 0xab, 0x94, 0x8a, 0x3d, 0xb2, 0x3c, 0x0f, 0xbd, 0x76,
+  0x7f, 0x46, 0xbd, 0x68, 0xb2, 0x96, 0xbc, 0x98, 0xa2, 0x61, 0x3d, 0x97, 0x72,
+  0x92, 0xbd, 0xde, 0xac, 0x51, 0xbd, 0x03, 0xb8, 0x74, 0x3d, 0xb5, 0x3b, 0x8a,
+  0xbc, 0x70, 0xbf, 0x42, 0xbd, 0xf0, 0x0f, 0xf9, 0x3b, 0xb6, 0x4d, 0xc5, 0x3c,
+  0x16, 0xeb, 0x72, 0x3d, 0x90, 0x81, 0xcd, 0xbb, 0x00, 0x8b, 0x0b, 0xbc, 0xb1,
+  0x02, 0xa5, 0x3c, 0xee, 0xa7, 0x7d, 0xbd, 0xf0, 0x26, 0x0e, 0xbd, 0x1c, 0xb0,
+  0x52, 0xbd, 0x80, 0xdd, 0x2f, 0xbd, 0x43, 0xbb, 0xeb, 0xbc, 0xf9, 0xa6, 0xd1,
+  0xbc, 0xb1, 0x67, 0x29, 0xbd, 0xaa, 0xee, 0xf4, 0x3b, 0xc4, 0xab, 0x59, 0xbd,
+  0xb8, 0x83, 0x36, 0x3d, 0x20, 0xfc, 0x60, 0x3b, 0x28, 0xdd, 0x59, 0xbd, 0x5c,
+  0x16, 0xd1, 0xbc, 0x00, 0xbc, 0xcb, 0xbc, 0x9f, 0x8e, 0x62, 0xbc, 0x8e, 0xde,
+  0x53, 0xbd, 0xec, 0x4f, 0x26, 0x3d, 0xde, 0x94, 0x46, 0xbd, 0x50, 0x30, 0x0e,
+  0x3c, 0x20, 0xef, 0x7b, 0xbd, 0x83, 0x86, 0x38, 0x3c, 0x5a, 0xff, 0x1f, 0xbd,
+  0x61, 0x3e, 0xd5, 0xbc, 0x0b, 0xac, 0x65, 0x3c, 0xfd, 0x06, 0xa5, 0x3c, 0x2c,
+  0x94, 0x47, 0xbd, 0xe2, 0xc3, 0x7e, 0x3d, 0x40, 0xac, 0x67, 0x3d, 0xa4, 0x7a,
+  0x77, 0xbc, 0xfc, 0x13, 0xe7, 0x3c, 0x56, 0x69, 0x80, 0x3d, 0x27, 0x58, 0x18,
+  0x3d, 0x1e, 0x95, 0x0e, 0x3d, 0x3f, 0xa8, 0x41, 0x3d, 0x0f, 0xbb, 0x16, 0xbd,
+  0x45, 0x72, 0x89, 0xbd, 0xf1, 0xd2, 0xfb, 0x3c, 0x8f, 0x6b, 0x65, 0x3d, 0x50,
+  0x8a, 0x05, 0x3c, 0x99, 0x24, 0x90, 0xbd, 0xc8, 0x4d, 0x4f, 0x3d, 0x80, 0xb8,
+  0xd2, 0x3b, 0xe5, 0x51, 0xae, 0x3b, 0x25, 0x33, 0x2a, 0xbd, 0x05, 0x12, 0xd7,
+  0x3c, 0xc2, 0x1b, 0x33, 0x3c, 0x5f, 0x8d, 0x07, 0xbc, 0x79, 0x60, 0x26, 0x3d,
+  0xf7, 0x63, 0x83, 0x3d, 0x88, 0xb4, 0xc7, 0xbc, 0x40, 0x5d, 0xb0, 0xba, 0x6e,
+  0xaf, 0x39, 0xbd, 0x50, 0x93, 0xf3, 0x3c, 0xc4, 0x3b, 0x53, 0x3c, 0xf9, 0x8b,
+  0x60, 0xbd, 0x74, 0x4e, 0xbd, 0x3c, 0x40, 0xe6, 0xdd, 0x3c, 0x30, 0x78, 0x18,
+  0x3d, 0xaa, 0xed, 0x76, 0x3d, 0xd7, 0x20, 0x4b, 0x3d, 0x30, 0x08, 0xd1, 0x3c,
+  0x52, 0xf0, 0x61, 0x3d, 0x75, 0xea, 0x6a, 0x3d, 0x93, 0xef, 0xeb, 0x3c, 0x35,
+  0xad, 0x96, 0xbd, 0xca, 0x41, 0x21, 0x3d, 0x59, 0x18, 0x1e, 0x3d, 0x2c, 0xa8,
+  0x81, 0xbd, 0x7e, 0xdb, 0xd7, 0x3c, 0xfc, 0x7e, 0x1b, 0xbd, 0x26, 0x25, 0x86,
+  0x3d, 0xa9, 0x58, 0x9b, 0xbd, 0x0a, 0xef, 0xfa, 0xbc, 0xfe, 0x74, 0x74, 0x3d,
+  0xb0, 0x51, 0x80, 0xbd, 0x29, 0x42, 0x88, 0x3a, 0x56, 0xe7, 0x8c, 0xbb, 0x16,
+  0x5f, 0x43, 0x3d, 0x5b, 0x1d, 0x4c, 0x3c, 0xae, 0x9d, 0xbd, 0xbb, 0xbc, 0xcf,
+  0x44, 0xbc, 0x78, 0x8d, 0x6c, 0x3d, 0x30, 0x99, 0x2c, 0x3d, 0x52, 0x17, 0x9e,
+  0xbc, 0x3d, 0x52, 0x18, 0xbd, 0xfa, 0xcc, 0xb4, 0x3c, 0x9d, 0x56, 0x8d, 0x3d,
+  0x7e, 0xa0, 0x18, 0x3d, 0x88, 0x7b, 0x94, 0xbd, 0xe8, 0x02, 0xc7, 0xbc, 0x08,
+  0x22, 0x37, 0x3c, 0x18, 0x3b, 0x5d, 0xbd, 0xa4, 0xbb, 0xb4, 0x3c, 0xb0, 0x8d,
+  0x06, 0x3d, 0xe8, 0xf4, 0xb0, 0xbb, 0xb4, 0x8b, 0x31, 0xbc, 0xf8, 0xdf, 0xf4,
+  0x3c, 0x29, 0x19, 0x80, 0xbb, 0x29, 0x4c, 0x60, 0x3c, 0x4b, 0x11, 0x93, 0xbd,
+  0x4b, 0xbd, 0x66, 0xbd, 0x62, 0x8e, 0x88, 0x3c, 0xfe, 0xa2, 0x37, 0x3d, 0x41,
+  0xe1, 0x36, 0xbd, 0xbe, 0x7b, 0xc1, 0x3b, 0x6c, 0xff, 0xba, 0x3c, 0x8f, 0xae,
+  0xab, 0xbc, 0x7b, 0x37, 0xd5, 0xbc, 0x0d, 0xac, 0x18, 0xbd, 0xf2, 0xcb, 0x1d,
+  0x3d, 0xbb, 0xb0, 0x30, 0x3c, 0xbb, 0x1a, 0x41, 0x3b, 0x5b, 0x36, 0x11, 0xbd,
+  0x96, 0xb3, 0x86, 0x3d, 0x0b, 0xcb, 0xf9, 0x3c, 0x5c, 0x23, 0x60, 0xbc, 0x62,
+  0xe1, 0x33, 0xbd, 0x10, 0x91, 0x5e, 0x3d, 0xdf, 0xc8, 0x6c, 0xbd, 0xe7, 0x19,
+  0x60, 0x3d, 0x87, 0xa0, 0x5b, 0x3c, 0x8a, 0xc5, 0x65, 0x3d, 0x6c, 0x2e, 0x31,
+  0x3d, 0x99, 0xc7, 0x1a, 0x3d, 0xe8, 0xe6, 0x6f, 0x3c, 0x10, 0x95, 0xd9, 0x3b,
+  0x1d, 0xdd, 0x19, 0xbd, 0xdc, 0xfe, 0x32, 0x3d, 0x83, 0x85, 0x05, 0x3d, 0xd8,
+  0x24, 0x16, 0x3d, 0xf7, 0x73, 0x20, 0xbd, 0x77, 0x07, 0xc4, 0x3c, 0xdf, 0xd0,
+  0x92, 0x3c, 0x1a, 0x7d, 0x2c, 0xba, 0xb0, 0x19, 0xe8, 0xbc, 0x9e, 0x97, 0xec,
+  0xbb, 0x33, 0xb2, 0xb1, 0x3c, 0x89, 0xde, 0x81, 0xbd, 0x9d, 0xae, 0x57, 0xbc,
+  0x31, 0xd9, 0xbb, 0x3c, 0xa0, 0x2d, 0x27, 0x3d, 0x00, 0x99, 0x43, 0x3c, 0x2e,
+  0x32, 0x9d, 0xbc, 0xa2, 0x6d, 0x81, 0x3d, 0x38, 0xce, 0xc3, 0xbc, 0x8e, 0xd7,
+  0x7a, 0x3d, 0x2a, 0x89, 0x00, 0xbc, 0x2e, 0x52, 0x9f, 0xbc, 0x20, 0x47, 0x4d,
+  0xbd, 0xd9, 0x79, 0x5f, 0x3d, 0x09, 0x2c, 0x97, 0x3c, 0x9c, 0x28, 0x5f, 0x3b,
+  0x9d, 0xd3, 0x65, 0x3d, 0x44, 0x63, 0xbb, 0xbc, 0x0c, 0xfe, 0xc0, 0x3c, 0x71,
+  0xfa, 0x08, 0xbd, 0x40, 0x4a, 0xac, 0x3b, 0xca, 0x9d, 0x7a, 0x3d, 0xbd, 0x1c,
+  0x52, 0xbd, 0xc8, 0x90, 0x0e, 0x3d, 0x6b, 0x89, 0xbd, 0xbc, 0xa0, 0x74, 0x77,
+  0x3c, 0x8a, 0xe4, 0x44, 0xbd, 0x5f, 0x81, 0x56, 0x3c, 0x39, 0x9a, 0xc9, 0xbc,
+  0x33, 0xf4, 0x07, 0xbd, 0x48, 0xe0, 0x94, 0xbd, 0x3f, 0xfc, 0xdf, 0xbc, 0x41,
+  0x3e, 0xa9, 0x3c, 0x18, 0x06, 0x0e, 0x3c, 0xfb, 0xb9, 0xe2, 0x3c, 0x12, 0x14,
+  0x26, 0xbc, 0x8b, 0x15, 0x97, 0xbd, 0x43, 0xc8, 0x23, 0xbd, 0x8e, 0x30, 0xf7,
+  0x3a, 0x4c, 0xdc, 0x4f, 0xbd, 0x52, 0x50, 0x3c, 0xbc, 0xda, 0x70, 0x1b, 0x3d,
+  0xfc, 0xbc, 0x3a, 0x3d, 0x76, 0x5a, 0x39, 0xbd, 0x48, 0xc3, 0x50, 0x3d, 0xf9,
+  0xd3, 0x81, 0xbd, 0x1e, 0xdf, 0x09, 0xbd, 0xd3, 0xa3, 0x7a, 0x3d, 0x71, 0x42,
+  0x6b, 0xbd, 0x7e, 0x3a, 0x4e, 0x3d, 0xd0, 0x26, 0xc5, 0xbb, 0xde, 0x7d, 0x2d,
+  0x3d, 0xc0, 0xda, 0xd8, 0xba, 0x18, 0x43, 0x63, 0x3c, 0xb5, 0x93, 0xb6, 0x3c,
+  0xc7, 0xee, 0x49, 0xbd, 0xb2, 0x73, 0x47, 0xbd, 0xa6, 0x66, 0x3b, 0x3d, 0xea,
+  0xa2, 0x04, 0xbd, 0xde, 0x2b, 0x44, 0x3d, 0x41, 0x80, 0xee, 0x3c, 0x11, 0xbe,
+  0x72, 0x3c, 0x46, 0xdf, 0x63, 0xbc, 0x4d, 0xc3, 0xfb, 0xbc, 0x3d, 0xbc, 0x86,
+  0x3d, 0xf7, 0xad, 0x02, 0xbd, 0x7d, 0xb7, 0x0f, 0xbd, 0x99, 0x8c, 0x51, 0x3c,
+  0x85, 0xce, 0x50, 0xbd, 0x0d, 0xe0, 0x41, 0x3d, 0x3a, 0xb3, 0x21, 0xbb, 0xd0,
+  0x0b, 0xdd, 0xbb, 0x94, 0x62, 0x25, 0xbd, 0xc0, 0xab, 0xd1, 0xbc, 0xf0, 0xf6,
+  0x89, 0xbb, 0xbe, 0x10, 0xb9, 0xbc, 0x68, 0x2e, 0x3a, 0x3c, 0x22, 0x34, 0x20,
+  0xbd, 0x4d, 0xd9, 0x75, 0xbc, 0x74, 0x5d, 0x00, 0x3d, 0xf3, 0xd5, 0x5e, 0x3d,
+  0x7c, 0x61, 0xcc, 0xbc, 0x56, 0x76, 0x13, 0x3d, 0xda, 0x68, 0xe3, 0x3b, 0xa3,
+  0xa1, 0x89, 0x3d, 0xd0, 0xfa, 0x16, 0x3d, 0xf1, 0x86, 0x48, 0x3c, 0x71, 0x81,
+  0x83, 0x3b, 0x31, 0x30, 0x2a, 0xbd, 0x4e, 0xc0, 0xd6, 0x3c, 0xe6, 0xf3, 0xfd,
+  0xba, 0x6d, 0x46, 0x96, 0x3c, 0x60, 0xcc, 0x67, 0xbd, 0x11, 0x9c, 0xc6, 0x3c,
+  0xa8, 0x63, 0x21, 0xbd, 0xdb, 0xb3, 0x70, 0xbc, 0x42, 0x46, 0x38, 0xbd, 0x88,
+  0x73, 0x00, 0xbc, 0x48, 0x5e, 0x4e, 0x3d, 0x2d, 0x95, 0x26, 0xbd, 0xa0, 0x22,
+  0xb3, 0x3c, 0x56, 0xfb, 0x91, 0xbd, 0x51, 0x13, 0x06, 0x3c, 0x85, 0x69, 0x8a,
+  0x3d, 0x23, 0xf8, 0x89, 0xbd, 0x61, 0x24, 0xd3, 0xbc, 0x28, 0xd0, 0x0a, 0x3c,
+  0xe9, 0x4e, 0x85, 0x3d, 0xde, 0x12, 0x93, 0xbb, 0x18, 0x55, 0xdd, 0x3b, 0x57,
+  0xc2, 0x22, 0xbd, 0x85, 0x3f, 0x0a, 0xbd, 0x9d, 0x49, 0x86, 0x3d, 0x50, 0x01,
+  0x8f, 0x3b, 0x2c, 0xbf, 0xf5, 0xbc, 0x6b, 0xec, 0x04, 0x3c, 0x92, 0x0e, 0x9b,
+  0xbc, 0xfc, 0xe0, 0x28, 0xbd, 0x16, 0xeb, 0x9d, 0xbb, 0x20, 0xde, 0xf9, 0x3c,
+  0x58, 0x77, 0x06, 0xbd, 0x5c, 0x2a, 0x92, 0xbc, 0x62, 0x8d, 0xf6, 0xbc, 0x88,
+  0xcc, 0xa3, 0xbb, 0x60, 0xbf, 0xdb, 0x3c, 0x2c, 0xcb, 0x69, 0xbd, 0xe3, 0xcf,
+  0x89, 0xbb, 0x35, 0xad, 0x81, 0xbd, 0xf1, 0x3d, 0x3d, 0xbd, 0x05, 0x62, 0x81,
+  0x3d, 0x4e, 0xbe, 0x4d, 0x3c, 0x7e, 0xbf, 0x85, 0x3d, 0xfb, 0xc4, 0x23, 0xbb,
+  0xd8, 0x1b, 0x78, 0x3d, 0x1d, 0xd7, 0x9d, 0xbd, 0x5d, 0x69, 0x15, 0x3d, 0xb6,
+  0x7a, 0x93, 0xbc, 0x8c, 0xf1, 0xdf, 0xbc, 0xec, 0xfa, 0x2b, 0x3d, 0x40, 0xda,
+  0x86, 0x3a, 0x1c, 0x0e, 0x2f, 0xbd, 0x38, 0x71, 0x4c, 0x3d, 0x68, 0x87, 0x9a,
+  0xbd, 0x12, 0x86, 0x91, 0xbd, 0x60, 0x8f, 0x95, 0xbd, 0xd0, 0xe1, 0xf4, 0xbc,
+  0xa2, 0x77, 0x3f, 0x3d, 0xc0, 0xcd, 0xa1, 0x3c, 0xa2, 0x69, 0x6e, 0xbd, 0xba,
+  0xc9, 0x79, 0x3d, 0x6d, 0x05, 0xec, 0xbc, 0xb0, 0x63, 0x57, 0x3d, 0xfa, 0x05,
+  0xd4, 0xbc, 0xb2, 0xd2, 0x93, 0x3b, 0x7e, 0x40, 0x09, 0xbd, 0xf0, 0x2e, 0xd6,
+  0x3c, 0x00, 0x7b, 0x69, 0xbd, 0x6e, 0x10, 0x29, 0xbd, 0x69, 0x91, 0x92, 0xbb,
+  0x90, 0x9e, 0x38, 0x3d, 0x99, 0x1b, 0x69, 0xbd, 0x32, 0xd2, 0x49, 0x3d, 0x9d,
+  0xa4, 0x5d, 0xbd, 0x8b, 0x8e, 0x20, 0xbd, 0xcf, 0x0b, 0x92, 0xbd, 0x3c, 0xb7,
+  0xfb, 0x3c, 0xdf, 0xf9, 0x58, 0x3d, 0xa7, 0xf0, 0x3e, 0xbb, 0x6c, 0x7e, 0xbd,
+  0x3c, 0x83, 0xdf, 0x12, 0x3d, 0x37, 0x97, 0x84, 0x3d, 0xe0, 0x4e, 0x36, 0x3d,
+  0xf6, 0x06, 0x90, 0xbd, 0x07, 0xc0, 0xce, 0x3c, 0xb1, 0xc0, 0x49, 0x3d, 0x7b,
+  0x76, 0x02, 0x3c, 0x29, 0x97, 0x93, 0x3b, 0x16, 0x46, 0x45, 0xbd, 0x10, 0xb1,
+  0x92, 0x3b, 0x26, 0x69, 0x45, 0x3d, 0x1e, 0x1a, 0x6d, 0x3d, 0x60, 0x9f, 0xe3,
+  0x3b, 0x07, 0xab, 0x5f, 0x3d, 0x65, 0xce, 0x35, 0xbd, 0x61, 0x0d, 0x43, 0xbd,
+  0x56, 0xa7, 0x79, 0x3d, 0x61, 0x67, 0x37, 0x3d, 0x26, 0xf4, 0x90, 0xbd, 0x73,
+  0x2e, 0x1b, 0x3d, 0x39, 0x48, 0xe2, 0xb9, 0x57, 0x1e, 0x32, 0x3d, 0xaa, 0x2d,
+  0x16, 0x3c, 0xae, 0x6a, 0x94, 0xbc, 0xc1, 0x8b, 0x1e, 0xbd, 0xf1, 0x42, 0x4f,
+  0xbd, 0x6d, 0x34, 0x66, 0x3d, 0xc2, 0x39, 0x6a, 0xbd, 0x6e, 0x02, 0xab, 0x3c,
+  0xa8, 0x60, 0x3d, 0xbd, 0x69, 0x24, 0x93, 0xbd, 0xd2, 0x91, 0x8a, 0xbd, 0xfe,
+  0xa0, 0x30, 0xbd, 0xbd, 0x15, 0x28, 0xbd, 0x00, 0x1c, 0x02, 0x3a, 0x2e, 0xe2,
+  0x5b, 0xbb, 0xda, 0x90, 0x4d, 0x3d, 0x56, 0xc4, 0xd3, 0xbc, 0x25, 0xb8, 0x6d,
+  0x3d, 0x89, 0xe0, 0x47, 0x3d, 0x60, 0x4b, 0x04, 0xbb, 0x00, 0xd5, 0xdc, 0x39,
+  0x33, 0xc0, 0x7e, 0x3d, 0xce, 0x0c, 0x51, 0xbd, 0xb2, 0x49, 0xf0, 0xbc, 0xc8,
+  0x62, 0xa2, 0xbc, 0xdc, 0x45, 0x2a, 0x3d, 0x5e, 0xe2, 0x1b, 0xbd, 0xa6, 0x02,
+  0x9a, 0xbd, 0xe2, 0xf0, 0x89, 0xbd, 0xff, 0x15, 0xa8, 0xbc, 0xc2, 0x94, 0xb9,
+  0x3c, 0x8a, 0x28, 0x8b, 0xbc, 0x27, 0x32, 0x7d, 0x3d, 0x2b, 0x24, 0x75, 0xbd,
+  0xc1, 0x7f, 0x05, 0xbd, 0x8b, 0x7f, 0x28, 0xbd, 0xa4, 0xd9, 0x9a, 0xbc, 0x03,
+  0xc7, 0x23, 0xbc, 0xac, 0xd5, 0x6d, 0xbc, 0xfb, 0xf5, 0x70, 0xbc, 0x5c, 0x28,
+  0x5c, 0xbd, 0xf5, 0xa5, 0x54, 0x3d, 0xc4, 0x5f, 0x87, 0xbd, 0x28, 0x92, 0x51,
+  0x3c, 0x10, 0xc1, 0x87, 0x3d, 0x00, 0xeb, 0x1c, 0x3c, 0x9a, 0x6a, 0x52, 0x3d,
+  0x95, 0xc5, 0x1a, 0x3d, 0x9d, 0x84, 0x9b, 0x3c, 0x56, 0x33, 0xda, 0xbc, 0x28,
+  0x01, 0x64, 0x3d, 0xb1, 0x80, 0x4f, 0xbd, 0x50, 0x61, 0x89, 0xbd, 0xe0, 0x1f,
+  0x30, 0xbb, 0x63, 0x5a, 0x86, 0x3d, 0x06, 0x30, 0x56, 0x3d, 0xc6, 0x8e, 0x4e,
+  0xbd, 0xd1, 0xb8, 0xc6, 0xbc, 0xc6, 0x6c, 0xf4, 0xbc, 0x6c, 0x6f, 0x21, 0x3d,
+  0xea, 0x45, 0x86, 0x3c, 0xe7, 0x7b, 0x1c, 0xbd, 0xba, 0x38, 0x54, 0xbd, 0xa4,
+  0x78, 0x82, 0x3d, 0xdc, 0x98, 0x18, 0xbc, 0xa0, 0x85, 0x0d, 0x3d, 0x9e, 0xe7,
+  0x55, 0xbd, 0x8e, 0x64, 0x30, 0x3d, 0xda, 0xf4, 0x48, 0x3d, 0x69, 0xdc, 0xe8,
+  0x3c, 0x68, 0xc7, 0x0d, 0xbd, 0xdf, 0x7e, 0xb4, 0x3c, 0x3a, 0x30, 0x57, 0x3d,
+  0xc5, 0x7a, 0x1a, 0xbc, 0x42, 0xa7, 0x8c, 0x3d, 0xb1, 0x9c, 0x4f, 0x3d, 0xa0,
+  0x74, 0x36, 0xbc, 0x7e, 0x74, 0x25, 0x3d, 0xc8, 0x7c, 0x48, 0x3d, 0x7f, 0x68,
+  0x55, 0x3c, 0xa6, 0x62, 0xf8, 0xbc, 0x16, 0x5b, 0x2d, 0x3d, 0x79, 0x57, 0x6a,
+  0xbd, 0x86, 0xf0, 0x8b, 0xbc, 0x20, 0x1c, 0x3f, 0x3c, 0x92, 0x3d, 0x20, 0x3d,
+  0x40, 0x29, 0x7b, 0xbd, 0x32, 0x88, 0x5b, 0x3d, 0x28, 0x79, 0x2c, 0x3c, 0xeb,
+  0x80, 0xe3, 0x3c, 0xe5, 0x28, 0xa1, 0x3c, 0x95, 0xbb, 0x88, 0x3d, 0x1b, 0xa9,
+  0x95, 0xbc, 0xb0, 0x35, 0x5b, 0x3d, 0x02, 0xbd, 0x8e, 0xbc, 0x62, 0xe7, 0x1d,
+  0xbd, 0xad, 0xe5, 0xca, 0x3c, 0x6f, 0x93, 0x3f, 0xb9, 0x51, 0x7d, 0x48, 0xbd,
+  0x06, 0x75, 0x68, 0x3d, 0xa7, 0x08, 0x7b, 0xbd, 0x5e, 0xeb, 0x73, 0xba, 0xa1,
+  0x83, 0x31, 0x3d, 0xcd, 0x92, 0x55, 0x3c, 0x88, 0xdb, 0x3f, 0xbd, 0x67, 0x9c,
+  0x35, 0x3d, 0xa9, 0x4b, 0x14, 0x3d, 0x94, 0x6b, 0x6c, 0xbc, 0x6c, 0xa8, 0xe7,
+  0x3c, 0xc0, 0x02, 0xf7, 0xbb, 0xcb, 0xbc, 0x85, 0x3a, 0xf1, 0x91, 0xf0, 0xbc,
+  0x72, 0x77, 0x83, 0x3d, 0x68, 0xab, 0x30, 0x3d, 0xa0, 0x17, 0x96, 0xbc, 0x7d,
+  0xe6, 0x19, 0xbd, 0x18, 0x2c, 0x22, 0x3d, 0x88, 0x14, 0xaa, 0x3c, 0x40, 0x4d,
+  0xb3, 0xbc, 0x4c, 0xc2, 0x7a, 0xbc, 0xf8, 0x68, 0x53, 0x3c, 0x16, 0x1d, 0xc6,
+  0xbb, 0x2f, 0x2c, 0x71, 0xbd, 0xa3, 0x55, 0x80, 0x3d, 0x96, 0x18, 0x07, 0x3d,
+  0x34, 0xa8, 0xa1, 0xbc, 0x2b, 0x39, 0x58, 0x3d, 0x23, 0xc6, 0x68, 0x3d, 0x46,
+  0x84, 0x55, 0x3d, 0x0d, 0xd6, 0x3e, 0x3c, 0x2e, 0xc2, 0x0d, 0x3d, 0x88, 0x20,
+  0x26, 0x3c, 0x44, 0x1b, 0x23, 0x3d, 0x7f, 0x54, 0x8b, 0xbd, 0xda, 0xa3, 0x54,
+  0xbd, 0x9e, 0xad, 0x32, 0x3d, 0x17, 0x7c, 0x78, 0x3d, 0xcd, 0x11, 0x9f, 0xbc,
+  0x2c, 0x53, 0x57, 0x3b, 0x1a, 0x5a, 0x0a, 0xbd, 0x6d, 0x40, 0x67, 0x3d, 0x52,
+  0xb6, 0x56, 0x3d, 0x1c, 0x07, 0x96, 0xbd, 0xb0, 0x1c, 0x14, 0xbd, 0xc3, 0xda,
+  0x2b, 0x3c, 0x7a, 0x02, 0x61, 0x3d, 0xbd, 0x9f, 0x2a, 0xbd, 0x72, 0xf9, 0xbf,
+  0xbc, 0x79, 0xfe, 0xa3, 0x3c, 0xfc, 0x45, 0x43, 0xbd, 0x9e, 0xd3, 0x7b, 0x3d,
+  0x70, 0x3a, 0x6e, 0xbd, 0x78, 0xdc, 0x30, 0x3c, 0x93, 0x36, 0x67, 0x3d, 0x63,
+  0x08, 0x84, 0x3d, 0x5e, 0x4f, 0x40, 0x3a, 0xc5, 0xd9, 0xc1, 0x3c, 0xea, 0x6b,
+  0x31, 0x3d, 0x1e, 0xf8, 0xdc, 0xbb, 0x0b, 0x30, 0xfd, 0xbc, 0xc6, 0xf2, 0x87,
+  0x3d, 0xc5, 0xc9, 0xc7, 0x3c, 0x98, 0x0c, 0xba, 0x3b, 0xcf, 0x1a, 0x8d, 0xbd,
+  0x90, 0xa5, 0xe1, 0xbb, 0x16, 0xc3, 0x64, 0x3d, 0x03, 0x3a, 0x95, 0x3c, 0xaa,
+  0x98, 0x32, 0xbd, 0x95, 0xa5, 0x95, 0xbd, 0xde, 0x9e, 0x88, 0x3a, 0xbb, 0x39,
+  0x8e, 0xbd, 0x3d, 0xf1, 0x30, 0x3d, 0x6e, 0x57, 0x8c, 0x3d, 0xf3, 0x90, 0x25,
+  0xbd, 0xf8, 0x97, 0x2e, 0xbd, 0x21, 0xf3, 0x1b, 0x3d, 0x34, 0xd9, 0x5d, 0xbc,
+  0x24, 0x60, 0x23, 0xbc, 0x32, 0x24, 0xa6, 0x3b, 0x01, 0xf1, 0x61, 0xbd, 0x69,
+  0x3b, 0xaa, 0x3c, 0x54, 0xf0, 0x53, 0xbd, 0x40, 0x67, 0x64, 0x3b, 0x00, 0x84,
+  0xa1, 0xbb, 0xda, 0xb5, 0x6e, 0x3d, 0x0f, 0xfb, 0x3d, 0xbc, 0xf9, 0xf3, 0x0c,
+  0xbd, 0x5b, 0x52, 0xd1, 0xbb, 0x43, 0xf7, 0x04, 0xbd, 0xf9, 0x67, 0x7c, 0x3d,
+  0x36, 0xed, 0x30, 0xbd, 0xcf, 0x53, 0x62, 0x3c, 0x03, 0xbb, 0x79, 0xbd, 0x6d,
+  0xc8, 0x40, 0x3d, 0xc5, 0x5c, 0x19, 0x3d, 0x0e, 0xd5, 0x2d, 0xbd, 0x2d, 0x89,
+  0x92, 0x3d, 0xf3, 0xcc, 0x15, 0x3d, 0xe2, 0x92, 0x9e, 0xbc, 0x44, 0x74, 0x8e,
+  0xbd, 0x6b, 0x27, 0x96, 0xbd, 0x86, 0xcb, 0xe8, 0x3c, 0xab, 0xda, 0x99, 0xbb,
+  0xf6, 0x99, 0x19, 0xbb, 0xe8, 0xb3, 0x49, 0x3d, 0xa4, 0x79, 0x85, 0x3c, 0x4f,
+  0xb4, 0xf5, 0xbc, 0x5c, 0x1a, 0xa9, 0xbc, 0xa7, 0x63, 0x1f, 0xbd, 0x33, 0xff,
+  0x46, 0xbd, 0x39, 0x7f, 0x97, 0xbd, 0xd8, 0x75, 0x85, 0xbd, 0x55, 0x97, 0x94,
+  0xbc, 0x3e, 0x73, 0xb0, 0x3c, 0xf8, 0xb8, 0xee, 0x3c, 0xa0, 0xe4, 0x6e, 0x3b,
+  0x00, 0xde, 0x54, 0x3b, 0x3b, 0x2d, 0x90, 0xbc, 0xae, 0xd9, 0x89, 0xbd, 0x65,
+  0x3d, 0xf9, 0x3c, 0x5f, 0x64, 0x8a, 0xbd, 0x88, 0x25, 0x7c, 0xbb, 0x8c, 0x64,
+  0x35, 0xbc, 0x63, 0x28, 0x0c, 0x3d, 0x2d, 0x9c, 0xde, 0xbb, 0x62, 0x5c, 0x96,
+  0xbc, 0x12, 0x3c, 0x35, 0x3d, 0x50, 0x11, 0xcc, 0x3b, 0x56, 0x1a, 0x80, 0xbd,
+  0xd0, 0x1a, 0x98, 0xba, 0x88, 0xe4, 0x58, 0x3d, 0x09, 0xc2, 0x9e, 0x3b, 0xce,
+  0xc4, 0x3c, 0xbc, 0x88, 0x46, 0x09, 0xbd, 0xea, 0xde, 0x04, 0x3c, 0xd4, 0x45,
+  0x5d, 0xbd, 0x18, 0x90, 0x7e, 0x3d, 0x99, 0x67, 0x91, 0x3d, 0x8d, 0x01, 0xd7,
+  0xbc, 0x61, 0xdc, 0x6b, 0x3d, 0x36, 0x17, 0x96, 0x3c, 0x7e, 0x27, 0x6f, 0x3d,
+  0x52, 0xcb, 0xf7, 0x3c, 0xfc, 0x54, 0x75, 0xbc, 0x36, 0xbd, 0x25, 0x3d, 0x86,
+  0xd1, 0x7b, 0xbd, 0x5c, 0x19, 0x12, 0x3d, 0xda, 0xfb, 0x03, 0x3d, 0xee, 0x5f,
+  0x37, 0xbd, 0xd4, 0x39, 0x34, 0xbd, 0xb4, 0x2f, 0x8b, 0xbd, 0x29, 0xd4, 0x99,
+  0xbd, 0x4e, 0x31, 0x4a, 0x3c, 0x3a, 0x73, 0x7b, 0x3d, 0x97, 0x99, 0xac, 0xbb,
+  0x77, 0xe4, 0xac, 0xbc, 0x0c, 0x31, 0xc3, 0xbb, 0xd7, 0xdb, 0x85, 0x3d, 0x31,
+  0x4d, 0xd5, 0xbb, 0xb8, 0x71, 0xda, 0x3c, 0x7c, 0x01, 0x5a, 0x3d, 0x32, 0xe9,
+  0x57, 0x3d, 0x6f, 0xd9, 0x7a, 0x3d, 0x38, 0x6a, 0x77, 0xbc, 0x7b, 0x63, 0x5c,
+  0xbd, 0x8c, 0xe0, 0x02, 0xbd, 0xf2, 0x35, 0x47, 0x3d, 0x93, 0x0e, 0x59, 0xbd,
+  0xf8, 0xfa, 0x63, 0x3d, 0x1c, 0x59, 0x49, 0xbd, 0x48, 0x00, 0x3c, 0xbc, 0x52,
+  0xd8, 0x14, 0x3d, 0xc3, 0x56, 0x42, 0x3c, 0x7d, 0x74, 0xa9, 0x3c, 0x15, 0x40,
+  0x83, 0x3d, 0x9c, 0x8d, 0xe2, 0xbc, 0x47, 0xdb, 0x86, 0x3d, 0xcc, 0x7f, 0x2d,
+  0xbd, 0x39, 0xdd, 0x8f, 0x3d, 0xe8, 0xe7, 0x0c, 0x3c, 0xc0, 0xc6, 0xfa, 0x3a,
+  0x5e, 0x6c, 0x85, 0xbd, 0xae, 0x8d, 0x79, 0x3d, 0x29, 0x90, 0xd8, 0x3c, 0x09,
+  0x17, 0x85, 0xbc, 0x4d, 0xf9, 0x71, 0xbd, 0x74, 0xa6, 0xf3, 0xbb, 0xf0, 0x65,
+  0xee, 0xbc, 0x42, 0x45, 0x7b, 0x3d, 0xdc, 0x2b, 0x5e, 0xbd, 0x35, 0x5f, 0x3f,
+  0x3d, 0x10, 0x00, 0xdd, 0x3b, 0xb8, 0xd0, 0x94, 0xbc, 0xe8, 0xb4, 0xcc, 0xbc,
+  0xb3, 0x71, 0x2d, 0x3c, 0x00, 0x36, 0xc0, 0x3c, 0x3e, 0x20, 0x1e, 0xbd, 0x0e,
+  0xdf, 0x62, 0x3c, 0x55, 0xdc, 0x44, 0x3d, 0x27, 0x0e, 0x3a, 0xbc, 0x6b, 0xd4,
+  0x8c, 0x3c, 0xcc, 0xcc, 0x7f, 0xbd, 0xd4, 0x43, 0x3d, 0xbd, 0x5b, 0xac, 0x58,
+  0x3c, 0xf0, 0x58, 0xd2, 0xbc, 0x49, 0x1d, 0x38, 0x3d, 0x09, 0x7c, 0x1d, 0xbd,
+  0x7a, 0x5b, 0x00, 0xbd, 0xe4, 0x6e, 0xf0, 0x3c, 0x4a, 0xd3, 0x56, 0x3d, 0x28,
+  0x12, 0x8d, 0xbc, 0xbe, 0x44, 0x65, 0x3d, 0x0a, 0xd4, 0x16, 0xbc, 0xb0, 0x96,
+  0x16, 0xbd, 0xfa, 0xf1, 0x8d, 0x3d, 0x41, 0xd6, 0x74, 0x3d, 0xb5, 0x79, 0x85,
+  0xbd, 0x5d, 0xfb, 0x8e, 0xbc, 0xd8, 0x46, 0x86, 0xba, 0x2f, 0xa2, 0x8b, 0xbd,
+  0xd8, 0x91, 0x90, 0xbc, 0xf7, 0x73, 0xe6, 0xbc, 0x6c, 0x45, 0xac, 0x3c, 0xe4,
+  0xbe, 0x60, 0xbc, 0x4b, 0x18, 0x7f, 0x3d, 0x1f, 0xb0, 0x39, 0x3c, 0xc0, 0x64,
+  0x71, 0x3d, 0x2f, 0x99, 0x3e, 0xbd, 0xa8, 0x87, 0x2f, 0x3d, 0xdc, 0xb3, 0x94,
+  0xbd, 0xfa, 0xe2, 0x8c, 0xbd, 0x28, 0xb5, 0x2a, 0x3c, 0xa3, 0x13, 0x31, 0xbd,
+  0xe6, 0xae, 0xfc, 0xbc, 0x98, 0xb6, 0x68, 0xbd, 0x41, 0xdf, 0x66, 0x3b, 0xde,
+  0xc5, 0x2e, 0xbd, 0x24, 0x8c, 0x4c, 0xbd, 0xdb, 0x77, 0xe8, 0x3b, 0xc0, 0x23,
+  0xc1, 0xbc, 0x50, 0xcb, 0x98, 0xbc, 0x44, 0x4b, 0x32, 0x3d, 0xd0, 0xd5, 0xf9,
+  0xbc, 0x40, 0x77, 0xea, 0x3b, 0xaf, 0x97, 0xbc, 0x3c, 0x9f, 0x07, 0x8d, 0x3d,
+  0x26, 0xc4, 0x87, 0xbc, 0x48, 0xff, 0x1b, 0x3d, 0x90, 0x07, 0xc0, 0x3b, 0xa0,
+  0xeb, 0x61, 0xbb, 0x61, 0x90, 0x8c, 0x3d, 0x46, 0x0b, 0x89, 0xbd, 0x61, 0x99,
+  0x09, 0xbd, 0x27, 0xb3, 0x3a, 0xbc, 0xad, 0x56, 0xff, 0xbc, 0xa6, 0xaf, 0x7f,
+  0x3d, 0x50, 0x1d, 0x09, 0xbd, 0x82, 0xfd, 0xcd, 0xbc, 0x31, 0x6c, 0x4d, 0x3d,
+  0x6d, 0xe8, 0x8c, 0x3c, 0x59, 0x5e, 0xb7, 0xbb, 0xa8, 0x14, 0x49, 0x3d, 0x86,
+  0xe4, 0x89, 0xbc, 0x41, 0xc7, 0x0c, 0xbd, 0xf5, 0x84, 0x80, 0x3d, 0x31, 0x71,
+  0x88, 0x3d, 0x3b, 0xcf, 0x84, 0xbd, 0x4f, 0xc3, 0x89, 0x3d, 0x24, 0x62, 0x21,
+  0xbd, 0xb0, 0xc2, 0xdb, 0x3b, 0xf8, 0xc8, 0x46, 0xbd, 0xa5, 0xe0, 0x89, 0x3d,
+  0x89, 0x41, 0x29, 0x3c, 0x90, 0xbd, 0xe7, 0x3c, 0x78, 0xc9, 0x42, 0xbc, 0x1f,
+  0xd6, 0x82, 0x3d, 0xfb, 0xcd, 0x87, 0xbd, 0x2a, 0xd2, 0x24, 0xbd, 0x86, 0x49,
+  0x6d, 0xbd, 0x62, 0x20, 0xc8, 0xba, 0xb0, 0xc4, 0xec, 0xbc, 0xdf, 0x68, 0xb4,
+  0x3a, 0xe3, 0x0f, 0xe7, 0x3c, 0x41, 0xd5, 0x2e, 0xbd, 0xd4, 0xd6, 0x7c, 0xbd,
+  0xb6, 0xd8, 0x2f, 0x3d, 0x2e, 0x95, 0xf2, 0xbc, 0x7c, 0xa4, 0xd0, 0xbc, 0x84,
+  0x63, 0x61, 0x3d, 0xfe, 0x1c, 0x26, 0x3d, 0x29, 0x38, 0x6e, 0x3c, 0xff, 0xb9,
+  0x12, 0xbd, 0xbc, 0xc6, 0x8d, 0x3d, 0xe1, 0xf5, 0x94, 0xbd, 0xd6, 0x91, 0x86,
+  0xbd, 0x88, 0xb9, 0x58, 0xbc, 0x50, 0x18, 0xb0, 0xbb, 0x95, 0x6f, 0x84, 0x3d,
+  0xd1, 0x02, 0x2c, 0xbd, 0xdd, 0xec, 0x00, 0x3d, 0x2c, 0x87, 0x33, 0x3c, 0x83,
+  0xae, 0x83, 0xbd, 0xf9, 0xfc, 0xc7, 0x3b, 0x54, 0x47, 0x34, 0xbc, 0xdc, 0xeb,
+  0x44, 0xbc, 0xc1, 0x33, 0x1f, 0xbd, 0x2e, 0xa0, 0xe7, 0xbc, 0x18, 0x92, 0x5b,
+  0xbc, 0x75, 0xee, 0x48, 0x3d, 0xcf, 0xe5, 0x29, 0x3c, 0xdd, 0xfb, 0xcd, 0xbc,
+  0x1e, 0xfe, 0x15, 0xbd, 0xfa, 0x83, 0x24, 0xbd, 0x74, 0xa7, 0x1b, 0x3d, 0x79,
+  0x43, 0xf6, 0x3c, 0xc1, 0x09, 0xcc, 0xbb, 0x23, 0xce, 0x51, 0x3d, 0x90, 0xbd,
+  0x6d, 0xbd, 0xd3, 0x87, 0xa9, 0x3c, 0xa6, 0x5c, 0x6b, 0x3d, 0x30, 0xbc, 0xd0,
+  0xbb, 0x43, 0x24, 0x71, 0xbd, 0xf1, 0xc3, 0x69, 0xbc, 0xcc, 0x77, 0x5d, 0xbd,
+  0xf5, 0x11, 0x95, 0xbd, 0x90, 0x17, 0xc7, 0xbc, 0x44, 0x6c, 0x85, 0xbd, 0xeb,
+  0x43, 0xd6, 0x3c, 0xe3, 0x8d, 0x8b, 0x3d, 0xbf, 0x68, 0x3d, 0xbd, 0x6d, 0x69,
+  0x86, 0xbd, 0xb5, 0x14, 0x8f, 0xbd, 0xe9, 0x70, 0x0c, 0xbc, 0x97, 0x30, 0x78,
+  0x3d, 0xd2, 0x1f, 0x57, 0xbd, 0x08, 0xe4, 0x28, 0x3d, 0x34, 0x1f, 0xf3, 0xbc,
+  0x18, 0xb7, 0x66, 0xbc, 0x00, 0x60, 0x30, 0x3c, 0xc1, 0x3d, 0x1f, 0xbd, 0x26,
+  0x9a, 0x85, 0x3d, 0xc6, 0x32, 0x88, 0xbd, 0x36, 0x33, 0x5c, 0xbd, 0x81, 0xb7,
+  0x89, 0xbd, 0x9f, 0x29, 0xeb, 0xbb, 0xe3, 0x50, 0x3d, 0x3d, 0x24, 0x66, 0x88,
+  0xbd, 0xcc, 0xc0, 0x0d, 0x3d, 0xd2, 0xa9, 0x92, 0x3c, 0x54, 0x72, 0x02, 0x3d,
+  0xd5, 0x3b, 0x90, 0xbb, 0x3d, 0x9f, 0x63, 0xbd, 0xed, 0xbe, 0x18, 0xbd, 0x59,
+  0xec, 0x6e, 0x3b, 0x28, 0xf2, 0x29, 0xbc, 0xc7, 0xce, 0xab, 0x3c, 0xf4, 0xc8,
+  0x79, 0xbd, 0x7c, 0x71, 0x30, 0x3d, 0x75, 0xbb, 0x80, 0xbc, 0x5c, 0xc6, 0x6b,
+  0xbd, 0x61, 0x73, 0x3c, 0x3d, 0x74, 0x82, 0x33, 0xbd, 0xd2, 0x32, 0x79, 0x3c,
+  0x9c, 0x80, 0xb6, 0xbb, 0xef, 0xee, 0x5f, 0x3d, 0xf8, 0x07, 0x30, 0xbd, 0xb1,
+  0x7f, 0x2f, 0xbd, 0xc2, 0x76, 0x36, 0xbd, 0x9e, 0x38, 0xa3, 0x3c, 0x7c, 0x4e,
+  0x47, 0xbc, 0x48, 0xce, 0x1a, 0x3d, 0xfc, 0xcd, 0xc2, 0x3c, 0x65, 0xb0, 0x07,
+  0x3d, 0x51, 0x39, 0x1c, 0x3d, 0x27, 0x56, 0x87, 0x3d, 0x63, 0x07, 0xdd, 0x3c,
+  0x2b, 0xd5, 0x82, 0x3d, 0xb0, 0x9d, 0x85, 0xbd, 0xc5, 0x43, 0xf0, 0x3c, 0x19,
+  0x0c, 0x95, 0x3b, 0x28, 0x64, 0x6b, 0xbd, 0x8e, 0x23, 0x09, 0xbd, 0xfa, 0x58,
+  0xfc, 0x3b, 0x40, 0xca, 0x5d, 0x3c, 0xa0, 0xbe, 0x58, 0xbd, 0xb1, 0x3b, 0x91,
+  0xbd, 0xd1, 0x73, 0xf0, 0x3a, 0x1d, 0x07, 0x31, 0x3d, 0x7d, 0x80, 0x07, 0x3d,
+  0xda, 0x52, 0x44, 0x3c, 0x78, 0x62, 0x58, 0x3c, 0x8d, 0x84, 0x01, 0x3d, 0x66,
+  0x36, 0x76, 0xbd, 0x68, 0xd0, 0x03, 0xbc, 0x43, 0x54, 0x56, 0x3c, 0xae, 0xac,
+  0x59, 0x3d, 0x36, 0xce, 0x48, 0xbd, 0xd4, 0xc1, 0x65, 0xbc, 0xd9, 0xee, 0x34,
+  0x3c, 0x80, 0x4c, 0x66, 0xba, 0x88, 0xe1, 0x3c, 0x3c, 0xc8, 0xb7, 0x04, 0x3d,
+  0x90, 0xdf, 0xdf, 0x3c, 0x20, 0x76, 0x1c, 0x3b, 0xfb, 0x80, 0x1e, 0x3d, 0x7e,
+  0xbd, 0x19, 0x3d, 0x1f, 0x28, 0x96, 0xbb, 0x19, 0xa6, 0x3c, 0x3c, 0x3f, 0xc7,
+  0xf9, 0xbc, 0x4a, 0xc2, 0x1a, 0xbd, 0xd5, 0xa0, 0x86, 0xbd, 0x3a, 0xc8, 0xd6,
+  0x3c, 0xc3, 0x1a, 0x5a, 0x3d, 0x1a, 0x8c, 0x91, 0xbd, 0xd0, 0x10, 0x67, 0x3d,
+  0x42, 0x5b, 0x16, 0x3d, 0xa3, 0xd2, 0x5b, 0xbc, 0x6c, 0xa0, 0xb6, 0x3c, 0x65,
+  0xe2, 0x1d, 0xbd, 0x9a, 0xdf, 0x0e, 0xbd, 0xc0, 0x74, 0xcf, 0x3b, 0x84, 0xe1,
+  0xc1, 0x3c, 0x2a, 0xed, 0x60, 0x3d, 0xe3, 0x10, 0xe4, 0xbc, 0x3f, 0xcc, 0x8b,
+  0xbd, 0x95, 0xa5, 0x8b, 0x3d, 0xd8, 0xc3, 0x00, 0xbd, 0x85, 0x56, 0x75, 0x3d,
+  0xac, 0x3a, 0x5b, 0x3d, 0x6a, 0x5d, 0xed, 0xbb, 0xbb, 0xd3, 0xd5, 0x3c, 0xac,
+  0xb0, 0x3f, 0x3d, 0x70, 0x1a, 0x6b, 0x3c, 0x70, 0xca, 0x28, 0x3c, 0xa2, 0x71,
+  0xde, 0xbc, 0x00, 0x22, 0x77, 0x3a, 0x43, 0x45, 0x21, 0xbd, 0x17, 0xa9, 0x34,
+  0x3d, 0x4d, 0x49, 0x2d, 0xbd, 0xb5, 0xd6, 0x8b, 0x3d, 0x84, 0xa5, 0xbd, 0xbc,
+  0x9d, 0x7f, 0x02, 0xbd, 0x85, 0x08, 0x80, 0xbd, 0xff, 0x2d, 0x8f, 0xbc, 0x04,
+  0x5f, 0x3b, 0xbd, 0xba, 0xce, 0x17, 0xbd, 0xf3, 0xfc, 0x80, 0x3d, 0xe1, 0x9c,
+  0x8c, 0xbd, 0xaf, 0x1c, 0xc6, 0x3c, 0x77, 0x31, 0x12, 0x3d, 0xde, 0x28, 0x49,
+  0xbd, 0x0d, 0xe3, 0x1f, 0xbd, 0x2a, 0x71, 0x30, 0xbc, 0x1e, 0x04, 0x35, 0x3d,
+  0x08, 0x0a, 0xad, 0x3b, 0xe9, 0x97, 0x98, 0xbc, 0x26, 0xe3, 0x00, 0x3c, 0xbe,
+  0xf9, 0xbb, 0xbc, 0x77, 0x23, 0x34, 0xbd, 0x55, 0x69, 0x61, 0x3d, 0xc4, 0xb9,
+  0x8d, 0xbd, 0x5f, 0x82, 0x81, 0x3d, 0x68, 0xff, 0x16, 0xbc, 0x2c, 0xa2, 0x91,
+  0xbc, 0x67, 0x62, 0x78, 0xbd, 0x76, 0x32, 0x13, 0x3d, 0x68, 0x26, 0x2b, 0x3d,
+  0x1a, 0xbb, 0xdc, 0xbc, 0xae, 0x91, 0x84, 0x3d, 0xc0, 0xfe, 0x8d, 0xbd, 0xfe,
+  0x28, 0x88, 0xbc, 0x02, 0x43, 0x0e, 0xbc, 0x0b, 0x35, 0x69, 0xbb, 0xb4, 0xf8,
+  0x8b, 0xbd, 0xad, 0x86, 0x6e, 0xbd, 0x5c, 0x92, 0x19, 0xbd, 0x03, 0x18, 0x59,
+  0xbd, 0x58, 0x48, 0x55, 0xbc, 0x2e, 0xaf, 0x4d, 0x3d, 0x70, 0x1a, 0x59, 0xbc,
+  0x63, 0xf3, 0x3d, 0xbd, 0x97, 0xcd, 0x8f, 0xbd, 0x4b, 0x2b, 0x75, 0x3d, 0x78,
+  0xf6, 0x78, 0xbd, 0x40, 0x84, 0x01, 0xbd, 0x04, 0xb6, 0x05, 0xbd, 0x21, 0xa7,
+  0xf7, 0x3c, 0x9e, 0x08, 0xc5, 0x3c, 0x3b, 0xde, 0xa8, 0xbc, 0x04, 0x81, 0x85,
+  0x3c, 0x7d, 0x36, 0xd2, 0x3c, 0x02, 0xf0, 0xd0, 0xbc, 0xcb, 0xe0, 0x68, 0x3d,
+  0xb3, 0x19, 0x89, 0xbd, 0x39, 0xf7, 0x5f, 0x3d, 0x6a, 0x8f, 0x05, 0xbc, 0x7c,
+  0xc8, 0x91, 0xbc, 0xec, 0xc4, 0x93, 0x3c, 0xa0, 0x62, 0x3a, 0xbb, 0x59, 0xfc,
+  0x1a, 0xbd, 0xc9, 0xcd, 0x95, 0xbd, 0x57, 0xc3, 0x5b, 0xbb, 0x67, 0x2f, 0xe4,
+  0x3c, 0x13, 0xcc, 0xa5, 0x3c, 0x1d, 0x6c, 0x39, 0xbc, 0x50, 0x64, 0x83, 0x3c,
+  0x50, 0x6d, 0x5b, 0xbc, 0xda, 0x2a, 0xcd, 0x3c, 0x09, 0xb3, 0x96, 0xbd, 0x91,
+  0x4f, 0x34, 0x3d, 0x33, 0xd0, 0x17, 0xbd, 0x1d, 0x22, 0x86, 0xbd, 0x9c, 0x1e,
+  0x0d, 0xbd, 0xd4, 0x2b, 0x9c, 0xba, 0x67, 0xb5, 0xa7, 0xbc, 0x0f, 0xe2, 0x76,
+  0xbd, 0x4b, 0xb9, 0x71, 0x3d, 0x69, 0xa9, 0x9c, 0xbc, 0x30, 0x44, 0x47, 0x3d,
+  0xf0, 0xdc, 0x95, 0x3c, 0xe2, 0x1d, 0x22, 0xbd, 0xaa, 0xb5, 0x58, 0xbd, 0x9d,
+  0x59, 0x7d, 0xbd, 0xa4, 0x92, 0x95, 0x3c, 0x40, 0xaa, 0x8d, 0xbd, 0xf0, 0x3e,
+  0xb4, 0x3c, 0xc2, 0x03, 0x2a, 0xbd, 0xb0, 0xc5, 0x29, 0xbd, 0xc0, 0x7c, 0x42,
+  0xbd, 0xea, 0x99, 0x7e, 0x3d, 0xd6, 0xbc, 0x15, 0x3d, 0xb9, 0xda, 0x37, 0xbd,
+  0xd0, 0x21, 0x9e, 0x3c, 0x79, 0x2e, 0xab, 0xbb, 0x73, 0x17, 0xcd, 0xbc, 0x7c,
+  0x01, 0xe3, 0x3c, 0xb7, 0xb8, 0xf2, 0x3c, 0x11, 0x4b, 0x45, 0x3d, 0x87, 0x86,
+  0x9a, 0x3c, 0x2c, 0x70, 0x57, 0xbd, 0x55, 0xdf, 0x1d, 0xbd, 0xf5, 0x86, 0xa6,
+  0xbc, 0x21, 0x96, 0x49, 0xbd, 0x36, 0x4c, 0x75, 0xbd, 0xc9, 0x1c, 0xa0, 0x3c,
+  0x5d, 0xba, 0x26, 0x3d, 0xd6, 0x56, 0x02, 0x3d, 0x69, 0x90, 0x12, 0xbc, 0x08,
+  0x5b, 0x0f, 0xbd, 0x81, 0xce, 0x92, 0xbc, 0x3a, 0xb8, 0x5f, 0x3d, 0x7a, 0xaf,
+  0xe7, 0x3c, 0x4d, 0x4b, 0x60, 0xbc, 0x78, 0xc0, 0x6c, 0xbd, 0x85, 0x6f, 0xe7,
+  0x3c, 0xaa, 0xc1, 0xb3, 0x3c, 0x8b, 0xe4, 0xb7, 0x3c, 0xdd, 0xd0, 0x39, 0x3d,
+  0x48, 0x49, 0x1b, 0x3d, 0xe2, 0x74, 0x28, 0xbd, 0x86, 0x4a, 0x47, 0x3d, 0x30,
+  0x77, 0xad, 0x3b, 0xe0, 0xa8, 0x0e, 0xbc, 0xec, 0x36, 0xd1, 0x3c, 0xe3, 0x01,
+  0x8f, 0xbd, 0x56, 0x6c, 0x34, 0xbd, 0x8a, 0x99, 0x20, 0xbb, 0xb1, 0x89, 0x12,
+  0x3d, 0xea, 0x43, 0x39, 0xbd, 0x26, 0x16, 0xd2, 0x3c, 0xe2, 0x88, 0xc8, 0x3c,
+  0x63, 0x15, 0xa0, 0x3c, 0x8d, 0x95, 0x3a, 0x3d, 0x86, 0x69, 0x26, 0xbd, 0x4c,
+  0x38, 0xdb, 0x3b, 0xe0, 0xfa, 0x49, 0x3d, 0x62, 0xdf, 0xb4, 0xbc, 0x6a, 0xe4,
+  0x89, 0xbc, 0x63, 0x50, 0x6d, 0x3d, 0xfa, 0x35, 0x46, 0xbd, 0xcb, 0xcb, 0x8c,
+  0xbc, 0x46, 0x94, 0x66, 0x3d, 0xdd, 0xf8, 0xa2, 0xbc, 0x00, 0x34, 0x8c, 0x3d,
+  0x0a, 0xa1, 0x05, 0x3d, 0x73, 0x92, 0x91, 0xbd, 0x64, 0x3e, 0xf4, 0xbc, 0xcd,
+  0x5a, 0xa4, 0xbc, 0xe6, 0xce, 0x4b, 0x3d, 0x68, 0xb0, 0xcf, 0xbc, 0x38, 0xd3,
+  0xe2, 0x3b, 0xfd, 0x03, 0x38, 0xbd, 0x11, 0xc0, 0x92, 0xbd, 0xa8, 0x82, 0x50,
+  0x3d, 0x2a, 0x9a, 0xaf, 0xbc, 0x0e, 0xea, 0x7b, 0x3d, 0x11, 0xf4, 0x95, 0xbc,
+  0x34, 0xed, 0xb6, 0x3c, 0x2b, 0x26, 0x6f, 0xbd, 0x15, 0xad, 0x7c, 0x3d, 0x19,
+  0xc6, 0xed, 0x3c, 0x00, 0xf8, 0x81, 0xbd, 0x74, 0x82, 0x63, 0xbd, 0x62, 0x76,
+  0x53, 0xbd, 0x48, 0x4f, 0x78, 0x3d, 0x76, 0x0e, 0x5c, 0xbb, 0x24, 0x30, 0x30,
+  0xbd, 0x86, 0x0a, 0x14, 0x3d, 0x08, 0x29, 0xb3, 0xbc, 0xef, 0x7c, 0x2a, 0xbd,
+  0x90, 0xb8, 0x09, 0x3d, 0x47, 0x45, 0x66, 0xbc, 0x30, 0x23, 0xb7, 0xbc, 0x8f,
+  0xd2, 0x5e, 0x3d, 0x31, 0x72, 0x33, 0x3d, 0x26, 0xdc, 0x88, 0xbd, 0xeb, 0x0b,
+  0x24, 0xbc, 0x14, 0x3c, 0xe9, 0xbc, 0x38, 0xc6, 0xd3, 0x3c, 0x55, 0xd6, 0x09,
+  0xbd, 0xe5, 0xf7, 0x21, 0xbb, 0x7d, 0x03, 0x0d, 0x3d, 0xe9, 0x91, 0xd6, 0xbb,
+  0x00, 0x90, 0xe4, 0x3a, 0x21, 0x2c, 0x1a, 0x3d, 0x0c, 0xe1, 0x82, 0x3c, 0x0a,
+  0xb6, 0x38, 0x3d, 0x6c, 0x03, 0xe9, 0x3c, 0x83, 0x86, 0x05, 0x3d, 0x01, 0x6e,
+  0x86, 0x3d, 0x99, 0xc2, 0x47, 0xbd, 0x27, 0x07, 0x57, 0x3d, 0xed, 0xd2, 0x59,
+  0x3d, 0x0f, 0xa1, 0x0a, 0xbc, 0x12, 0x62, 0x6c, 0x3d, 0x16, 0x50, 0xf8, 0x3b,
+  0x00, 0xf3, 0xdc, 0x3c, 0x5c, 0x4e, 0xa6, 0xbc, 0xfa, 0x73, 0x42, 0x3c, 0xd2,
+  0x38, 0x8a, 0xbd, 0x35, 0x94, 0x8d, 0xbc, 0x69, 0x22, 0x3e, 0xbd, 0x83, 0xec,
+  0x6f, 0xbc, 0xb6, 0x37, 0xb4, 0x3c, 0xf1, 0xa7, 0x83, 0x3d, 0x62, 0xbc, 0x82,
+  0x3d, 0x88, 0x5d, 0xb8, 0xbc, 0xdd, 0x4d, 0x96, 0xbc, 0xaa, 0x38, 0x23, 0xbd,
+  0x88, 0x3f, 0x4d, 0xbc, 0xc5, 0x2d, 0xfc, 0x3c, 0x78, 0x63, 0x20, 0x3d, 0xe5,
+  0x87, 0x88, 0x3d, 0x08, 0xed, 0x77, 0xbc, 0x38, 0xef, 0x85, 0xbc, 0x19, 0xc5,
+  0x90, 0x3d, 0xba, 0xc7, 0x4e, 0x3d, 0xe4, 0xc2, 0xd6, 0x3c, 0xac, 0x97, 0x22,
+  0xbc, 0xa4, 0x4d, 0x55, 0xbd, 0x02, 0x71, 0x8b, 0xbd, 0xce, 0x55, 0x86, 0x3d,
+  0xf9, 0x00, 0x9c, 0xbc, 0xbc, 0x84, 0x51, 0x3d, 0x3c, 0xaa, 0x21, 0xbd, 0xb3,
+  0x0f, 0x43, 0xbd, 0x15, 0x2e, 0x90, 0xbd, 0xa9, 0x5c, 0x7a, 0x3d, 0x11, 0x1e,
+  0x4b, 0x3d, 0xc7, 0x35, 0xc9, 0xbc, 0x86, 0x61, 0x77, 0xbd, 0x5c, 0xbb, 0x21,
+  0xbc, 0x39, 0x3c, 0x6d, 0x3d, 0xaa, 0xde, 0xdd, 0x3a, 0xe5, 0xad, 0x0b, 0xbd,
+  0xd5, 0x2c, 0x8f, 0xbd, 0x9b, 0xd2, 0x40, 0xbc, 0xae, 0xd1, 0x27, 0x3d, 0xa4,
+  0x43, 0x61, 0x3c, 0x96, 0x2f, 0x26, 0xbd, 0x4c, 0xdb, 0x50, 0xbd, 0xd0, 0xee,
+  0x55, 0xbc, 0xa9, 0xdf, 0x62, 0x3d, 0xa9, 0xc7, 0x14, 0xbd, 0x02, 0x65, 0x41,
+  0x3b, 0xdc, 0x7c, 0x20, 0x3c, 0xb5, 0xb9, 0x89, 0x3d, 0x43, 0xc8, 0x8f, 0xbd,
+  0xe5, 0x6b, 0x3e, 0x3c, 0xcb, 0x96, 0x8d, 0xbd, 0xe8, 0x9b, 0x7d, 0xbd, 0xad,
+  0x41, 0x91, 0x3d, 0x84, 0x7b, 0xc2, 0x3c, 0xe9, 0xf8, 0x8c, 0x3c, 0x6d, 0x06,
+  0xf1, 0xbb, 0xac, 0xcc, 0x43, 0x3d, 0x11, 0xd2, 0xe3, 0x3c, 0x69, 0xb6, 0x76,
+  0xbc, 0x19, 0x3b, 0x71, 0xbd, 0x82, 0x8a, 0xb9, 0xbc, 0x28, 0x56, 0x3a, 0x3d,
+  0xf6, 0x2b, 0x3c, 0x3d, 0x0f, 0x6e, 0xe1, 0xbb, 0x96, 0x11, 0x84, 0xbc, 0xae,
+  0xf7, 0x81, 0x3d, 0xd2, 0xd1, 0x80, 0x3d, 0x97, 0xc3, 0xe6, 0xbc, 0x89, 0xe2,
+  0x57, 0x3c, 0x3d, 0x6e, 0x8e, 0xbc, 0xca, 0x02, 0x4d, 0xbd, 0x62, 0x3c, 0xc1,
+  0xbc, 0x16, 0x10, 0xed, 0xba, 0x3f, 0xe1, 0xef, 0x3c, 0x0a, 0x5c, 0xab, 0xbc,
+  0x21, 0xad, 0xd1, 0xbb, 0xbc, 0xfe, 0x32, 0x3c, 0xac, 0x6c, 0x71, 0xbd, 0x15,
+  0x98, 0x14, 0x3d, 0xb6, 0xee, 0x3a, 0x3c, 0x35, 0x4c, 0x87, 0x3d, 0xb6, 0xcd,
+  0x4c, 0x3d, 0x10, 0xf7, 0xcc, 0x3b, 0xdb, 0x8a, 0x19, 0xbd, 0x00, 0x38, 0xdb,
+  0xb8, 0xb3, 0x1b, 0x8e, 0xbd, 0x50, 0xa8, 0x41, 0xbd, 0x64, 0x53, 0x85, 0xbd,
+  0x46, 0xcf, 0xcd, 0xbb, 0x65, 0xaf, 0xa4, 0x3c, 0x78, 0x82, 0x22, 0xbd, 0xb1,
+  0xb2, 0x19, 0xbd, 0xaa, 0x2b, 0xe5, 0xbc, 0xb8, 0x9c, 0x3d, 0x3d, 0x30, 0x82,
+  0x8c, 0x3c, 0xd9, 0x2c, 0x89, 0xbd, 0x27, 0x33, 0x8f, 0x3d, 0x20, 0x09, 0x87,
+  0x3d, 0x50, 0x15, 0x05, 0xbd, 0x4b, 0xc1, 0x96, 0xbd, 0x82, 0x2a, 0x33, 0x3d,
+  0xc1, 0x9b, 0x6c, 0xbd, 0xac, 0x51, 0x0c, 0xbd, 0xd7, 0xbc, 0x59, 0xbd, 0x69,
+  0x2b, 0x37, 0x3c, 0xc0, 0xef, 0x26, 0xbd, 0xc8, 0xba, 0x59, 0x3c, 0xda, 0x1b,
+  0x18, 0xbd, 0x11, 0xfb, 0x8b, 0x3d, 0xbf, 0xc8, 0x3d, 0xbd, 0x52, 0x1b, 0x00,
+  0x3d, 0xe8, 0x9d, 0x4d, 0xba, 0xe4, 0x9d, 0x44, 0x3d, 0x87, 0x63, 0x06, 0xbd,
+  0x76, 0xc3, 0x83, 0x3d, 0x32, 0xe3, 0x84, 0xbd, 0x5a, 0x34, 0x11, 0x3d, 0xe0,
+  0xb2, 0x0e, 0xbd, 0xa8, 0x02, 0x8a, 0xbd, 0x9c, 0x92, 0x10, 0x3d, 0x47, 0xfd,
+  0x90, 0xbd, 0x24, 0x45, 0x3c, 0x3d, 0x67, 0x62, 0x96, 0xbd, 0xbb, 0x91, 0x79,
+  0xbd, 0x80, 0x99, 0x5b, 0xbd, 0x93, 0x7f, 0x83, 0xbd, 0x75, 0x82, 0x10, 0xbd,
+  0x07, 0xb0, 0xa7, 0xbb, 0x5b, 0x41, 0x66, 0xbd, 0x82, 0xeb, 0x7a, 0xbc, 0x52,
+  0xca, 0x57, 0xbd, 0x7e, 0xe3, 0x66, 0x3c, 0xab, 0x22, 0x68, 0xbd, 0x51, 0x4b,
+  0xa9, 0xbc, 0x5e, 0x13, 0xa7, 0xbc, 0xe3, 0x6b, 0x88, 0xbb, 0x80, 0x4c, 0x02,
+  0x3d, 0xf3, 0x3c, 0x59, 0xbd, 0xb2, 0x10, 0x7e, 0x3d, 0x1a, 0x9d, 0x13, 0xbd,
+  0x8d, 0xd0, 0x5b, 0x3d, 0xca, 0x7a, 0x74, 0x3d, 0x16, 0x53, 0x4b, 0x3d, 0xc9,
+  0x0a, 0x89, 0xbd, 0x44, 0x7e, 0x1b, 0xbc, 0x11, 0xca, 0xb2, 0xbc, 0x09, 0xe0,
+  0x27, 0xbd, 0xe4, 0xed, 0xfb, 0x3c, 0xe4, 0x1a, 0xf9, 0xbc, 0x50, 0x47, 0x2e,
+  0x3d, 0x1b, 0xed, 0x4e, 0x3d, 0x6d, 0x7c, 0x81, 0xbd, 0x72, 0x2a, 0xdc, 0xbc,
+  0x6f, 0xa7, 0x59, 0x3d, 0xc0, 0xbd, 0x1e, 0xbc, 0xb2, 0xaf, 0xb9, 0xbc, 0x07,
+  0x39, 0xba, 0xbc, 0xf4, 0x63, 0x46, 0xbd, 0x45, 0x7b, 0x1a, 0x3d, 0x79, 0xe9,
+  0xf7, 0x3c, 0x9e, 0xba, 0xf0, 0xbc, 0xc1, 0x09, 0xbb, 0x3c, 0x0e, 0x21, 0x52,
+  0xbc, 0xed, 0x78, 0x43, 0x3b, 0x73, 0x07, 0x62, 0x3d, 0x71, 0x92, 0x84, 0x3d,
+  0x7b, 0x59, 0xb2, 0xbc, 0xe0, 0xba, 0x34, 0xbc, 0x0c, 0x23, 0x14, 0xbd, 0x93,
+  0x93, 0x1f, 0xbd, 0xb7, 0x20, 0x6b, 0xbd, 0x8e, 0x60, 0x8c, 0xbd, 0x00, 0xe9,
+  0x8c, 0x3d, 0xdf, 0xb4, 0xe1, 0xbb, 0xa0, 0x1a, 0xbf, 0xbc, 0xf6, 0x4c, 0x80,
+  0x3c, 0x74, 0xeb, 0x18, 0x3d, 0x28, 0x64, 0x8c, 0x3c, 0xba, 0xbd, 0xd3, 0xbc,
+  0x56, 0xc0, 0x6f, 0x3d, 0x09, 0x02, 0x88, 0xbd, 0x02, 0xd5, 0x58, 0x3d, 0xc1,
+  0x57, 0x31, 0x3d, 0xfc, 0x52, 0x48, 0x3d, 0x61, 0xdc, 0x64, 0xbd, 0xa7, 0xc3,
+  0x2b, 0x3d, 0x3b, 0xea, 0x13, 0xbc, 0x0e, 0xac, 0x3c, 0xbd, 0x7e, 0x92, 0x86,
+  0x3c, 0xbf, 0x14, 0x29, 0xbc, 0xf3, 0x91, 0x7f, 0x3d, 0xf1, 0x9a, 0xac, 0x3c,
+  0xf8, 0xf5, 0x76, 0x3c, 0xa2, 0x0f, 0x86, 0xbd, 0xc3, 0xeb, 0xb7, 0x3a, 0xff,
+  0x56, 0x6c, 0x3d, 0x1c, 0xcc, 0x5a, 0xbd, 0x97, 0x3f, 0x78, 0x3d, 0x92, 0xea,
+  0x9d, 0xbc, 0xbc, 0x51, 0x6a, 0x3d, 0xc5, 0x44, 0x65, 0x3c, 0xbc, 0x66, 0x30,
+  0x3d, 0x70, 0xe2, 0x26, 0xbd, 0x2e, 0xbe, 0x19, 0x3d, 0x5e, 0xf3, 0x82, 0x3d,
+  0x32, 0x2f, 0x86, 0xbd, 0x53, 0x73, 0x81, 0x3d, 0x86, 0xef, 0xa2, 0xbc, 0xdb,
+  0xda, 0x62, 0xbd, 0x82, 0x4e, 0xd3, 0xbc, 0x80, 0xed, 0x93, 0xba, 0x50, 0xc2,
+  0xd6, 0x3b, 0x82, 0x22, 0xf1, 0xbc, 0x49, 0xd7, 0x7a, 0xbc, 0xe9, 0x00, 0x85,
+  0x3d, 0xb7, 0x12, 0x4c, 0xbd, 0x90, 0x25, 0x08, 0xb9, 0x2e, 0x76, 0xcb, 0xbc,
+  0x47, 0x11, 0x97, 0xbd, 0x06, 0x96, 0x2f, 0x3d, 0x44, 0x62, 0x65, 0x3d, 0xe7,
+  0xa5, 0x1f, 0x3d, 0x2e, 0x9e, 0xbf, 0xbc, 0x00, 0xd8, 0x6c, 0xbc, 0x20, 0xd1,
+  0x44, 0xbb, 0x19, 0x61, 0x32, 0x3c, 0xf4, 0x7a, 0x30, 0x3d, 0x11, 0x7b, 0xe4,
+  0xbc, 0x6e, 0x1c, 0x50, 0x3b, 0x9b, 0x64, 0x64, 0xbd, 0x89, 0x52, 0x1f, 0x3d,
+  0x65, 0x20, 0x2c, 0x3d, 0xb9, 0x45, 0xd7, 0x3c, 0xe8, 0x37, 0x8e, 0x3d, 0x40,
+  0x5e, 0x50, 0x3c, 0x7a, 0x66, 0x68, 0xbd, 0x45, 0x1b, 0x31, 0xbd, 0xcb, 0x31,
+  0x47, 0x3d, 0x2f, 0x4a, 0xb3, 0x3c, 0x97, 0x3d, 0xbc, 0xbc, 0x55, 0x24, 0x80,
+  0xbd, 0x85, 0x56, 0x69, 0xbc, 0x0e, 0x0a, 0x34, 0x3d, 0xec, 0xe8, 0x54, 0xbd,
+  0xeb, 0x92, 0x6d, 0xbd, 0xe2, 0x61, 0x41, 0x3c, 0xf3, 0x3c, 0x93, 0xbd, 0x10,
+  0xea, 0xbd, 0xb7, 0x42, 0xec, 0x3b, 0xbd, 0x66, 0xe6, 0x80, 0xbd, 0x84, 0xd9,
+  0x85, 0x3d, 0x2c, 0xd8, 0xac, 0x3c, 0x72, 0x8e, 0x48, 0x3c, 0x11, 0xa8, 0x9c,
+  0xbc, 0x08, 0x31, 0x39, 0x3d, 0x0f, 0x3c, 0x7c, 0x3d, 0x58, 0xba, 0x25, 0x3d,
+  0xce, 0x5f, 0x27, 0x3c, 0x7c, 0x7b, 0x65, 0x3d, 0x96, 0xd6, 0x1e, 0x3d, 0x48,
+  0x03, 0x73, 0xbd, 0x84, 0x7a, 0x26, 0xbd, 0x92, 0x82, 0x72, 0xbd, 0xeb, 0x8a,
+  0x0c, 0xbd, 0x84, 0xe7, 0x5f, 0xbd, 0x0b, 0x83, 0xfc, 0x3c, 0xfb, 0xed, 0x8e,
+  0xbd, 0x52, 0xe2, 0x65, 0x3d, 0xd1, 0xa1, 0x4e, 0xbb, 0x5f, 0x41, 0xce, 0xbc,
+  0x4b, 0x3d, 0x15, 0xbb, 0x20, 0xc8, 0x90, 0xbd, 0x29, 0xfb, 0x28, 0xbd, 0x04,
+  0x06, 0x8a, 0xbd, 0x8a, 0x65, 0x30, 0x3d, 0x00, 0x49, 0x93, 0x3a, 0x6e, 0xb0,
+  0x61, 0x3d, 0x94, 0xcc, 0x87, 0xbc, 0x10, 0x13, 0x3a, 0x3d, 0x5a, 0x7e, 0x7f,
+  0xbd, 0x4c, 0x1f, 0xd7, 0xbc, 0x82, 0xb3, 0x1e, 0x3d, 0x7e, 0xca, 0x00, 0xbc,
+  0xe7, 0x69, 0xe4, 0xbb, 0xd5, 0xad, 0x1f, 0x3d, 0xb6, 0x02, 0x72, 0x3d, 0x4b,
+  0x4f, 0x91, 0xbc, 0x69, 0xd1, 0xd2, 0xbc, 0xf4, 0x42, 0xce, 0x3c, 0xf9, 0x95,
+  0x8f, 0x3d, 0x5f, 0xd1, 0x52, 0x3c, 0xec, 0xd5, 0x67, 0x3d, 0x79, 0x25, 0x84,
+  0xba, 0xf3, 0x43, 0x5f, 0x3d, 0x39, 0xdc, 0x2b, 0x3d, 0xc6, 0x40, 0x67, 0xbd,
+  0xbb, 0xfa, 0x02, 0xbd, 0xf6, 0x13, 0x31, 0xbc, 0x1a, 0x8a, 0x5b, 0x3d, 0x28,
+  0x8c, 0x3d, 0xba, 0xbd, 0x41, 0x46, 0x3d, 0xc8, 0xb7, 0x80, 0xbb, 0xd7, 0xc5,
+  0x71, 0x3b, 0x2a, 0x9d, 0x51, 0xbd, 0xfb, 0xe8, 0x66, 0xbd, 0x49, 0x55, 0xad,
+  0xbc, 0x80, 0x74, 0x36, 0xbd, 0x00, 0x48, 0xc7, 0xbc, 0xec, 0x9e, 0xf8, 0x3c,
+  0x2d, 0x31, 0x7e, 0x3d, 0x5d, 0xdd, 0x94, 0xbd, 0xfd, 0xce, 0x57, 0x3d, 0xe2,
+  0x28, 0x0b, 0xbc, 0x00, 0xec, 0x38, 0x3d, 0x88, 0x2f, 0xc9, 0xbc, 0xe8, 0x5d,
+  0x69, 0x3d, 0xd8, 0x1a, 0x04, 0xbc, 0xa5, 0x91, 0x78, 0x3d, 0x4f, 0x30, 0x06,
+  0xbc, 0xdf, 0x59, 0x51, 0x3d, 0x00, 0xb6, 0x8f, 0x3a, 0x9f, 0x7e, 0x76, 0xbd,
+  0x66, 0xc5, 0x1d, 0x3d, 0x99, 0x26, 0x91, 0xbd, 0x82, 0x51, 0x8e, 0xbd, 0xf6,
+  0xf9, 0x81, 0xbc, 0x60, 0x4a, 0x9d, 0x3c, 0x40, 0xfa, 0xf8, 0xbb, 0x96, 0x7a,
+  0xf4, 0xbb, 0x8d, 0xfb, 0x02, 0xbd, 0xf0, 0xf1, 0xa8, 0x3c, 0xc9, 0xa7, 0x38,
+  0xbd, 0x85, 0xc8, 0x4b, 0xbc, 0xc8, 0x56, 0x13, 0x3d, 0x61, 0x4d, 0x88, 0xbd,
+  0x4e, 0xe1, 0x42, 0x3d, 0xec, 0x20, 0x7c, 0xbc, 0x49, 0x1c, 0x91, 0x3d, 0x40,
+  0xea, 0x8d, 0xbd, 0x90, 0xa9, 0x5b, 0xbd, 0xe1, 0x98, 0x8e, 0xbd, 0x2f, 0x06,
+  0xed, 0xbc, 0xa9, 0xa1, 0xe0, 0x3c, 0x54, 0xa1, 0x76, 0xbd, 0x21, 0x88, 0x70,
+  0xbd, 0x16, 0x25, 0x23, 0xbd, 0xb6, 0xdf, 0x4f, 0x3d, 0xaf, 0x39, 0x57, 0x3d,
+  0x3f, 0xfa, 0x2a, 0xbd, 0xda, 0x39, 0xcf, 0x3c, 0xf6, 0x8b, 0x5e, 0x3d, 0x49,
+  0x9e, 0xec, 0xbc, 0x5c, 0x6b, 0x7f, 0x3d, 0x38, 0xf8, 0x8a, 0xbc, 0x15, 0xc8,
+  0x8a, 0xbd, 0xc9, 0xb5, 0x3f, 0x3d, 0x1c, 0xcd, 0x97, 0xbd, 0x3c, 0xa4, 0xb0,
+  0xba, 0x85, 0x05, 0x18, 0xbc, 0x0b, 0xf9, 0x81, 0xbd, 0xa7, 0x64, 0x84, 0xbc,
+  0x17, 0xa4, 0x86, 0x3d, 0x74, 0xbc, 0x6d, 0xbd, 0xbe, 0xaa, 0xe0, 0x3c, 0x70,
+  0x71, 0x01, 0x3d, 0x34, 0x7c, 0x3b, 0x3d, 0xf7, 0xe5, 0x4a, 0x3d, 0x0b, 0x8a,
+  0xe2, 0x3c, 0x3a, 0xce, 0x8c, 0xbd, 0xc3, 0x45, 0x17, 0xbc, 0x06, 0x14, 0x40,
+  0xbd, 0xc8, 0x4e, 0x2a, 0x3d, 0x1e, 0x87, 0x38, 0x3d, 0x12, 0xe6, 0x8e, 0x3d,
+  0x5d, 0x26, 0x24, 0xbc, 0x96, 0x16, 0x0e, 0xbb, 0xbd, 0x7b, 0xe7, 0xbb, 0xee,
+  0xf1, 0x86, 0xbc, 0x21, 0x44, 0xe1, 0xba, 0x34, 0xc7, 0x76, 0xbd, 0x84, 0x41,
+  0x0f, 0xba, 0x79, 0x2a, 0x77, 0x3d, 0xe0, 0x52, 0xce, 0x3c, 0xd3, 0xbd, 0x0c,
+  0x3d, 0xff, 0x57, 0x8b, 0x3d, 0xc6, 0x60, 0xed, 0x3b, 0xfc, 0x72, 0x7f, 0xbd,
+  0x18, 0xaa, 0x20, 0x3c, 0xcd, 0x28, 0x0d, 0x3d, 0x18, 0xf7, 0xdb, 0x3a, 0xd6,
+  0x93, 0x6a, 0x3d, 0x46, 0x48, 0x55, 0xbd, 0x01, 0x2f, 0x7c, 0x3d, 0x75, 0x2d,
+  0x80, 0x3c, 0x4c, 0x22, 0xd0, 0x3c, 0x17, 0x6d, 0x8b, 0xbb, 0x34, 0x25, 0xec,
+  0xbc, 0x04, 0x8e, 0x56, 0x3d, 0xd8, 0xab, 0x88, 0x3d, 0x20, 0x51, 0x88, 0xbc,
+  0x71, 0xdb, 0xd4, 0x3c, 0x41, 0xe5, 0x03, 0xbd, 0x28, 0x8d, 0x0c, 0x3c, 0xa1,
+  0xe2, 0x7d, 0xbd, 0x10, 0xb2, 0xcd, 0x3c, 0x3b, 0xa9, 0xdf, 0xbc, 0x2d, 0x71,
+  0x73, 0x3d, 0xfa, 0xcb, 0xd3, 0x3c, 0xb4, 0x04, 0x10, 0xbb, 0xca, 0xec, 0x8c,
+  0xbd, 0xd1, 0x28, 0x9a, 0x3c, 0x0f, 0x12, 0x2f, 0x3d, 0x93, 0x67, 0x2a, 0x3d,
+  0x94, 0x98, 0xb7, 0x3c, 0x8e, 0x0f, 0xae, 0xbc, 0xc6, 0x7c, 0xd9, 0x3c, 0xa0,
+  0x4d, 0x3b, 0xbb, 0x20, 0xf7, 0xd5, 0x3c, 0x7b, 0xa2, 0x72, 0xbd, 0xc5, 0xb9,
+  0xbd, 0x3c, 0x59, 0x61, 0x1e, 0x3d, 0x8b, 0x95, 0x8c, 0xbd, 0xbe, 0xbf, 0x9b,
+  0xbc, 0x0f, 0x63, 0x7b, 0x3d, 0x92, 0x1a, 0x66, 0x3c, 0x4f, 0xef, 0xa0, 0x38,
+  0x8c, 0x24, 0xd9, 0xbc, 0x7d, 0xfa, 0xf8, 0xbc, 0xde, 0xe7, 0x85, 0x3d, 0xa2,
+  0xd6, 0x13, 0xbd, 0x5e, 0x38, 0x3d, 0xbd, 0xe7, 0x7e, 0xb0, 0x3d, 0xc5, 0x86,
+  0xba, 0xbc, 0x49, 0x12, 0x93, 0xbd, 0x8e, 0x9e, 0xea, 0x3d, 0x48, 0x93, 0x84,
+  0xbd, 0x33, 0x48, 0xc7, 0xbc, 0x23, 0x1f, 0x5f, 0x3d, 0x51, 0x20, 0xb5, 0xbb,
+  0x93, 0xfa, 0x90, 0x3d, 0x99, 0xe1, 0x31, 0xbd, 0x82, 0x3e, 0x89, 0xbd, 0x99,
+  0x5e, 0xe0, 0xbc, 0x0c, 0xc2, 0x03, 0x3d, 0xe2, 0x69, 0xb2, 0x3c, 0x3d, 0xdb,
+  0x6e, 0xbd, 0x37, 0xd2, 0x36, 0x3c, 0x89, 0x66, 0x1e, 0xbd, 0xeb, 0x8a, 0x88,
+  0x3d, 0x1a, 0x34, 0x3d, 0x3d, 0x84, 0x3a, 0x24, 0x3d, 0x2f, 0xd2, 0x78, 0xbd,
+  0x45, 0x13, 0x82, 0x3d, 0x70, 0x07, 0x94, 0x3d, 0xf9, 0xc5, 0x7f, 0xbd, 0x40,
+  0x1b, 0x04, 0xbd, 0x74, 0x6f, 0x3a, 0x3d, 0xa0, 0x7d, 0xf8, 0xbc, 0x7e, 0x95,
+  0x61, 0x3d, 0xc0, 0x56, 0x5d, 0x3b, 0x16, 0xa4, 0x06, 0x3d, 0x4b, 0x46, 0xbf,
+  0xbd, 0x64, 0x97, 0xe8, 0xbc, 0x79, 0xbd, 0x75, 0x3a, 0x50, 0xb6, 0x6a, 0x3c,
+  0x7b, 0xcc, 0x29, 0x3c, 0xa8, 0x8f, 0x17, 0x3d, 0xf0, 0xf6, 0xbc, 0x3b, 0x48,
+  0x26, 0x78, 0xbd, 0x96, 0x9b, 0xe4, 0x3b, 0x87, 0xe5, 0x70, 0x3c, 0x88, 0xf2,
+  0xac, 0xbb, 0x79, 0x75, 0x05, 0x3c, 0x06, 0x38, 0xa5, 0x3d, 0x8b, 0x4e, 0x0a,
+  0x3d, 0xf9, 0x2d, 0x95, 0x3d, 0x08, 0xca, 0x7f, 0x3d, 0xc7, 0x5e, 0x1c, 0x3d,
+  0xf2, 0xbc, 0x57, 0xbc, 0xc6, 0xaf, 0x5a, 0xbd, 0x7f, 0xc5, 0xc7, 0x3c, 0x69,
+  0x5c, 0x00, 0x3c, 0x69, 0xaf, 0x8a, 0x3d, 0x60, 0x07, 0x01, 0x3d, 0xc3, 0x8f,
+  0xff, 0x3a, 0xd5, 0x44, 0x1d, 0x3d, 0x66, 0x63, 0x2a, 0xbd, 0xe9, 0xd3, 0x9a,
+  0xbd, 0x50, 0xc0, 0x0a, 0xbd, 0x32, 0x2d, 0xc6, 0xbc, 0xf0, 0xb1, 0xd4, 0xbb,
+  0x48, 0xcc, 0xdc, 0x3a, 0xcd, 0x33, 0x6f, 0x3d, 0xea, 0x34, 0x95, 0xbd, 0xb8,
+  0x4b, 0x2f, 0xbc, 0xe0, 0xa1, 0x0f, 0xbc, 0x0f, 0xee, 0x01, 0x3c, 0x5e, 0x3d,
+  0x35, 0x3d, 0x6e, 0x51, 0x81, 0xbd, 0xfa, 0x8d, 0x8b, 0x3c, 0x51, 0xc5, 0x0a,
+  0x3d, 0x8a, 0xa8, 0xc4, 0xbc, 0x66, 0x86, 0x19, 0xbd, 0x50, 0x08, 0x8e, 0x3d,
+  0x22, 0x74, 0xdd, 0x3b, 0xdb, 0xf4, 0xea, 0x3a, 0xa1, 0x2d, 0x68, 0x3d, 0x7e,
+  0x82, 0xc6, 0x3d, 0xe6, 0x89, 0x16, 0xbd, 0xe2, 0x72, 0x78, 0xbd, 0x25, 0xe0,
+  0x82, 0xbd, 0xc2, 0x61, 0x66, 0x3c, 0xb2, 0x57, 0x66, 0x3d, 0x47, 0xa3, 0x40,
+  0xbc, 0xf7, 0x00, 0x3e, 0xbd, 0x78, 0x7e, 0x42, 0x3d, 0xc3, 0x09, 0x83, 0x3d,
+  0x1d, 0xac, 0x09, 0x3d, 0x37, 0xc0, 0xd7, 0x3b, 0xae, 0xbb, 0x34, 0xbd, 0x12,
+  0x34, 0x95, 0x3d, 0xf8, 0x3f, 0x20, 0x3d, 0xa8, 0x30, 0x0b, 0xbd, 0x09, 0x71,
+  0x02, 0xbd, 0xb7, 0xbc, 0x80, 0x3d, 0x9e, 0x24, 0x48, 0x3d, 0xbb, 0xe7, 0xa6,
+  0x3d, 0x59, 0xd4, 0x28, 0xbd, 0x98, 0x85, 0x14, 0xbc, 0x25, 0xbe, 0xae, 0x3c,
+  0x1b, 0x82, 0x85, 0x3c, 0x6c, 0x23, 0xc3, 0x3c, 0x7a, 0xe2, 0x03, 0xbd, 0x75,
+  0x65, 0x3a, 0x3d, 0x9e, 0x34, 0x76, 0x3b, 0xe1, 0x36, 0x05, 0x3d, 0xd6, 0x9a,
+  0x37, 0xbd, 0x66, 0x1c, 0x99, 0x3c, 0x9d, 0x65, 0x2a, 0xbd, 0xc3, 0xdd, 0x60,
+  0xbc, 0x6c, 0xa8, 0x06, 0xbd, 0xb8, 0xb4, 0x85, 0xbd, 0xca, 0x5d, 0x65, 0x3c,
+  0xe2, 0xce, 0xfa, 0x3c, 0x18, 0xe2, 0x29, 0x3d, 0x4a, 0xd0, 0x31, 0xbc, 0x78,
+  0xd4, 0x52, 0x3d, 0x7a, 0x03, 0x47, 0x3d, 0x0e, 0x3a, 0xde, 0xbc, 0xd1, 0x1c,
+  0x72, 0xbd, 0x39, 0xb2, 0x8c, 0xbd, 0x1a, 0x1c, 0xba, 0xbd, 0x20, 0x30, 0x5e,
+  0x3b, 0x4b, 0x1f, 0x40, 0xbc, 0x70, 0x8b, 0xbd, 0x3c, 0x02, 0x15, 0x12, 0xbd,
+  0x92, 0x7d, 0x52, 0xbd, 0x98, 0x66, 0x78, 0xbc, 0x73, 0x75, 0x74, 0x3d, 0x91,
+  0x42, 0x88, 0x3d, 0x8a, 0x00, 0x26, 0xbd, 0xca, 0xd7, 0x86, 0x3d, 0xea, 0xcb,
+  0x66, 0xbd, 0xb8, 0x28, 0x26, 0x3c, 0xd5, 0x36, 0x90, 0xbd, 0xfa, 0x19, 0x5a,
+  0x3d, 0xb2, 0x02, 0x81, 0xbd, 0xe3, 0x63, 0x8d, 0x3d, 0xad, 0x2e, 0x0e, 0x3d,
+  0x01, 0x74, 0x4b, 0xbd, 0xa3, 0x91, 0x08, 0x3d, 0x6d, 0xa0, 0x23, 0xbd, 0x84,
+  0xbd, 0x0a, 0xbd, 0x28, 0x54, 0x95, 0xba, 0x1c, 0x4a, 0x2f, 0x3d, 0xf0, 0x67,
+  0xaf, 0xbc, 0xcc, 0x1e, 0x18, 0x3d, 0xd5, 0xf0, 0x29, 0x3d, 0xd9, 0x19, 0x0a,
+  0xbc, 0x91, 0xf8, 0x1c, 0xbc, 0xf0, 0x4b, 0x1a, 0x3d, 0xc8, 0xdc, 0x52, 0xbc,
+  0x65, 0x2b, 0x6c, 0xbd, 0x9f, 0x08, 0x9a, 0xbd, 0x11, 0xd4, 0x9e, 0xbc, 0xb0,
+  0xa3, 0x0d, 0x3c, 0x20, 0x50, 0xd7, 0x3c, 0x65, 0xfc, 0xb7, 0xbc, 0x43, 0xf5,
+  0x0d, 0xbd, 0xb9, 0x3c, 0x2a, 0x3d, 0x66, 0xb3, 0x5b, 0x3d, 0x6d, 0x26, 0xa0,
+  0x3d, 0x3a, 0xc0, 0x15, 0xbb, 0x67, 0x1b, 0x0b, 0x3c, 0x20, 0x72, 0xa6, 0xbd,
+  0xe2, 0x14, 0xa5, 0xbc, 0x37, 0x10, 0x92, 0x3d, 0x24, 0x2d, 0x1c, 0x3d, 0x47,
+  0xbd, 0x2b, 0xbd, 0x68, 0x0f, 0xa5, 0x3d, 0x96, 0x58, 0x98, 0x3d, 0x25, 0x20,
+  0xd3, 0x3b, 0xc2, 0x1b, 0xbd, 0x3d, 0x17, 0x2a, 0xa5, 0xbb, 0x34, 0x7e, 0x47,
+  0x3d, 0x36, 0xb6, 0xd0, 0x3b, 0x6a, 0xba, 0xf3, 0x3c, 0x54, 0x95, 0x25, 0xbd,
+  0x99, 0x51, 0x81, 0x3d, 0xe6, 0x1b, 0x20, 0xbc, 0x2e, 0xc2, 0x3b, 0xbd, 0xb8,
+  0xa6, 0x17, 0xbd, 0x86, 0x1f, 0xd7, 0x3c, 0x60, 0x69, 0x8d, 0x3d, 0x00, 0x02,
+  0x76, 0xbd, 0x86, 0xdb, 0x85, 0x3b, 0x52, 0xb1, 0xd7, 0x3d, 0x7c, 0xd1, 0x4f,
+  0xbd, 0xb0, 0xe7, 0x13, 0xbd, 0xee, 0xe2, 0x0f, 0x3d, 0x2e, 0x0a, 0x11, 0xbd,
+  0x59, 0x7e, 0x04, 0xbd, 0xf1, 0xdf, 0x10, 0xbc, 0x9f, 0xfd, 0x90, 0xbc, 0x0a,
+  0xec, 0x47, 0x3c, 0x9b, 0x06, 0x5a, 0x3d, 0x0e, 0xe3, 0xee, 0xbc, 0x3b, 0xbf,
+  0xc7, 0x3b, 0x1e, 0xc7, 0x17, 0xbd, 0x65, 0x6d, 0x75, 0x3c, 0x81, 0x92, 0xc3,
+  0x3c, 0xee, 0x48, 0x9e, 0x3c, 0x6d, 0x2e, 0x4f, 0xbd, 0x42, 0x85, 0x64, 0xbd,
+  0xe9, 0x0a, 0xbb, 0xbc, 0x73, 0x3f, 0x40, 0xbd, 0xbd, 0x8c, 0xae, 0x3b, 0x4a,
+  0xae, 0x31, 0x3d, 0x9e, 0x39, 0xfd, 0x3c, 0xd7, 0x4e, 0xe0, 0xbd, 0xf6, 0x05,
+  0x05, 0xbd, 0xbf, 0x61, 0x31, 0x3c, 0xba, 0x2f, 0x51, 0x3d, 0x16, 0xef, 0xdd,
+  0x3c, 0x23, 0x64, 0x18, 0x3c, 0x44, 0x4b, 0xce, 0xbc, 0x13, 0xbd, 0xd7, 0xbc,
+  0xc8, 0xc8, 0xb8, 0xbc, 0x76, 0x69, 0x19, 0xbd, 0x76, 0x51, 0x9c, 0xbd, 0xbe,
+  0xbc, 0x7d, 0x3d, 0xa3, 0xa2, 0x74, 0x3d, 0xfe, 0xad, 0x06, 0x3c, 0x74, 0xb4,
+  0x0f, 0x3b, 0x9f, 0x83, 0x8d, 0x3d, 0xa5, 0x84, 0x70, 0x3d, 0x99, 0xa1, 0xe6,
+  0xbc, 0xf2, 0xf1, 0xbd, 0xbc, 0x29, 0xd8, 0x42, 0xbc, 0x48, 0xb0, 0xa7, 0x3c,
+  0xce, 0x31, 0x0b, 0xbd, 0x8b, 0xef, 0x39, 0x3d, 0xc5, 0x28, 0xa4, 0x3c, 0xcd,
+  0x1b, 0xb7, 0x3c, 0x3f, 0x50, 0x55, 0xbd, 0xf4, 0xa8, 0x9d, 0x3d, 0xe3, 0xdb,
+  0xac, 0x3c, 0x5c, 0xae, 0x68, 0xbc, 0x8e, 0xf1, 0x0f, 0xbc, 0x17, 0x29, 0x87,
+  0x3c, 0x19, 0x45, 0x23, 0xbd, 0xf0, 0x0f, 0x12, 0xbd, 0x06, 0x74, 0x8b, 0xbd,
+  0x10, 0x65, 0x00, 0x3d, 0xa3, 0x9d, 0x8a, 0x3d, 0x1e, 0xf4, 0x3d, 0x3d, 0x4e,
+  0x40, 0x7b, 0x3c, 0xa0, 0xc8, 0xf7, 0xbb, 0x2e, 0x19, 0x1a, 0xbc, 0x37, 0x47,
+  0x36, 0xbd, 0x8b, 0x65, 0x6d, 0x3d, 0xc0, 0xcd, 0x21, 0xbd, 0x60, 0xb6, 0xa3,
+  0xbb, 0xa9, 0x58, 0x42, 0xbc, 0x94, 0x1c, 0x73, 0xbd, 0x82, 0xa5, 0xad, 0xbc,
+  0x51, 0xe5, 0xb5, 0x3d, 0xbd, 0xa1, 0x59, 0x3d, 0x13, 0x5b, 0xdb, 0xbc, 0x44,
+  0xdc, 0xd3, 0xbc, 0xc8, 0x3f, 0xa5, 0x3d, 0x5d, 0x7c, 0x68, 0x3d, 0xcd, 0xb4,
+  0xa7, 0xbc, 0x58, 0x2b, 0x48, 0x3d, 0xe6, 0x22, 0xf6, 0xbc, 0xde, 0x4b, 0x0b,
+  0xbd, 0x71, 0x8f, 0x44, 0xbd, 0x8d, 0xa0, 0x17, 0xbd, 0xd3, 0xd3, 0x36, 0x3d,
+  0x40, 0x04, 0x3c, 0xbd, 0x4a, 0xdf, 0x82, 0x3b, 0x23, 0x72, 0x20, 0x3d, 0xf5,
+  0x84, 0x80, 0xbd, 0xf9, 0x1c, 0xf3, 0xbc, 0x84, 0xd9, 0x86, 0xbd, 0x28, 0x42,
+  0x48, 0xbd, 0x90, 0xd7, 0x32, 0x3d, 0x80, 0x98, 0x01, 0xbc, 0x7f, 0x7a, 0x82,
+  0xbd, 0x59, 0x12, 0xf3, 0x3c, 0x9b, 0x63, 0xaa, 0xbc, 0x5e, 0x84, 0xb5, 0xbd,
+  0x95, 0x77, 0x90, 0x3d, 0xad, 0x26, 0xb4, 0xbd, 0xda, 0xfb, 0x0a, 0xbd, 0x44,
+  0x70, 0x73, 0x3d, 0x70, 0x45, 0x41, 0x3d, 0xe6, 0x6b, 0x73, 0x3c, 0x93, 0x01,
+  0x78, 0xbd, 0xc3, 0xda, 0xa2, 0x3d, 0x46, 0x41, 0x83, 0x3d, 0x16, 0x40, 0x32,
+  0x3d, 0xa7, 0xfb, 0xa7, 0xbd, 0xc0, 0x57, 0x28, 0x3b, 0xd0, 0x2b, 0x84, 0xbc,
+  0x85, 0x89, 0x88, 0x3d, 0xc4, 0xa3, 0x8f, 0xbc, 0xbb, 0xc6, 0x96, 0xbd, 0x7c,
+  0xae, 0x36, 0xbd, 0xf8, 0x8b, 0x85, 0x3d, 0xfa, 0x35, 0xf5, 0x3c, 0xad, 0x86,
+  0x63, 0xbc, 0x7c, 0xc1, 0x54, 0x3d, 0xad, 0xfc, 0x09, 0xbd, 0x3a, 0x1f, 0xf2,
+  0x3c, 0xf4, 0x35, 0x65, 0x3c, 0xd0, 0x53, 0x38, 0xbd, 0x99, 0xf8, 0x36, 0x3d,
+  0x95, 0xaf, 0x67, 0x3d, 0xd2, 0x76, 0x44, 0x3d, 0x03, 0x46, 0x82, 0x3d, 0xdc,
+  0xe2, 0x53, 0xbd, 0x49, 0x59, 0x7b, 0xbd, 0x1c, 0x8b, 0xaf, 0x3a, 0x80, 0x30,
+  0x27, 0xbd, 0xdb, 0x9c, 0x87, 0xbd, 0x8e, 0x09, 0x5c, 0x3d, 0x5e, 0x5d, 0x5d,
+  0x3d, 0xcc, 0x97, 0xaa, 0xbb, 0x81, 0xe0, 0xb9, 0xbc, 0x61, 0x3a, 0x9a, 0x3b,
+  0xc9, 0x99, 0x9f, 0x3d, 0x2d, 0x52, 0x10, 0xbd, 0x90, 0x0b, 0xa1, 0x3c, 0xaf,
+  0x88, 0x81, 0xbd, 0xf4, 0x7a, 0x89, 0xbc, 0xb3, 0xe1, 0xc5, 0xbc, 0x8e, 0xe5,
+  0x8a, 0xbd, 0x6d, 0xd9, 0x70, 0x3b, 0xdd, 0x1b, 0xa1, 0x3c, 0xdd, 0xeb, 0x42,
+  0xbd, 0x01, 0xcb, 0xf2, 0x3c, 0x8e, 0x4f, 0xff, 0xbc, 0x28, 0x5e, 0x6a, 0xbc,
+  0x3f, 0xff, 0x26, 0x3d, 0xc4, 0xfa, 0x87, 0xbc, 0xcb, 0x5e, 0x32, 0xbd, 0x1f,
+  0xb7, 0xd1, 0xbd, 0x40, 0xb6, 0x8b, 0x3c, 0x22, 0xf5, 0xa5, 0xbc, 0x5e, 0xa1,
+  0xf7, 0xbc, 0x1a, 0x43, 0x11, 0x3d, 0xc9, 0xfe, 0x18, 0xbd, 0x34, 0x8b, 0x2f,
+  0x3d, 0x2f, 0xe3, 0x8d, 0x3d, 0xaf, 0x7b, 0x69, 0xbd, 0x63, 0x9d, 0xac, 0x3d,
+  0xce, 0x45, 0x50, 0xbd, 0xe1, 0x8f, 0x6b, 0xbd, 0x6e, 0xc6, 0x07, 0xbd, 0x58,
+  0x1e, 0x12, 0x3c, 0x79, 0xdd, 0x06, 0x3d, 0xea, 0x26, 0x83, 0xbd, 0xaa, 0x63,
+  0xce, 0x3d, 0x3a, 0xb3, 0x81, 0x3b, 0x35, 0x9a, 0xc6, 0x3c, 0x27, 0xc4, 0x59,
+  0xbd, 0x74, 0x21, 0x30, 0x3d, 0xfe, 0x21, 0x8f, 0xbc, 0xb2, 0x86, 0x78, 0xbc,
+  0xbb, 0x4f, 0xd7, 0xbd, 0xda, 0xfe, 0x2c, 0xbd, 0x7b, 0x99, 0x21, 0x3b, 0x61,
+  0xe4, 0x68, 0xbd, 0x66, 0xfd, 0xb2, 0xba, 0xbe, 0x3d, 0x53, 0x3d, 0x53, 0x3f,
+  0x5c, 0xbd, 0x5b, 0xf9, 0xc4, 0x3c, 0x1c, 0xa3, 0x6c, 0x3d, 0x61, 0x44, 0xfa,
+  0x3c, 0x35, 0xb8, 0xd9, 0x3c, 0x6d, 0x40, 0xc8, 0xbc, 0xbf, 0x20, 0x2a, 0x3d,
+  0x84, 0xbd, 0x80, 0x3c, 0x19, 0x27, 0x1c, 0x3d, 0xc8, 0xf0, 0x56, 0x3c, 0x74,
+  0x85, 0x29, 0x3c, 0xce, 0x5a, 0x91, 0xbc, 0x1f, 0xc3, 0x89, 0xbc, 0x8a, 0xec,
+  0x62, 0x3d, 0xd0, 0xc0, 0xd2, 0xbb, 0x29, 0x30, 0x36, 0x3d, 0x71, 0xd4, 0xaf,
+  0x3c, 0x29, 0x52, 0xb9, 0xbc, 0x33, 0xc8, 0x2c, 0x3a, 0x97, 0x8e, 0x18, 0xbb,
+  0xda, 0xa7, 0x28, 0xbd, 0xaf, 0x8c, 0xc1, 0xbc, 0x62, 0xbb, 0xc7, 0x3b, 0xda,
+  0x12, 0xbb, 0xbc, 0x7a, 0xfb, 0x3a, 0xbd, 0x04, 0xc0, 0xe3, 0x3c, 0x0f, 0x84,
+  0xdd, 0xbd, 0xa4, 0x83, 0x87, 0x3d, 0x38, 0x8b, 0x5f, 0xbd, 0x60, 0xb4, 0x98,
+  0x3c, 0x99, 0xef, 0x5d, 0x3b, 0xda, 0x0b, 0x83, 0x3d, 0x49, 0xf9, 0x93, 0x3d,
+  0xe4, 0x29, 0x51, 0xbd, 0x5e, 0x33, 0x4b, 0xbd, 0x7a, 0xc5, 0xd5, 0x3b, 0xc2,
+  0xbc, 0x67, 0x3d, 0x89, 0xa1, 0x55, 0xbd, 0x91, 0x0f, 0x55, 0x3d, 0xf8, 0x89,
+  0x82, 0xbd, 0x4c, 0xdc, 0xc6, 0xbc, 0xc9, 0xb0, 0x3e, 0xbd, 0x7c, 0x95, 0x25,
+  0x3d, 0xa2, 0x9f, 0xe1, 0x3b, 0x17, 0xcf, 0x90, 0xbb, 0xd6, 0x9c, 0x47, 0x3b,
+  0xf6, 0x12, 0x74, 0x3d, 0xba, 0x2e, 0xde, 0x3c, 0x3e, 0x06, 0x74, 0x3d, 0x32,
+  0x23, 0x5e, 0xbc, 0x02, 0xf3, 0x88, 0xbd, 0x16, 0x5d, 0xdd, 0xbc, 0x50, 0x9b,
+  0x0a, 0xbd, 0x8e, 0x56, 0xb9, 0xbc, 0xc8, 0x8b, 0x18, 0x3d, 0xfd, 0x15, 0x80,
+  0x3d, 0x4c, 0x97, 0x5a, 0xbc, 0xe2, 0x63, 0xa4, 0xbc, 0xc3, 0x3d, 0x84, 0xbc,
+  0x7e, 0xa2, 0x83, 0x3b, 0x6e, 0x8b, 0x4e, 0x3c, 0x24, 0xb4, 0xb3, 0xbb, 0x03,
+  0x9e, 0xfd, 0x3b, 0xa4, 0x8b, 0x53, 0x3d, 0xbc, 0x81, 0x61, 0xbd, 0x59, 0xde,
+  0x48, 0x3d, 0x21, 0x16, 0x61, 0xbd, 0x31, 0xbc, 0x1c, 0xbd, 0xfc, 0xe8, 0xf4,
+  0x3c, 0x88, 0x36, 0x59, 0x3d, 0x12, 0x10, 0xf8, 0xbb, 0xe4, 0x7b, 0x5f, 0xbc,
+  0xf0, 0x9d, 0x9e, 0x3c, 0xfb, 0x94, 0xdb, 0xbc, 0x54, 0x67, 0x65, 0xbc, 0x5e,
+  0x6e, 0x3b, 0xbd, 0x12, 0x92, 0x59, 0x3c, 0xf3, 0x69, 0x8b, 0x3b, 0x78, 0x99,
+  0xdd, 0x3c, 0x85, 0x31, 0x21, 0x3d, 0xe4, 0x6c, 0x33, 0x3d, 0x9c, 0x58, 0x87,
+  0xbd, 0xd9, 0xf5, 0x31, 0xbc, 0xce, 0xac, 0xb9, 0x3d, 0x0e, 0x2c, 0x5c, 0x3d,
+  0x6a, 0x94, 0xa9, 0x3d, 0x0e, 0xca, 0x4d, 0xbc, 0x68, 0x0f, 0x4d, 0xbd, 0xd5,
+  0x31, 0xa6, 0xbc, 0xf1, 0xdc, 0x9b, 0x3d, 0x71, 0x4d, 0xfd, 0xbc, 0xcc, 0x43,
+  0x1a, 0x3d, 0x1f, 0x4f, 0x51, 0x3d, 0xf0, 0x07, 0xa4, 0x3b, 0x1a, 0x75, 0x40,
+  0x3d, 0xf6, 0xef, 0x13, 0x3d, 0x58, 0x08, 0x04, 0xbd, 0xf3, 0x55, 0x58, 0x3d,
+  0x55, 0x7e, 0x6d, 0xbd, 0x96, 0x39, 0x78, 0xbd, 0x19, 0x7d, 0x7f, 0xbd, 0xc3,
+  0x4a, 0x9a, 0xbd, 0x64, 0xad, 0x24, 0x3d, 0xc8, 0xab, 0x10, 0x3b, 0xa2, 0x7f,
+  0x76, 0xbd, 0xdd, 0xb6, 0x2e, 0x3d, 0xdb, 0xbf, 0x88, 0x3d, 0x49, 0x2e, 0xbd,
+  0xbb, 0xdb, 0xdc, 0x86, 0x3d, 0x06, 0xf9, 0x85, 0xbd, 0x3c, 0x44, 0x39, 0xbc,
+  0x8b, 0x1c, 0x32, 0x3d, 0xf6, 0x3c, 0x7a, 0x3d, 0x68, 0x1f, 0x13, 0xbd, 0x1d,
+  0x1c, 0xed, 0x3c, 0xa8, 0x9b, 0x08, 0xbc, 0xe4, 0x25, 0xf6, 0xbc, 0xf6, 0xd8,
+  0x19, 0xbd, 0x24, 0x39, 0x2f, 0xbd, 0x59, 0x25, 0x86, 0xbd, 0xbf, 0xf8, 0x78,
+  0xbd, 0x33, 0xec, 0x93, 0xbd, 0x65, 0xdd, 0x55, 0xbd, 0x9d, 0x16, 0x05, 0xbd,
+  0x69, 0xe6, 0x79, 0x3d, 0x64, 0xfd, 0xf0, 0xbc, 0xf7, 0xa3, 0x63, 0xbc, 0xb4,
+  0x5f, 0xdb, 0xbc, 0x72, 0x22, 0x13, 0x3d, 0x0e, 0x28, 0x03, 0xbd, 0x64, 0x4b,
+  0xad, 0x3c, 0xcb, 0x9c, 0x15, 0xbd, 0x58, 0x24, 0x55, 0x3d, 0x85, 0x90, 0x18,
+  0xbc, 0x87, 0xb7, 0x95, 0x3d, 0x5e, 0xd9, 0x78, 0xbd, 0xa6, 0x19, 0x80, 0x3d,
+  0xd3, 0xf6, 0x08, 0x3d, 0x8c, 0x74, 0x43, 0xbd, 0x06, 0x77, 0x8f, 0xbd, 0x68,
+  0xc4, 0x6f, 0xbd, 0x6f, 0x45, 0x03, 0x3b, 0xb4, 0xf9, 0x9c, 0x3c, 0xe2, 0x85,
+  0x8f, 0x3c, 0x3a, 0x70, 0x92, 0x3d, 0x06, 0xaa, 0x28, 0xbd, 0x51, 0x46, 0xc2,
+  0xbd, 0x39, 0xf2, 0x8f, 0x3d, 0xda, 0xbd, 0x4e, 0x3d, 0x68, 0x6d, 0x57, 0xbc,
+  0xb3, 0x41, 0x8b, 0x3d, 0xa8, 0x83, 0xa3, 0xbc, 0x3a, 0x05, 0xbf, 0xbc, 0x5b,
+  0x8d, 0x6e, 0x3d, 0xfa, 0x17, 0x8b, 0xbd, 0xff, 0x33, 0x03, 0x3c, 0x4e, 0x35,
+  0x6d, 0xbb, 0xf5, 0x98, 0x31, 0xbd, 0xfe, 0x46, 0x20, 0x3c, 0xb7, 0x91, 0x5d,
+  0x3d, 0xa9, 0x64, 0x97, 0x3c, 0xd8, 0x6a, 0x59, 0xbd, 0x0b, 0xfb, 0x7c, 0x3d,
+  0x05, 0xf1, 0x26, 0xbd, 0xd4, 0xfd, 0x2a, 0x3d, 0x70, 0xca, 0x1d, 0x3d, 0x76,
+  0x80, 0xc7, 0xbc, 0xfa, 0x43, 0x7e, 0x3d, 0x6e, 0xda, 0xb6, 0x3c, 0x63, 0x63,
+  0x25, 0xbd, 0x39, 0xad, 0x9c, 0xbc, 0x89, 0xa0, 0xbf, 0xbd, 0xc7, 0xd6, 0x19,
+  0x3d, 0x36, 0x1d, 0x22, 0x3c, 0x11, 0x87, 0x8b, 0xbd, 0xa8, 0x59, 0x39, 0xbd,
+  0xe4, 0x1d, 0x02, 0x3c, 0xf1, 0x0d, 0xf7, 0xbd, 0x16, 0x10, 0xb8, 0x3b, 0x03,
+  0xfc, 0xa4, 0x3c, 0x32, 0x06, 0x8f, 0xbc, 0x47, 0x59, 0xa3, 0xbc, 0xac, 0x7f,
+  0xda, 0xbc, 0x4b, 0x26, 0x80, 0x3d, 0x73, 0x33, 0x31, 0xbc, 0x83, 0x75, 0x98,
+  0xbd, 0xb7, 0x95, 0x65, 0xbd, 0x64, 0x01, 0x21, 0xbd, 0xb8, 0x86, 0x8a, 0x3b,
+  0xe5, 0x85, 0x4a, 0xbd, 0xe5, 0xc1, 0x45, 0xbc, 0x97, 0x00, 0xab, 0x3c, 0xb6,
+  0x55, 0x1b, 0xbd, 0x41, 0xcb, 0x01, 0x3d, 0x3c, 0x4e, 0x2f, 0xbc, 0x4c, 0x54,
+  0xad, 0x3c, 0x70, 0xec, 0x58, 0x3c, 0x57, 0x6e, 0xf9, 0x3c, 0xac, 0xa8, 0x28,
+  0xbd, 0xea, 0x4c, 0xce, 0xbb, 0x5f, 0x87, 0x1d, 0xbd, 0x0d, 0xe2, 0x5c, 0x3d,
+  0x1d, 0x21, 0x31, 0xbd, 0xf5, 0x47, 0xd7, 0xbd, 0xb5, 0xd5, 0x0c, 0xbd, 0x81,
+  0x2b, 0xff, 0x3c, 0x40, 0x81, 0xd2, 0x3c, 0xc3, 0x64, 0x77, 0x3c, 0xd6, 0xdd,
+  0xc9, 0xbc, 0xee, 0x42, 0x9e, 0xbc, 0x4a, 0xdb, 0x3c, 0x3d, 0xc2, 0x58, 0x82,
+  0x3d, 0xfa, 0x36, 0x24, 0xbd, 0x36, 0x2e, 0x86, 0x3d, 0x68, 0xee, 0x5e, 0xbd,
+  0x3c, 0x29, 0x1e, 0xbc, 0x80, 0x1f, 0x88, 0xbd, 0x27, 0xab, 0xb7, 0xbc, 0xce,
+  0x18, 0xa7, 0xbd, 0xf6, 0x96, 0xa7, 0xbc, 0xde, 0x1b, 0x0a, 0xbd, 0x15, 0x9b,
+  0x1d, 0x3c, 0x2e, 0xb4, 0x9d, 0x3d, 0x61, 0xba, 0xbe, 0xbc, 0xb8, 0xc8, 0x6a,
+  0x3d, 0xcc, 0x06, 0xa8, 0xbd, 0x83, 0xae, 0x13, 0xbc, 0x3d, 0xb4, 0x4c, 0xbd,
+  0xcc, 0xb5, 0x65, 0xbc, 0x0d, 0xad, 0x8b, 0x3c, 0x0e, 0x2f, 0x91, 0x3c, 0x1a,
+  0xfa, 0x1e, 0x3d, 0xbf, 0xe3, 0xf8, 0x3c, 0x21, 0x8d, 0x8c, 0xbc, 0x30, 0x1b,
+  0xcb, 0xbc, 0x34, 0x68, 0xf2, 0x3a, 0xed, 0x13, 0x0f, 0xbd, 0x66, 0x39, 0x61,
+  0xbd, 0xee, 0x87, 0x42, 0x3d, 0xc0, 0x58, 0x69, 0xbc, 0x3e, 0xe4, 0xd5, 0x3c,
+  0x46, 0x68, 0x30, 0xbd, 0x6c, 0x68, 0xad, 0x3c, 0x36, 0x63, 0x13, 0x3d, 0x0c,
+  0xf5, 0xf7, 0xbc, 0x56, 0x99, 0x71, 0x3d, 0x4a, 0xba, 0x10, 0x3d, 0xfc, 0xba,
+  0x3e, 0x3d, 0x5a, 0xd8, 0x82, 0x3d, 0x70, 0x17, 0x92, 0xbd, 0x0f, 0x9b, 0x77,
+  0xbd, 0x06, 0x4d, 0x78, 0x3d, 0xcb, 0x90, 0x96, 0x3d, 0xa5, 0x6d, 0x04, 0xbd,
+  0x4a, 0x4f, 0x0f, 0xbc, 0x83, 0x77, 0x3a, 0x3d, 0xdf, 0x43, 0x39, 0x3d, 0x17,
+  0x17, 0xf7, 0x3c, 0x3d, 0x1a, 0x44, 0xbd, 0x42, 0x1b, 0xdb, 0xbc, 0x1f, 0x26,
+  0x82, 0xbd, 0xfd, 0x51, 0xa5, 0x3d, 0xc5, 0x70, 0x45, 0x3d, 0x00, 0x17, 0xa1,
+  0x3c, 0xe1, 0x5c, 0x56, 0xbd, 0x57, 0x8c, 0xe6, 0xbc, 0x87, 0x07, 0xef, 0x3b,
+  0x9b, 0x41, 0xbf, 0xbd, 0xa1, 0x85, 0xd5, 0x3c, 0x07, 0x20, 0x0a, 0xbd, 0xc0,
+  0x19, 0xf3, 0xbb, 0x1f, 0xb5, 0xba, 0x3b, 0xa0, 0x79, 0x86, 0xbc, 0x62, 0x56,
+  0x40, 0xbd, 0x51, 0xf1, 0xa8, 0x3c, 0x83, 0x80, 0x86, 0x3c, 0x18, 0x2b, 0x2d,
+  0x3d, 0x8d, 0x66, 0xb6, 0x3c, 0x1d, 0xac, 0x2e, 0xbd, 0x91, 0xbc, 0x3e, 0xbd,
+  0xfb, 0x80, 0x75, 0x3d, 0x7d, 0xa1, 0x54, 0xba, 0x0f, 0xd1, 0x2f, 0xbd, 0xcb,
+  0x3a, 0x14, 0xbd, 0x76, 0xd3, 0x82, 0xbc, 0x15, 0x06, 0xf5, 0x39, 0xa4, 0xdb,
+  0x6e, 0x3d, 0x42, 0x46, 0xb7, 0x3c, 0xa3, 0x20, 0x00, 0x3d, 0xfc, 0x4f, 0x2b,
+  0xbd, 0x06, 0xb1, 0x7e, 0x3d, 0xf8, 0x37, 0xc9, 0xbc, 0x0d, 0x90, 0xd7, 0xbc,
+  0xb7, 0x8e, 0x0e, 0x3d, 0x68, 0xd8, 0x1d, 0xbc, 0x57, 0xb5, 0x11, 0x3d, 0x68,
+  0x20, 0x0b, 0x3d, 0x85, 0xda, 0x1e, 0xbd, 0xe0, 0xc0, 0x6b, 0xbd, 0x44, 0x69,
+  0x96, 0xbd, 0xec, 0xbd, 0x38, 0xbc, 0x09, 0x65, 0x85, 0xbd, 0xb4, 0xf4, 0x57,
+  0xbd, 0x35, 0xe4, 0xb2, 0xbc, 0xf7, 0x90, 0xd0, 0x3c, 0x78, 0xd1, 0x83, 0xbd,
+  0xe7, 0x8d, 0x1b, 0xbd, 0x49, 0xa3, 0x94, 0x3d, 0x56, 0xf3, 0x44, 0xbd, 0xb2,
+  0xce, 0x5e, 0x3d, 0x42, 0x8e, 0x37, 0xbd, 0x22, 0x3e, 0x79, 0xbd, 0xa0, 0x71,
+  0x6c, 0x3d, 0x23, 0x13, 0xb3, 0xbb, 0x0d, 0x32, 0x21, 0x3c, 0x35, 0x5e, 0xfd,
+  0xba, 0x0d, 0x0c, 0xbd, 0x3b, 0xcb, 0x0c, 0xaa, 0xbb, 0x33, 0xe8, 0x08, 0xbd,
+  0x43, 0x7a, 0xa5, 0xbc, 0x15, 0x50, 0x89, 0x3d, 0xd1, 0x86, 0x5b, 0x3d, 0x2a,
+  0xd8, 0x4c, 0x3d, 0xe1, 0x63, 0x19, 0xbc, 0xee, 0xf0, 0x6f, 0x3d, 0xfa, 0xc2,
+  0x44, 0x3d, 0x88, 0x3c, 0x6b, 0xbd, 0xe3, 0x24, 0xbb, 0xbc, 0x4c, 0xe6, 0x21,
+  0x3b, 0x47, 0xf2, 0xa1, 0xbc, 0x46, 0x96, 0xfd, 0x3c, 0x4c, 0x21, 0x86, 0xbd,
+  0x32, 0x28, 0x83, 0xbc, 0x70, 0x39, 0xa0, 0xbd, 0x80, 0xca, 0x4d, 0xbd, 0xc4,
+  0x91, 0x8d, 0xbc, 0xab, 0xae, 0x08, 0x3c, 0x54, 0xff, 0xb5, 0xbb, 0x76, 0xae,
+  0xbe, 0x3c, 0xd8, 0xd1, 0xa5, 0x3d, 0x03, 0x0c, 0x44, 0x3d, 0x92, 0x96, 0x40,
+  0xbd, 0xd5, 0xc5, 0x1f, 0x3d, 0xdf, 0x09, 0xc0, 0x3c, 0xfb, 0x0d, 0x5f, 0x3d,
+  0xfd, 0x07, 0x04, 0x3d, 0x1c, 0x43, 0x9a, 0xbd, 0xd7, 0x14, 0x72, 0xbd, 0x2d,
+  0x50, 0x84, 0xbd, 0x6a, 0x16, 0x7d, 0x38, 0xa6, 0xff, 0x90, 0x3d, 0x44, 0xb7,
+  0xcc, 0x3c, 0x5d, 0x5f, 0x69, 0xbd, 0x92, 0x8d, 0x6d, 0x3d, 0xf9, 0x02, 0x99,
+  0xbc, 0xe5, 0x7a, 0xc5, 0xbd, 0xde, 0x5c, 0x69, 0x3d, 0xee, 0xbf, 0xf4, 0x3c,
+  0x92, 0x19, 0x96, 0x3d, 0xf3, 0x5b, 0x35, 0xbd, 0xf3, 0x90, 0x3b, 0x3d, 0x90,
+  0xe2, 0xc2, 0xbc, 0x98, 0x91, 0xf9, 0xbc, 0x3b, 0x3b, 0x82, 0xbd, 0xb0, 0x85,
+  0x30, 0x3d, 0x14, 0x12, 0xea, 0xbc, 0x21, 0x84, 0x8c, 0x3d, 0x93, 0xcd, 0x65,
+  0x3d, 0xc9, 0x26, 0xda, 0xbc, 0xd5, 0xc3, 0x4e, 0x3c, 0xcc, 0x6e, 0x0f, 0x3d,
+  0x8d, 0xaf, 0x47, 0x3c, 0x9c, 0xfa, 0xe1, 0x3c, 0x3c, 0xe0, 0x4c, 0x3d, 0x79,
+  0x22, 0xed, 0x3c, 0xf4, 0x05, 0x3a, 0x3d, 0x59, 0xc0, 0x22, 0xbd, 0x5e, 0xaa,
+  0xf8, 0xbc, 0xc4, 0xda, 0x22, 0x3c, 0x76, 0x88, 0xaf, 0x3c, 0x1c, 0xf4, 0x3b,
+  0x3d, 0x4e, 0x6a, 0x1b, 0x3d, 0x60, 0xc7, 0x85, 0x3c, 0xb2, 0xc7, 0x75, 0x3d,
+  0xbd, 0xe4, 0xbe, 0xbc, 0x54, 0x8e, 0x82, 0x3d, 0x36, 0x27, 0x6a, 0xbc, 0x0d,
+  0x99, 0x00, 0xbd, 0x38, 0x5e, 0x9f, 0xbc, 0x9d, 0x49, 0xd6, 0x3d, 0xbb, 0x1a,
+  0x85, 0x3d, 0x6f, 0x89, 0x9f, 0x3c, 0xc5, 0x0b, 0xa7, 0xbc, 0x9e, 0x5a, 0xfa,
+  0xbc, 0xd3, 0x59, 0x50, 0xba, 0x3f, 0xc6, 0xbc, 0xbd, 0xb3, 0x9c, 0x12, 0xbd,
+  0x05, 0x39, 0xd6, 0x3b, 0x58, 0x14, 0x0d, 0x3d, 0x63, 0x0e, 0x19, 0x3d, 0x69,
+  0x9b, 0xa2, 0x3d, 0x68, 0x4d, 0x13, 0x3c, 0x06, 0x73, 0x64, 0xbd, 0x28, 0x79,
+  0x3c, 0xbd, 0x26, 0x23, 0x28, 0xbc, 0xb5, 0xa2, 0xa5, 0xba, 0xf6, 0x5f, 0x89,
+  0xbc, 0x66, 0x2e, 0x79, 0xbd, 0x90, 0xee, 0x54, 0xbc, 0x99, 0xf4, 0x4e, 0x3c,
+  0xdb, 0xdc, 0xd0, 0xbc, 0x3f, 0xed, 0x43, 0xbd, 0x03, 0xdf, 0xf4, 0x3c, 0x7d,
+  0x40, 0x2b, 0x3c, 0xfb, 0x1d, 0x64, 0x3d, 0xcd, 0x1f, 0xb8, 0x3d, 0xb1, 0xb2,
+  0x0f, 0x3d, 0x30, 0xf6, 0x38, 0xbd, 0x54, 0xef, 0x84, 0xbc, 0x2f, 0x3f, 0xac,
+  0xbd, 0xe0, 0xe1, 0xc4, 0xbc, 0x49, 0x0a, 0x03, 0xbd, 0xb8, 0x78, 0x43, 0xbc,
+  0xbf, 0xbc, 0x80, 0x3a, 0x1a, 0x41, 0x39, 0x3d, 0xd0, 0x5d, 0x8c, 0x3d, 0x8d,
+  0x8f, 0x5e, 0xbc, 0xfd, 0x1b, 0xed, 0xbd, 0x22, 0x7c, 0x99, 0xbc, 0x4c, 0xb3,
+  0x1d, 0xbc, 0x10, 0xbb, 0x1c, 0x3c, 0x19, 0x89, 0xd3, 0xbc, 0x2a, 0x64, 0x37,
+  0x3d, 0x11, 0x87, 0x00, 0x3c, 0x39, 0x0d, 0x1c, 0x3d, 0xb8, 0xeb, 0xde, 0xbc,
+  0x26, 0x9d, 0x05, 0xbd, 0x51, 0xca, 0x0d, 0xbd, 0xa9, 0xe0, 0xbc, 0x3c, 0xd6,
+  0x01, 0x2d, 0xbd, 0x72, 0x14, 0xd3, 0x3c, 0xf2, 0x07, 0x81, 0x3c, 0xe4, 0xbb,
+  0x00, 0x3d, 0x0b, 0x42, 0x09, 0x3b, 0x0e, 0x99, 0x71, 0xbd, 0x32, 0x91, 0x10,
+  0xbd, 0xa0, 0x0b, 0x05, 0xbd, 0x7f, 0xf8, 0xf6, 0x3c, 0xd4, 0x72, 0xbd, 0x3c,
+  0xdf, 0xcc, 0x8a, 0x3d, 0x0e, 0x3d, 0x24, 0x3d, 0x71, 0x5a, 0x52, 0xbd, 0xb6,
+  0x11, 0xda, 0xbc, 0x5b, 0xec, 0x9c, 0x3d, 0x4a, 0x73, 0xfd, 0xbc, 0xc1, 0x2b,
+  0x9f, 0xbd, 0x06, 0xed, 0x2f, 0xbd, 0x38, 0x4c, 0x53, 0x3d, 0x36, 0x8d, 0xc1,
+  0x3c, 0x14, 0x26, 0xa3, 0xbd, 0x2d, 0x2f, 0x0a, 0xbb, 0xfd, 0x7d, 0xa5, 0xbd,
+  0x10, 0xbe, 0xe4, 0x3b, 0x77, 0x22, 0x6a, 0x3d, 0xdd, 0x33, 0xc3, 0x3c, 0x3e,
+  0x8e, 0xbb, 0xbd, 0x60, 0x54, 0x81, 0x3d, 0x02, 0xcf, 0x15, 0x3d, 0x06, 0x28,
+  0xd5, 0x3d, 0xda, 0xb6, 0x6f, 0xbd, 0xf6, 0x93, 0x86, 0xbc, 0x98, 0x16, 0x45,
+  0x3d, 0xdc, 0x9e, 0x47, 0x3c, 0x8b, 0x3a, 0x82, 0xbd, 0x11, 0x05, 0xb6, 0xbd,
+  0x0e, 0x26, 0xc1, 0xbc, 0xe2, 0xdc, 0xab, 0x3d, 0x10, 0x6e, 0x84, 0x3d, 0x49,
+  0x2f, 0x1c, 0xbb, 0x0e, 0x73, 0x7a, 0x3c, 0x82, 0x17, 0x29, 0x3d, 0x88, 0x40,
+  0x91, 0x3b, 0x2d, 0xcd, 0xf3, 0xbc, 0xcc, 0x39, 0x37, 0xbd, 0xb0, 0x03, 0x17,
+  0x3d, 0xb8, 0xd0, 0x22, 0x3d, 0xc6, 0x69, 0x90, 0x3c, 0x09, 0x0f, 0xc2, 0x3b,
+  0x7a, 0x64, 0xcc, 0xbc, 0x26, 0x93, 0x22, 0x3d, 0xa3, 0xe0, 0x4b, 0xbd, 0x7d,
+  0xca, 0x2f, 0xbb, 0xda, 0x26, 0x19, 0x3d, 0xe7, 0x88, 0x47, 0xbc, 0x4e, 0x0f,
+  0x3b, 0x3d, 0xf8, 0x1c, 0x1c, 0x3d, 0xb4, 0x23, 0x8e, 0x3d, 0xaf, 0xa6, 0x10,
+  0xbd, 0xfc, 0x9a, 0x9c, 0x3c, 0x35, 0x69, 0x9f, 0x3d, 0xe4, 0x5f, 0x8f, 0xbd,
+  0xc7, 0xe3, 0x98, 0x3d, 0xab, 0xb8, 0xcc, 0x3b, 0x6a, 0xa9, 0x0f, 0xbd, 0x0d,
+  0x8a, 0x6a, 0xbd, 0x1e, 0xec, 0x10, 0x3d, 0xa0, 0x13, 0xe8, 0x3b, 0xc0, 0x77,
+  0x93, 0x3c, 0x3f, 0x03, 0x0b, 0x3d, 0xde, 0x40, 0xb4, 0x3c, 0xfc, 0xdb, 0x06,
+  0xbd, 0xc3, 0x86, 0x90, 0x3d, 0x54, 0x89, 0x37, 0x3d, 0x55, 0xd4, 0x8d, 0xbd,
+  0x39, 0x31, 0xb7, 0xbc, 0xab, 0x31, 0xc0, 0xbc, 0x60, 0x17, 0xdb, 0xbb, 0x49,
+  0xa9, 0x2f, 0xbc, 0xbf, 0xcb, 0xd6, 0x3b, 0x83, 0x93, 0x16, 0x3d, 0xba, 0xdd,
+  0x1b, 0xbd, 0xd1, 0x6a, 0x17, 0x3d, 0x45, 0x0f, 0x1d, 0xbd, 0xa3, 0xc1, 0xb5,
+  0xbd, 0x88, 0x0e, 0x6e, 0x3d, 0x41, 0x5d, 0x06, 0x3d, 0xd8, 0xeb, 0xb4, 0x3c,
+  0xe5, 0xc8, 0x88, 0xbb, 0x48, 0x65, 0x47, 0x3d, 0xff, 0xe8, 0xa6, 0xbd, 0x12,
+  0x2a, 0x10, 0xbd, 0xd0, 0x90, 0x8b, 0x3d, 0x17, 0x08, 0xfc, 0xbc, 0x8e, 0xb4,
+  0x9a, 0xbc, 0x70, 0x79, 0x3f, 0x3d, 0xd8, 0xad, 0x06, 0x3c, 0xf8, 0x4e, 0x81,
+  0xbd, 0x82, 0xf1, 0x71, 0xbd, 0x9f, 0x19, 0xcc, 0xbd, 0xaf, 0x6a, 0x45, 0x3d,
+  0x4e, 0x39, 0x25, 0x3d, 0x17, 0x43, 0x74, 0x3d, 0x52, 0x51, 0x53, 0xbd, 0x53,
+  0x10, 0x5f, 0xbd, 0x5f, 0x60, 0xf7, 0x3c, 0xf4, 0x07, 0x6d, 0x3d, 0x68, 0x1d,
+  0x29, 0x3d, 0xd6, 0xf7, 0xad, 0xbc, 0x09, 0x0d, 0x8f, 0xbd, 0x17, 0xae, 0xd7,
+  0x3c, 0x63, 0xf2, 0xc7, 0xbc, 0x4e, 0xa0, 0x05, 0xbd, 0x53, 0x3b, 0xc5, 0xbc,
+  0x81, 0xf4, 0x82, 0x3d, 0x5e, 0xc9, 0x56, 0xbd, 0x32, 0xb8, 0xbd, 0xbc, 0xf2,
+  0x3e, 0xc7, 0xbc, 0x76, 0x7f, 0x76, 0xbd, 0x19, 0x45, 0x13, 0xbd, 0xb9, 0x17,
+  0x88, 0x3d, 0xef, 0x15, 0x68, 0xbd, 0x7a, 0xb8, 0xf6, 0x3a, 0xa8, 0x56, 0x72,
+  0xbb, 0x96, 0x68, 0xce, 0x3d, 0x13, 0x43, 0x0a, 0xbd, 0x87, 0x3f, 0x91, 0x3c,
+  0xd7, 0x12, 0x8b, 0x3b, 0x2f, 0x85, 0xbf, 0xbc, 0x33, 0xfc, 0x62, 0xbc, 0x5f,
+  0xb3, 0x8f, 0xbc, 0x9f, 0x1a, 0xf5, 0xbc, 0x3b, 0x75, 0x68, 0x3d, 0x58, 0xae,
+  0x3c, 0x3d, 0xe3, 0x00, 0x5d, 0x3d, 0xcf, 0x69, 0x9c, 0x3d, 0xdb, 0x20, 0xb3,
+  0x39, 0x31, 0x1a, 0x7a, 0xbc, 0x11, 0x37, 0xd0, 0x3c, 0x1d, 0x5d, 0x84, 0x3d,
+  0xb2, 0x5d, 0xe9, 0xbc, 0x24, 0x74, 0xe5, 0xbc, 0x86, 0x1d, 0xea, 0xbb, 0x65,
+  0x94, 0x76, 0x3d, 0x9a, 0xb2, 0xeb, 0x3c, 0x62, 0x9f, 0x44, 0xbb, 0xca, 0x35,
+  0xa8, 0xbc, 0x25, 0x51, 0x23, 0x3d, 0xa9, 0xac, 0x00, 0xbd, 0xb9, 0x13, 0xa6,
+  0x3d, 0x3e, 0x3e, 0x10, 0xbc, 0x5f, 0x40, 0x8b, 0x3d, 0x75, 0xef, 0x70, 0x3b,
+  0xf8, 0x66, 0xa4, 0x3c, 0x69, 0x24, 0x84, 0x3c, 0x2a, 0xd2, 0x76, 0xbc, 0x67,
+  0xef, 0x9f, 0xbc, 0xe1, 0x67, 0xcb, 0xbc, 0xe1, 0x4c, 0xa9, 0xbd, 0x18, 0xb6,
+  0x96, 0x3d, 0x29, 0xaa, 0x84, 0xbd, 0x80, 0x0d, 0x5b, 0x3d, 0x35, 0xe7, 0x02,
+  0x3d, 0xea, 0xf8, 0x46, 0xbd, 0xba, 0x63, 0x42, 0x3d, 0x3e, 0x6d, 0x83, 0x3d,
+  0x0d, 0x47, 0x3c, 0xbd, 0x79, 0xe3, 0xa1, 0x3c, 0x7b, 0x77, 0x17, 0xbd, 0x4d,
+  0x55, 0x53, 0x3d, 0xc3, 0x91, 0x7e, 0xbd, 0x9b, 0x6b, 0x49, 0x3d, 0x30, 0xad,
+  0xc7, 0xbc, 0xc1, 0x27, 0x3e, 0xbd, 0xea, 0xaf, 0x51, 0x3d, 0x12, 0x3a, 0x94,
+  0xbc, 0xf1, 0x36, 0xf1, 0x3c, 0x6a, 0x5a, 0x93, 0x3b, 0x88, 0x1e, 0xb1, 0xbc,
+  0x3c, 0x43, 0x37, 0xbd, 0x74, 0xda, 0x9a, 0xbd, 0x53, 0x3d, 0x7b, 0x3d, 0xe7,
+  0x18, 0xdd, 0xbc, 0xba, 0x1b, 0xd9, 0xbc, 0xe8, 0x9a, 0x64, 0xbd, 0xca, 0x36,
+  0x2b, 0x3d, 0xc6, 0x99, 0xbc, 0x3c, 0xa6, 0x76, 0x72, 0x3d, 0x59, 0x8a, 0xb5,
+  0x3c, 0x07, 0xf8, 0xd7, 0x3d, 0xdd, 0xaf, 0x2a, 0xb8, 0x77, 0xac, 0xb7, 0x3c,
+  0x53, 0xd6, 0x12, 0xbd, 0x19, 0x6c, 0x63, 0x3c, 0xe0, 0xf5, 0x32, 0xbd, 0x72,
+  0xc2, 0xae, 0xbd, 0x04, 0x6b, 0x12, 0x3c, 0xea, 0x76, 0x99, 0x3d, 0x5e, 0x14,
+  0x25, 0xbd, 0x16, 0x01, 0x01, 0xbc, 0x6d, 0x0e, 0xb8, 0x3d, 0x78, 0x70, 0x85,
+  0x3b, 0x7b, 0xb9, 0x55, 0xbb, 0x59, 0xa4, 0x2f, 0x3d, 0xbb, 0xf1, 0x4e, 0xbc,
+  0x6e, 0x1e, 0x6f, 0x3d, 0x6d, 0xd0, 0x82, 0x3d, 0xa1, 0x2a, 0x38, 0xbd, 0x82,
+  0x0e, 0x81, 0x3d, 0x51, 0x1a, 0xe8, 0x3c, 0x78, 0x0f, 0xb2, 0xbc, 0xdb, 0x4a,
+  0x9f, 0x3d, 0xeb, 0xf7, 0x5f, 0x3b, 0xf0, 0x3e, 0xe2, 0xbc, 0x9c, 0x11, 0x91,
+  0x3c, 0xb0, 0xbd, 0x1a, 0x3c, 0xce, 0x3f, 0x1c, 0xbb, 0x0e, 0xe3, 0x0b, 0x3d,
+  0x2e, 0x44, 0x15, 0x3d, 0x90, 0x12, 0xe8, 0x3c, 0x84, 0xb7, 0x46, 0x3d, 0x4f,
+  0x51, 0x90, 0x3c, 0x5f, 0xee, 0xe8, 0x3c, 0x8f, 0xa8, 0xd2, 0xbb, 0x86, 0x20,
+  0x7c, 0x3d, 0xe8, 0x1f, 0x48, 0xbc, 0xbb, 0x7f, 0x59, 0x3d, 0x62, 0xf1, 0x8a,
+  0xbc, 0x94, 0x28, 0x0c, 0x3c, 0xdd, 0x8f, 0x1a, 0xbd, 0xad, 0x5a, 0xa8, 0x39,
+  0x4d, 0x0c, 0x71, 0x3d, 0x96, 0xa2, 0x91, 0x3d, 0xe7, 0x9c, 0x69, 0xbc, 0x1f,
+  0x9d, 0x0c, 0xbd, 0x6e, 0xbe, 0xe7, 0x3c, 0x97, 0x28, 0x35, 0xbd, 0x11, 0xb7,
+  0x8c, 0xbd, 0x3b, 0xc0, 0xc1, 0x3c, 0x02, 0x96, 0xd7, 0x3c, 0x79, 0x02, 0x4d,
+  0xbc, 0x6c, 0xad, 0xb7, 0x3c, 0x9a, 0xef, 0x29, 0x3d, 0xe9, 0x73, 0x9b, 0x3d,
+  0x58, 0xd3, 0x17, 0x3d, 0xea, 0xcc, 0x2d, 0xbd, 0x64, 0x3a, 0x9e, 0xbd, 0x9a,
+  0x8b, 0x3c, 0xbd, 0x4f, 0x97, 0x88, 0xbc, 0x1b, 0x18, 0x27, 0xbc, 0x22, 0xdc,
+  0xde, 0xbd, 0xb4, 0xbe, 0x94, 0xba, 0x5a, 0xc7, 0xe0, 0x3b, 0xe9, 0xd7, 0x07,
+  0x3c, 0xcb, 0x47, 0xf2, 0x3c, 0x04, 0xca, 0x2f, 0x3d, 0x25, 0x4d, 0xd9, 0x3c,
+  0xc1, 0xb9, 0x37, 0xbd, 0xa1, 0x9a, 0x0c, 0x3d, 0x78, 0xae, 0x88, 0xbd, 0x02,
+  0xb5, 0x98, 0x3d, 0x63, 0x8b, 0x79, 0xbd, 0xab, 0xe4, 0xaa, 0x3d, 0x5a, 0x1e,
+  0x02, 0xbc, 0x16, 0x17, 0x68, 0x3b, 0xf8, 0x36, 0x0d, 0x3b, 0x1f, 0x67, 0x8c,
+  0xbd, 0xbc, 0x52, 0xe2, 0xbc, 0x2f, 0xee, 0xe2, 0xbb, 0x46, 0x45, 0x08, 0x3d,
+  0xd2, 0xea, 0xc9, 0x3c, 0x00, 0xcc, 0x5c, 0x3d, 0x1e, 0x1f, 0x54, 0x3c, 0x10,
+  0x3e, 0x8e, 0x3c, 0x1e, 0x6d, 0x5f, 0xbd, 0xfb, 0xdb, 0x64, 0x3d, 0x62, 0x27,
+  0xb5, 0xbd, 0x0a, 0x8c, 0x51, 0xbd, 0x5e, 0x4d, 0xae, 0xbd, 0xd4, 0xd2, 0x65,
+  0x3d, 0x88, 0xc4, 0xc0, 0x3c, 0x25, 0x97, 0xb9, 0xbb, 0x6d, 0x7c, 0x5b, 0x3d,
+  0x42, 0x2f, 0x0e, 0xbb, 0x42, 0xfc, 0xb3, 0xba, 0x38, 0x1c, 0xae, 0xbc, 0x4d,
+  0xba, 0x7a, 0xbd, 0x15, 0xf7, 0x9d, 0x3d, 0x51, 0xc4, 0x82, 0x3d, 0x70, 0xa9,
+  0x47, 0x3d, 0x68, 0x1c, 0xdf, 0x3c, 0xef, 0x44, 0x71, 0x3c, 0xdf, 0x7d, 0x80,
+  0x3d, 0x6c, 0x6c, 0xcd, 0xbc, 0x9b, 0xf2, 0x68, 0x3d, 0x61, 0x10, 0x64, 0x3d,
+  0x31, 0x19, 0xda, 0x3c, 0xc3, 0x1c, 0xdc, 0xbb, 0xe1, 0x30, 0x13, 0xbc, 0x4d,
+  0xd5, 0xaf, 0xbb, 0x39, 0xaa, 0x43, 0xbd, 0x9a, 0x51, 0x75, 0xbd, 0xc3, 0x2b,
+  0x5e, 0x3c, 0x2f, 0x60, 0xed, 0x3c, 0x2a, 0x8e, 0x87, 0x3d, 0x0e, 0x88, 0x08,
+  0xbd, 0xcb, 0x1a, 0xc2, 0x3b, 0x86, 0xdb, 0x44, 0xbd, 0x3c, 0xb2, 0xd8, 0xbc,
+  0xd8, 0x5c, 0x2a, 0x3d, 0xf9, 0xb9, 0x06, 0xbd, 0xf6, 0x2f, 0x52, 0x3d, 0xda,
+  0x46, 0xe9, 0x3b, 0xeb, 0x10, 0xd5, 0x3c, 0x5a, 0x5a, 0x70, 0x3b, 0x58, 0xd3,
+  0x30, 0x3c, 0xb3, 0x7e, 0x00, 0xbd, 0x81, 0x37, 0x56, 0xbd, 0x0a, 0x66, 0x12,
+  0xbd, 0xd7, 0xca, 0x80, 0xbd, 0x89, 0x4c, 0x52, 0x3d, 0x42, 0x49, 0xab, 0x3c,
+  0x79, 0xe8, 0xa6, 0xbd, 0xa2, 0x35, 0xd5, 0xbd, 0xa3, 0x0c, 0x0e, 0xbd, 0x4f,
+  0x10, 0x8a, 0x3d, 0xd4, 0xbe, 0x64, 0x3d, 0x38, 0x13, 0xfd, 0x3d, 0x86, 0xc8,
+  0x82, 0xbd, 0xd2, 0x11, 0x46, 0x3d, 0xcc, 0x13, 0x6a, 0x3d, 0x29, 0x91, 0xe2,
+  0xbc, 0x9a, 0x59, 0xc8, 0xbc, 0x6d, 0xd3, 0x79, 0xbd, 0x00, 0x17, 0xbd, 0x3d,
+  0x2f, 0x3d, 0x13, 0xbd, 0xf2, 0x5e, 0x5a, 0x3d, 0x91, 0xd3, 0x22, 0xbc, 0x8d,
+  0x7d, 0xdd, 0x3c, 0xcb, 0xd3, 0x47, 0x3d, 0x51, 0x39, 0x43, 0x3d, 0x8e, 0xba,
+  0xb3, 0x3c, 0xcf, 0xdc, 0x5d, 0xbc, 0xe8, 0xf4, 0x69, 0xbd, 0x75, 0xed, 0x4a,
+  0xbd, 0x3e, 0xa3, 0x52, 0x3d, 0x55, 0xbe, 0x6e, 0xbd, 0x84, 0x86, 0xb3, 0xbc,
+  0x7d, 0x3b, 0x4f, 0xbd, 0xd0, 0x9c, 0x8f, 0xbb, 0xe4, 0x9f, 0x39, 0x3d, 0x10,
+  0x5c, 0xf0, 0xbb, 0x64, 0x15, 0x82, 0xbc, 0x12, 0xf8, 0x45, 0x3d, 0xf6, 0xfc,
+  0x40, 0x3d, 0x64, 0x01, 0x84, 0xbc, 0x4e, 0x97, 0x28, 0x3d, 0xc0, 0xb8, 0x30,
+  0x3d, 0xf8, 0x94, 0x71, 0xbd, 0x59, 0x5a, 0x61, 0xbd, 0x9e, 0x55, 0x8d, 0xbd,
+  0x00, 0x77, 0xfa, 0xbc, 0x9c, 0xbf, 0x17, 0x3d, 0x94, 0x7a, 0x4f, 0xbd, 0xb1,
+  0xa6, 0x8f, 0xbd, 0xad, 0xc3, 0x8a, 0x3d, 0xf0, 0xca, 0x8b, 0x3c, 0x2a, 0xe4,
+  0x2b, 0xbd, 0x34, 0x81, 0x44, 0xbd, 0x48, 0x55, 0x52, 0xbd, 0x2e, 0x7e, 0x63,
+  0x3d, 0x3a, 0x07, 0x4e, 0x3d, 0xb0, 0xb9, 0x7a, 0x3c, 0x18, 0x7d, 0x6e, 0xbc,
+  0x7a, 0x0e, 0x3c, 0xbd, 0xdc, 0x81, 0x8c, 0xbd, 0xc8, 0xa4, 0x71, 0x3c, 0xca,
+  0x20, 0x28, 0x3d, 0x28, 0x36, 0xf6, 0x3c, 0x28, 0xef, 0x3c, 0x3d, 0x88, 0x83,
+  0x3e, 0x3c, 0x74, 0x45, 0x34, 0x3d, 0x80, 0x11, 0x06, 0xba, 0x8c, 0xd1, 0x79,
+  0xbc, 0x84, 0x71, 0x26, 0xbd, 0x98, 0x15, 0x15, 0x3c, 0x4a, 0x0e, 0x92, 0xbc,
+  0x75, 0x17, 0x83, 0x3d, 0xfc, 0x9c, 0xc1, 0xbc, 0x4c, 0xe3, 0xb5, 0x3c, 0x10,
+  0xc9, 0x23, 0x3c, 0xd0, 0xde, 0x1a, 0x3c, 0x22, 0x15, 0x92, 0xbd, 0xe6, 0x39,
+  0x48, 0xbd, 0x16, 0x40, 0x91, 0xbd, 0x5c, 0xf1, 0xb4, 0x3c, 0x4a, 0xf7, 0xbc,
+  0xbc, 0x80, 0x48, 0x44, 0x3c, 0xc8, 0x47, 0x15, 0xbc, 0xcb, 0x39, 0x4d, 0xbd,
+  0x04, 0xe1, 0xc0, 0x3c, 0x86, 0x40, 0x43, 0xbd, 0x3f, 0x39, 0x6a, 0xbd, 0x00,
+  0xfd, 0x30, 0xbb, 0x18, 0x14, 0x60, 0xbc, 0xf0, 0x88, 0x12, 0x3d, 0x21, 0xf7,
+  0x90, 0x3d, 0xfc, 0xcc, 0xa1, 0x3c, 0xa6, 0x1f, 0x2d, 0x3d, 0x0a, 0x14, 0x46,
+  0xbd, 0x37, 0x3c, 0x5f, 0xbd, 0x32, 0x53, 0x94, 0xbc, 0x58, 0x51, 0xb1, 0xbc,
+  0xd7, 0x03, 0x89, 0x3d, 0xfe, 0x03, 0x37, 0xbd, 0x9e, 0x06, 0x89, 0xbd, 0xbc,
+  0xf6, 0x41, 0x3d, 0xf0, 0x87, 0x32, 0x3d, 0xdc, 0x11, 0xeb, 0xbc, 0x4a, 0x89,
+  0x3b, 0x3d, 0xd2, 0xf1, 0x2b, 0x3d, 0x78, 0xcb, 0x38, 0xbc, 0x46, 0xda, 0xff,
+  0xbc, 0xee, 0x9c, 0x8d, 0xbd, 0x14, 0x8e, 0xcd, 0xbc, 0x08, 0x6f, 0x05, 0x3d,
+  0x00, 0xac, 0x8e, 0xbd, 0x90, 0xa2, 0x84, 0xbb, 0x9b, 0x36, 0x32, 0xbd, 0x2b,
+  0x3f, 0x89, 0x3d, 0x80, 0x9a, 0x03, 0xbb, 0x06, 0xac, 0x17, 0x3d, 0xf8, 0x22,
+  0x3f, 0xbd, 0x75, 0xae, 0x90, 0xbd, 0x76, 0xdd, 0x3e, 0xbd, 0x7c, 0x72, 0x92,
+  0x3c, 0x4c, 0x38, 0x44, 0xbd, 0xba, 0x8f, 0x21, 0x3d, 0x00, 0x88, 0x7e, 0xbb,
+  0xdc, 0xd2, 0x92, 0x3c, 0x1a, 0x45, 0x77, 0x3d, 0x54, 0xa1, 0x50, 0xbc, 0x44,
+  0xea, 0x2d, 0x3d, 0x8e, 0xbd, 0x1d, 0x3d, 0x1b, 0xb9, 0x88, 0x3d, 0x20, 0xc4,
+  0x8b, 0xbd, 0x43, 0x9e, 0x05, 0xbd, 0x80, 0x93, 0x4a, 0x3d, 0x02, 0xb3, 0x8a,
+  0xbd, 0x40, 0x5c, 0xbb, 0x3b, 0x54, 0x22, 0x37, 0xbd, 0x04, 0xd5, 0xed, 0xbc,
+  0xae, 0xce, 0x87, 0xbd, 0x0c, 0x0f, 0xe3, 0xbc, 0xc1, 0x1f, 0x48, 0xbd, 0x68,
+  0x6a, 0x9a, 0x3c, 0xd0, 0x0b, 0x8f, 0x3c, 0xc8, 0x5c, 0x00, 0x3d, 0x60, 0xf9,
+  0xd5, 0xbb, 0x57, 0x9a, 0x88, 0xbd, 0xf2, 0x1a, 0x8d, 0xbd, 0x52, 0x69, 0x63,
+  0x3d, 0xb8, 0x69, 0x89, 0x3c, 0x56, 0xfb, 0x0a, 0x3d, 0x00, 0xc3, 0x10, 0xba,
+  0x0e, 0xcd, 0x56, 0xbd, 0x1a, 0xf7, 0x61, 0x3d, 0xf8, 0x95, 0x8b, 0xbd, 0x3c,
+  0x34, 0x14, 0xbd, 0xed, 0xc6, 0x8f, 0x3d, 0xee, 0xc2, 0x1c, 0x3d, 0xa0, 0x9d,
+  0x04, 0xbb, 0xfd, 0x06, 0x56, 0xbd, 0xa0, 0xe7, 0x12, 0x3b, 0xae, 0x01, 0xbd,
+  0xbc, 0xb0, 0x52, 0x16, 0x3d, 0x00, 0x9e, 0x97, 0xba, 0x40, 0xaf, 0x58, 0x3d,
+  0xa4, 0x80, 0x97, 0x3c, 0xa0, 0x07, 0x22, 0x3b, 0x59, 0x3b, 0x01, 0xbd, 0x83,
+  0x64, 0x87, 0x3d, 0x0e, 0xfd, 0x96, 0xbc, 0x3a, 0xf8, 0x7b, 0xbd, 0x7d, 0x61,
+  0x0a, 0xbd, 0xe2, 0x4c, 0x58, 0xbd, 0xc0, 0x1b, 0x81, 0xbb, 0x70, 0x48, 0x0b,
+  0x3d, 0x5a, 0x4c, 0x94, 0xbc, 0x6a, 0x49, 0x5b, 0x3d, 0x58, 0x79, 0x7a, 0x3c,
+  0x54, 0xe4, 0x10, 0xbd, 0x0f, 0x05, 0x8c, 0x3d, 0x00, 0x70, 0xb3, 0xba, 0xfe,
+  0x52, 0xec, 0xbc, 0x80, 0x87, 0xe5, 0x3b, 0x76, 0x35, 0x7f, 0x3d, 0x20, 0x23,
+  0x36, 0x3b, 0x48, 0xe0, 0x16, 0x3d, 0x0e, 0xdb, 0x53, 0x3d, 0x76, 0x7d, 0xcb,
+  0xbc, 0x79, 0xf8, 0x5c, 0xbd, 0x8a, 0x7c, 0x39, 0x3d, 0x8c, 0x87, 0x1d, 0x3d,
+  0x3a, 0x32, 0x08, 0xbd, 0x54, 0xa9, 0x6a, 0xbc, 0x22, 0xad, 0xad, 0xbc, 0xd2,
+  0x4b, 0x68, 0x3d, 0x86, 0x89, 0xee, 0xbc, 0x42, 0xee, 0x7d, 0x3d, 0x56, 0x9e,
+  0x46, 0x3d, 0x58, 0xcd, 0xd0, 0x3c, 0xb4, 0x6d, 0x9f, 0x3c, 0x0c, 0x5b, 0x20,
+  0xbd, 0x40, 0xe8, 0x2c, 0x3b, 0x23, 0xd1, 0x80, 0x3d, 0xee, 0x0f, 0xc8, 0xbc,
+  0x1c, 0x52, 0xd5, 0x3c, 0x68, 0x8d, 0x63, 0xbc, 0x9c, 0xb3, 0x37, 0xbd, 0x0c,
+  0x04, 0xde, 0x3c, 0x50, 0x20, 0x93, 0x3b, 0xac, 0xef, 0xf6, 0x3c, 0xac, 0x6e,
+  0x93, 0xbc, 0x92, 0x06, 0x64, 0x3d, 0x28, 0xdd, 0x74, 0x3c, 0xf7, 0x67, 0x86,
+  0x3d, 0x2c, 0x86, 0x43, 0x3d, 0x30, 0x55, 0x89, 0xbd, 0xa0, 0xf0, 0xd7, 0xbb,
+  0xe4, 0x7f, 0x05, 0x3d, 0x18, 0xf7, 0x3f, 0x3c, 0x46, 0xaf, 0xcb, 0xbc, 0x80,
+  0xf0, 0xb3, 0x3b, 0xdc, 0xe9, 0x81, 0x3c, 0xef, 0x3f, 0x5c, 0xbd, 0xfe, 0xb8,
+  0xa1, 0xbc, 0x90, 0x44, 0x41, 0x3c, 0x4e, 0xc8, 0x30, 0xbd, 0x63, 0x6e, 0x72,
+  0xbd, 0xbc, 0x52, 0xbf, 0xbc, 0x7c, 0x04, 0x47, 0xbd, 0x4c, 0xe3, 0x4e, 0xbd,
+  0x34, 0x8b, 0x36, 0x3d, 0xd1, 0xf2, 0x33, 0xbd, 0x16, 0x48, 0x09, 0x3d, 0x8c,
+  0x31, 0x00, 0xbd, 0xd9, 0x91, 0x8e, 0xbd, 0xf2, 0x8d, 0x64, 0xbd, 0x48, 0x20,
+  0xbf, 0xbc, 0x60, 0x89, 0x53, 0x3b, 0x00, 0x96, 0x71, 0x3a, 0x44, 0x6e, 0x8c,
+  0xbd, 0x90, 0x6b, 0x7d, 0xbd, 0x64, 0x71, 0xa6, 0x3c, 0x52, 0x23, 0x70, 0x3d,
+  0xf3, 0x05, 0x80, 0x3d, 0xb4, 0xe2, 0x68, 0xbd, 0x20, 0x6f, 0xf9, 0x3b, 0x60,
+  0x31, 0x2c, 0x3d, 0x30, 0x78, 0x4b, 0xbd, 0xd8, 0xae, 0x23, 0xbc, 0x40, 0xea,
+  0xc5, 0x3a, 0xd0, 0xe7, 0x86, 0xbd, 0xa0, 0x57, 0x47, 0x3d, 0x70, 0x78, 0xab,
+  0x3b, 0x1c, 0xab, 0xb1, 0xbc, 0x2a, 0x75, 0x5d, 0xbd, 0xd0, 0xd1, 0x26, 0xbd,
+  0x90, 0x93, 0x3a, 0xbd, 0xb4, 0x8a, 0xe9, 0xbc, 0xac, 0xf1, 0xa5, 0xbc, 0x10,
+  0xa3, 0xa7, 0xbb, 0x02, 0xb2, 0x73, 0xbd, 0x2e, 0x27, 0xb7, 0xbc, 0xd0, 0x0c,
+  0x92, 0xbd, 0x0e, 0x8e, 0x77, 0x3d, 0x5a, 0x78, 0x0a, 0x3d, 0xf4, 0xa9, 0xc5,
+  0x3c, 0x82, 0x8a, 0x15, 0x3d, 0x3d, 0x25, 0x13, 0xbd, 0x7e, 0x35, 0x12, 0xbd,
+  0x2a, 0xd2, 0x6e, 0x3d, 0x78, 0x60, 0xcb, 0xbc, 0x70, 0x92, 0x81, 0xbd, 0xca,
+  0x3f, 0x2f, 0xbd, 0x3b, 0x71, 0x67, 0xbd, 0x80, 0x79, 0x83, 0xba, 0xc6, 0x2a,
+  0x47, 0x3d, 0x86, 0x99, 0x72, 0x3d, 0x6c, 0x59, 0x8f, 0x3c, 0x73, 0x59, 0x14,
+  0xbd, 0x23, 0x83, 0x82, 0x3d, 0x94, 0x4d, 0x8b, 0xbd, 0x9c, 0x05, 0x2f, 0xbd,
+  0x60, 0xae, 0x57, 0x3d, 0x95, 0x1c, 0x86, 0x3d, 0x26, 0xaf, 0x78, 0x3d, 0x47,
+  0x4b, 0x4e, 0xbd, 0x96, 0xfd, 0x75, 0x3d, 0xb2, 0x63, 0x35, 0x3d, 0xc0, 0x00,
+  0xa3, 0x3b, 0x12, 0x16, 0x3d, 0x3d, 0x8e, 0xd2, 0x56, 0xbd, 0x02, 0xff, 0xec,
+  0xbc, 0x96, 0x20, 0xcc, 0xbc, 0xf4, 0x61, 0x0b, 0x3d, 0x20, 0x12, 0x58, 0x3b,
+  0x5a, 0xa3, 0x4c, 0x3d, 0x80, 0x86, 0x64, 0x3b, 0x0e, 0x77, 0x70, 0x3d, 0xd0,
+  0x7b, 0xe8, 0xbb, 0x92, 0x2d, 0x20, 0xbd, 0xc8, 0x33, 0x6f, 0xbc, 0xf8, 0x0f,
+  0x76, 0x3c, 0x3a, 0xea, 0x36, 0x3d, 0xc0, 0x6c, 0x47, 0x3b, 0x00, 0x3b, 0x98,
+  0xbc, 0x88, 0x52, 0x3b, 0x3c, 0xa8, 0x58, 0x54, 0x3c, 0x5a, 0xff, 0x4f, 0x3d,
+  0xfe, 0x26, 0x5e, 0x3d, 0x7c, 0x39, 0x8e, 0xbc, 0x96, 0x37, 0x75, 0x3d, 0xbd,
+  0x95, 0x86, 0xbd, 0x6b, 0x40, 0x91, 0x3d, 0x40, 0x14, 0x3a, 0xbb, 0xf0, 0xe0,
+  0x0f, 0xbc, 0xeb, 0x23, 0x82, 0x3d, 0xe0, 0x7c, 0x8e, 0x3b, 0x60, 0x71, 0x11,
+  0xbc, 0x3e, 0x89, 0x2c, 0xbd, 0x9a, 0x0a, 0x7f, 0xbd, 0xe8, 0x86, 0xcd, 0x3c,
+  0xd4, 0x1d, 0xfe, 0x3c, 0xc6, 0x1f, 0x63, 0x3d, 0xe8, 0x6a, 0x2d, 0x3c, 0xec,
+  0xb5, 0x02, 0x3d, 0x78, 0xcb, 0xe0, 0xbc, 0x74, 0x19, 0x64, 0xbc, 0xf0, 0xf7,
+  0x69, 0xbc, 0x11, 0x97, 0x92, 0xbd, 0xe2, 0x89, 0x8b, 0xbd, 0x36, 0xe1, 0xa2,
+  0xbc, 0x38, 0x7d, 0xb2, 0xbc, 0xf4, 0x26, 0x16, 0x3d, 0x70, 0x40, 0x90, 0xbd,
+  0xe0, 0x0a, 0x70, 0x3c, 0x86, 0xb8, 0x35, 0x3d, 0x67, 0xd7, 0x8d, 0x3d, 0xd0,
+  0xdc, 0x17, 0xbc, 0x10, 0xf7, 0xcd, 0xbb, 0xfe, 0x64, 0x59, 0x3d, 0x34, 0xf3,
+  0x3c, 0xbd, 0x40, 0xfe, 0xae, 0xba, 0xd1, 0x87, 0x85, 0x3d, 0x10, 0x58, 0x65,
+  0xbd, 0x66, 0xaf, 0x5d, 0xbd, 0x42, 0x56, 0x5d, 0x3d, 0x7c, 0xce, 0x5f, 0xbd,
+  0xc0, 0x38, 0x96, 0x3a, 0x33, 0x59, 0x90, 0x3d, 0x06, 0x1a, 0xa6, 0xbc, 0xd4,
+  0xb0, 0x83, 0x3c, 0xa8, 0xf4, 0x07, 0x3c, 0xa5, 0x8f, 0x90, 0x3d, 0x36, 0xd8,
+  0xc0, 0xbc, 0xf0, 0xf5, 0x31, 0x3d, 0x30, 0x56, 0x88, 0xbd, 0x3c, 0x96, 0x05,
+  0xbd, 0x89, 0xc2, 0x89, 0x3d, 0x19, 0x10, 0x06, 0xbd, 0xa2, 0xaa, 0x63, 0x3d,
+  0x5e, 0x9b, 0x76, 0xbd, 0xa5, 0x57, 0x8c, 0x3d, 0x48, 0xe9, 0x2a, 0x3c, 0xe0,
+  0xd9, 0x3a, 0x3b, 0xd3, 0x1c, 0x7f, 0xbd, 0x8c, 0x60, 0x21, 0xbc, 0x38, 0xc1,
+  0x67, 0xbc, 0xf0, 0x83, 0x62, 0x3c, 0x58, 0xcb, 0x3f, 0x3d, 0xc7, 0xd9, 0x83,
+  0x3d, 0x3e, 0xf5, 0x90, 0xbd, 0xeb, 0xb8, 0x8b, 0xbd, 0x0a, 0x86, 0x05, 0x3d,
+  0x61, 0xb6, 0x39, 0xbd, 0x56, 0x8f, 0x04, 0x3d, 0x19, 0xbd, 0x33, 0xbd, 0x24,
+  0xd1, 0x50, 0x3d, 0xd0, 0x14, 0xf8, 0x3c, 0x2c, 0x43, 0x49, 0x3d, 0x98, 0xa1,
+  0x53, 0xbc, 0xc2, 0x43, 0x26, 0x3d, 0x8e, 0xed, 0xff, 0xbc, 0xb7, 0x58, 0x75,
+  0xbd, 0x00, 0xb7, 0x85, 0x3a, 0x8c, 0xb1, 0x83, 0xbc, 0x08, 0x40, 0x92, 0xbd,
+  0x35, 0x28, 0x08, 0xbd, 0x30, 0x4f, 0x84, 0x3c, 0x34, 0x0b, 0x22, 0xbc, 0x30,
+  0x1a, 0x07, 0x3c, 0xaa, 0xd6, 0x87, 0xbd, 0xa2, 0xfd, 0x7d, 0xbd, 0xfe, 0xa0,
+  0xb7, 0xbc, 0xa2, 0x0a, 0x33, 0x3d, 0x10, 0x60, 0xe4, 0xbb, 0x64, 0x49, 0x10,
+  0xbd, 0xf4, 0xd0, 0x48, 0xbc, 0x12, 0x7a, 0x38, 0x3d, 0x28, 0xb9, 0xee, 0xbc,
+  0x05, 0xbe, 0x50, 0xbd, 0xce, 0x2f, 0xd5, 0xbc, 0x04, 0x8f, 0x39, 0xbd, 0xa8,
+  0x16, 0x0c, 0xbd, 0x64, 0xe1, 0x79, 0xbc, 0xd4, 0x20, 0x8c, 0x3c, 0x28, 0x73,
+  0x1c, 0x3d, 0x20, 0x66, 0x97, 0x3c, 0x66, 0x6e, 0xc1, 0xbc, 0x6d, 0xfc, 0x91,
+  0xbd, 0xc5, 0x79, 0x89, 0xbd, 0xd0, 0x3c, 0x90, 0x3c, 0xfc, 0x19, 0x55, 0xbd,
+  0x72, 0x96, 0x80, 0xbd, 0x80, 0x81, 0x46, 0x3d, 0xea, 0x10, 0x30, 0x3d, 0x00,
+  0xdc, 0xe2, 0x3b, 0x44, 0x30, 0x78, 0xbc, 0x3a, 0x5b, 0x39, 0x3d, 0x00, 0x8d,
+  0x8c, 0xbb, 0x70, 0x9f, 0x3b, 0xbc, 0x1c, 0xa9, 0x5c, 0xbc, 0x04, 0xa9, 0xe4,
+  0xbc, 0x3a, 0xd9, 0x39, 0x3d, 0xa0, 0x11, 0xfd, 0x3c, 0x76, 0x3b, 0xf9, 0xbc,
+  0xb9, 0xdd, 0x6f, 0xbd, 0xf5, 0xcb, 0x91, 0xbd, 0xee, 0x45, 0x5d, 0xbd, 0x13,
+  0x1c, 0x8d, 0xbd, 0x10, 0xb7, 0xb6, 0x3b, 0x60, 0xc8, 0x77, 0x3b, 0x70, 0x4d,
+  0xbf, 0xbb, 0x38, 0x4f, 0x80, 0xbd, 0xa9, 0x6b, 0x92, 0xbd, 0x78, 0x8e, 0x7e,
+  0x3c, 0x70, 0xd1, 0x6e, 0x3c, 0x79, 0x4c, 0x85, 0xbd, 0xcc, 0xac, 0x2b, 0x3d,
+  0x49, 0x46, 0x5f, 0xbd, 0x68, 0x60, 0x6d, 0xbc, 0x50, 0x53, 0xe4, 0x3b, 0x35,
+  0x39, 0x81, 0x3d, 0xf0, 0x01, 0x12, 0x3c, 0x4c, 0x27, 0x8b, 0xbd, 0xce, 0x8d,
+  0x71, 0x3d, 0xcc, 0x9a, 0x8e, 0xbd, 0x9e, 0x6f, 0xcd, 0xbc, 0xea, 0x23, 0x19,
+  0x3d, 0xac, 0xed, 0x95, 0x3c, 0x76, 0x32, 0x68, 0x3d, 0x08, 0xcc, 0x58, 0x3c,
+  0xc8, 0xe2, 0xcc, 0x3c, 0xf1, 0x85, 0x81, 0x3d, 0x06, 0xdc, 0x6b, 0x3d, 0x16,
+  0x15, 0xf0, 0xbc, 0xda, 0x56, 0x4e, 0x3d, 0x58, 0x5c, 0x90, 0xbc, 0xe4, 0x79,
+  0x37, 0xbd, 0x40, 0x1b, 0x6a, 0xbd, 0x00, 0x4e, 0x63, 0x3b, 0xbc, 0xfc, 0x35,
+  0x3d, 0xe6, 0x87, 0xf9, 0xbc, 0xb0, 0xfc, 0x0c, 0x3d, 0x96, 0x7f, 0x53, 0xbd,
+  0x1e, 0xe1, 0x04, 0x3d, 0x10, 0x11, 0x87, 0x3c, 0xce, 0xd1, 0x42, 0x3d, 0x1c,
+  0x27, 0xca, 0xbc, 0xd8, 0x71, 0xfa, 0x3c, 0xea, 0xce, 0x76, 0x3d, 0x2c, 0x0e,
+  0xbc, 0x3c, 0x9b, 0x96, 0x48, 0xbd, 0x60, 0x7b, 0x93, 0xbb, 0x8a, 0x69, 0xa8,
+  0xbc, 0xc0, 0xcd, 0x79, 0x3c, 0xd0, 0xe0, 0x87, 0xbd, 0xe6, 0x91, 0x53, 0xbd,
+  0x96, 0xe0, 0x03, 0x3d, 0x8b, 0x7a, 0x81, 0xbd, 0x16, 0x64, 0x80, 0xbd, 0x84,
+  0xac, 0x87, 0x3c, 0xf8, 0xb7, 0xfc, 0xbc, 0x63, 0x2a, 0x38, 0xbd, 0x5a, 0x71,
+  0x35, 0xbd, 0xda, 0xff, 0x49, 0xbd, 0x50, 0xcd, 0xdb, 0xbb, 0xc0, 0x85, 0x37,
+  0xbb, 0x2a, 0x21, 0x35, 0x3d, 0xb6, 0x59, 0xcc, 0xbc, 0x10, 0x02, 0xe7, 0x3b,
+  0x78, 0xf5, 0x54, 0xbc, 0xb0, 0x3c, 0x58, 0x3c, 0xf4, 0x96, 0x59, 0x3d, 0x10,
+  0xd7, 0xd2, 0xbb, 0x1a, 0x0c, 0x79, 0x3d, 0x48, 0x2c, 0x6b, 0x3c, 0xc0, 0x44,
+  0x89, 0xbb, 0x5c, 0xf0, 0xa3, 0x3c, 0xd0, 0x1c, 0x07, 0x3d, 0x02, 0xcd, 0x94,
+  0xbc, 0xa8, 0x51, 0x99, 0xbc, 0xc0, 0xb9, 0x40, 0x3c, 0xe0, 0x85, 0x86, 0x3c,
+  0x74, 0x77, 0x9f, 0x3c, 0x15, 0xe0, 0x71, 0xbd, 0x00, 0xf1, 0xfc, 0xb9, 0x50,
+  0x39, 0x11, 0x3c, 0xb7, 0x13, 0x81, 0x3d, 0x60, 0x31, 0xe5, 0x3c, 0x8c, 0x42,
+  0xf6, 0xbc, 0x4c, 0x34, 0x8a, 0xbc, 0xb8, 0x26, 0xe6, 0x3c, 0xf4, 0x56, 0x69,
+  0xbc, 0xcc, 0xb4, 0xa1, 0x3c, 0xf0, 0x8e, 0x48, 0xbd, 0xcb, 0xab, 0x91, 0xbd,
+  0x00, 0xc4, 0x5e, 0xbb, 0xdd, 0xf5, 0x8c, 0x3d, 0xc8, 0x1a, 0x8a, 0x3c, 0x1c,
+  0x9c, 0xda, 0xbc, 0x89, 0x6e, 0x83, 0x3d, 0x00, 0x6e, 0x3c, 0x39, 0x80, 0x82,
+  0xd0, 0x3a, 0x00, 0x09, 0xc2, 0xb9, 0x04, 0x06, 0x38, 0xbc, 0x0a, 0x7a, 0xf7,
+  0xbc, 0x50, 0xac, 0x1d, 0x3c, 0x9e, 0xd8, 0xfa, 0xbc, 0xea, 0xed, 0x71, 0xbd,
+  0x7f, 0xf6, 0x0a, 0xbd, 0x20, 0x2d, 0x30, 0x3b, 0xd0, 0x7c, 0x96, 0x3b, 0x2e,
+  0x61, 0x3f, 0x3d, 0xb0, 0x0a, 0x2d, 0x3d, 0x80, 0xac, 0x47, 0xbb, 0x7a, 0x9e,
+  0xe6, 0xbc, 0x50, 0x90, 0x44, 0x3c, 0x0d, 0x23, 0x8e, 0xbd, 0x00, 0x3a, 0x59,
+  0x3a, 0x12, 0xa5, 0x52, 0xbd, 0xbc, 0x90, 0xac, 0x3c, 0x00, 0x77, 0xe1, 0x3a,
+  0x83, 0x27, 0x8a, 0xbd, 0x40, 0xcd, 0xb0, 0xbc, 0x6a, 0xf8, 0x22, 0x3d, 0xc0,
+  0xfe, 0xc8, 0xbb, 0x52, 0x28, 0x63, 0x3d, 0xb2, 0xd2, 0xbe, 0xbc, 0x80, 0x68,
+  0x42, 0xbc, 0xa4, 0x31, 0x58, 0xbc, 0xae, 0xda, 0x3a, 0xbd, 0xcb, 0xd7, 0x80,
+  0xbd, 0x32, 0x43, 0x60, 0x3d, 0x52, 0xc1, 0xa9, 0xbc, 0x18, 0x3a, 0x2d, 0x3c,
+  0x8e, 0x17, 0x5f, 0xbd, 0x9d, 0xcc, 0x85, 0x3d, 0x5c, 0x7c, 0x12, 0x3d, 0xde,
+  0x24, 0x78, 0x3d, 0xec, 0xba, 0x16, 0x3d, 0xd1, 0xb1, 0x3d, 0xbd, 0xf0, 0x7f,
+  0xe3, 0x3c, 0xe0, 0xf7, 0xef, 0xbb, 0x28, 0x65, 0x18, 0xbd, 0x7a, 0x38, 0x48,
+  0x3d, 0xad, 0xff, 0x81, 0xbd, 0x72, 0xe6, 0x69, 0x3d, 0x98, 0x35, 0x08, 0xbd,
+  0x16, 0xb5, 0x3a, 0xbd, 0x26, 0x18, 0x52, 0xbd, 0xc4, 0xb5, 0xc9, 0x3c, 0xbc,
+  0xcc, 0x93, 0x3c, 0x6e, 0x74, 0xc9, 0xbc, 0xae, 0x05, 0x14, 0x3d, 0x96, 0x6c,
+  0x78, 0x3d, 0x48, 0xe7, 0x7a, 0xbc, 0xe2, 0x8b, 0x65, 0xbd, 0xda, 0x9c, 0x97,
+  0xbc, 0xbc, 0xc8, 0xab, 0x3c, 0xf0, 0xb1, 0x5f, 0xbd, 0xbe, 0x43, 0x3d, 0x3d,
+  0xf8, 0xc7, 0x81, 0xbd, 0xd0, 0xc7, 0xcd, 0x3c, 0xfe, 0x77, 0x72, 0xbd, 0x32,
+  0x3c, 0x7c, 0x3d, 0xfa, 0x2e, 0x84, 0xbc, 0x4c, 0xbc, 0x04, 0x3d, 0xc6, 0x29,
+  0x8f, 0xbd, 0x4c, 0x07, 0xb8, 0x3c, 0x51, 0xb8, 0x45, 0xbd, 0x4c, 0x84, 0x7b,
+  0xbd, 0x8e, 0x26, 0x3e, 0xbd, 0x48, 0xcc, 0x96, 0xbc, 0xb0, 0x59, 0x32, 0x3d,
+  0xd6, 0x47, 0xba, 0xbc, 0xf9, 0x32, 0x81, 0x3d, 0xb0, 0xb8, 0x88, 0xbb, 0x80,
+  0x93, 0xfd, 0x3a, 0x4a, 0x8d, 0x39, 0x3d, 0x88, 0x34, 0xa1, 0x3c, 0x20, 0x3b,
+  0x53, 0x3b, 0x10, 0x26, 0x35, 0x3d, 0x50, 0xab, 0x77, 0xbc, 0x89, 0x68, 0x69,
+  0xbd, 0x56, 0xd0, 0x15, 0x3d, 0x56, 0x3f, 0x3e, 0xbd, 0xa0, 0x94, 0xb5, 0x3c,
+  0xa9, 0x10, 0x90, 0xbd, 0xfa, 0xe9, 0x48, 0xbd, 0x66, 0x62, 0x6a, 0x3d, 0xdc,
+  0x51, 0xb0, 0x3c, 0x20, 0x13, 0x4d, 0xbd, 0x40, 0xbf, 0xe5, 0xba, 0x50, 0x61,
+  0x9e, 0x3b, 0xa0, 0xbd, 0xeb, 0xbc, 0xd9, 0x55, 0x48, 0xbd, 0x4c, 0xbf, 0x0e,
+  0xbd, 0x80, 0x28, 0x20, 0x3b, 0xea, 0x77, 0x72, 0x3d, 0x08, 0xd6, 0x02, 0x3d,
+  0x7b, 0x14, 0x42, 0xbd, 0x8c, 0x7f, 0x91, 0x3c, 0x82, 0xe4, 0x16, 0xbd, 0x30,
+  0x61, 0xaf, 0x3c, 0xd2, 0x5c, 0x5a, 0xbd, 0xc0, 0x16, 0x69, 0x3b, 0xe9, 0x5b,
+  0x84, 0x3d, 0x49, 0xc3, 0x7e, 0xbd, 0x90, 0x7f, 0xf7, 0x3c, 0x3e, 0xd5, 0x85,
+  0xbd, 0x38, 0xb7, 0x43, 0x3c, 0x4e, 0x4d, 0xc0, 0xbc, 0x00, 0x78, 0xea, 0x3a,
+  0x32, 0xb2, 0x92, 0xbd, 0xb0, 0xc3, 0x1d, 0x3c, 0x90, 0xc2, 0x23, 0x3c, 0x80,
+  0x14, 0xc5, 0x3b, 0x00, 0xf1, 0x87, 0xbc, 0x26, 0xf4, 0x8a, 0xbd, 0x10, 0xa6,
+  0x9a, 0x3b, 0x78, 0x8b, 0x72, 0xbd, 0x85, 0xef, 0x12, 0xbd, 0xd8, 0x93, 0x02,
+  0x3d, 0x80, 0x8b, 0xca, 0x3a, 0x18, 0x72, 0x17, 0xbc, 0x65, 0x2d, 0x83, 0x3d,
+  0xfb, 0xe9, 0x81, 0x3d, 0x60, 0xf3, 0x46, 0xbd, 0xb4, 0xab, 0x1a, 0xbc, 0x30,
+  0x0c, 0xf9, 0x3c, 0xb6, 0xc5, 0x63, 0xbd, 0x8e, 0x20, 0xdd, 0xbc, 0x5c, 0x18,
+  0x97, 0xbc, 0x10, 0x42, 0x43, 0x3d, 0x11, 0xab, 0x84, 0x3d, 0xec, 0xcf, 0x30,
+  0x3d, 0x38, 0x0e, 0x6a, 0x3c, 0x3e, 0x40, 0xd9, 0xbc, 0xce, 0x14, 0x14, 0x3d,
+  0x5c, 0xe6, 0x71, 0xbc, 0xf8, 0xd8, 0xf2, 0x3c, 0x98, 0x96, 0x21, 0xbc, 0xbe,
+  0xdb, 0x18, 0xbd, 0xe6, 0x7f, 0x28, 0xbd, 0xab, 0x56, 0x23, 0xbd, 0xc2, 0x40,
+  0x8e, 0xbd, 0x8c, 0x92, 0xc3, 0x3c, 0xd4, 0x0a, 0x13, 0xbd, 0xbe, 0x25, 0x05,
+  0x3d, 0x12, 0x58, 0x0d, 0x3d, 0xd7, 0x65, 0x79, 0xbd, 0x9c, 0x54, 0x4e, 0x3d,
+  0x02, 0x2a, 0x40, 0x3d, 0xef, 0xcd, 0x01, 0xbd, 0x11, 0x5c, 0x92, 0x3d, 0xb0,
+  0x03, 0x95, 0x3c, 0xa0, 0x08, 0x19, 0x3b, 0x79, 0xad, 0x8c, 0x3d, 0x19, 0x93,
+  0x7a, 0xbd, 0x40, 0xfa, 0xc6, 0xbb, 0x68, 0xb6, 0xa8, 0x3c, 0x45, 0x29, 0x8d,
+  0xbd, 0x90, 0x3e, 0x13, 0xbc, 0x1a, 0x2d, 0x70, 0x3d, 0xc1, 0xdd, 0x6a, 0xbd,
+  0x50, 0x75, 0x01, 0xbd, 0xc1, 0x8d, 0x91, 0xbd, 0xdd, 0x3f, 0x84, 0xbd, 0xa3,
+  0xc6, 0x8d, 0x3d, 0xce, 0x23, 0x5b, 0x3d, 0x7e, 0xfb, 0x7d, 0x3d, 0xd5, 0xf4,
+  0x23, 0xbd, 0x4c, 0x65, 0x8d, 0xbc, 0xb0, 0x76, 0x89, 0xbd, 0x28, 0xc4, 0x82,
+  0xbd, 0x40, 0x70, 0x71, 0x3b, 0xfa, 0x55, 0x8e, 0xbc, 0x40, 0x08, 0xf0, 0x3a,
+  0x02, 0x81, 0x56, 0x3d, 0xfe, 0x51, 0xf8, 0xbc, 0x1a, 0xcd, 0x91, 0xbd, 0xfb,
+  0x66, 0x7b, 0xbd, 0xb0, 0xbb, 0xf2, 0xbc, 0xbb, 0x24, 0x23, 0xbd, 0x5c, 0x6c,
+  0x6d, 0xbd, 0x08, 0xa0, 0x8b, 0x3c, 0xb7, 0x93, 0x1d, 0xbd, 0x74, 0x9f, 0x21,
+  0x3d, 0x1c, 0x43, 0x33, 0xbd, 0x66, 0x2c, 0x1c, 0xbd, 0xfe, 0xf5, 0x11, 0xbd,
+  0x10, 0x32, 0xef, 0xbc, 0x40, 0x70, 0x6f, 0xbb, 0xa1, 0xca, 0x8f, 0x3d, 0x12,
+  0x42, 0x13, 0x3d, 0x38, 0x2e, 0xf3, 0x3c, 0x16, 0x69, 0x77, 0x3d, 0x6d, 0xa9,
+  0x1e, 0xbd, 0xdc, 0xf5, 0xba, 0xbc, 0xc4, 0xe8, 0x1f, 0xbd, 0xfc, 0xc7, 0x08,
+  0x3d, 0x8c, 0x9a, 0x28, 0x3d, 0x80, 0xbb, 0x14, 0x3b, 0xce, 0x47, 0x68, 0x3d,
+  0xd3, 0x75, 0x10, 0xbd, 0x30, 0x9e, 0xb1, 0x3b, 0x48, 0x08, 0x80, 0x3c, 0x53,
+  0xbe, 0x7e, 0xbd, 0x54, 0xdd, 0x5c, 0xbd, 0x89, 0x15, 0x77, 0xbd, 0x20, 0x13,
+  0x00, 0x3b, 0xab, 0x6a, 0x15, 0xbd, 0x70, 0x62, 0x0b, 0xbc, 0xb6, 0x69, 0x44,
+  0x3d, 0x9e, 0x71, 0x44, 0x3d, 0xfb, 0x84, 0x1e, 0xbd, 0xc8, 0x25, 0x3e, 0xbc,
+  0xa8, 0x9e, 0xa6, 0x3c, 0xa0, 0x0c, 0x0b, 0x3d, 0x48, 0xe7, 0xb1, 0xbc, 0x2f,
+  0xfc, 0x8a, 0x3d, 0xbc, 0x2a, 0x27, 0xbc, 0x80, 0x69, 0x38, 0x3c, 0xa0, 0x89,
+  0xb4, 0xbb, 0x10, 0xb6, 0x56, 0xbc, 0x80, 0xaa, 0x37, 0x3b, 0xbd, 0x66, 0x1d,
+  0xbd, 0xb9, 0x3e, 0x6c, 0xbd, 0x14, 0xc1, 0x1e, 0x3d, 0x10, 0xd3, 0xa5, 0x3b,
+  0x1c, 0x9a, 0x43, 0xbc, 0xa0, 0xb3, 0xdd, 0xbc, 0xf8, 0x82, 0xb8, 0x3c, 0xc8,
+  0x76, 0x1b, 0x3d, 0x7e, 0x2b, 0x5c, 0x3d, 0x20, 0xd8, 0x7f, 0xbd, 0x88, 0xe0,
+  0xa0, 0x3c, 0x1c, 0x48, 0x26, 0x3d, 0x50, 0x53, 0x1e, 0x3c, 0xf0, 0x07, 0x54,
+  0x3c, 0xc9, 0xde, 0x05, 0xbd, 0x2c, 0x34, 0x84, 0x3c, 0xa8, 0x30, 0x1b, 0x3c,
+  0x6c, 0xa1, 0x3c, 0xbd, 0x00, 0x58, 0xc1, 0xb8, 0xf0, 0xd4, 0xf9, 0x3b, 0xf0,
+  0xb3, 0x2e, 0x3d, 0x14, 0xe3, 0x4f, 0x3d, 0x70, 0x0b, 0x73, 0x3c, 0x8b, 0xca,
+  0x89, 0xbd, 0x9c, 0xd8, 0x85, 0x3c, 0x9c, 0x34, 0x4b, 0xbc, 0xf5, 0x38, 0x71,
+  0xbd, 0x01, 0xe5, 0x84, 0x3d, 0xd4, 0xde, 0x25, 0xbc, 0x80, 0xc0, 0xb1, 0xbb,
+  0x80, 0xca, 0xfc, 0x3b, 0x78, 0xe0, 0x2d, 0xbd, 0xda, 0x90, 0x29, 0xbd, 0x3a,
+  0xdb, 0x37, 0xbd, 0x00, 0x81, 0xa1, 0xbb, 0x3a, 0xcb, 0x71, 0xbd, 0x1c, 0x8e,
+  0x29, 0xbc, 0x68, 0x0a, 0x5f, 0xbc, 0x0f, 0x86, 0x91, 0xbd, 0x98, 0x61, 0x62,
+  0x3c, 0x82, 0x06, 0x4e, 0xbd, 0xa0, 0x7a, 0x35, 0x3b, 0xfa, 0xbc, 0x31, 0x3d,
+  0xee, 0x18, 0x3a, 0x3d, 0xe0, 0xf0, 0x9d, 0xbb, 0x87, 0xba, 0x8f, 0x3d, 0x0e,
+  0x75, 0x24, 0x3d, 0x92, 0xf6, 0x77, 0x3d, 0x78, 0xda, 0x72, 0xbc, 0xe4, 0x5c,
+  0x55, 0xbc, 0xe3, 0xbf, 0x87, 0x3d, 0x74, 0x55, 0x5c, 0xbd, 0x88, 0x2b, 0x0b,
+  0xbc, 0x68, 0xd5, 0x21, 0x3d, 0x0a, 0x05, 0x94, 0xbc, 0x5f, 0xb7, 0x8a, 0x3d,
+  0x48, 0x83, 0x5c, 0x3c, 0x08, 0x83, 0x77, 0xbc, 0xc4, 0x31, 0xd6, 0x3c, 0xb8,
+  0x48, 0x52, 0x3c, 0x00, 0xcb, 0xda, 0x3b, 0x32, 0x6a, 0x5f, 0xbd, 0x76, 0x7f,
+  0x8f, 0xbd, 0xc0, 0xb7, 0xb2, 0x3c, 0x91, 0x5e, 0x1d, 0xbd, 0x92, 0x5d, 0x62,
+  0x3d, 0x9c, 0x2b, 0x65, 0xbd, 0x3e, 0xe5, 0x2a, 0x3d, 0x29, 0xb7, 0x81, 0xbd,
+  0x74, 0xa2, 0xda, 0x3c, 0x1a, 0xcb, 0x15, 0x3d, 0x56, 0x35, 0x60, 0x3d, 0x50,
+  0x4a, 0x4f, 0xbc, 0xb2, 0x3c, 0x73, 0x3d, 0x88, 0x39, 0x71, 0xbd, 0xa0, 0x73,
+  0x7d, 0xbd, 0x18, 0x14, 0xac, 0x3c, 0xa8, 0x1a, 0x57, 0x3d, 0x00, 0x3a, 0x77,
+  0xbc, 0x2a, 0xd5, 0x93, 0xbc, 0x7e, 0x27, 0x41, 0x3d, 0xa0, 0x96, 0x19, 0x3d,
+  0x18, 0x3e, 0xe5, 0x3c, 0x56, 0xda, 0x0d, 0x3d, 0xb2, 0x5f, 0x1d, 0x3d, 0x0c,
+  0x27, 0xd6, 0x3c, 0xc6, 0x34, 0x89, 0xbd, 0x84, 0xe7, 0x65, 0xbd, 0xfc, 0x87,
+  0xba, 0x3c, 0xd6, 0x7b, 0x3b, 0xbd, 0xe8, 0xf4, 0x49, 0xbd, 0x70, 0x19, 0x0d,
+  0x3c, 0x5a, 0x0c, 0x18, 0x3d, 0xe6, 0x0e, 0x26, 0x3d, 0x12, 0xa0, 0x61, 0xbd,
+  0xec, 0xa3, 0x26, 0x3d, 0xf4, 0xef, 0xe0, 0x3c, 0xdd, 0xc0, 0x88, 0xbd, 0x08,
+  0x87, 0x0e, 0x3d, 0x2b, 0xb7, 0x18, 0xbd, 0xe6, 0xd5, 0x1f, 0xbd, 0x38, 0xc1,
+  0x37, 0x3c, 0x88, 0x9a, 0x74, 0xbd, 0x04, 0xce, 0x04, 0x3d, 0x00, 0x5c, 0xab,
+  0xbc, 0xbd, 0x47, 0x4b, 0xbd, 0xf0, 0xc1, 0x33, 0xbc, 0x2c, 0x4d, 0xca, 0x3c,
+  0x84, 0xfd, 0xed, 0xbc, 0x6c, 0xf2, 0x2c, 0x3d, 0x1b, 0x24, 0x87, 0x3d, 0x7a,
+  0x67, 0x8f, 0xbc, 0x84, 0xab, 0x50, 0xbc, 0x84, 0xd2, 0x0b, 0x3d, 0x18, 0x03,
+  0x03, 0x3d, 0x80, 0x54, 0x01, 0x3d, 0xbc, 0x41, 0xd8, 0x3c, 0x60, 0xe4, 0x34,
+  0x3d, 0x3d, 0xfb, 0x26, 0xbd, 0xcc, 0x6f, 0x1f, 0x3d, 0xc0, 0xb0, 0x30, 0xbb,
+  0x7f, 0xb2, 0x83, 0xbd, 0x8f, 0xed, 0x91, 0x3d, 0xa0, 0xe6, 0xe2, 0xbb, 0xfa,
+  0x94, 0x67, 0x3d, 0x70, 0xd4, 0x69, 0xbd, 0x80, 0xba, 0xed, 0x3c, 0xce, 0x26,
+  0xb8, 0xbc, 0xfe, 0xd9, 0x1c, 0x3d, 0xae, 0x09, 0x0e, 0x3d, 0x4f, 0x3d, 0x52,
+  0xbd, 0x87, 0xde, 0x62, 0xbd, 0x02, 0x63, 0xff, 0xbc, 0x70, 0x60, 0xbd, 0x3b,
+  0x3c, 0x3f, 0xe7, 0x3c, 0x9c, 0x9c, 0x34, 0xbd, 0x82, 0xcf, 0x82, 0xbd, 0xa2,
+  0xdb, 0x39, 0x3d, 0x70, 0x89, 0xe8, 0x3c, 0xad, 0x61, 0x80, 0xbd, 0xd8, 0x58,
+  0x34, 0xbd, 0xf6, 0x79, 0x5f, 0xbd, 0xd0, 0x9b, 0xc6, 0x3c, 0x02, 0x91, 0x0f,
+  0x3d, 0x90, 0xe4, 0xc1, 0x3b, 0xff, 0xa7, 0x8e, 0x3d, 0x99, 0x07, 0x92, 0xbd,
+  0x30, 0x36, 0xe4, 0x3b, 0xf0, 0xd6, 0x38, 0xbd, 0xea, 0x6d, 0x2d, 0xbd, 0x0e,
+  0x11, 0xf6, 0xbc, 0x80, 0x5b, 0x53, 0x3b, 0x1c, 0x44, 0x41, 0x3d, 0xab, 0x98,
+  0x7b, 0xbd, 0x20, 0x36, 0x71, 0x3b, 0x87, 0x93, 0x20, 0xbd, 0xb0, 0x35, 0x27,
+  0xbd, 0xd2, 0x2b, 0x75, 0x3d, 0x90, 0x12, 0xdc, 0xbc, 0x06, 0x6c, 0x2b, 0x3d,
+  0xe0, 0x86, 0x20, 0xbb, 0x9d, 0xdd, 0x88, 0x3d, 0xec, 0xe2, 0x19, 0x3d, 0x70,
+  0x76, 0xb4, 0x3c, 0x0e, 0x49, 0x42, 0xbd, 0x34, 0x9c, 0xe3, 0x3c, 0xe0, 0x1d,
+  0xf8, 0xbb, 0xfc, 0x83, 0xc2, 0xbc, 0xdc, 0xe1, 0x8d, 0xbc, 0x04, 0x9b, 0xa7,
+  0x3c, 0x54, 0x5a, 0xfc, 0x3c, 0x80, 0x63, 0x14, 0xba, 0xcc, 0x46, 0x08, 0x3d,
+  0x46, 0xf5, 0x2b, 0x3d, 0xe0, 0x8b, 0x48, 0x3d, 0xa0, 0x99, 0xfd, 0x3b, 0x41,
+  0x57, 0x87, 0x3d, 0xe4, 0xcb, 0x56, 0xbd, 0x1f, 0xa4, 0x3f, 0xbd, 0xac, 0x66,
+  0x85, 0x3c, 0xaa, 0x3a, 0x55, 0x3d, 0x32, 0x06, 0x29, 0x3d, 0x9a, 0xb8, 0x5a,
+  0xbd, 0x00, 0xfc, 0xbb, 0xba, 0xd7, 0x80, 0x86, 0x3d, 0xb4, 0x7c, 0xf5, 0x3c,
+  0xac, 0xf4, 0x36, 0x3d, 0x82, 0xef, 0x65, 0x3d, 0x49, 0x63, 0x5c, 0xbd, 0x66,
+  0xe0, 0x8f, 0xbd, 0x42, 0x66, 0x28, 0x3d, 0xfc, 0xec, 0x08, 0x3d, 0x0a, 0x9c,
+  0x1e, 0x3d, 0x65, 0x3c, 0x45, 0xbd, 0x73, 0x4f, 0x88, 0x3d, 0xec, 0x1e, 0xbf,
+  0xbc, 0xee, 0xa7, 0x55, 0x3d, 0x10, 0x84, 0x57, 0x3c, 0xd4, 0x12, 0xdf, 0x3c,
+  0xa8, 0x8f, 0x8f, 0xbd, 0x56, 0x80, 0x89, 0xbd, 0x08, 0xc5, 0x09, 0xbc, 0xfd,
+  0x84, 0x22, 0xbd, 0xb2, 0x0a, 0x66, 0x3d, 0x0a, 0x86, 0x61, 0x3d, 0x79, 0xf8,
+  0x81, 0xbd, 0x7a, 0x81, 0x49, 0xbd, 0x88, 0x62, 0x7f, 0x3c, 0x8c, 0x81, 0x71,
+  0xbd, 0x42, 0x9e, 0x86, 0xbd, 0x30, 0x5d, 0xf6, 0x3b, 0x6c, 0xc0, 0x29, 0xbc,
+  0x88, 0x30, 0xdf, 0xbc, 0xda, 0xed, 0xf4, 0xbc, 0x98, 0x29, 0x34, 0xbd, 0xc0,
+  0x10, 0xbe, 0x3a, 0x9b, 0x69, 0x8c, 0x3d, 0x40, 0x02, 0x98, 0xba, 0x2b, 0x85,
+  0x76, 0xbd, 0x0c, 0xfd, 0xd3, 0x3c, 0x62, 0x37, 0x08, 0x3d, 0x0a, 0xe3, 0xe9,
+  0xbc, 0x80, 0x1c, 0xc9, 0x3a, 0x54, 0x4b, 0x39, 0xbc, 0x28, 0xae, 0x7a, 0x3c,
+  0x60, 0xd7, 0xe9, 0x3b, 0x08, 0xbe, 0x52, 0xbd, 0x04, 0x99, 0x3d, 0xbd, 0xd0,
+  0xd2, 0x13, 0xbd, 0x1a, 0x86, 0x8e, 0xbc, 0xeb, 0xaa, 0x6a, 0xbd, 0x00, 0x23,
+  0xa3, 0xb9, 0xc8, 0x76, 0x77, 0xbc, 0x36, 0x45, 0x72, 0xbd, 0xe4, 0xd7, 0x8a,
+  0xbc, 0xfd, 0xfa, 0x8c, 0x3d, 0x2b, 0xc3, 0x07, 0xbd, 0x6d, 0xd0, 0x87, 0x3d,
+  0xec, 0xa4, 0xde, 0x3c, 0x92, 0x4b, 0x65, 0x3d, 0x20, 0x6c, 0x2c, 0xbd, 0x00,
+  0xb7, 0x0c, 0x3b, 0x96, 0x7f, 0x4b, 0x3d, 0xec, 0xe9, 0xdb, 0xbc, 0xaa, 0x06,
+  0x3b, 0x3d, 0x20, 0x8c, 0x33, 0x3d, 0xe1, 0x03, 0x18, 0xbd, 0xe0, 0xa5, 0x0a,
+  0xbc, 0x30, 0x1d, 0x5f, 0x3c, 0xfc, 0x28, 0x6d, 0xbd, 0x43, 0x41, 0x90, 0x3d,
+  0x58, 0x87, 0x30, 0x3c, 0xdd, 0x8c, 0x60, 0xbd, 0xec, 0x2a, 0xba, 0xbc, 0xf2,
+  0x9d, 0xa9, 0xbc, 0x30, 0xb0, 0x06, 0x3c, 0x68, 0x3e, 0x53, 0x3c, 0x78, 0xab,
+  0xff, 0xbc, 0xa8, 0x34, 0x0d, 0xbc, 0x4e, 0x3f, 0x01, 0x3d, 0x00, 0x96, 0x44,
+  0x3b, 0x2c, 0xa3, 0xda, 0x3c, 0xba, 0xc4, 0x2e, 0xbd, 0x72, 0xbd, 0x2f, 0x3d,
+  0xfc, 0x1b, 0x7d, 0xbc, 0x9e, 0xbf, 0x7e, 0x3d, 0x02, 0x94, 0x19, 0x3d, 0x94,
+  0x36, 0x4f, 0x3d, 0xf1, 0xee, 0x68, 0xbd, 0x54, 0x9c, 0x87, 0x3c, 0xfa, 0x3e,
+  0x7e, 0x3d, 0x02, 0xec, 0x84, 0xbc, 0x12, 0xe7, 0x89, 0xbd, 0xa4, 0x90, 0xa6,
+  0x3c, 0x3c, 0x7a, 0x89, 0xbc, 0x86, 0x5d, 0x54, 0x3d, 0xa4, 0xad, 0x53, 0xbc,
+  0x32, 0xc5, 0x00, 0x3d, 0x1e, 0x53, 0x0b, 0x3d, 0xef, 0xae, 0x02, 0xbd, 0x7c,
+  0xd8, 0x03, 0x3d, 0x38, 0x0e, 0xa5, 0xbc, 0x51, 0xc4, 0x83, 0x3d, 0x66, 0xcb,
+  0x8f, 0xbd, 0xa6, 0xfe, 0xb6, 0xbc, 0xa4, 0xb1, 0x97, 0x3c, 0x00, 0xad, 0xb2,
+  0x3a, 0x0f, 0xb7, 0x33, 0xbd, 0x37, 0x1f, 0x6f, 0xbd, 0x57, 0x39, 0x8c, 0x3d,
+  0x54, 0xe4, 0xb7, 0xbc, 0x1e, 0x63, 0x52, 0xbd, 0x00, 0x3b, 0x43, 0xbd, 0x50,
+  0x48, 0xf1, 0xbb, 0x18, 0x01, 0x81, 0xbd, 0x90, 0x1c, 0xaf, 0xbc, 0x06, 0xf8,
+  0x7d, 0xbd, 0xf0, 0xe0, 0xa5, 0xbc, 0x08, 0x06, 0xc3, 0x3c, 0x22, 0xff, 0x83,
+  0xbc, 0x4c, 0xef, 0x88, 0xbd, 0x36, 0xf2, 0x77, 0x3d, 0x54, 0x3b, 0xd4, 0xbc,
+  0xa7, 0xa2, 0x8e, 0x3d, 0xac, 0xb2, 0x99, 0x3c, 0x10, 0x08, 0x88, 0xbb, 0x81,
+  0x58, 0x8d, 0xbd, 0xf8, 0x25, 0x29, 0xbd, 0x1c, 0x0f, 0x26, 0xbd, 0x8e, 0x7a,
+  0x81, 0xbd, 0x5c, 0x14, 0x8d, 0xbd, 0x81, 0xdd, 0x8f, 0xbd, 0xc8, 0xa2, 0x5f,
+  0xbc, 0xc0, 0x48, 0xda, 0xba, 0xfe, 0x26, 0x14, 0x3d, 0xe2, 0x9a, 0x89, 0xbd,
+  0x66, 0x8d, 0x59, 0x3d, 0xd8, 0xf8, 0x45, 0x3d, 0x0b, 0xb1, 0x04, 0xbd, 0x7a,
+  0x32, 0xdd, 0xbc, 0x00, 0x01, 0x24, 0xbb, 0xc5, 0x97, 0x87, 0xbd, 0x7c, 0xea,
+  0x46, 0x3d, 0x85, 0xc1, 0x81, 0x3d, 0xe8, 0x63, 0x24, 0x3d, 0x5d, 0xb3, 0x84,
+  0xbd, 0xca, 0xa4, 0x04, 0x3d, 0xea, 0xe8, 0xf0, 0xbc, 0xdc, 0x41, 0x05, 0xbd,
+  0xe8, 0x40, 0x4c, 0xbd, 0xb0, 0xb7, 0x2d, 0x3d, 0xa9, 0x0c, 0x1f, 0xbd, 0xd0,
+  0x50, 0x97, 0x3b, 0x3f, 0x9c, 0x0f, 0xbd, 0xac, 0xa8, 0x59, 0xbd, 0xdb, 0x76,
+  0x87, 0x3d, 0x08, 0xd7, 0x52, 0x3c, 0xc8, 0xf0, 0x1c, 0x3d, 0xec, 0xc1, 0x4a,
+  0x3d, 0x44, 0x87, 0x81, 0x3c, 0xbe, 0x6f, 0x13, 0x3d, 0x80, 0x36, 0x49, 0x3c,
+  0xae, 0xea, 0x73, 0x3d, 0x70, 0xd3, 0x2d, 0x3d, 0xde, 0xbb, 0x9d, 0xbc, 0xaa,
+  0xba, 0x32, 0x3d, 0x7b, 0xc1, 0x3c, 0xbd, 0x42, 0x4e, 0x5f, 0xbd, 0x9a, 0xd4,
+  0x75, 0xbd, 0x52, 0x8d, 0x4a, 0x3d, 0xb4, 0x42, 0x8f, 0x3c, 0x20, 0x32, 0x92,
+  0xbc, 0x39, 0x52, 0x0a, 0xbd, 0xd8, 0xf6, 0x21, 0xbd, 0x8b, 0x5e, 0x26, 0xbd,
+  0x42, 0x45, 0x5b, 0xbd, 0x06, 0x86, 0x7f, 0xbd, 0x65, 0x5a, 0x57, 0xbd, 0x78,
+  0x0a, 0x41, 0xbd, 0x5d, 0x12, 0x89, 0xbd, 0x40, 0x70, 0x34, 0xbc, 0xa0, 0x15,
+  0x43, 0xbb, 0x76, 0xc5, 0x48, 0x3d, 0x40, 0x0b, 0x36, 0x3d, 0x40, 0x3a, 0x3f,
+  0x3b, 0x58, 0xc4, 0xa3, 0x3c, 0x70, 0xdc, 0xdf, 0x3c, 0x50, 0x13, 0x1c, 0x3d,
+  0xc0, 0x6d, 0xcc, 0xbb, 0x62, 0xc7, 0x32, 0xbd, 0x15, 0x3f, 0x8b, 0x3d, 0xb5,
+  0x5b, 0x14, 0xbd, 0xf1, 0x00, 0x3f, 0xbd, 0x90, 0xe9, 0x53, 0x3c, 0xae, 0xa0,
+  0x1f, 0xbd, 0x54, 0x4f, 0xc8, 0xbc, 0x7c, 0x0b, 0x3a, 0xbc, 0x96, 0x74, 0x38,
+  0x3d, 0xa6, 0x9b, 0x3f, 0xbd, 0xf4, 0xfd, 0x88, 0xbc, 0x18, 0x1c, 0x97, 0xbc,
+  0xc8, 0xcf, 0xea, 0x3c, 0xd9, 0x76, 0x8c, 0x3d, 0x3e, 0x07, 0x87, 0xbc, 0xa8,
+  0xb5, 0x3f, 0x3c, 0x74, 0x96, 0x79, 0xbd, 0x30, 0xfc, 0x4e, 0x3c, 0x60, 0x75,
+  0x25, 0x3d, 0x28, 0xd6, 0x7a, 0x3c, 0x38, 0xf6, 0x3e, 0x3c, 0x90, 0xd8, 0xf6,
+  0xbc, 0x0a, 0x8b, 0x78, 0x3d, 0x94, 0x29, 0xc7, 0xbc, 0xa0, 0x3e, 0xe9, 0xbc,
+  0x20, 0xfc, 0xa9, 0x3c, 0xde, 0xab, 0xd2, 0xbc, 0x97, 0x63, 0x8b, 0xbd, 0xa0,
+  0xe7, 0x52, 0xbb, 0xa4, 0xf2, 0x36, 0xbc, 0x50, 0x49, 0xb9, 0xbb, 0x1f, 0x9e,
+  0x88, 0x3d, 0x86, 0xea, 0x9d, 0xbc, 0x38, 0x1b, 0xf5, 0x3c, 0x46, 0xea, 0x1e,
+  0xbd, 0x00, 0xad, 0x18, 0xba, 0x1e, 0x19, 0x6b, 0xbd, 0xa4, 0x1f, 0x90, 0x3c,
+  0xf5, 0xb4, 0x42, 0xbd, 0x48, 0xf2, 0x1f, 0xbd, 0x26, 0x05, 0x12, 0x3d, 0x80,
+  0x01, 0x58, 0xbd, 0xee, 0x98, 0x51, 0xbd, 0xb8, 0xcd, 0x96, 0xbc, 0x65, 0xbc,
+  0x81, 0x3d, 0x90, 0x57, 0xcd, 0x3b, 0xa0, 0x9a, 0x30, 0x3c, 0xa6, 0xa4, 0x82,
+  0xbd, 0x20, 0xa1, 0xc6, 0xbb, 0x95, 0x3a, 0x8c, 0xbd, 0x00, 0xa2, 0x72, 0x3c,
+  0x00, 0xd6, 0x58, 0x3b, 0xc8, 0x1f, 0x7d, 0x3c, 0xf0, 0x98, 0xe1, 0xbb, 0x02,
+  0x83, 0xe7, 0xbc, 0x9a, 0xc9, 0x67, 0x3d, 0xf5, 0x03, 0x90, 0xbd, 0x00, 0x9e,
+  0x55, 0xba, 0x80, 0xa0, 0x05, 0x3b, 0x00, 0x53, 0x6d, 0x3c, 0x16, 0xc9, 0x6a,
+  0x3d, 0x96, 0x11, 0x04, 0x3d, 0x10, 0x45, 0xff, 0xbb, 0xd2, 0x78, 0x2a, 0xbd,
+  0xbb, 0xe1, 0x8d, 0xbd, 0x8c, 0x4a, 0xc7, 0xbc, 0x20, 0x1c, 0x23, 0x3d, 0x10,
+  0xb3, 0xff, 0x3b, 0xd8, 0xec, 0x36, 0x3c, 0x64, 0xf1, 0xa7, 0x3d, 0x22, 0xd3,
+  0xb0, 0xbd, 0xba, 0xd3, 0xc4, 0x3c, 0x7f, 0x35, 0x0a, 0x3d, 0xb1, 0xba, 0xc0,
+  0x3d, 0x70, 0x6e, 0x10, 0x3c, 0x0b, 0x3f, 0x43, 0x3d, 0x75, 0x57, 0x4f, 0xbd,
+  0xf7, 0xae, 0x5e, 0xbd, 0xd6, 0xc7, 0x9f, 0x3d, 0x15, 0x89, 0x08, 0x3d, 0x02,
+  0x77, 0x49, 0x3c, 0x19, 0x3b, 0xc5, 0xbc, 0xa2, 0x8d, 0x43, 0xbd, 0x7b, 0x63,
+  0x22, 0xbc, 0xb8, 0x4c, 0xbe, 0x3d, 0x98, 0x23, 0x2a, 0xbd, 0xd2, 0x49, 0x69,
+  0xbd, 0x58, 0xae, 0x14, 0x3d, 0xdc, 0x52, 0x85, 0xbd, 0xd0, 0x91, 0xea, 0x3c,
+  0x93, 0x04, 0x5c, 0x3d, 0xdf, 0xf9, 0x20, 0x3d, 0xd3, 0x87, 0x3f, 0xbd, 0xae,
+  0xe4, 0x6a, 0x3c, 0xed, 0x34, 0x27, 0x3c, 0x79, 0x2d, 0x67, 0x3d, 0x63, 0xb8,
+  0x57, 0xbc, 0x9f, 0x7f, 0x79, 0xbd, 0x44, 0x92, 0x9b, 0x3d, 0x60, 0x08, 0x40,
+  0xbd, 0xde, 0x4c, 0x9c, 0x3c, 0xdd, 0x61, 0x21, 0x3c, 0x86, 0xd4, 0x15, 0xbd,
+  0xf9, 0xd9, 0xe1, 0xbd, 0x40, 0xc7, 0x2f, 0x3d, 0xa7, 0x36, 0x89, 0x3d, 0x8a,
+  0xdc, 0xa0, 0xbd, 0x5a, 0x12, 0x99, 0x3c, 0x8a, 0x63, 0xfa, 0xba, 0x77, 0x80,
+  0xa2, 0xbd, 0x68, 0x8f, 0x19, 0xbc, 0x91, 0x17, 0xfc, 0x3c, 0xc7, 0x5f, 0xa0,
+  0x3c, 0x21, 0x34, 0xf2, 0xbc, 0x09, 0x55, 0x1d, 0xbc, 0xcf, 0x87, 0x01, 0xbc,
+  0xba, 0xe9, 0x8c, 0x3d, 0x07, 0xf7, 0x93, 0x3c, 0xe2, 0x86, 0x80, 0x3c, 0xd7,
+  0xf7, 0x45, 0xbd, 0x8d, 0x5c, 0x55, 0x3d, 0x40, 0x89, 0x73, 0x3c, 0x7a, 0xe1,
+  0x5c, 0x3c, 0x6a, 0x34, 0xe7, 0xbc, 0x25, 0x79, 0xaa, 0x3a, 0x13, 0x23, 0xa1,
+  0x3d, 0x4b, 0x1e, 0xe1, 0x3c, 0x49, 0xbb, 0xb5, 0xbc, 0xa6, 0x19, 0xa9, 0x3c,
+  0x4e, 0xf1, 0x2a, 0x3d, 0x69, 0x81, 0xac, 0x3c, 0x00, 0x31, 0x46, 0x3c, 0x84,
+  0x9b, 0x17, 0xbd, 0xa3, 0x50, 0x70, 0x3d, 0xf9, 0x6d, 0x91, 0xbd, 0x41, 0x1f,
+  0xad, 0x3b, 0x9c, 0x7c, 0xa5, 0xbc, 0xd7, 0xa0, 0x8f, 0xbb, 0xfe, 0xeb, 0x05,
+  0x3d, 0xc5, 0x31, 0xc5, 0x3a, 0x9a, 0x3c, 0x08, 0x3d, 0xc2, 0x6d, 0x27, 0xbd,
+  0xa5, 0xc1, 0x7a, 0x3c, 0x4c, 0x25, 0x41, 0xbd, 0x3e, 0x6e, 0xd0, 0x3c, 0x6b,
+  0x0e, 0x6d, 0x3d, 0xb4, 0x47, 0x86, 0x3c, 0x60, 0xc8, 0x03, 0x3d, 0x78, 0xb8,
+  0xb3, 0x3d, 0xfb, 0x4b, 0x0d, 0x3d, 0x44, 0x4c, 0xc0, 0x3b, 0xd1, 0xa8, 0x33,
+  0xbc, 0xf8, 0x4d, 0x8d, 0xbd, 0x3b, 0xeb, 0x15, 0xbd, 0x16, 0xef, 0x19, 0xbb,
+  0x66, 0x45, 0x2c, 0xbd, 0x50, 0x0b, 0xab, 0xbb, 0x95, 0x0b, 0x06, 0xbd, 0x2c,
+  0x1f, 0x33, 0xbd, 0xe4, 0xa5, 0xb7, 0x3a, 0xa0, 0xa0, 0xe4, 0xbc, 0x6c, 0x3b,
+  0x65, 0x3d, 0x1e, 0xa8, 0x8b, 0x3b, 0xe0, 0xb7, 0x82, 0x3c, 0x3f, 0x77, 0x5b,
+  0x3d, 0xd1, 0xd3, 0x0a, 0x3c, 0xdd, 0xbc, 0xaa, 0xbd, 0xb2, 0x81, 0x91, 0xbc,
+  0x0f, 0xcb, 0x5d, 0x3d, 0x08, 0xa9, 0xf0, 0xbc, 0x9b, 0xc4, 0x0c, 0x3c, 0xf7,
+  0x0d, 0x64, 0xbc, 0x1c, 0xa0, 0xa5, 0xbc, 0x5b, 0x1d, 0x2d, 0xbd, 0x03, 0x78,
+  0x59, 0x3d, 0x1b, 0x8a, 0x13, 0x3d, 0xaa, 0x9c, 0x14, 0xbd, 0x57, 0xe2, 0xf1,
+  0x3c, 0x5f, 0xaa, 0x58, 0x3d, 0x6c, 0x19, 0xb5, 0xbc, 0x20, 0xeb, 0x3c, 0x3d,
+  0xe0, 0xda, 0xd5, 0x3c, 0x54, 0x6f, 0x6f, 0xbd, 0x91, 0x64, 0x82, 0x3d, 0xed,
+  0xcd, 0x10, 0x3b, 0xec, 0x91, 0x1c, 0x3d, 0xad, 0xee, 0xc0, 0x3c, 0xb9, 0x84,
+  0xb8, 0x3d, 0x67, 0xe4, 0x19, 0xba, 0xc5, 0xca, 0x00, 0x3b, 0xbc, 0x29, 0xcb,
+  0xbc, 0xca, 0x3c, 0x20, 0xbd, 0x6e, 0xed, 0x2e, 0xbd, 0xd8, 0x47, 0x83, 0xbd,
+  0x1f, 0x0b, 0x52, 0xbd, 0x10, 0x29, 0x29, 0x3c, 0xfa, 0x35, 0xd2, 0xbc, 0xbe,
+  0x31, 0x1b, 0x3d, 0x9c, 0x28, 0xdc, 0xbc, 0xb7, 0x93, 0x70, 0xbb, 0x7b, 0xa8,
+  0x83, 0xbc, 0xcb, 0xf0, 0x9a, 0x3c, 0x53, 0x7d, 0x31, 0xbd, 0x8a, 0x47, 0x4a,
+  0x3c, 0xf2, 0xe7, 0x79, 0xbd, 0xe7, 0x10, 0x64, 0xbc, 0x69, 0xf1, 0xa9, 0xbc,
+  0x5c, 0xfc, 0x9b, 0x3d, 0x5a, 0xcf, 0x14, 0x3d, 0xec, 0x08, 0x63, 0x3d, 0x69,
+  0x0f, 0x99, 0xbd, 0x6a, 0x76, 0xeb, 0x3c, 0xbd, 0x2f, 0x8f, 0x3d, 0xa0, 0x54,
+  0x8f, 0x3d, 0x7e, 0x08, 0x84, 0x3d, 0xba, 0x94, 0x42, 0x3d, 0x7c, 0xae, 0xf9,
+  0xbd, 0x70, 0x32, 0x7f, 0x3c, 0x2f, 0xd3, 0x88, 0xbc, 0x9a, 0x1a, 0x49, 0x3d,
+  0xf6, 0xed, 0x54, 0xbd, 0x7e, 0x15, 0x66, 0x3d, 0x81, 0x94, 0x7f, 0x3d, 0x4a,
+  0xfb, 0x5f, 0x3c, 0xd7, 0x10, 0x3a, 0x3c, 0xf8, 0x02, 0x89, 0xbd, 0x9f, 0x9c,
+  0xb9, 0xbc, 0x02, 0x4c, 0x5b, 0x3d, 0x80, 0xe7, 0x33, 0x3c, 0x55, 0x86, 0x99,
+  0x3d, 0x9d, 0xa9, 0xad, 0xbd, 0x9e, 0x1b, 0x76, 0xbb, 0xb8, 0x62, 0x49, 0x3d,
+  0x22, 0x21, 0x65, 0x3d, 0x22, 0x6d, 0x0f, 0x3d, 0x60, 0x23, 0x87, 0xbc, 0xc8,
+  0xfc, 0x26, 0xbd, 0xc5, 0x47, 0x8c, 0xbd, 0x22, 0x6e, 0xe2, 0xbc, 0xf0, 0x78,
+  0x2e, 0x3d, 0xa4, 0x7f, 0xa5, 0xbc, 0xf1, 0x41, 0xae, 0x3d, 0xa4, 0x08, 0x0b,
+  0x3d, 0xe8, 0xbb, 0x1c, 0xbc, 0xf8, 0xdd, 0x85, 0xbc, 0x72, 0x87, 0xea, 0x3c,
+  0x4a, 0xaa, 0x9a, 0x3d, 0x86, 0xdb, 0xb6, 0x3d, 0x0f, 0xb5, 0xd1, 0xba, 0xfc,
+  0x88, 0x62, 0xbd, 0x08, 0x54, 0xfd, 0x3d, 0x35, 0xf8, 0x2e, 0xbd, 0x3b, 0xbb,
+  0xc9, 0x3d, 0x9c, 0xb6, 0x57, 0x3d, 0x03, 0x65, 0x58, 0x3d, 0x13, 0xd0, 0x1d,
+  0xbd, 0xbb, 0xb1, 0xbf, 0xbc, 0x78, 0x00, 0xde, 0xbc, 0x5c, 0xcb, 0x48, 0xbd,
+  0xd3, 0xa1, 0x85, 0x3d, 0x08, 0x35, 0xf6, 0xbc, 0x4c, 0x66, 0x89, 0x3d, 0x09,
+  0x92, 0xa6, 0xbc, 0x64, 0x99, 0x9e, 0xbd, 0xae, 0x80, 0x85, 0xbd, 0x99, 0xe0,
+  0xe2, 0x3c, 0x8e, 0x75, 0x66, 0xbc, 0x1e, 0x8c, 0xb9, 0xbd, 0x57, 0x43, 0xa8,
+  0x3c, 0x31, 0x71, 0xac, 0xbc, 0xb5, 0x75, 0x01, 0x3d, 0x10, 0x39, 0x5c, 0xbd,
+  0xa6, 0xf9, 0x7b, 0xbd, 0xf6, 0xea, 0x5d, 0x3d, 0xd3, 0x34, 0xc7, 0xbc, 0x4e,
+  0xdc, 0x76, 0xbc, 0x7c, 0x98, 0x26, 0x3c, 0xfb, 0x7a, 0x27, 0xbd, 0x44, 0xe6,
+  0x44, 0xbd, 0x26, 0xc5, 0xb2, 0x3d, 0xb1, 0x6e, 0xfa, 0xbd, 0x79, 0xcc, 0x29,
+  0xbd, 0x08, 0xae, 0x46, 0xbc, 0x9d, 0x74, 0x67, 0x3d, 0xa3, 0xb6, 0x98, 0x3d,
+  0x92, 0xae, 0x3f, 0xbc, 0xef, 0x8c, 0x90, 0x3d, 0xeb, 0x4c, 0x02, 0xbc, 0x21,
+  0x7d, 0xe5, 0x3c, 0xd4, 0x6f, 0x47, 0xbd, 0x1a, 0xe8, 0x84, 0x3c, 0x0c, 0x96,
+  0x85, 0xbd, 0xa9, 0x69, 0xa7, 0xbb, 0x8c, 0x1e, 0x82, 0xba, 0xff, 0x78, 0x04,
+  0xbc, 0x25, 0xb9, 0xaa, 0xbd, 0x0b, 0x03, 0x48, 0xbc, 0xb3, 0xbb, 0x88, 0xbd,
+  0x00, 0x26, 0xba, 0xbd, 0x82, 0x41, 0x81, 0x3d, 0xfa, 0x3d, 0xc7, 0x3c, 0x38,
+  0x5c, 0x49, 0xbd, 0x0d, 0x4d, 0x3a, 0x3d, 0x67, 0x58, 0x0a, 0xbd, 0x7e, 0xf6,
+  0x82, 0x3b, 0x1a, 0x7a, 0x7b, 0x3d, 0xba, 0xff, 0x84, 0x3c, 0x46, 0x87, 0x84,
+  0x3c, 0xe8, 0x6c, 0x29, 0x3d, 0x8c, 0x6a, 0xac, 0xbc, 0x89, 0x34, 0x91, 0xbd,
+  0xb9, 0xaf, 0xa6, 0x3c, 0xe0, 0x9e, 0xaf, 0xbc, 0xd2, 0x7a, 0x38, 0x3d, 0xac,
+  0xbf, 0xc9, 0x3d, 0x73, 0xa1, 0x13, 0x3d, 0x7d, 0xe1, 0xf2, 0x3c, 0x73, 0xec,
+  0xcf, 0x3b, 0xfd, 0x7b, 0x8e, 0x3d, 0x1e, 0xb2, 0xf3, 0xbc, 0xdc, 0x32, 0x03,
+  0xbe, 0x5e, 0xfa, 0x1b, 0x3d, 0xdc, 0x1a, 0x25, 0x3d, 0x00, 0xcd, 0x48, 0xba,
+  0x13, 0x9d, 0xbe, 0x3d, 0x2e, 0x05, 0x77, 0xbd, 0x17, 0x74, 0x9e, 0xbd, 0xae,
+  0xc5, 0x62, 0x3c, 0x95, 0xf4, 0x59, 0x3d, 0x36, 0xd2, 0xa4, 0x3d, 0xab, 0x2b,
+  0x84, 0xbc, 0x87, 0x89, 0x55, 0x3d, 0xd0, 0xde, 0x5d, 0xbc, 0xcd, 0xb0, 0xce,
+  0xbc, 0x29, 0xa0, 0xc8, 0xbc, 0x8a, 0x0b, 0xf1, 0x3c, 0xb8, 0xce, 0x9c, 0x3c,
+  0x14, 0xd1, 0x36, 0x3d, 0x50, 0x4b, 0x08, 0xbd, 0x85, 0x95, 0x4b, 0xbd, 0x31,
+  0x9e, 0xcf, 0xbc, 0xff, 0x96, 0x83, 0x3d, 0x6c, 0x32, 0x15, 0x3c, 0x6d, 0xfd,
+  0xb0, 0x3d, 0x05, 0xd8, 0x33, 0xbd, 0x1b, 0x74, 0x8d, 0xbd, 0xfb, 0x92, 0x21,
+  0xbd, 0xde, 0x6c, 0x8f, 0xbc, 0xcc, 0x1e, 0x0f, 0xbd, 0xfa, 0xc4, 0xb8, 0xbb,
+  0xc6, 0xe2, 0x1e, 0x3d, 0x9b, 0xd2, 0x99, 0xbb, 0x0f, 0x21, 0x5a, 0xbd, 0x32,
+  0xb3, 0x8b, 0x3c, 0x08, 0x0c, 0x2e, 0x3b, 0x81, 0xda, 0x5f, 0xbd, 0x44, 0x42,
+  0x81, 0x3c, 0x11, 0xf4, 0xb3, 0xbb, 0xf5, 0x91, 0xdd, 0xbd, 0x20, 0xdd, 0xb0,
+  0x3b, 0x94, 0xc1, 0xe4, 0x3c, 0x7c, 0x2f, 0x5d, 0xbd, 0x8b, 0x1f, 0xf3, 0x3c,
+  0xf7, 0xc1, 0xd1, 0xbd, 0x2e, 0x5f, 0x5d, 0xbd, 0x35, 0x2c, 0x92, 0x3b, 0x47,
+  0x24, 0x34, 0x3d, 0x7f, 0x44, 0x71, 0x3d, 0x39, 0xd7, 0xfc, 0x3c, 0x60, 0x34,
+  0x49, 0xbd, 0x70, 0xdc, 0x80, 0x3c, 0x3b, 0xe4, 0x5d, 0xbc, 0x7d, 0x7f, 0xe3,
+  0x3c, 0x6d, 0x96, 0x2e, 0x3d, 0x7b, 0x5c, 0x15, 0x3d, 0xc3, 0x8f, 0x78, 0x3c,
+  0x5b, 0x2f, 0x2d, 0xbc, 0x30, 0xfd, 0x3a, 0x3d, 0x79, 0x6a, 0xbb, 0x3d, 0x1a,
+  0xb0, 0x4d, 0x3c, 0xe2, 0x91, 0x9a, 0x3b, 0x3c, 0x03, 0xa4, 0x3d, 0xa9, 0x2a,
+  0x3a, 0xbd, 0xfc, 0xbb, 0x88, 0x3d, 0x16, 0x7f, 0x2a, 0x3c, 0xdd, 0xfc, 0x43,
+  0x3d, 0x41, 0x34, 0x3f, 0x3d, 0x80, 0x68, 0x76, 0xbd, 0xbb, 0xab, 0xa9, 0x3d,
+  0x4f, 0x4c, 0x17, 0x3d, 0xa3, 0x6e, 0x48, 0x3c, 0x24, 0xdf, 0xed, 0xbc, 0xa9,
+  0xca, 0x8e, 0xbd, 0x28, 0x64, 0x51, 0x3d, 0x65, 0xea, 0x94, 0x3d, 0x80, 0xc3,
+  0x08, 0x3b, 0xba, 0xc6, 0x38, 0x3d, 0xa3, 0x2f, 0x64, 0xba, 0x16, 0xc1, 0x28,
+  0x3d, 0xfb, 0x5a, 0x4c, 0x3c, 0xd9, 0x21, 0x26, 0xbd, 0xb9, 0x19, 0xbd, 0x3d,
+  0xba, 0x00, 0x59, 0x3c, 0xeb, 0x40, 0x14, 0xbc, 0x24, 0x37, 0xe9, 0xbc, 0x5e,
+  0x99, 0xd0, 0xbc, 0x7c, 0xbc, 0x18, 0xbd, 0x71, 0x23, 0x56, 0x3d, 0xca, 0xa7,
+  0x30, 0xbe, 0x37, 0x29, 0x5b, 0xbd, 0x73, 0xfa, 0x30, 0x3d, 0xb7, 0x67, 0xcd,
+  0xbc, 0x92, 0xa3, 0x54, 0x3c, 0xf8, 0x54, 0xaa, 0x3d, 0xba, 0x13, 0x8c, 0x3d,
+  0x35, 0xa3, 0xa6, 0x3c, 0x11, 0x44, 0x1d, 0xbc, 0x56, 0xe4, 0x18, 0xbd, 0xd6,
+  0x33, 0xab, 0x3c, 0x2c, 0x70, 0xa8, 0xbc, 0xa0, 0xd7, 0xc8, 0xb8, 0x56, 0xd9,
+  0x69, 0x3d, 0xab, 0xaf, 0x5e, 0xbd, 0x09, 0xbf, 0xb1, 0xbd, 0xad, 0xf1, 0x50,
+  0x3c, 0xe0, 0x69, 0x47, 0xbd, 0x21, 0x32, 0x2b, 0xbb, 0x66, 0x24, 0x90, 0xbd,
+  0xf8, 0xca, 0xbf, 0xbc, 0x1f, 0x85, 0x02, 0xbd, 0xc9, 0x47, 0xa6, 0x3d, 0xaa,
+  0xeb, 0x9b, 0xbc, 0xcf, 0x49, 0x88, 0xbd, 0x40, 0xf0, 0x4e, 0xbc, 0xe3, 0x45,
+  0x16, 0x3d, 0xd4, 0x2e, 0xa4, 0xbc, 0xaf, 0xe6, 0x81, 0x3d, 0x62, 0xef, 0x2c,
+  0xbc, 0x95, 0xea, 0x63, 0xbd, 0x33, 0x76, 0x9e, 0x3d, 0x16, 0xdf, 0xd6, 0xbd,
+  0xa4, 0xb0, 0xde, 0x39, 0xee, 0xfc, 0x89, 0x3d, 0xbd, 0x48, 0xbe, 0x3b, 0xd1,
+  0xbb, 0x31, 0xbc, 0x69, 0x1b, 0x26, 0xbd, 0xc1, 0x34, 0xec, 0x3c, 0x33, 0x47,
+  0xd5, 0x3c, 0xd0, 0xfb, 0x5c, 0x3b, 0xec, 0x71, 0x27, 0xbc, 0x48, 0x88, 0x62,
+  0x3c, 0x60, 0x89, 0x76, 0x3b, 0x4c, 0x07, 0xe8, 0x3c, 0xd5, 0xb4, 0x16, 0x3d,
+  0x9d, 0x21, 0x9f, 0x3c, 0x9d, 0x78, 0xb3, 0xbd, 0xeb, 0x74, 0x21, 0xbd, 0xdb,
+  0x5e, 0x75, 0xbd, 0x02, 0xf1, 0x9b, 0x3d, 0x50, 0x67, 0x30, 0xbc, 0xc4, 0xa7,
+  0xe6, 0x3c, 0x77, 0x75, 0x6e, 0x3c, 0xfd, 0x7e, 0x9e, 0xbb, 0x79, 0xed, 0x77,
+  0xbc, 0x18, 0x82, 0x40, 0x3d, 0x18, 0xd1, 0x93, 0x3d, 0x4a, 0xa2, 0x32, 0xbb,
+  0x83, 0xd5, 0x51, 0x3c, 0xa1, 0x52, 0xd9, 0x38, 0x6a, 0x5e, 0xb4, 0x3d, 0x73,
+  0xb2, 0x1f, 0xbd, 0x02, 0xe7, 0x06, 0xbd, 0x25, 0x20, 0x5c, 0xbd, 0x6a, 0x66,
+  0x16, 0x3d, 0xef, 0x75, 0x7c, 0x3d, 0x4b, 0xa8, 0x89, 0x3d, 0x17, 0x5e, 0x82,
+  0xbc, 0xd7, 0x41, 0x80, 0x3d, 0x67, 0x41, 0xaf, 0xbc, 0x93, 0x11, 0x9b, 0x3d,
+  0x4a, 0x03, 0xb3, 0xbd, 0x0d, 0x82, 0x32, 0xbd, 0x39, 0x35, 0xee, 0xbc, 0x07,
+  0x60, 0x87, 0xbd, 0x51, 0xb7, 0x4d, 0x3b, 0xe4, 0x6e, 0xbf, 0xbb, 0x24, 0x01,
+  0x36, 0xbd, 0x24, 0x02, 0x10, 0xbd, 0xfe, 0x24, 0x4f, 0xbd, 0xaf, 0xc2, 0x34,
+  0xbc, 0x21, 0x39, 0xd9, 0x3c, 0x80, 0x73, 0x88, 0x3c, 0x8e, 0xaf, 0x84, 0xbd,
+  0x1e, 0x05, 0x8b, 0xbd, 0xd2, 0xa7, 0x0e, 0x3d, 0x53, 0xe6, 0x89, 0x3b, 0xf3,
+  0xd7, 0xa7, 0x3d, 0x58, 0xf7, 0x29, 0x3d, 0xb1, 0x45, 0x9f, 0x3c, 0x3d, 0xf4,
+  0x73, 0x3d, 0x73, 0xd2, 0x4d, 0xbd, 0x6f, 0x4a, 0x0f, 0x3d, 0xc1, 0x60, 0x95,
+  0xbd, 0xf4, 0x0f, 0x8e, 0x3d, 0x83, 0x58, 0xed, 0xbd, 0x58, 0x39, 0x12, 0x3c,
+  0x20, 0x58, 0x39, 0x3d, 0xf4, 0xc9, 0x14, 0x3d, 0x5f, 0xa1, 0x0a, 0x3d, 0xd0,
+  0x80, 0x42, 0xbd, 0x2b, 0xc9, 0x35, 0xbd, 0xa5, 0xe0, 0xf9, 0xbc, 0x11, 0xe4,
+  0x8b, 0x3c, 0x0f, 0x18, 0x33, 0xbd, 0xb7, 0x53, 0x8f, 0xbc, 0xa8, 0xfe, 0x4f,
+  0xbd, 0x1f, 0x8d, 0xf9, 0x3b, 0x33, 0x31, 0xa6, 0x3d, 0xb7, 0x6d, 0x03, 0x3c,
+  0x80, 0xaa, 0xda, 0xbd, 0x82, 0x6e, 0xc5, 0x3c, 0x22, 0xaa, 0xba, 0x3c, 0xfd,
+  0xd9, 0xcd, 0x3c, 0x16, 0x60, 0x5a, 0x3c, 0x48, 0xdb, 0x36, 0x3d, 0x10, 0xf4,
+  0x84, 0xbc, 0x78, 0xf4, 0x8c, 0x3d, 0x24, 0xd3, 0xf2, 0xbc, 0x8e, 0xac, 0x16,
+  0xbd, 0x41, 0x7a, 0xf1, 0x3c, 0xd3, 0x25, 0x77, 0x3d, 0x26, 0xf2, 0x63, 0x3d,
+  0x7a, 0xb2, 0xa0, 0x3d, 0x00, 0xbb, 0xa4, 0x3c, 0x11, 0xd2, 0xf7, 0xbc, 0x92,
+  0x58, 0xa7, 0x3d, 0xa1, 0x9e, 0xaf, 0xbd, 0x38, 0xb3, 0x0b, 0x3c, 0xf3, 0xbb,
+  0x62, 0x3c, 0x98, 0x07, 0x9c, 0x3d, 0xa3, 0x56, 0xba, 0xba, 0x1a, 0x8d, 0x95,
+  0x3d, 0x13, 0x14, 0x7b, 0x3d, 0xfe, 0x05, 0xb3, 0x3d, 0xd2, 0x56, 0x01, 0x3c,
+  0x9e, 0xad, 0x44, 0x3d, 0xc7, 0xd7, 0x98, 0x3c, 0x1e, 0xfb, 0x18, 0x3d, 0x58,
+  0x4c, 0x53, 0xbc, 0xf2, 0x16, 0xf1, 0xbb, 0xae, 0x3a, 0xad, 0xbd, 0x3d, 0xdd,
+  0x40, 0xbd, 0x9f, 0xa1, 0x9c, 0xbd, 0xb6, 0xb7, 0x09, 0xbc, 0x74, 0xc3, 0xbc,
+  0xbd, 0x22, 0xf9, 0x61, 0xbc, 0x71, 0x46, 0x80, 0xbc, 0x26, 0x48, 0x53, 0xbd,
+  0x6a, 0xb7, 0x5d, 0x3d, 0xb9, 0xc9, 0x66, 0x3d, 0xaf, 0x27, 0x00, 0xbd, 0x24,
+  0x28, 0xd3, 0x3a, 0x53, 0xfb, 0x5d, 0xbd, 0xf4, 0x8b, 0x8a, 0x3d, 0x80, 0x14,
+  0x8e, 0xbd, 0x72, 0xcc, 0xa7, 0x3d, 0xd4, 0x5b, 0xff, 0xbc, 0xdf, 0x54, 0x43,
+  0xbd, 0x6a, 0x25, 0xe1, 0x3b, 0xe2, 0xe9, 0x09, 0xbd, 0x55, 0xad, 0x63, 0xbd,
+  0x14, 0xb6, 0xa9, 0x3b, 0x0c, 0xba, 0xd8, 0xbc, 0xc3, 0x6d, 0x53, 0xbd, 0x42,
+  0xa5, 0x5f, 0xbd, 0x7b, 0x04, 0x22, 0xbd, 0x15, 0x56, 0x77, 0x3c, 0x53, 0x67,
+  0xe6, 0xbc, 0x69, 0xe6, 0x89, 0x3c, 0x80, 0xcc, 0xbb, 0xbb, 0xea, 0x11, 0xb5,
+  0x3d, 0x02, 0x35, 0xb6, 0x3b, 0x98, 0x78, 0x19, 0x3d, 0xae, 0x02, 0xdd, 0xbd,
+  0x88, 0x78, 0x35, 0x3c, 0x30, 0x8b, 0x9d, 0xbd, 0xce, 0x4f, 0xad, 0xbd, 0x27,
+  0xf3, 0xcf, 0x3c, 0xda, 0x15, 0x82, 0xbd, 0x50, 0x43, 0x86, 0x3c, 0xff, 0x0b,
+  0xca, 0x3b, 0xec, 0x3f, 0xd1, 0xbc, 0x53, 0xc4, 0x15, 0x3d, 0x72, 0x9f, 0x12,
+  0x3d, 0xcb, 0x3b, 0xcc, 0x3c, 0x90, 0xd2, 0x3a, 0x3d, 0x42, 0x53, 0x0d, 0xbc,
+  0x46, 0x82, 0x93, 0x3d, 0xe9, 0x9a, 0xb1, 0xbd, 0x05, 0x99, 0x98, 0xbb, 0x52,
+  0x17, 0x71, 0xbd, 0x6e, 0xb6, 0x8d, 0xbd, 0x0f, 0xe1, 0x66, 0xbd, 0x2b, 0x2f,
+  0x1b, 0x3d, 0x97, 0x2f, 0xf4, 0xbc, 0xc0, 0xc0, 0x0f, 0x3d, 0xf3, 0x36, 0x6f,
+  0x3d, 0x38, 0x99, 0x97, 0x3c, 0xca, 0x4a, 0xca, 0xbd, 0xe2, 0x66, 0x11, 0x3b,
+  0xa8, 0xe8, 0x03, 0xbd, 0x60, 0xbf, 0x7e, 0xbb, 0x6d, 0x53, 0xb9, 0x3d, 0x50,
+  0x02, 0x0c, 0x3c, 0xe3, 0x5f, 0xbb, 0xbd, 0xd1, 0xc0, 0xbd, 0xbc, 0x42, 0x35,
+  0x89, 0x3d, 0x36, 0x8e, 0x9c, 0xbd, 0xac, 0x4a, 0x92, 0xbd, 0x7c, 0xb8, 0x65,
+  0xbd, 0x77, 0xdd, 0x5e, 0xbd, 0x58, 0x55, 0x38, 0xbd, 0x2e, 0xa6, 0x67, 0x3c,
+  0x7d, 0x81, 0x0b, 0xbd, 0x7b, 0xda, 0x92, 0x3d, 0x07, 0xec, 0x98, 0xbc, 0x6c,
+  0x89, 0x35, 0xbd, 0x1b, 0x09, 0x0a, 0x3d, 0xca, 0x57, 0x27, 0x3c, 0xab, 0xff,
+  0x2e, 0x3d, 0x97, 0xd7, 0x8d, 0xbd, 0xfa, 0x59, 0xb3, 0x3d, 0xb2, 0x38, 0x31,
+  0x3d, 0xd2, 0x30, 0x2b, 0x3d, 0xa5, 0x8d, 0xa4, 0x3b, 0xc9, 0xca, 0xe4, 0x3c,
+  0x0a, 0x75, 0x99, 0x3d, 0x3f, 0x85, 0x08, 0x3d, 0xff, 0x4e, 0x4e, 0x3d, 0x00,
+  0xfb, 0x74, 0x3d, 0x90, 0x22, 0xb2, 0xbb, 0xed, 0xe6, 0x8c, 0xbb, 0x23, 0x48,
+  0xe6, 0x3b, 0xfc, 0x6e, 0x62, 0xbd, 0xd5, 0x72, 0x58, 0x3d, 0xc8, 0x23, 0xce,
+  0x3c, 0xf2, 0x1f, 0x3b, 0x3c, 0xd0, 0x69, 0xc6, 0x3b, 0x18, 0x15, 0x62, 0x3c,
+  0xa8, 0x0a, 0x2b, 0x3d, 0x94, 0xed, 0x79, 0xbd, 0xf1, 0xff, 0x81, 0xbc, 0xb8,
+  0x90, 0x3e, 0xbd, 0x4d, 0x8e, 0x25, 0x3d, 0x04, 0x91, 0xef, 0x3d, 0xb9, 0x57,
+  0x17, 0x3d, 0x3a, 0xef, 0x01, 0xbd, 0xc4, 0x52, 0x59, 0xbc, 0x8a, 0x5e, 0x8e,
+  0xbd, 0xe7, 0x23, 0xf5, 0xbc, 0x4f, 0xe7, 0x1f, 0xbd, 0x1f, 0x86, 0x82, 0xbc,
+  0x1e, 0xf9, 0x53, 0x3d, 0xdf, 0x9c, 0x0a, 0x3c, 0xbf, 0xc9, 0xcc, 0x3c, 0xec,
+  0xa1, 0x3e, 0xbc, 0x9c, 0x8e, 0x5e, 0x3a, 0xfd, 0xd8, 0x90, 0xbc, 0xe8, 0x4c,
+  0xc7, 0xbc, 0xf2, 0x0f, 0x4b, 0x3a, 0x08, 0x9d, 0xbc, 0xbc, 0xab, 0x39, 0x4d,
+  0x3d, 0xea, 0x3d, 0x6b, 0x3d, 0x5c, 0x84, 0x80, 0x3d, 0x7d, 0x95, 0xf8, 0xbc,
+  0x70, 0xb2, 0x18, 0xbd, 0x2a, 0x02, 0x79, 0x3d, 0xe8, 0xd9, 0x3c, 0x3d, 0x67,
+  0xaf, 0x29, 0x3d, 0x39, 0x45, 0x27, 0xbd, 0x0a, 0x7b, 0x12, 0xbd, 0xbb, 0xdc,
+  0xe9, 0xbc, 0x73, 0x04, 0x83, 0xbd, 0x5d, 0xe4, 0x1c, 0xbd, 0xf0, 0x70, 0x29,
+  0x3d, 0x87, 0x1e, 0x0d, 0xbd, 0x39, 0x86, 0xf0, 0x3c, 0xf5, 0x57, 0x3e, 0xbd,
+  0xc8, 0x3c, 0x18, 0xbc, 0xf4, 0xa8, 0xa0, 0x3d, 0x5c, 0xa0, 0x6c, 0x3d, 0x02,
+  0x7a, 0x7e, 0xbc, 0x0b, 0xb6, 0x6d, 0xbd, 0xb0, 0x9a, 0xa8, 0x3c, 0xee, 0x24,
+  0x11, 0x3d, 0x54, 0x87, 0xf7, 0xbc, 0x57, 0x52, 0x70, 0xbd, 0x1e, 0x35, 0x46,
+  0xbd, 0x38, 0x2d, 0x82, 0x3d, 0x9d, 0x1a, 0x3c, 0xbd, 0x53, 0x7b, 0xa6, 0x3d,
+  0x29, 0x4b, 0xab, 0x3d, 0x0c, 0x43, 0x2d, 0x3d, 0x1a, 0x12, 0x95, 0x3d, 0x3b,
+  0xf1, 0x3e, 0x3d, 0x80, 0xf6, 0x8d, 0xbd, 0x1b, 0xb6, 0xb4, 0xbc, 0x98, 0x23,
+  0x79, 0xbd, 0xb7, 0xf6, 0xc5, 0x3d, 0x10, 0xd5, 0x48, 0x3d, 0x58, 0x7c, 0x9f,
+  0xbd, 0xa0, 0x5a, 0x16, 0xbd, 0x82, 0xfb, 0x8e, 0xbd, 0x0b, 0xec, 0xed, 0xbc,
+  0x92, 0xb7, 0xa3, 0xbd, 0xd5, 0xfd, 0x85, 0xbd, 0x54, 0xc9, 0x20, 0x3d, 0xad,
+  0xa1, 0x90, 0xbd, 0x83, 0xd6, 0xfb, 0xbc, 0xe2, 0x46, 0x43, 0x3b, 0xfe, 0xa6,
+  0xbd, 0xb7, 0x8f, 0xd3, 0xaf, 0x3d, 0x75, 0xb9, 0x9d, 0x3d, 0xd5, 0xfc, 0x2a,
+  0x3c, 0xc6, 0x7e, 0xd6, 0xbc, 0x08, 0xcd, 0x4c, 0xbd, 0xcf, 0x4f, 0x73, 0x3d,
+  0x3e, 0x7f, 0xb7, 0xbc, 0xbc, 0xa9, 0xfd, 0xbc, 0xf4, 0x8b, 0xa6, 0xbc, 0x11,
+  0x90, 0xd0, 0xbc, 0x47, 0xf7, 0x4d, 0x3c, 0xed, 0x09, 0x64, 0xbd, 0x61, 0x49,
+  0x8d, 0xbc, 0xc8, 0xd3, 0x3c, 0x3d, 0x72, 0x23, 0x88, 0x3d, 0xc3, 0xa7, 0x2e,
+  0x3d, 0x67, 0x01, 0x2d, 0xbd, 0xcc, 0x34, 0xa0, 0xbd, 0x7e, 0xc7, 0xf8, 0xbc,
+  0x0c, 0xf5, 0xaf, 0xbb, 0x6e, 0xa6, 0x4f, 0x3d, 0xe2, 0xb9, 0x88, 0xbd, 0x87,
+  0x6f, 0xf9, 0xbc, 0x82, 0x23, 0x16, 0x3c, 0x10, 0x0c, 0x69, 0x3b, 0xab, 0x02,
+  0xe2, 0x3c, 0x57, 0x6a, 0x08, 0xba, 0x4e, 0xc7, 0x6a, 0x3d, 0x30, 0x86, 0x6d,
+  0x3c, 0xee, 0xb3, 0x84, 0x3d, 0xf9, 0xc4, 0x3a, 0x3d, 0x6f, 0x21, 0x8d, 0xbb,
+  0xef, 0x7e, 0xc1, 0x3b, 0x05, 0xca, 0x12, 0xbc, 0x8a, 0x77, 0x2b, 0xbd, 0x1e,
+  0x23, 0x32, 0x3d, 0x32, 0x8b, 0x03, 0x3d, 0xd3, 0x33, 0x0a, 0xbd, 0x3f, 0xdd,
+  0x59, 0xbd, 0x18, 0xfa, 0x00, 0x3d, 0x46, 0x0b, 0xdd, 0x3b, 0x96, 0x2b, 0x4c,
+  0xbd, 0xc8, 0xcc, 0xa7, 0x3d, 0xe2, 0xad, 0x2e, 0x3d, 0xbc, 0x68, 0x54, 0x3d,
+  0xcb, 0x88, 0xae, 0x3c, 0x00, 0xd8, 0x15, 0xbc, 0x18, 0x4b, 0xb5, 0xbd, 0x89,
+  0x31, 0x93, 0xbd, 0x84, 0xd3, 0x57, 0x3d, 0x86, 0x2c, 0x6c, 0x3d, 0x18, 0x08,
+  0xb1, 0x3d, 0x14, 0x61, 0xbc, 0xbc, 0x25, 0xa4, 0x27, 0xbd, 0xfa, 0xdd, 0xb7,
+  0xbd, 0x81, 0xaf, 0x1d, 0xbc, 0x06, 0x91, 0x5d, 0x3d, 0x54, 0xfb, 0xc9, 0xbc,
+  0x0b, 0x35, 0x9a, 0x3b, 0x48, 0x7f, 0x1c, 0xbd, 0xaa, 0x85, 0x54, 0x3d, 0x3e,
+  0x43, 0xfe, 0xbb, 0xcb, 0xf9, 0xbf, 0x3b, 0x4b, 0x03, 0xed, 0x3c, 0xe0, 0x7f,
+  0x85, 0x3d, 0xe2, 0x52, 0x82, 0x3d, 0x98, 0x11, 0x94, 0x3d, 0x39, 0x2d, 0x26,
+  0x3c, 0xce, 0x96, 0x5e, 0xbd, 0x6c, 0x42, 0x31, 0xbd, 0xca, 0x90, 0xd4, 0x3b,
+  0x66, 0xa9, 0xc0, 0xbd, 0x23, 0x2e, 0x8d, 0x3d, 0x26, 0xc8, 0x4a, 0xbc, 0x2a,
+  0xbd, 0x09, 0xbd, 0x26, 0xa5, 0xe6, 0x3c, 0x1e, 0x7c, 0xaa, 0x3d, 0x1b, 0x52,
+  0x15, 0x3d, 0xb2, 0xa4, 0x81, 0x3d, 0x73, 0x78, 0x8a, 0x3c, 0x60, 0x6d, 0x4a,
+  0xbd, 0x60, 0xc1, 0x3b, 0xbc, 0x14, 0xc6, 0xfb, 0x3c, 0x48, 0x70, 0x05, 0xbd,
+  0xc1, 0xa4, 0x98, 0x3d, 0x71, 0x0a, 0xc4, 0xbd, 0x25, 0xdd, 0x31, 0xbd, 0x99,
+  0x3a, 0x94, 0xbd, 0xa1, 0x45, 0xbf, 0x3c, 0x54, 0x14, 0xbf, 0xbc, 0xfd, 0x98,
+  0xd2, 0xbd, 0xca, 0x27, 0x87, 0xbd, 0x1a, 0x52, 0x3a, 0x3d, 0xc3, 0xcf, 0x42,
+  0xbc, 0x4c, 0x2f, 0xe0, 0x3a, 0x96, 0x3f, 0x5e, 0x3b, 0xba, 0xc2, 0x1d, 0xbd,
+  0xed, 0x26, 0x42, 0xbd, 0xf6, 0xe0, 0xb4, 0x3d, 0xbe, 0x39, 0x23, 0xbc, 0x05,
+  0x9d, 0xba, 0x3c, 0xe9, 0x38, 0x2f, 0xbb, 0x15, 0x9c, 0xbb, 0x3d, 0x22, 0xca,
+  0x66, 0x3c, 0x10, 0x16, 0xdb, 0xbc, 0x11, 0x3d, 0xda, 0x3d, 0xac, 0x48, 0x37,
+  0xbd, 0xac, 0x3e, 0x08, 0xbd, 0x8b, 0xb1, 0x7f, 0x3d, 0xe7, 0x31, 0xa3, 0x3c,
+  0xd5, 0xe9, 0xb6, 0x3d, 0x53, 0xc1, 0x19, 0xbd, 0x2f, 0xc2, 0x35, 0xbd, 0xf9,
+  0xa6, 0xa2, 0xbd, 0x46, 0x22, 0x2b, 0x3d, 0x2a, 0x2c, 0x3b, 0xbd, 0xf3, 0x8e,
+  0x07, 0x3c, 0xff, 0xb1, 0x09, 0xbd, 0xbd, 0x01, 0x0f, 0xbb, 0x04, 0x7f, 0x4a,
+  0xbd, 0xb9, 0xca, 0x87, 0x3d, 0x4e, 0x96, 0x12, 0xbc, 0x7b, 0x9a, 0x7d, 0x3d,
+  0x1b, 0x48, 0x08, 0xbc, 0x1b, 0x36, 0x8a, 0x3d, 0xd1, 0x48, 0xe1, 0x3c, 0xb9,
+  0xb0, 0x6f, 0x3d, 0x51, 0x6a, 0x83, 0xbb, 0xaa, 0xf0, 0xac, 0x3d, 0x61, 0xdb,
+  0x43, 0xbd, 0x2e, 0xcf, 0xa2, 0x3d, 0xa6, 0x41, 0x89, 0x3d, 0x53, 0x86, 0xe1,
+  0xbc, 0xda, 0x91, 0x9a, 0xbd, 0xba, 0xf7, 0x86, 0x3d, 0x8b, 0x8c, 0xab, 0xbd,
+  0xa2, 0x2c, 0x6b, 0x3d, 0x31, 0x66, 0x83, 0x3c, 0xce, 0xd5, 0x0e, 0xbd, 0x35,
+  0x29, 0x73, 0x3d, 0x9b, 0xf7, 0xb0, 0x3d, 0x51, 0x33, 0x21, 0x3d, 0x4c, 0xa1,
+  0x4b, 0x3d, 0x58, 0xe3, 0xd5, 0xbc, 0x9f, 0xe4, 0x68, 0x3b, 0xed, 0x0b, 0x1e,
+  0x3b, 0xc8, 0x06, 0x8c, 0x3c, 0x67, 0x47, 0x17, 0xbd, 0x63, 0xb4, 0xd1, 0xbc,
+  0xf3, 0x34, 0x55, 0xbc, 0xde, 0x7b, 0x31, 0xbd, 0x17, 0x4e, 0x74, 0xba, 0x8b,
+  0x65, 0x43, 0xbc, 0x01, 0xcc, 0xa0, 0x3d, 0xc7, 0x20, 0xa2, 0xbd, 0x63, 0x70,
+  0x67, 0x3c, 0x65, 0xa0, 0x8d, 0x3d, 0xdf, 0xc9, 0x3d, 0xbc, 0x2f, 0xfa, 0x44,
+  0x3b, 0xd2, 0xcf, 0x42, 0x3d, 0x9a, 0x40, 0x06, 0x3d, 0x67, 0x53, 0x4b, 0xbc,
+  0x43, 0x50, 0x4a, 0x3c, 0x23, 0xb9, 0xa1, 0xbc, 0xad, 0x34, 0xe3, 0xbc, 0xac,
+  0xc4, 0x4f, 0xbd, 0x4b, 0x40, 0xe5, 0xbb, 0xc3, 0xf1, 0x50, 0xbd, 0x98, 0x34,
+  0x28, 0xbd, 0x28, 0xf8, 0xae, 0x3d, 0xd1, 0x27, 0x8f, 0x3c, 0xb4, 0x8c, 0x8b,
+  0x3d, 0x73, 0xf2, 0x07, 0xbb, 0x65, 0x39, 0x61, 0xbd, 0x9a, 0x90, 0xcb, 0xbb,
+  0x18, 0x2f, 0x8e, 0xbd, 0x65, 0xab, 0x4b, 0x3d, 0xd1, 0x40, 0x64, 0xbd, 0x10,
+  0xdb, 0x83, 0xbd, 0x3b, 0x12, 0xa5, 0x3d, 0x31, 0x45, 0x78, 0x3d, 0xa4, 0xb1,
+  0x26, 0x3d, 0xac, 0x10, 0x42, 0xbc, 0xbe, 0x62, 0xb3, 0xbd, 0x4e, 0x3d, 0x76,
+  0x3c, 0x66, 0x0e, 0xde, 0xbc, 0x4f, 0x82, 0xd0, 0xbd, 0xf1, 0x86, 0x8e, 0xbd,
+  0xf1, 0xe8, 0x37, 0x3c, 0xb7, 0xbb, 0x0e, 0x3d, 0x1c, 0xc4, 0x05, 0x3d, 0x15,
+  0x50, 0x86, 0x3d, 0x81, 0x10, 0x92, 0x3b, 0x0a, 0xff, 0xed, 0x3c, 0x91, 0x9b,
+  0xb3, 0xbb, 0xb5, 0xba, 0x26, 0xbc, 0x89, 0xef, 0x0f, 0x3d, 0x52, 0xde, 0x47,
+  0x3d, 0x9d, 0x0f, 0x0c, 0x3d, 0x80, 0xee, 0xcb, 0xbd, 0xe2, 0xc7, 0x82, 0xbd,
+  0x1a, 0xf6, 0x64, 0x3c, 0xaf, 0xa7, 0xbf, 0xbc, 0xfc, 0x41, 0x37, 0x3c, 0xf9,
+  0x88, 0xfe, 0xbc, 0xdf, 0x47, 0x8d, 0xbc, 0x55, 0x09, 0x0b, 0xbd, 0x32, 0x50,
+  0x00, 0xbd, 0x83, 0x62, 0xaf, 0xbc, 0xdc, 0xac, 0x5e, 0xbd, 0xb6, 0x22, 0x54,
+  0xbd, 0x74, 0xd7, 0x00, 0x3c, 0xe3, 0x5a, 0xcb, 0xbc, 0xaa, 0x37, 0x25, 0xbd,
+  0x64, 0x98, 0x5f, 0x3d, 0x81, 0xdf, 0x8b, 0x3c, 0x23, 0xef, 0x66, 0x3b, 0x84,
+  0x67, 0x55, 0xbb, 0xd2, 0x11, 0x98, 0xbd, 0x2b, 0x15, 0x82, 0x3d, 0xeb, 0x1e,
+  0xc6, 0x3c, 0x56, 0x83, 0xcb, 0xba, 0xd0, 0xc7, 0x2d, 0x3d, 0xd1, 0xcd, 0x0c,
+  0x3d, 0xe4, 0x5c, 0x5a, 0xbc, 0x4a, 0xf3, 0x73, 0xbd, 0x43, 0xdc, 0xfe, 0x3c,
+  0x00, 0xd6, 0x2f, 0x3d, 0x06, 0x22, 0x49, 0xbb, 0x4e, 0x45, 0x71, 0xbc, 0xb3,
+  0x3c, 0x00, 0x3d, 0x1a, 0xae, 0x58, 0xbd, 0x15, 0x61, 0x92, 0x3d, 0x14, 0xb9,
+  0xf8, 0xbc, 0x15, 0x2c, 0x1b, 0x3d, 0x31, 0x97, 0x3b, 0xbc, 0xe2, 0xe7, 0x18,
+  0x3d, 0xcf, 0xf0, 0x1f, 0xbd, 0x7c, 0x1e, 0x0f, 0x3d, 0xb1, 0x27, 0x7f, 0xbd,
+  0xb8, 0xdd, 0xb2, 0xbd, 0xcc, 0xc2, 0x44, 0x3d, 0x44, 0x5c, 0x06, 0xbd, 0x4f,
+  0x6a, 0x4a, 0xbd, 0x43, 0x2c, 0x87, 0x3d, 0xb7, 0xe9, 0x48, 0xbd, 0x60, 0x01,
+  0x07, 0xbd, 0x0b, 0xe4, 0x78, 0x3a, 0x92, 0x5d, 0x64, 0xbd, 0x7c, 0xcf, 0x81,
+  0xbc, 0xe2, 0x59, 0xab, 0x3c, 0xf0, 0xbc, 0x68, 0xbc, 0xc3, 0x2d, 0x3d, 0x3d,
+  0x27, 0xb2, 0xce, 0x3d, 0x44, 0x61, 0x0e, 0x3c, 0x94, 0x6d, 0x02, 0xbd, 0xe5,
+  0x6f, 0xc2, 0x3c, 0x70, 0xab, 0x8a, 0x3a, 0x14, 0xab, 0x04, 0x3c, 0x9d, 0xd4,
+  0xab, 0x3d, 0x0a, 0x7d, 0x64, 0x3c, 0x17, 0xb5, 0xce, 0x3b, 0x66, 0xbd, 0x24,
+  0x3d, 0xed, 0xce, 0x77, 0xbd, 0xed, 0x6e, 0x7f, 0xbd, 0x70, 0xe8, 0x10, 0xbc,
+  0x6a, 0x80, 0x37, 0x3d, 0x2d, 0x0b, 0x83, 0x3d, 0x8e, 0x4b, 0x5e, 0xbd, 0xd6,
+  0x38, 0x34, 0xbd, 0xce, 0xaf, 0x88, 0x3d, 0xef, 0x64, 0x10, 0xbc, 0xa0, 0x8b,
+  0xac, 0xbd, 0x70, 0xa5, 0x50, 0x3c, 0x87, 0x3d, 0x83, 0x3d, 0x70, 0x63, 0x57,
+  0xbd, 0xf3, 0x6a, 0x44, 0x3d, 0x3a, 0x49, 0xda, 0xbd, 0x1b, 0x74, 0xde, 0xbd,
+  0x0d, 0xb2, 0x34, 0x3d, 0x04, 0x0f, 0x87, 0x3d, 0x04, 0xb1, 0x25, 0xbd, 0x5f,
+  0x2c, 0x01, 0xbc, 0x9a, 0x55, 0x6b, 0x3b, 0xad, 0xdf, 0x5e, 0x3d, 0x7f, 0x85,
+  0x2a, 0x3c, 0xfa, 0x88, 0xfa, 0xbc, 0x0d, 0x79, 0x8b, 0xbd, 0x01, 0x45, 0x73,
+  0x3d, 0x11, 0xde, 0xb6, 0x3c, 0xcc, 0xb5, 0xa4, 0x3c, 0xe8, 0xc5, 0x67, 0xbc,
+  0x66, 0x99, 0x92, 0x3d, 0x36, 0xb0, 0x79, 0xbd, 0x14, 0x41, 0xa7, 0x3d, 0xfe,
+  0x98, 0xcf, 0x3c, 0x32, 0xf7, 0x0a, 0x3d, 0xa6, 0x4a, 0x45, 0x3d, 0x83, 0xa0,
+  0x9e, 0x3d, 0x86, 0x2e, 0x71, 0x3d, 0x92, 0x9c, 0x4d, 0x3d, 0xed, 0x24, 0xeb,
+  0xbc, 0x3e, 0xfe, 0xc0, 0xbc, 0xcd, 0x6e, 0x4f, 0x3c, 0x83, 0x86, 0xa5, 0xbd,
+  0xa4, 0xd7, 0xa5, 0xbc, 0xe0, 0x9a, 0x38, 0x3d, 0xe2, 0x79, 0xcd, 0x3c, 0x4a,
+  0xe2, 0xa1, 0x3c, 0x94, 0x66, 0xd1, 0xbc, 0xe6, 0xed, 0x9b, 0x3c, 0x68, 0xb1,
+  0x41, 0x3b, 0x1b, 0x65, 0x0b, 0x3d, 0xdd, 0x50, 0xae, 0xbd, 0x29, 0xf9, 0xfc,
+  0xbc, 0x33, 0xe6, 0x37, 0xbd, 0xb6, 0x53, 0xbb, 0x3c, 0x0c, 0x5e, 0xf6, 0x3d,
+  0x75, 0xbb, 0xf6, 0xbc, 0xf8, 0xc6, 0x9a, 0x3d, 0x8f, 0xe5, 0xc4, 0x3c, 0x88,
+  0xee, 0x33, 0xbc, 0x73, 0xb2, 0x87, 0x3c, 0xd4, 0xd8, 0x58, 0x3c, 0x15, 0x37,
+  0x82, 0x3d, 0xc1, 0x4f, 0x38, 0xbc, 0xba, 0x8e, 0xf9, 0xbb, 0x7c, 0x56, 0xe0,
+  0xbd, 0xca, 0x23, 0x94, 0xbc, 0x24, 0x41, 0xae, 0x3d, 0x89, 0x4e, 0x9a, 0x3c,
+  0xcb, 0x28, 0xe3, 0x3c, 0xf1, 0xfa, 0x05, 0x3d, 0xe3, 0xa4, 0x80, 0xbd, 0x6f,
+  0xda, 0x16, 0x3d, 0xc7, 0xee, 0x77, 0xbd, 0xa8, 0xe3, 0xb1, 0xbc, 0x6f, 0x70,
+  0x90, 0xbc, 0x78, 0x35, 0x48, 0x3d, 0xac, 0xdb, 0x23, 0xbd, 0x4e, 0xbd, 0xe4,
+  0xbb, 0x79, 0x88, 0xd0, 0xbb, 0xf2, 0xa9, 0xb6, 0xbd, 0x54, 0x46, 0x5d, 0xbd,
+  0xc6, 0xb2, 0x95, 0x3d, 0xe6, 0x67, 0x52, 0x3d, 0xa6, 0x5d, 0x7f, 0xbd, 0x0b,
+  0xe5, 0xad, 0x3b, 0x91, 0xf6, 0x0c, 0x3c, 0x33, 0x45, 0xab, 0xbc, 0xa7, 0x84,
+  0xb3, 0xbc, 0xf5, 0xb0, 0x6c, 0x3c, 0x08, 0xc9, 0xb4, 0x3c, 0x61, 0x9d, 0x8b,
+  0x3c, 0x0d, 0x19, 0x87, 0x3d, 0xaa, 0xbc, 0xd3, 0xbc, 0x85, 0x92, 0x8e, 0x3b,
+  0xfc, 0x26, 0x49, 0xbd, 0x56, 0x7e, 0x7f, 0x3d, 0xf3, 0x85, 0x61, 0xbd, 0x8c,
+  0x5b, 0xf0, 0x3c, 0x14, 0x09, 0x65, 0xbd, 0x66, 0x78, 0x38, 0xbb, 0x2c, 0x69,
+  0x4d, 0xbd, 0x33, 0x31, 0x46, 0x3d, 0x6d, 0xb8, 0xa6, 0xbc, 0x69, 0x4e, 0xc3,
+  0x3d, 0xc9, 0x54, 0x93, 0xbd, 0x1a, 0x80, 0x83, 0x3d, 0x06, 0x1b, 0xa8, 0x3c,
+  0xf0, 0x64, 0x65, 0x3c, 0xae, 0xd7, 0xb2, 0x3d, 0x03, 0xc0, 0xf0, 0x3c, 0x9d,
+  0xbf, 0x84, 0xbd, 0xa6, 0x60, 0xfd, 0xbd, 0x58, 0x27, 0x41, 0x3d, 0x3f, 0x70,
+  0x9f, 0x3c, 0x13, 0x59, 0x37, 0xbd, 0x6b, 0x61, 0x4e, 0xbd, 0xb5, 0xf3, 0x26,
+  0x39, 0x10, 0x99, 0xc5, 0x3c, 0x7c, 0xda, 0x28, 0x3d, 0x23, 0x7b, 0x78, 0x3b,
+  0xa5, 0x5f, 0x1c, 0xbd, 0x8e, 0x82, 0xd0, 0x3c, 0x42, 0x5a, 0x29, 0x3d, 0x5c,
+  0x7a, 0x1d, 0xb8, 0xf8, 0x4e, 0x3c, 0xbc, 0x24, 0xee, 0x52, 0x3b, 0x56, 0xfa,
+  0x0b, 0x3d, 0xe2, 0xa4, 0xc4, 0x3b, 0xd1, 0x51, 0xe1, 0xbd, 0x22, 0xbb, 0x7f,
+  0xbd, 0xd3, 0x54, 0x6d, 0x3d, 0x75, 0x61, 0xaa, 0x3d, 0x4a, 0xd4, 0x33, 0x3d,
+  0x2d, 0x5f, 0x91, 0x3c, 0x38, 0xc6, 0xe3, 0xb9, 0x91, 0x94, 0x38, 0x3d, 0x87,
+  0x92, 0xd5, 0x3c, 0xb3, 0x59, 0x34, 0xbd, 0x74, 0x48, 0x64, 0xbd, 0x90, 0xb1,
+  0xba, 0x3c, 0xd1, 0x21, 0x97, 0x3c, 0xb9, 0x24, 0xa7, 0x3c, 0xa0, 0xe7, 0xe8,
+  0xbd, 0xf1, 0xc5, 0x45, 0x3c, 0x93, 0x0e, 0x2e, 0x3d, 0x31, 0x84, 0xd5, 0xbc,
+  0xd7, 0x86, 0xbf, 0x3c, 0x5b, 0xae, 0xb8, 0x3c, 0xc3, 0x7e, 0xf3, 0xbc, 0xb1,
+  0xd7, 0x0c, 0x3d, 0x2a, 0x33, 0xcc, 0x3d, 0x86, 0x09, 0x6b, 0x3d, 0xb6, 0xa4,
+  0x97, 0x3d, 0x15, 0x03, 0x89, 0x3d, 0x5c, 0x5c, 0x85, 0x3d, 0x47, 0x39, 0x65,
+  0x3d, 0xd2, 0x8b, 0x06, 0xbd, 0x6c, 0xed, 0x55, 0x3b, 0x30, 0xd5, 0x99, 0xbc,
+  0x7d, 0x00, 0xb5, 0xbb, 0x54, 0xe8, 0x12, 0xbd, 0x8c, 0x6f, 0x3e, 0x3c, 0x07,
+  0x15, 0x9a, 0x3d, 0xf2, 0x93, 0xa1, 0x3d, 0x0a, 0xf7, 0x7c, 0x3d, 0x89, 0xe9,
+  0xc0, 0x3c, 0xc4, 0x63, 0x6d, 0x3d, 0x02, 0x6a, 0xa9, 0x3d, 0x85, 0x9b, 0x4b,
+  0x3d, 0x20, 0x90, 0x99, 0x3c, 0xcd, 0xb5, 0x1f, 0x3d, 0x7f, 0x5e, 0x72, 0xbd,
+  0x19, 0x42, 0x08, 0xbc, 0x4c, 0xd0, 0x60, 0xbd, 0x28, 0x45, 0x5d, 0xbd, 0x9f,
+  0x9e, 0x95, 0xbd, 0xf8, 0x82, 0x82, 0xbd, 0x14, 0xd6, 0x3c, 0x3d, 0x55, 0x69,
+  0x6e, 0x3d, 0x6e, 0xd1, 0x37, 0xbc, 0x6a, 0x72, 0x34, 0xbd, 0x67, 0x77, 0xa4,
+  0xbc, 0xd0, 0xb2, 0xaa, 0x3d, 0xfa, 0xbb, 0x32, 0x3d, 0x5b, 0xfd, 0x1e, 0x3d,
+  0x6b, 0x18, 0x8a, 0x3b, 0xd1, 0xe0, 0x3b, 0x3c, 0x0e, 0xaa, 0xb8, 0xbc, 0xd8,
+  0x60, 0x73, 0x3d, 0x18, 0xea, 0xac, 0x3d, 0x0a, 0x98, 0x8c, 0xbd, 0xa8, 0xae,
+  0x90, 0x3d, 0xa4, 0x92, 0x81, 0x3b, 0xfa, 0x7d, 0x67, 0x3d, 0xd1, 0x86, 0xad,
+  0x3d, 0xa0, 0x03, 0x2e, 0xbc, 0xa7, 0x6d, 0xf7, 0x3c, 0x93, 0xfe, 0x81, 0x3d,
+  0x55, 0x43, 0xdd, 0x3b, 0x9e, 0xc7, 0x19, 0x3d, 0xc1, 0x4e, 0x1e, 0x3d, 0x4a,
+  0xb6, 0x3c, 0xbd, 0xae, 0x17, 0x16, 0xbd, 0xa1, 0xf5, 0x4d, 0xbd, 0x89, 0x2c,
+  0x04, 0xbd, 0xd3, 0xeb, 0x93, 0x3d, 0x35, 0xae, 0x19, 0x3c, 0xf8, 0x48, 0xa5,
+  0x3c, 0x94, 0x41, 0xf4, 0xbc, 0x67, 0x32, 0x41, 0xbd, 0x19, 0x2d, 0x38, 0x3d,
+  0x57, 0x90, 0x6f, 0xbc, 0xea, 0xb3, 0x89, 0xbc, 0x73, 0x19, 0x5b, 0x3d, 0x9d,
+  0x72, 0xae, 0x3d, 0xb9, 0x8b, 0x23, 0xbd, 0xa4, 0x13, 0x43, 0xbc, 0xd0, 0x4d,
+  0x12, 0x3d, 0xd7, 0xa3, 0x38, 0xbd, 0xc9, 0xb4, 0xd5, 0x3d, 0x4b, 0x93, 0x24,
+  0x3c, 0xd2, 0xfa, 0xe8, 0xbc, 0xdb, 0xa3, 0x0b, 0xbd, 0xc2, 0xdd, 0x5e, 0x3d,
+  0x4c, 0x2c, 0xa5, 0xbd, 0xd2, 0x24, 0x77, 0xbd, 0x50, 0xd3, 0xa1, 0x3d, 0xca,
+  0xe7, 0x00, 0x3a, 0xbf, 0x15, 0xed, 0xbc, 0x83, 0xc3, 0x60, 0x3d, 0xba, 0x44,
+  0x82, 0x3d, 0xa4, 0x8d, 0x93, 0x3d, 0x7a, 0xdf, 0x92, 0xbd, 0x2e, 0x60, 0xcd,
+  0x3b, 0x8a, 0xc9, 0x67, 0x3d, 0xbc, 0x59, 0x2e, 0xbd, 0xd6, 0x96, 0xb0, 0x3d,
+  0x89, 0x2f, 0xd1, 0xbc, 0x18, 0xd2, 0x0c, 0xbc, 0xc4, 0xf8, 0x84, 0x3d, 0x50,
+  0xc8, 0x52, 0xbd, 0xa8, 0xc1, 0x58, 0xbd, 0xa3, 0xe1, 0x26, 0x3d, 0x61, 0x05,
+  0x00, 0x3d, 0x5d, 0xe9, 0x84, 0x3d, 0xc2, 0x44, 0x37, 0x3d, 0xfb, 0xf3, 0xb0,
+  0xbc, 0x69, 0x4b, 0x6c, 0xbd, 0xa9, 0x6b, 0xa4, 0xbc, 0x77, 0x53, 0x84, 0x3c,
+  0x12, 0x21, 0x0c, 0xbd, 0x0d, 0x59, 0x08, 0xbc, 0x44, 0xb6, 0x11, 0xbd, 0xaa,
+  0xef, 0x8e, 0x3d, 0x4e, 0x39, 0x32, 0x3d, 0x40, 0x7f, 0x7a, 0xbd, 0xa8, 0x2d,
+  0xbf, 0xbc, 0x3a, 0xff, 0x30, 0x3d, 0xff, 0x61, 0xbb, 0x3b, 0xc3, 0xdf, 0x96,
+  0xbc, 0x22, 0x74, 0x53, 0xbd, 0x69, 0x07, 0x8a, 0xbd, 0x46, 0x58, 0xe0, 0x3c,
+  0x91, 0x62, 0x31, 0xbd, 0x38, 0x57, 0x01, 0xbc, 0x09, 0x74, 0x93, 0xbc, 0x3e,
+  0xb2, 0x8a, 0x3c, 0xd8, 0x12, 0x1d, 0xbd, 0xd7, 0xf6, 0xc2, 0xbc, 0x86, 0x55,
+  0x11, 0x3c, 0x28, 0x0d, 0x70, 0x3d, 0x98, 0xa3, 0x8a, 0x3d, 0x7b, 0xf0, 0x93,
+  0xbd, 0xc2, 0x7c, 0x0b, 0xbd, 0xfa, 0x05, 0xcc, 0x3c, 0x5f, 0x77, 0x19, 0x3d,
+  0xe0, 0x09, 0xb3, 0x3c, 0x13, 0x77, 0x8a, 0xbc, 0x1f, 0x76, 0x36, 0x3c, 0xfb,
+  0x4f, 0x97, 0x3d, 0x1f, 0xec, 0x31, 0x3d, 0xf9, 0x14, 0x79, 0x3d, 0x50, 0xab,
+  0x92, 0xbd, 0xda, 0x3c, 0xf3, 0xba, 0x2f, 0x4d, 0x72, 0xbc, 0x0f, 0x3a, 0xc6,
+  0x3c, 0x7e, 0xf5, 0x40, 0xbd, 0x0f, 0xf2, 0x87, 0xbd, 0xc9, 0x6e, 0xef, 0xbc,
+  0x06, 0xec, 0xce, 0xbc, 0x3d, 0x26, 0x2b, 0xbd, 0x4a, 0x6a, 0x53, 0x3d, 0x1b,
+  0x90, 0x1a, 0xbb, 0x39, 0xb6, 0x23, 0x3d, 0xa2, 0xbd, 0x88, 0xbd, 0xd7, 0x0d,
+  0x2a, 0xbc, 0xf5, 0xf6, 0x94, 0xbd, 0xf0, 0xd7, 0x52, 0xbc, 0x85, 0x99, 0x83,
+  0xbd, 0xdd, 0xc4, 0x8c, 0xbd, 0xaa, 0x19, 0x4a, 0x3d, 0x26, 0x21, 0xec, 0x3c,
+  0x0f, 0xe7, 0x1b, 0xbc, 0x39, 0x8e, 0xea, 0xbc, 0x03, 0xdc, 0x2f, 0xbd, 0x03,
+  0x8c, 0x8c, 0x3d, 0xe4, 0xcb, 0x7f, 0xbc, 0xc6, 0xb9, 0xfd, 0x3b, 0x78, 0x5b,
+  0x44, 0xbd, 0xd0, 0x3d, 0x89, 0xbc, 0xe0, 0xdb, 0xc2, 0xbc, 0x84, 0x8d, 0x39,
+  0xbd, 0x9a, 0x7b, 0x9a, 0x3b, 0x5d, 0xb4, 0x88, 0xbc, 0xf3, 0xf0, 0x8e, 0xbd,
+  0x27, 0x0c, 0x41, 0x3d, 0xe7, 0x60, 0xa0, 0x3c, 0x86, 0xb6, 0xa9, 0xbc, 0x15,
+  0x55, 0x4f, 0xbd, 0xf4, 0x53, 0xfb, 0xbc, 0xdf, 0x4d, 0x0d, 0x3d, 0x06, 0x46,
+  0x7d, 0xbd, 0x37, 0x4d, 0xb0, 0xbc, 0x7d, 0x65, 0x1e, 0xbd, 0x30, 0x1a, 0x00,
+  0xbb, 0x16, 0x56, 0x28, 0xbd, 0xb4, 0xef, 0xdd, 0xbc, 0xcc, 0xbc, 0x40, 0xbd,
+  0x95, 0xce, 0x84, 0xbd, 0x97, 0x26, 0x98, 0xbd, 0x86, 0x1f, 0x80, 0xbd, 0x64,
+  0x16, 0x97, 0x3c, 0x9b, 0xd0, 0x22, 0x3c, 0x05, 0x08, 0x52, 0xbb, 0xd2, 0x11,
+  0x8e, 0xbd, 0x3c, 0xa3, 0x8c, 0x3d, 0x4c, 0xdb, 0xa0, 0xbd, 0x24, 0xe2, 0x0a,
+  0xbd, 0x24, 0x87, 0x69, 0x3c, 0x7c, 0x72, 0xb2, 0x3c, 0xda, 0xcd, 0x0c, 0x3d,
+  0xd1, 0x51, 0x4c, 0x3d, 0xb6, 0xaf, 0x30, 0xbd, 0x07, 0xa0, 0x64, 0x3d, 0x09,
+  0x30, 0x59, 0x3d, 0x68, 0xb3, 0x06, 0xbd, 0x01, 0x85, 0xe4, 0xbc, 0x10, 0x9f,
+  0x2a, 0xbd, 0xe0, 0x85, 0x93, 0x3d, 0x71, 0xe0, 0x13, 0xbd, 0x28, 0x8b, 0x8e,
+  0x3c, 0x53, 0x74, 0x71, 0xbc, 0x6a, 0x6d, 0xad, 0x3d, 0x88, 0xf7, 0x32, 0x3c,
+  0xfb, 0xde, 0x41, 0x3c, 0x90, 0x33, 0x4c, 0xba, 0x89, 0xe4, 0x1d, 0x3c, 0x47,
+  0x26, 0xb5, 0xbc, 0x5c, 0x9c, 0x9d, 0xbd, 0xd4, 0xe8, 0xdb, 0x3b, 0x7f, 0x88,
+  0x99, 0x3d, 0x79, 0xd9, 0xb8, 0xbc, 0x76, 0x00, 0xb9, 0x3d, 0x74, 0x04, 0xb9,
+  0xbc, 0xde, 0x84, 0x38, 0x3d, 0x5c, 0x38, 0x91, 0x3d, 0x80, 0x37, 0x04, 0xbd,
+  0xfa, 0x1a, 0x34, 0x3d, 0x36, 0x16, 0x11, 0x3d, 0xf3, 0x66, 0x86, 0x3d, 0x84,
+  0x83, 0x16, 0xbd, 0xec, 0x1a, 0x43, 0xbd, 0x06, 0xf8, 0x64, 0x3d, 0x96, 0x19,
+  0x31, 0x3b, 0x75, 0x30, 0x9e, 0x3d, 0xf5, 0xfa, 0xd1, 0xbb, 0x96, 0xf3, 0xc8,
+  0xbc, 0x84, 0x0f, 0x6d, 0xbd, 0xd1, 0x3e, 0x77, 0x3c, 0xbb, 0xb8, 0xf1, 0xbc,
+  0x49, 0xf5, 0x70, 0x3d, 0x33, 0x33, 0x44, 0xbd, 0xc9, 0xca, 0xf5, 0x3c, 0x5d,
+  0xe3, 0x2c, 0xbc, 0x06, 0x48, 0xb8, 0x3d, 0xfe, 0xac, 0x12, 0x3d, 0x1d, 0xd6,
+  0x86, 0x3d, 0x54, 0xa5, 0x39, 0x3d, 0x4d, 0x88, 0xeb, 0x3c, 0x14, 0xe2, 0x3e,
+  0x3c, 0xb5, 0xe9, 0xd3, 0xbc, 0x97, 0xe0, 0x7e, 0x3c, 0x9b, 0xa2, 0x5a, 0xbc,
+  0x14, 0xab, 0x89, 0x3d, 0x4a, 0xdc, 0x93, 0x3d, 0xe8, 0xee, 0xb5, 0xbc, 0x5f,
+  0x9a, 0x9b, 0x3b, 0x26, 0x69, 0x55, 0x3c, 0x7d, 0x50, 0x89, 0xbc, 0xe0, 0x93,
+  0x8c, 0x3b, 0x44, 0xbc, 0x23, 0xbd, 0x47, 0x76, 0x85, 0x3d, 0xfd, 0x6a, 0x25,
+  0x39, 0x3e, 0x57, 0x9c, 0x3d, 0x70, 0xdd, 0xd0, 0x3b, 0x40, 0xdf, 0x3b, 0x3d,
+  0x47, 0x5c, 0xbd, 0xbc, 0x90, 0x3d, 0x33, 0xbd, 0xd8, 0xc6, 0x76, 0xbd, 0xf2,
+  0xd8, 0x51, 0x3d, 0x17, 0x60, 0x9c, 0xbd, 0x32, 0x78, 0x1b, 0xbd, 0xb4, 0xef,
+  0x70, 0x3d, 0xfa, 0x9d, 0xb6, 0x3b, 0x88, 0x5c, 0xe0, 0x3a, 0x47, 0x1b, 0xf8,
+  0xbc, 0x3b, 0x66, 0xcb, 0xba, 0x30, 0xe1, 0x04, 0xbd, 0x58, 0xbe, 0x87, 0xbd,
+  0xc2, 0xa5, 0x10, 0xbc, 0x48, 0x34, 0xa3, 0x3d, 0x44, 0xa4, 0x77, 0x3d, 0x7d,
+  0xe5, 0x94, 0xba, 0x23, 0xd9, 0xa3, 0xbc, 0xf6, 0xf6, 0xc6, 0xbc, 0xea, 0xd8,
+  0x31, 0xbd, 0x9f, 0x50, 0x24, 0x3d, 0xc8, 0x2a, 0x37, 0x3d, 0xaf, 0xe4, 0x82,
+  0x3d, 0x28, 0x20, 0x70, 0x3d, 0xa3, 0x27, 0x52, 0x3d, 0xbd, 0x34, 0x8a, 0x3c,
+  0x8c, 0x2c, 0xde, 0x3c, 0x35, 0xf4, 0x70, 0xbd, 0x35, 0x89, 0x19, 0x3d, 0x54,
+  0x59, 0x46, 0xb9, 0xa6, 0xfb, 0xc0, 0xbc, 0x56, 0x95, 0x8d, 0x3d, 0xd1, 0x4f,
+  0x71, 0x3d, 0xe1, 0xe3, 0x9f, 0x3d, 0x05, 0xe2, 0x82, 0xbd, 0xb7, 0xcf, 0x06,
+  0x3d, 0x02, 0x28, 0xa3, 0xbc, 0xd0, 0xcf, 0x48, 0x3d, 0x8e, 0x69, 0x3b, 0xbc,
+  0x1e, 0x83, 0x14, 0xbb, 0x72, 0x67, 0x82, 0x3b, 0x64, 0x7d, 0xeb, 0xbc, 0x2a,
+  0x76, 0xe5, 0xba, 0x6a, 0xd8, 0x3c, 0xbd, 0x10, 0xc0, 0x4c, 0x3d, 0x64, 0x44,
+  0x64, 0x3d, 0xbe, 0xb4, 0x31, 0xbd, 0x0c, 0x43, 0x09, 0xbd, 0xa4, 0x6d, 0x8d,
+  0xbd, 0xd0, 0xbf, 0x4a, 0x3d, 0x09, 0x76, 0x90, 0xbd, 0x29, 0x9c, 0x0b, 0x3d,
+  0x7c, 0x61, 0x74, 0xbd, 0xb9, 0x1c, 0x1c, 0xbd, 0x09, 0x6d, 0xad, 0x3b, 0x3e,
+  0xb4, 0x93, 0xbc, 0x1f, 0x5a, 0xa4, 0x3c, 0xe2, 0x7a, 0x89, 0xbd, 0x1c, 0x1d,
+  0x49, 0x3c, 0x0c, 0xc3, 0x06, 0xbd, 0xf9, 0xe2, 0xd6, 0x3c, 0x1a, 0x44, 0x57,
+  0xbd, 0x7a, 0xac, 0x50, 0x3d, 0x39, 0xe4, 0xc4, 0x3c, 0xfb, 0x1e, 0x04, 0x3d,
+  0x8a, 0xf6, 0x53, 0xbd, 0xfc, 0xac, 0x62, 0xbc, 0x44, 0xcc, 0x20, 0x3d, 0xf6,
+  0x5e, 0xa0, 0x3c, 0x88, 0x20, 0xcd, 0xba, 0x6b, 0xc7, 0x1c, 0xbd, 0x66, 0xd2,
+  0x16, 0xbb, 0x8b, 0x02, 0x58, 0xbd, 0x17, 0x15, 0x83, 0x3d, 0xef, 0x6a, 0x84,
+  0x3d, 0x00, 0x91, 0xd1, 0xba, 0x9a, 0xa6, 0x83, 0x3d, 0x6e, 0x12, 0x9c, 0xbd,
+  0x4c, 0x00, 0x46, 0x3d, 0x08, 0x8e, 0xcf, 0x3b, 0x53, 0x98, 0xb9, 0xbc, 0x5c,
+  0x33, 0x43, 0x3d, 0x05, 0x7b, 0x03, 0xbd, 0x82, 0x26, 0x35, 0xbd, 0xbf, 0x76,
+  0x75, 0xbd, 0x08, 0x78, 0x49, 0xbd, 0xe1, 0x7e, 0x53, 0xbc, 0xf0, 0x64, 0xf2,
+  0x3c, 0x56, 0xaf, 0x1a, 0x3d, 0x1c, 0x8f, 0x08, 0x3d, 0x11, 0xac, 0x91, 0xbd,
+  0xe8, 0x21, 0x06, 0x3d, 0xf5, 0xbb, 0xdb, 0xbc, 0x0c, 0xc9, 0x81, 0xbd, 0x74,
+  0x76, 0x83, 0xbd, 0x5e, 0xf3, 0x40, 0xbd, 0xd6, 0xbb, 0x98, 0x3d, 0x4b, 0x9a,
+  0x93, 0x3c, 0x25, 0x64, 0x9d, 0xbd, 0xf4, 0xf4, 0x9e, 0xbc, 0x66, 0xbe, 0x2b,
+  0xbb, 0xad, 0xa4, 0x82, 0x3c, 0x76, 0x08, 0x5d, 0xbd, 0x2c, 0xf4, 0x2f, 0xbd,
+  0xb3, 0x5e, 0x84, 0x3d, 0x62, 0xad, 0x06, 0x3d, 0x6a, 0xe5, 0xea, 0xbc, 0xd8,
+  0x06, 0x23, 0x3d, 0x85, 0x25, 0xeb, 0xbc, 0xa9, 0x01, 0xab, 0xbb, 0x28, 0xe4,
+  0xf3, 0x3c, 0x9f, 0x9e, 0x8e, 0xbd, 0x3f, 0xe2, 0x2c, 0xbc, 0xe0, 0xfd, 0xc1,
+  0x3c, 0x84, 0x67, 0xa7, 0xbb, 0xc5, 0x1d, 0xfc, 0xbc, 0xee, 0x05, 0x6b, 0xbd,
+  0x9a, 0x29, 0xc9, 0xbc, 0x35, 0x9c, 0x0f, 0x3d, 0xff, 0xd3, 0x1c, 0xbd, 0x60,
+  0x5c, 0x3d, 0xbd, 0x85, 0xf0, 0x81, 0x3d, 0xe6, 0x58, 0x0f, 0xbc, 0xda, 0x46,
+  0x01, 0xbd, 0xe4, 0xae, 0x88, 0xbd, 0xe2, 0x4a, 0x47, 0xbd, 0x51, 0xf0, 0x7e,
+  0xbd, 0x18, 0xc7, 0x82, 0x3d, 0x85, 0xf7, 0x26, 0x3d, 0x7f, 0xe0, 0xc0, 0xbc,
+  0x28, 0xa7, 0x56, 0x3b, 0x86, 0xe9, 0x17, 0xbb, 0x75, 0xc7, 0x81, 0x3d, 0x0c,
+  0x95, 0x19, 0xbc, 0x27, 0x0d, 0x62, 0xbd, 0xae, 0x2f, 0x14, 0x3b, 0xcf, 0x26,
+  0x47, 0xbd, 0x75, 0xe8, 0x26, 0x3d, 0x99, 0x94, 0x48, 0x3d, 0xac, 0xe6, 0x3f,
+  0x3d, 0x50, 0xa8, 0xee, 0x3c, 0x25, 0x3e, 0xef, 0xbc, 0x98, 0xfe, 0x37, 0xbc,
+  0x05, 0x4b, 0x28, 0x3d, 0xa5, 0x42, 0xfc, 0x3c, 0x40, 0xda, 0x68, 0x3d, 0xf7,
+  0x91, 0x35, 0x3d, 0xae, 0xa1, 0x1a, 0x3d, 0xeb, 0xc7, 0x1b, 0xbd, 0x98, 0x7d,
+  0xb1, 0x3c, 0xf7, 0xe7, 0x0b, 0xbd, 0x72, 0x31, 0x47, 0x3d, 0x47, 0xeb, 0x85,
+  0xbd, 0x4f, 0x71, 0x1f, 0xbc, 0xae, 0x19, 0x1b, 0xbd, 0x30, 0xc5, 0xd7, 0xbb,
+  0x94, 0xbe, 0x05, 0x3d, 0x39, 0x66, 0x94, 0x3c, 0x68, 0xab, 0x65, 0xbc, 0x4a,
+  0x43, 0xd3, 0xbc, 0x66, 0x6e, 0x22, 0x3d, 0x2c, 0xb6, 0x45, 0x3d, 0xec, 0xf0,
+  0x09, 0xbd, 0x15, 0x84, 0xd6, 0x3c, 0x67, 0xb6, 0x5e, 0xbd, 0x48, 0xb9, 0x1b,
+  0x3d, 0xef, 0x6b, 0x36, 0x3d, 0xfa, 0x9f, 0x60, 0x3c, 0xfb, 0x49, 0x8c, 0x3d,
+  0x50, 0x0b, 0xfd, 0x3c, 0x43, 0x24, 0xf5, 0x3c, 0x48, 0xf5, 0x1c, 0x3d, 0x24,
+  0xed, 0x55, 0xbd, 0x12, 0x2a, 0x33, 0xbd, 0x6f, 0x59, 0x3b, 0xbb, 0xeb, 0x66,
+  0xe0, 0xbc, 0x7b, 0x67, 0x60, 0xbb, 0x19, 0x8c, 0x85, 0x3c, 0x72, 0x71, 0x22,
+  0x3b, 0x7f, 0xa1, 0x22, 0xbd, 0x9e, 0xcd, 0x04, 0x3d, 0x00, 0xf6, 0xff, 0xb9,
+  0xdf, 0x8b, 0x16, 0xbd, 0xc1, 0x0c, 0xfd, 0x3c, 0x9b, 0xf9, 0x5b, 0xbd, 0x71,
+  0x73, 0x8c, 0x3d, 0x0f, 0x55, 0x63, 0x3d, 0x20, 0xbf, 0xb9, 0x3c, 0xa3, 0xc5,
+  0x85, 0x3d, 0xfd, 0x98, 0x2e, 0xbd, 0xb4, 0x02, 0x2e, 0xbc, 0xe2, 0x12, 0x46,
+  0xbc, 0x90, 0x41, 0x6f, 0xbd, 0x0d, 0xc7, 0x68, 0x3d, 0x4e, 0x58, 0x4f, 0x3c,
+  0xc0, 0xeb, 0x1d, 0xbb, 0x3d, 0xcb, 0x9f, 0xbd, 0x29, 0x0c, 0x7f, 0x3d, 0x8a,
+  0x62, 0x4d, 0xbc, 0x01, 0x3c, 0x7b, 0x3d, 0x3c, 0x41, 0xb8, 0x3c, 0xa9, 0x70,
+  0x53, 0x3d, 0x32, 0x94, 0xab, 0x3d, 0xdc, 0x75, 0x4c, 0x3d, 0xab, 0x5d, 0xd6,
+  0xbc, 0xae, 0x74, 0x0a, 0xbd, 0x7f, 0xf5, 0xec, 0x3c, 0xff, 0x6e, 0x4c, 0xbd,
+  0x0c, 0x65, 0x16, 0xbc, 0x4f, 0x2a, 0x58, 0x3c, 0xe2, 0x17, 0xa0, 0x3d, 0x6a,
+  0x10, 0x83, 0xbc, 0xfc, 0x40, 0xc0, 0x3d, 0xbc, 0xa0, 0xad, 0xbc, 0xde, 0xdc,
+  0x98, 0x3d, 0xaf, 0x54, 0x84, 0xbb, 0x64, 0xcd, 0xdf, 0x3c, 0xab, 0x93, 0x2c,
+  0xbc, 0x44, 0x5c, 0x29, 0x3c, 0xac, 0x7f, 0x27, 0x3d, 0xb2, 0x34, 0xee, 0x3c,
+  0x66, 0xf2, 0xd9, 0x3c, 0x4d, 0xaf, 0x86, 0x3d, 0xee, 0x79, 0x10, 0xbd, 0xa2,
+  0x84, 0x31, 0xbd, 0xe2, 0xf9, 0x43, 0x3d, 0x26, 0x87, 0xf1, 0x3b, 0xf0, 0x3a,
+  0x8f, 0xbd, 0x3e, 0x23, 0x5d, 0xbd, 0x75, 0x0a, 0x7c, 0x3d, 0x15, 0xe4, 0x5a,
+  0xbd, 0x45, 0xb3, 0xb2, 0x3c, 0xe3, 0xc4, 0x36, 0x3d, 0x7d, 0x89, 0x9f, 0x3c,
+  0x9e, 0x54, 0xaa, 0xbb, 0x89, 0x2e, 0x88, 0xbd, 0xad, 0xe0, 0x89, 0xbc, 0x69,
+  0xe9, 0x66, 0xbd, 0x94, 0xa9, 0xf4, 0xbc, 0xb3, 0xde, 0x21, 0xbd, 0x0b, 0x5a,
+  0x82, 0xbd, 0x55, 0x78, 0x00, 0x3d, 0x1f, 0x1d, 0xa2, 0xbd, 0x5c, 0xe4, 0x4b,
+  0xbd, 0x63, 0x9e, 0xa6, 0xbd, 0x44, 0xdb, 0x75, 0xbd, 0x6a, 0xe7, 0xf3, 0xbc,
+  0xdc, 0xa5, 0x2c, 0xbd, 0xc7, 0xcd, 0x8d, 0x3c, 0xd4, 0x97, 0x85, 0x3c, 0xc5,
+  0x19, 0x4a, 0xbc, 0x48, 0x7d, 0x09, 0xbc, 0xd6, 0x74, 0x2c, 0xbd, 0x94, 0xb6,
+  0xf9, 0x3c, 0xfd, 0x54, 0x8d, 0x3d, 0xdf, 0x85, 0x57, 0x3d, 0x82, 0x58, 0x67,
+  0x3d, 0x67, 0x4a, 0xe8, 0xba, 0xec, 0xb0, 0xe9, 0x3c, 0x9a, 0xf0, 0x1f, 0x3d,
+  0x80, 0xbc, 0x7e, 0xbd, 0x15, 0xe3, 0x16, 0x3d, 0x49, 0xb7, 0x33, 0xbc, 0x03,
+  0xbe, 0x65, 0xbd, 0x6c, 0x41, 0x8b, 0x3d, 0x93, 0x68, 0x85, 0xbc, 0x50, 0x1a,
+  0x50, 0xbd, 0x10, 0xbe, 0x7f, 0xbc, 0x15, 0x0c, 0x58, 0xbc, 0x48, 0xe9, 0x92,
+  0xbd, 0x48, 0x67, 0x3e, 0xbc, 0x38, 0x60, 0x66, 0xbd, 0x76, 0xac, 0x9e, 0xbd,
+  0x4d, 0xc9, 0x61, 0x3d, 0x0b, 0xa6, 0x9f, 0xbd, 0x8f, 0x08, 0xcb, 0x3c, 0x60,
+  0x17, 0x35, 0x3d, 0x60, 0x75, 0x7a, 0x3c, 0x24, 0x97, 0x48, 0x3a, 0x64, 0x78,
+  0x90, 0xbc, 0xf3, 0x93, 0xb8, 0xbb, 0x46, 0x84, 0x69, 0xbd, 0xd6, 0x71, 0x43,
+  0x3d, 0xb4, 0x2b, 0x62, 0xbc, 0x47, 0x6b, 0x08, 0x3c, 0x0e, 0x23, 0xeb, 0xbc,
+  0xf4, 0xc8, 0xb0, 0xbc, 0x3f, 0x17, 0xbe, 0xbc, 0x11, 0xc5, 0x99, 0x3d, 0x50,
+  0x81, 0x15, 0x3d, 0x8e, 0xd8, 0x7d, 0x3d, 0xfd, 0x07, 0x8d, 0xbb, 0x7a, 0x46,
+  0xea, 0x3c, 0x7d, 0xc9, 0x2c, 0x3d, 0x1e, 0x27, 0x2f, 0x3d, 0x67, 0x04, 0x05,
+  0xbc, 0x8f, 0x0a, 0x71, 0xbc, 0x44, 0xcb, 0x78, 0xbc, 0x3b, 0x8e, 0x17, 0x3d,
+  0x8c, 0x61, 0xf6, 0x3c, 0xdf, 0x7a, 0x54, 0x3d, 0x93, 0xe6, 0xaa, 0xbc, 0xef,
+  0x19, 0xd2, 0xbc, 0xb8, 0xec, 0x13, 0x3d, 0xed, 0x16, 0x39, 0x3d, 0x7c, 0xb2,
+  0xdc, 0x3c, 0x03, 0xf9, 0x84, 0xb9, 0xe7, 0xbd, 0x70, 0xbc, 0xea, 0x33, 0x77,
+  0x3d, 0xa8, 0xd3, 0x55, 0x3c, 0x3b, 0x55, 0x04, 0x3c, 0x72, 0x75, 0x67, 0xbc,
+  0xde, 0x63, 0x4b, 0xbc, 0x73, 0xc5, 0x01, 0xbd, 0x2e, 0x1b, 0x01, 0x3c, 0xb2,
+  0xeb, 0x57, 0x3d, 0x81, 0xaa, 0x2d, 0xbd, 0x68, 0x5f, 0x1c, 0xbd, 0x0e, 0x36,
+  0x77, 0x3d, 0xd9, 0xb5, 0x27, 0x3c, 0x99, 0x74, 0x27, 0x3d, 0xae, 0x86, 0x74,
+  0xbd, 0x57, 0x12, 0x0e, 0xbd, 0x37, 0x30, 0x2a, 0x3d, 0x5e, 0xf5, 0x3b, 0x3d,
+  0x37, 0x81, 0x6f, 0x3d, 0xd3, 0xe7, 0x4b, 0xbd, 0x4a, 0x7f, 0x85, 0x3d, 0xce,
+  0x31, 0x21, 0x3d, 0xda, 0xf8, 0x86, 0xbc, 0x5e, 0x6d, 0x1f, 0x3c, 0x80, 0x1b,
+  0x06, 0x3b, 0xd7, 0x82, 0x5f, 0x3d, 0x74, 0xc0, 0x26, 0xbd, 0x1d, 0x0e, 0x8d,
+  0xbc, 0x00, 0xfe, 0x06, 0x3d, 0x5f, 0x91, 0x79, 0xbd, 0x53, 0x7a, 0xee, 0xbc,
+  0x64, 0x03, 0x41, 0x3d, 0x66, 0xa9, 0xfa, 0xba, 0x67, 0x37, 0x40, 0xbd, 0xd8,
+  0x7f, 0x23, 0xbd, 0x1a, 0x9f, 0x03, 0xbc, 0x93, 0x26, 0x03, 0xbd, 0xeb, 0xf7,
+  0x58, 0xbc, 0x04, 0xe4, 0xdc, 0xb9, 0xb6, 0xbb, 0x9b, 0x3b, 0x9e, 0x4b, 0x14,
+  0x3d, 0x5a, 0x9a, 0xd4, 0xba, 0x59, 0xcd, 0x21, 0xbd, 0x00, 0xc3, 0x85, 0x3c,
+  0xec, 0xbf, 0xf2, 0xbc, 0x0e, 0x59, 0x3a, 0xbd, 0xa7, 0x8f, 0x81, 0x3d, 0x11,
+  0x2d, 0x63, 0xbd, 0x55, 0x42, 0xe8, 0xbc, 0x6b, 0x6e, 0x8c, 0x3c, 0xa3, 0x84,
+  0x1d, 0xbd, 0x8c, 0xda, 0x4f, 0x3c, 0xb2, 0x36, 0xd1, 0x3c, 0x4f, 0x27, 0x71,
+  0x3d, 0xf8, 0x32, 0x8c, 0x3c, 0x5c, 0xe8, 0x69, 0xbc, 0x42, 0xcb, 0x24, 0x3d,
+  0x8f, 0xd8, 0x6b, 0xbd, 0x87, 0xd2, 0x9c, 0xbd, 0xc5, 0x3f, 0xb5, 0x3c, 0x08,
+  0xfc, 0xf9, 0x3c, 0x5b, 0x21, 0x7e, 0x3d, 0xef, 0x06, 0x65, 0xbc, 0xda, 0x92,
+  0x02, 0x3c, 0xb1, 0xf0, 0x99, 0xbc, 0x2e, 0x72, 0xe7, 0xbc, 0x32, 0x44, 0x6a,
+  0xbd, 0xdd, 0xbb, 0x20, 0x3b, 0xa1, 0xbf, 0xa3, 0x3c, 0xd2, 0x4f, 0x9b, 0x3c,
+  0xf8, 0x55, 0xbe, 0x3c, 0x35, 0xe3, 0x0a, 0x3d, 0xf0, 0x8a, 0x89, 0xbc, 0xd7,
+  0xd7, 0x6f, 0x3d, 0x96, 0xd9, 0x70, 0xbd, 0x00, 0x50, 0x20, 0x39, 0x1f, 0xa7,
+  0x17, 0x3d, 0x4f, 0x4f, 0xc3, 0xbb, 0xf6, 0x99, 0x40, 0xbd, 0x87, 0xd4, 0x2a,
+  0xbd, 0x09, 0x54, 0x06, 0x3d, 0x87, 0x46, 0xf4, 0xbb, 0x9c, 0x12, 0x12, 0x3c,
+  0x2f, 0xc9, 0xd1, 0x3c, 0x4c, 0x47, 0x4e, 0x3d, 0xf9, 0x77, 0x64, 0xbd, 0xd1,
+  0xa5, 0x17, 0xbd, 0xf3, 0x5b, 0xdb, 0x3c, 0x98, 0x30, 0x55, 0x3d, 0x3f, 0x3d,
+  0x37, 0xbd, 0x54, 0x12, 0xed, 0xbc, 0x30, 0x26, 0x1d, 0x3d, 0x72, 0x80, 0x8a,
+  0x3d, 0xf1, 0xd7, 0x4c, 0xbd, 0xa9, 0xc7, 0x83, 0x3d, 0x86, 0xba, 0x93, 0xbd,
+  0x6b, 0x0a, 0x90, 0xbd, 0x96, 0x8c, 0x64, 0xbd, 0x40, 0x70, 0xf1, 0x3a, 0xc0,
+  0x39, 0x79, 0x3d, 0x27, 0xda, 0x24, 0xbc, 0x36, 0x2e, 0x3c, 0x3d, 0xb0, 0xbe,
+  0x90, 0xbd, 0x20, 0x68, 0x14, 0xbc, 0x00, 0xa4, 0x3e, 0xbc, 0x85, 0xb9, 0x44,
+  0xbd, 0xa2, 0x06, 0x52, 0xbd, 0x6e, 0xae, 0x4a, 0xbd, 0xbe, 0x73, 0x6c, 0xbd,
+  0x49, 0xee, 0x3e, 0xbd, 0x36, 0x8a, 0xe0, 0x3c, 0x7f, 0x94, 0x8a, 0xbd, 0x19,
+  0x1d, 0x11, 0xbd, 0x15, 0x3e, 0x55, 0xbd, 0x4b, 0xcd, 0x7b, 0x3d, 0x63, 0xd7,
+  0x9f, 0xba, 0x83, 0xcb, 0x37, 0xbd, 0xa4, 0x4f, 0x21, 0xbd, 0xa5, 0xaf, 0xec,
+  0xbc, 0xcd, 0x46, 0xae, 0xbd, 0xe8, 0x66, 0x9d, 0x3c, 0x7c, 0x84, 0xa6, 0xbc,
+  0x85, 0xcc, 0x7f, 0x3d, 0xa5, 0x28, 0xa6, 0xbd, 0x2f, 0x3a, 0x55, 0xbc, 0xb4,
+  0x8b, 0xc8, 0xbc, 0xd3, 0x90, 0x5e, 0x3d, 0x49, 0x79, 0x81, 0xbd, 0x50, 0xc3,
+  0x79, 0xbc, 0x90, 0x04, 0x9b, 0xbd, 0x1e, 0xdb, 0x73, 0x3d, 0x97, 0x15, 0x7e,
+  0x3c, 0x5f, 0xf6, 0x83, 0x3d, 0x1d, 0x20, 0x32, 0x3c, 0xda, 0x32, 0x7a, 0xbd,
+  0x8f, 0xa0, 0x69, 0x3c, 0x20, 0xe0, 0x87, 0xbd, 0x08, 0xb7, 0x2f, 0x3d, 0x5e,
+  0x6c, 0x26, 0xbd, 0xba, 0xa8, 0xbe, 0xbc, 0xb3, 0x9b, 0xb7, 0xbc, 0xc1, 0x3e,
+  0x8e, 0x3d, 0x45, 0x90, 0x3f, 0xbd, 0x82, 0xee, 0x0c, 0x3d, 0x62, 0xe1, 0x38,
+  0xbc, 0x30, 0x95, 0x8b, 0x3c, 0xc6, 0x6b, 0x58, 0x3d, 0x7c, 0xca, 0x06, 0xbd,
+  0x03, 0xa3, 0x7b, 0x3d, 0x77, 0xef, 0x83, 0x3c, 0x24, 0xc7, 0x69, 0x3d, 0xf6,
+  0xed, 0x35, 0xbd, 0xaa, 0x2d, 0x33, 0x3d, 0x71, 0x69, 0x72, 0x3c, 0xed, 0x0d,
+  0x80, 0x3c, 0x02, 0x0d, 0x47, 0x3d, 0x30, 0x51, 0x86, 0xbc, 0x0a, 0xad, 0x8d,
+  0xbc, 0x80, 0xab, 0x1c, 0x3d, 0x68, 0x17, 0x3d, 0x3d, 0x47, 0x3c, 0x36, 0xbd,
+  0x32, 0x58, 0xfb, 0x3c, 0x27, 0x47, 0x82, 0x3d, 0xb8, 0x9c, 0x92, 0xbc, 0xab,
+  0xa8, 0xaf, 0xbb, 0x97, 0xb4, 0x7b, 0x3d, 0xdb, 0x16, 0xad, 0xbc, 0xa8, 0x50,
+  0x8b, 0xbd, 0x50, 0x91, 0x4d, 0x3c, 0xe1, 0x69, 0x73, 0x3c, 0x62, 0x4f, 0x30,
+  0xbd, 0x00, 0x70, 0x6a, 0x3c, 0x57, 0xbb, 0x8f, 0x3d, 0xe6, 0x60, 0x44, 0xbd,
+  0x33, 0x5a, 0xc2, 0xbc, 0xe6, 0xae, 0x82, 0xbd, 0x1e, 0xad, 0x6e, 0xbd, 0xc9,
+  0x43, 0x30, 0x3d, 0x30, 0x4a, 0x65, 0x3c, 0x79, 0x1d, 0xc7, 0x3c, 0x97, 0xab,
+  0x1e, 0x3b, 0x95, 0x60, 0xd7, 0xbc, 0xcc, 0xed, 0xa1, 0xbc, 0xa3, 0x6d, 0x6b,
+  0xbd, 0xd8, 0xc4, 0x30, 0x3c, 0xcf, 0x3e, 0x8b, 0xbc, 0x82, 0xd9, 0x0d, 0xbc,
+  0x6b, 0x1f, 0xdb, 0xbc, 0xb7, 0x65, 0x76, 0xbd, 0x19, 0x3a, 0xfb, 0x3c, 0xe8,
+  0x08, 0x08, 0xbd, 0x0b, 0xdb, 0x00, 0xbd, 0x4c, 0x51, 0x19, 0xbd, 0x2e, 0x6c,
+  0x37, 0x3d, 0xc0, 0xdf, 0x1e, 0x3b, 0x64, 0x10, 0x49, 0x3d, 0x77, 0x9b, 0xca,
+  0xbc, 0xca, 0x17, 0xfb, 0xbc, 0xe6, 0xa4, 0x92, 0x3d, 0xfd, 0x90, 0x77, 0x3d,
+  0x82, 0x5e, 0x6b, 0x3d, 0xe5, 0x15, 0x3c, 0x3d, 0xc3, 0x45, 0xf9, 0xbb, 0x0c,
+  0x61, 0x88, 0xbd, 0x26, 0xa1, 0x68, 0xbd, 0x67, 0x2c, 0x1e, 0xbd, 0x2b, 0xfe,
+  0x3e, 0xbd, 0xb9, 0x45, 0x0b, 0xbd, 0x8e, 0x79, 0x09, 0xbd, 0x16, 0xdf, 0x45,
+  0xbd, 0x52, 0xbb, 0x24, 0xbc, 0x84, 0x55, 0x78, 0xbd, 0xb7, 0x6d, 0x55, 0x3d,
+  0xb8, 0xe4, 0x8a, 0x3d, 0xcc, 0x8e, 0x2d, 0xbd, 0xf8, 0x0a, 0x13, 0x3c, 0xda,
+  0x22, 0x23, 0x3d, 0xee, 0x07, 0x1e, 0x3d, 0xee, 0x5c, 0x38, 0xbd, 0x1b, 0xfa,
+  0xc1, 0xbc, 0x62, 0x88, 0x82, 0xbc, 0x9e, 0x6c, 0x39, 0xbd, 0xe8, 0xc8, 0x90,
+  0xbd, 0xb2, 0xaf, 0x0e, 0xbd, 0x87, 0xc1, 0x61, 0xbc, 0x91, 0xcf, 0x21, 0x3b,
+  0xaa, 0x52, 0x88, 0xbd, 0x2b, 0xcb, 0x8e, 0xbd, 0x42, 0x58, 0xb0, 0x3c, 0x72,
+  0x3e, 0x9a, 0x3c, 0x1e, 0x92, 0x09, 0x3d, 0xc6, 0x67, 0x9a, 0xbd, 0xa0, 0xb0,
+  0x29, 0x3b, 0x51, 0x6e, 0x0c, 0xbd, 0x88, 0x0d, 0x4d, 0xbd, 0x1c, 0xc3, 0xee,
+  0x3c, 0x43, 0xfc, 0x61, 0x3d, 0x74, 0x13, 0x84, 0x3c, 0x10, 0xbc, 0xd4, 0x3c,
+  0x8a, 0x20, 0x9d, 0x39, 0x0a, 0x33, 0xdd, 0x3b, 0xee, 0x75, 0x96, 0xbd, 0x77,
+  0x4f, 0xa2, 0x3c, 0x1a, 0x55, 0xe4, 0xbc, 0x17, 0x4b, 0x5c, 0xbc, 0xe8, 0x22,
+  0x5a, 0xbd, 0xcf, 0xa8, 0x46, 0x3c, 0x2e, 0x1d, 0x2c, 0xbd, 0x7c, 0x53, 0x62,
+  0xbc, 0x4e, 0xdc, 0x25, 0x3d, 0x3c, 0x94, 0x4e, 0xbd, 0xba, 0x9a, 0x3b, 0xbd,
+  0x32, 0x01, 0x02, 0x3d, 0x57, 0xd2, 0x80, 0x3d, 0x88, 0x7d, 0xb4, 0xbc, 0x81,
+  0xbf, 0x7f, 0xbd, 0xf7, 0xbb, 0x89, 0x3d, 0xa0, 0xba, 0x30, 0x3d, 0x13, 0xd5,
+  0x91, 0x3d, 0xc7, 0x59, 0x37, 0x3d, 0x3c, 0xc1, 0x95, 0xbd, 0x41, 0x62, 0x94,
+  0xbc, 0x09, 0x66, 0x25, 0xbc, 0x4a, 0x10, 0x84, 0xbd, 0xf0, 0x61, 0x09, 0x3d,
+  0x7c, 0xba, 0x6d, 0x3d, 0x43, 0x44, 0x60, 0x3d, 0xbc, 0x42, 0x2d, 0x3d, 0x09,
+  0x6d, 0x2d, 0x3d, 0x3b, 0x61, 0xb1, 0x3c, 0xd7, 0xb2, 0x36, 0xbc, 0x10, 0xe9,
+  0x06, 0xbd, 0xd4, 0x30, 0x64, 0x3d, 0x4e, 0xb2, 0x8d, 0xbc, 0x54, 0x0d, 0x24,
+  0xbd, 0xb6, 0x13, 0xe8, 0x3c, 0xe1, 0xd2, 0xd3, 0x3c, 0xd2, 0xc8, 0x99, 0xbc,
+  0x5c, 0x05, 0x75, 0x3d, 0x58, 0x19, 0x91, 0x3d, 0x66, 0x5b, 0x03, 0xbd, 0xf4,
+  0x88, 0xbd, 0xbc, 0xff, 0x51, 0x93, 0xbc, 0xaa, 0xc8, 0x3e, 0x3d, 0x57, 0x16,
+  0xbc, 0xba, 0xf4, 0xe1, 0xa0, 0xbd, 0x3a, 0x82, 0x94, 0xbd, 0x77, 0xfa, 0x86,
+  0xbd, 0xa6, 0xfd, 0x84, 0xbb, 0x91, 0x28, 0xeb, 0xbb, 0x86, 0xfd, 0xca, 0xbc,
+  0x7f, 0xd4, 0x10, 0xbc, 0xea, 0x09, 0x08, 0xbd, 0xbe, 0x9e, 0x23, 0xbc, 0x5a,
+  0x6a, 0x4f, 0xbd, 0x00, 0xf1, 0x54, 0x3d, 0xf4, 0x72, 0xb8, 0xbc, 0x0a, 0xde,
+  0x0f, 0x3d, 0x27, 0x61, 0x1b, 0x3d, 0xed, 0xb6, 0x49, 0xbd, 0x11, 0x6d, 0xfb,
+  0x3c, 0x51, 0x41, 0x75, 0x3d, 0x0b, 0x3b, 0x68, 0x3d, 0x1e, 0xb2, 0x6c, 0xbd,
+  0xd0, 0x5a, 0xfe, 0x3c, 0x3d, 0xa0, 0x30, 0xbd, 0xc8, 0xf9, 0x89, 0x3c, 0x10,
+  0x06, 0x72, 0x3d, 0xed, 0x61, 0xe1, 0x3a, 0x35, 0x65, 0x7e, 0x3d, 0x16, 0x6c,
+  0x4d, 0x3d, 0x8a, 0xf6, 0x5a, 0x3d, 0x3e, 0x18, 0x64, 0x3d, 0x36, 0x9a, 0xbe,
+  0x3c, 0x14, 0xa7, 0xba, 0xbc, 0x93, 0x98, 0xe3, 0x3c, 0x14, 0x13, 0x30, 0x3d,
+  0xa8, 0x9a, 0x71, 0xbc, 0xd0, 0x9e, 0xfd, 0xbc, 0x10, 0x8b, 0xa7, 0xbd, 0xb9,
+  0x47, 0x2f, 0x3d, 0x44, 0xff, 0x9c, 0xbd, 0x5b, 0x84, 0x3e, 0xbd, 0xc6, 0xa4,
+  0xaa, 0x3c, 0x5b, 0xa9, 0x0e, 0xbd, 0x6b, 0xa6, 0x33, 0x3d, 0x65, 0x26, 0x46,
+  0x3d, 0x8e, 0x5d, 0xdc, 0xbc, 0x62, 0xcf, 0x43, 0xbd, 0xfd, 0x0e, 0x86, 0x3d,
+  0x52, 0xd5, 0xf3, 0x3c, 0x10, 0x00, 0x50, 0xbc, 0x55, 0xec, 0x6c, 0xbd, 0x9b,
+  0x21, 0x46, 0x3d, 0xb3, 0xe4, 0x80, 0xbc, 0xa1, 0xf7, 0x84, 0xbd, 0x64, 0x01,
+  0x4e, 0xbd, 0x01, 0xfb, 0x3e, 0xbc, 0x28, 0xfc, 0xac, 0xbc, 0x84, 0xf6, 0x17,
+  0x3c, 0x69, 0x7c, 0xd9, 0xbc, 0x30, 0xb8, 0xfe, 0xbc, 0x0e, 0x3a, 0x87, 0xbd,
+  0x88, 0xad, 0x93, 0xbd, 0xe1, 0x85, 0x8d, 0xbd, 0x42, 0x8c, 0x12, 0x3d, 0x41,
+  0x59, 0x84, 0xbd, 0x1c, 0x0e, 0x70, 0xbb, 0xb0, 0x9e, 0xd3, 0xbc, 0x3c, 0x03,
+  0xdb, 0xbb, 0xf4, 0x19, 0x01, 0x3d, 0x6f, 0x20, 0xc6, 0x3c, 0x77, 0xc0, 0xb4,
+  0x3c, 0x4a, 0xa0, 0xa7, 0x3c, 0x1c, 0xaa, 0x2a, 0xbd, 0x49, 0x9b, 0x60, 0xbd,
+  0x30, 0xff, 0xf9, 0xbc, 0x2f, 0x70, 0xc9, 0xbb, 0x72, 0x4b, 0x8f, 0xbd, 0x47,
+  0xc6, 0x34, 0x3d, 0x18, 0x49, 0x21, 0x3c, 0x04, 0x19, 0x30, 0x3d, 0x74, 0xbe,
+  0x7b, 0xbb, 0xbc, 0x92, 0x43, 0xbc, 0x6f, 0xb6, 0xdf, 0xbc, 0x20, 0xdb, 0x90,
+  0x3c, 0x45, 0x29, 0x95, 0xbc, 0x4c, 0x9c, 0xa6, 0x3c, 0x2b, 0xbf, 0xe4, 0xbc,
+  0xa9, 0x41, 0xff, 0xbc, 0x62, 0x15, 0xd4, 0x3c, 0x29, 0x60, 0x8e, 0xbd, 0x8d,
+  0xce, 0x56, 0xbc, 0x84, 0x09, 0x41, 0x3d, 0x16, 0xb8, 0x35, 0x3d, 0x03, 0x5c,
+  0x09, 0xbd, 0x82, 0xfe, 0x64, 0x3d, 0x16, 0x2e, 0x6d, 0xbd, 0xbf, 0x4b, 0x05,
+  0xbd, 0x15, 0x9a, 0x28, 0xbd, 0x1d, 0x3d, 0x4f, 0xbd, 0x7c, 0x8a, 0x99, 0x3b,
+  0xf9, 0x8c, 0x35, 0xbd, 0xef, 0xc2, 0x2a, 0xbd, 0xe6, 0xea, 0x85, 0xbc, 0xfd,
+  0xf1, 0xde, 0x3b, 0xce, 0xb3, 0x5f, 0x3d, 0x2f, 0x4a, 0x30, 0xbc, 0xc5, 0xa1,
+  0x09, 0xbd, 0x63, 0x5f, 0x5e, 0xbd, 0x44, 0xc9, 0xc2, 0xbc, 0xb6, 0x2a, 0xf8,
+  0xbc, 0x58, 0x39, 0x34, 0x3d, 0x49, 0xbe, 0x5c, 0xbd, 0x45, 0xad, 0x1d, 0x3c,
+  0x3f, 0x9f, 0x19, 0xbd, 0xfb, 0xef, 0x2e, 0x3c, 0xd5, 0xe8, 0x88, 0x3c, 0x13,
+  0x36, 0x5c, 0xbd, 0x04, 0xeb, 0x78, 0x3c, 0x6e, 0x39, 0x64, 0x3d, 0xdc, 0x1e,
+  0x70, 0x3d, 0x79, 0x43, 0x4d, 0x3d, 0xfd, 0x0f, 0x30, 0xbd, 0xd2, 0x88, 0x18,
+  0x3d, 0x87, 0x62, 0xcc, 0x3c, 0x00, 0x39, 0x30, 0x3d, 0xba, 0xa0, 0xfa, 0xbc,
+  0x00, 0x3d, 0x41, 0x3d, 0xed, 0xfa, 0x73, 0xbd, 0x0c, 0x09, 0x54, 0xbd, 0x77,
+  0x2f, 0x5f, 0xbd, 0x01, 0x38, 0x7f, 0xbd, 0x98, 0x08, 0xee, 0xbc, 0x53, 0x34,
+  0x48, 0xbc, 0x8a, 0x25, 0x72, 0xbc, 0xf3, 0x71, 0x70, 0xbd, 0x44, 0xdf, 0x1b,
+  0x3d, 0xd8, 0x6e, 0x6f, 0xbd, 0xdf, 0x4d, 0x23, 0x3c, 0x9c, 0xfb, 0x21, 0x3d,
+  0x72, 0xe1, 0xa4, 0xbc, 0x74, 0xc3, 0x2e, 0xbd, 0x63, 0x0c, 0x8a, 0xbc, 0x24,
+  0x09, 0x6e, 0xbd, 0xbb, 0x68, 0x68, 0xbd, 0x7d, 0xd7, 0x6c, 0x3d, 0xd8, 0x63,
+  0x63, 0x3c, 0x1a, 0x16, 0xdb, 0xbb, 0x86, 0x5e, 0x40, 0xbd, 0x50, 0x6d, 0x31,
+  0xbb, 0xdd, 0xb6, 0x96, 0xbd, 0x19, 0x27, 0x56, 0xbd, 0xf3, 0xd5, 0x11, 0x3d,
+  0x91, 0x8e, 0x68, 0x3d, 0xea, 0xed, 0x86, 0xbd, 0xd6, 0x51, 0x87, 0xbc, 0xfb,
+  0x6c, 0x76, 0xbd, 0x50, 0x6f, 0x38, 0x3d, 0x9b, 0xa5, 0x71, 0xbd, 0x9b, 0x1f,
+  0x16, 0xbd, 0x25, 0xee, 0x93, 0x3d, 0xa9, 0x05, 0xca, 0xbc, 0x9f, 0xee, 0x36,
+  0xbd, 0x5c, 0x03, 0x28, 0x3d, 0x52, 0x3b, 0xb1, 0x3c, 0xe3, 0x45, 0x13, 0x3d,
+  0x38, 0xec, 0x82, 0xbd, 0xba, 0xc6, 0x5f, 0x3d, 0x18, 0xf7, 0x59, 0x3d, 0xc4,
+  0x2f, 0x89, 0x3c, 0x3c, 0x23, 0xd1, 0xbc, 0x39, 0xa7, 0x28, 0x3d, 0x07, 0x78,
+  0x17, 0xbc, 0x72, 0xe3, 0xaf, 0xbc, 0x15, 0x2e, 0x2d, 0x3d, 0x2c, 0x3d, 0xa3,
+  0x3c, 0x33, 0x96, 0x18, 0xbd, 0xee, 0x47, 0x30, 0xbd, 0x56, 0xc0, 0x0e, 0xbd,
+  0xae, 0x3b, 0x74, 0x3c, 0x79, 0x3e, 0x94, 0x3d, 0xee, 0x19, 0x3d, 0xbd, 0x8d,
+  0x14, 0x7a, 0xbd, 0x49, 0xfa, 0x2e, 0x3d, 0x9a, 0x0e, 0x8e, 0xbd, 0x41, 0x87,
+  0x45, 0x3c, 0x3b, 0x28, 0x66, 0xbd, 0x3d, 0xbd, 0x20, 0x3d, 0x60, 0x4e, 0x80,
+  0xbd, 0x7a, 0x3c, 0x50, 0xbd, 0xaa, 0x0f, 0x9e, 0xbd, 0xa2, 0x81, 0x57, 0xbd,
+  0x69, 0xf7, 0x27, 0x3d, 0x62, 0x88, 0x17, 0xbc, 0x47, 0x5d, 0xac, 0x3c, 0xe7,
+  0x41, 0x31, 0xbd, 0xde, 0xec, 0x85, 0xbd, 0x74, 0xa1, 0x48, 0xbd, 0x80, 0x0d,
+  0x2a, 0xbd, 0x5e, 0x67, 0x7e, 0x3c, 0x35, 0xa5, 0xc6, 0x3c, 0xc4, 0xeb, 0x89,
+  0xbc, 0xcb, 0xa7, 0x97, 0x3c, 0x0f, 0xca, 0x68, 0x3c, 0xeb, 0x57, 0xea, 0xbc,
+  0x88, 0xf8, 0xb3, 0x3c, 0x44, 0x92, 0xee, 0x3c, 0x89, 0xa1, 0x92, 0x3d, 0x61,
+  0xa5, 0x23, 0x3a, 0x1e, 0x6c, 0x28, 0xbd, 0x18, 0x89, 0xa4, 0x3c, 0xd1, 0x26,
+  0x47, 0x3b, 0x4a, 0x06, 0x80, 0x3c, 0x3a, 0x5f, 0x58, 0xbd, 0x6e, 0x1d, 0x77,
+  0xbd, 0xe1, 0x43, 0x89, 0x3a, 0x41, 0xd0, 0x71, 0xbc, 0x90, 0x43, 0x40, 0xbd,
+  0xa5, 0xc3, 0x3a, 0x3c, 0xc2, 0x45, 0xb1, 0xbb, 0xf1, 0x81, 0x32, 0x3d, 0x80,
+  0x8e, 0x20, 0x3d, 0x0a, 0xbd, 0x14, 0x3d, 0xbb, 0x93, 0x3e, 0xbd, 0x50, 0x1f,
+  0x5b, 0x3d, 0xb7, 0xd1, 0x99, 0xbd, 0xbe, 0x77, 0x4b, 0x3d, 0x5f, 0xd4, 0x58,
+  0x3d, 0xdc, 0xab, 0xa4, 0x3c, 0x41, 0x6c, 0x78, 0xbd, 0xbd, 0x11, 0x71, 0x3c,
+  0xc9, 0x97, 0x50, 0xbd, 0x93, 0xca, 0xe9, 0x3b, 0xec, 0x1b, 0xb4, 0xbc, 0xcf,
+  0xb1, 0x48, 0x3c, 0x26, 0xd1, 0x99, 0x3c, 0x9b, 0xca, 0x26, 0xbd, 0xe0, 0xaf,
+  0x2f, 0xbc, 0xef, 0x23, 0x84, 0xbd, 0x10, 0x75, 0xe1, 0x3b, 0xe6, 0x8c, 0x3c,
+  0x3d, 0xad, 0x1a, 0x48, 0x3d, 0xfe, 0x04, 0x3f, 0x3d, 0xf2, 0x2f, 0xe0, 0xbc,
+  0x98, 0x58, 0xe3, 0xbb, 0xe2, 0x78, 0x84, 0x3d, 0xde, 0x9e, 0x97, 0x3b, 0xe3,
+  0x90, 0x35, 0xbd, 0xb9, 0xf5, 0x57, 0x3c, 0x29, 0x97, 0x18, 0x3c, 0xa7, 0xe6,
+  0x02, 0x3d, 0x6e, 0xd3, 0x0b, 0x3d, 0x09, 0x9f, 0x51, 0xbd, 0xca, 0x5b, 0xac,
+  0x3a, 0x38, 0xd9, 0x55, 0xbd, 0xc0, 0x50, 0x0b, 0x3d, 0x63, 0xe8, 0x69, 0xbd,
+  0x96, 0xeb, 0x86, 0xbd, 0x43, 0x18, 0x26, 0x3d, 0x76, 0xab, 0xd8, 0x3a, 0xe3,
+  0x0e, 0xb9, 0xbc, 0xed, 0xb2, 0x33, 0x3c, 0x67, 0x1d, 0x7c, 0xbd, 0x13, 0x39,
+  0xa8, 0x3b, 0x4b, 0xa3, 0x39, 0xbd, 0x17, 0xb9, 0x44, 0xbd, 0x88, 0x76, 0x43,
+  0xbd, 0xdd, 0x31, 0x61, 0xbd, 0x2d, 0x7d, 0xae, 0xbc, 0xe9, 0xb8, 0x05, 0x3d,
+  0xdd, 0x80, 0x2a, 0xbd, 0x55, 0x66, 0x08, 0xbd, 0xea, 0x09, 0x8a, 0xbd, 0x13,
+  0xd8, 0x0d, 0xbd, 0x7e, 0x9d, 0x5a, 0x3d, 0x08, 0x68, 0x8d, 0x3c, 0x02, 0x87,
+  0xdc, 0x3c, 0xfb, 0x55, 0xda, 0xb9, 0xc4, 0x69, 0x71, 0xbd, 0xd1, 0x02, 0xf6,
+  0xbc, 0x92, 0x01, 0x0c, 0x3d, 0xbb, 0x2c, 0x40, 0xbd, 0x82, 0x69, 0x97, 0x3d,
+  0x2b, 0xda, 0x57, 0xbd, 0x7b, 0x9b, 0xe0, 0x3b, 0xff, 0xfd, 0x4b, 0xbd, 0x5c,
+  0xa6, 0x2e, 0x3d, 0x40, 0xec, 0x85, 0xbd, 0x3b, 0x5d, 0x17, 0xbd, 0x52, 0x04,
+  0x2c, 0xbd, 0x61, 0x00, 0x20, 0x3c, 0x65, 0x33, 0x28, 0xbc, 0x77, 0x76, 0x07,
+  0x3d, 0x7a, 0xff, 0x32, 0x3b, 0xb9, 0x96, 0x59, 0xbd, 0xe0, 0xe1, 0x43, 0xbd,
+  0x17, 0xa7, 0x6b, 0xbd, 0xf8, 0xa6, 0x4d, 0xbd, 0x4f, 0xc3, 0x9d, 0xbb, 0xfa,
+  0x3a, 0x39, 0xbd, 0xe3, 0x59, 0x9a, 0xbd, 0xbd, 0xb9, 0x43, 0xbc, 0x21, 0xc4,
+  0x0c, 0x3c, 0x3e, 0x70, 0x47, 0xbd, 0x42, 0xcf, 0x93, 0x3b, 0x9b, 0xe0, 0x34,
+  0x3d, 0x00, 0x5d, 0xeb, 0x39, 0x5f, 0x65, 0x80, 0xbd, 0x37, 0x8a, 0x65, 0x3d,
+  0x0e, 0x1b, 0x67, 0xbc, 0xa0, 0x0a, 0x68, 0x3c, 0xc5, 0x6d, 0xf7, 0x3c, 0xe1,
+  0x9d, 0x85, 0x3d, 0xa8, 0xe7, 0x69, 0xbd, 0x30, 0x9c, 0x36, 0xbd, 0xcf, 0x55,
+  0xdf, 0x3c, 0x85, 0xe9, 0x4c, 0x3d, 0x3e, 0x03, 0x8a, 0xbd, 0x19, 0xe1, 0x86,
+  0xbb, 0xa0, 0x51, 0xec, 0x3c, 0x11, 0xc9, 0x84, 0x3d, 0x48, 0xa9, 0x1d, 0x3d,
+  0x1c, 0xd6, 0xee, 0x3b, 0x82, 0x07, 0x96, 0xbc, 0x33, 0x6b, 0xd0, 0x3c, 0x62,
+  0x62, 0xb6, 0x3c, 0x4a, 0x35, 0x62, 0x3d, 0x10, 0x85, 0x66, 0xbd, 0xc9, 0xf5,
+  0x53, 0xbc, 0x70, 0x4a, 0xfa, 0x3b, 0xa5, 0x21, 0x33, 0xbd, 0xe7, 0x07, 0x40,
+  0x3b, 0x6d, 0xe3, 0x16, 0x3d, 0x11, 0xa2, 0xa7, 0x3a, 0x01, 0x73, 0x95, 0xbc,
+  0x5c, 0xd1, 0x2e, 0xbd, 0x5c, 0x41, 0x00, 0xbd, 0x02, 0x40, 0x8a, 0x3d, 0x66,
+  0xcf, 0x2b, 0x3d, 0x3d, 0x54, 0x8b, 0xbc, 0x1b, 0x25, 0x44, 0x3d, 0x56, 0xda,
+  0x15, 0xbd, 0xfc, 0x0c, 0xc1, 0xbc, 0x4d, 0xcd, 0x5e, 0xbd, 0x40, 0x55, 0x2c,
+  0x3d, 0xb9, 0xe6, 0xc5, 0xbc, 0x6b, 0x0d, 0xd2, 0xba, 0xd0, 0x10, 0x28, 0x3c,
+  0x6b, 0xd8, 0x63, 0xbd, 0xf7, 0xed, 0xca, 0x3c, 0xa3, 0x63, 0x5a, 0x3b, 0x45,
+  0x41, 0x8e, 0x3d, 0x48, 0x23, 0xd7, 0x3c, 0x71, 0xbb, 0xa8, 0x3c, 0xe2, 0x55,
+  0x98, 0x3c, 0x27, 0xae, 0x5e, 0xbc, 0x06, 0x79, 0xb4, 0xbb, 0x8c, 0xdb, 0x13,
+  0xbd, 0x7b, 0x59, 0x18, 0x3d, 0xbb, 0x91, 0xfc, 0xbc, 0x4b, 0x7d, 0x80, 0xbd,
+  0x58, 0x76, 0x8a, 0x3c, 0x5f, 0x71, 0xa8, 0x3c, 0xb3, 0x8f, 0x89, 0xbd, 0xb4,
+  0x4c, 0x64, 0xbd, 0xf9, 0x1a, 0x81, 0x3d, 0x8f, 0xa5, 0x90, 0xbd, 0x24, 0x93,
+  0xbf, 0x3c, 0x1c, 0x73, 0x68, 0x3d, 0xa5, 0x53, 0x4a, 0xbd, 0xec, 0x40, 0x34,
+  0xbd, 0xb2, 0x5f, 0x90, 0x3d, 0x0d, 0xe3, 0x11, 0x3d, 0x5b, 0x77, 0x91, 0x3d,
+  0xe4, 0x5b, 0x8b, 0x3d, 0x99, 0x6e, 0x6a, 0xbd, 0x05, 0xcb, 0x99, 0xbd, 0xb5,
+  0x26, 0x1f, 0xbd, 0xfd, 0xc3, 0x2f, 0xbd, 0xd2, 0x82, 0x96, 0x3d, 0x06, 0xf6,
+  0x78, 0xbd, 0x8e, 0x08, 0x30, 0x3d, 0x16, 0x22, 0x6d, 0xbd, 0xda, 0x25, 0x4b,
+  0x3d, 0xf7, 0x44, 0x43, 0xbc, 0xba, 0x20, 0xbc, 0xbc, 0x41, 0xd7, 0x04, 0xbc,
+  0xe1, 0x62, 0x0d, 0xbd, 0x93, 0x78, 0x2f, 0xbd, 0x2a, 0xad, 0xd5, 0xbc, 0x13,
+  0xd3, 0x6f, 0xbd, 0x88, 0xc4, 0x12, 0xbd, 0x49, 0x73, 0x84, 0xbd, 0xd6, 0x50,
+  0x2c, 0x3d, 0xa9, 0xb7, 0x7d, 0xbd, 0x9a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00,
+  0x00, 0x08, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, 0x04, 0x00, 0x00,
+  0xae, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc0,
+  0x02, 0x74, 0xbb, 0xc6, 0x58, 0x47, 0x39, 0x07, 0x36, 0x4d, 0x3c, 0xf5, 0x20,
+  0xc5, 0x3c, 0xce, 0x88, 0x6c, 0x3a, 0xd2, 0x40, 0x7d, 0xbc, 0x2f, 0x7e, 0xf5,
+  0x3a, 0x3d, 0xe1, 0x3e, 0xbc, 0xda, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x1d, 0xe1, 0xa3, 0xbc, 0xe7, 0x98, 0x88, 0x3c, 0xe4,
+  0xc0, 0x49, 0x3b, 0xa6, 0x49, 0x38, 0x3c, 0x0e, 0x65, 0xbc, 0xbc, 0xd8, 0x59,
+  0x73, 0xbc, 0x15, 0x66, 0x0a, 0xbd, 0x7c, 0x75, 0x24, 0xba, 0x37, 0xc4, 0x65,
+  0x3c, 0x94, 0x0d, 0x84, 0x3c, 0x26, 0xcc, 0x87, 0x3c, 0x59, 0xea, 0x03, 0xbd,
+  0x33, 0x39, 0x48, 0xbc, 0xac, 0x3e, 0x6d, 0x3c, 0xc7, 0x46, 0xb1, 0xbb, 0xcf,
+  0xee, 0x07, 0x3d, 0x26, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00,
+  0x00, 0x00, 0x7c, 0xe9, 0x43, 0x3c, 0xd3, 0x16, 0xd7, 0xbc, 0x15, 0x37, 0x4a,
+  0xba, 0xa4, 0xad, 0x1c, 0x3c, 0x20, 0x66, 0x3b, 0xbb, 0x22, 0x84, 0x97, 0x3a,
+  0xa5, 0x65, 0x86, 0x3c, 0x68, 0x0b, 0xf7, 0xbb, 0x52, 0xaf, 0x8c, 0x3b, 0xe1,
+  0x81, 0x00, 0x3d, 0x3c, 0xf9, 0xd9, 0x3c, 0x96, 0xa8, 0x80, 0x3c, 0x94, 0xdf,
+  0x21, 0x3c, 0xc7, 0x26, 0xd7, 0x3a, 0x96, 0xb2, 0x8c, 0x3c, 0x17, 0x29, 0x20,
+  0x3c, 0xfa, 0xe0, 0x59, 0x3c, 0xf7, 0x08, 0x14, 0x3c, 0xad, 0x71, 0x61, 0x3c,
+  0x2e, 0x73, 0x1a, 0xbc, 0x0f, 0xd0, 0x55, 0xbb, 0xa8, 0xde, 0x68, 0x3c, 0xd9,
+  0x86, 0x44, 0x3c, 0x54, 0x22, 0x05, 0xbc, 0x3c, 0x7a, 0x92, 0x3c, 0x70, 0x16,
+  0x01, 0x3c, 0x69, 0x1e, 0xaf, 0xbb, 0xe8, 0x4b, 0xc5, 0xbc, 0x8b, 0xfd, 0x23,
+  0x3c, 0xb8, 0x1e, 0xfd, 0xbc, 0x49, 0x11, 0x50, 0xbb, 0x2a, 0x7b, 0x9c, 0x3c,
+  0xb2, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x6e,
+  0x5f, 0x06, 0xba, 0xca, 0x9c, 0x99, 0xbb, 0x00, 0x00, 0x00, 0x00, 0xa4, 0x8a,
+  0xfe, 0xba, 0x12, 0xed, 0xa7, 0x3c, 0xc0, 0x7d, 0x37, 0xbb, 0xa3, 0x8a, 0x30,
+  0xbb, 0xd0, 0x95, 0x99, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9c, 0x1c, 0x3d,
+  0x5c, 0x2a, 0x8e, 0xbb, 0x8c, 0xc0, 0x1a, 0xbb, 0x5b, 0xa1, 0xe5, 0x3b, 0x00,
+  0x00, 0x00, 0x00, 0x6a, 0x50, 0xef, 0x3c, 0xdc, 0xbc, 0x9a, 0x3a, 0x00, 0x00,
+  0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0x08, 0x00, 0x00, 0x00, 0x6e, 0x6b, 0xdf, 0xbb, 0x54, 0xe6, 0xe6, 0x3c,
+  0xd0, 0xf4, 0xff, 0xff, 0xd4, 0xf4, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x4d,
+  0x4c, 0x49, 0x52, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64,
+  0x2e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e,
+  0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0,
+  0x02, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x04, 0x00,
+  0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00,
+  0x00, 0x38, 0x02, 0x00, 0x00, 0xd4, 0x01, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00,
+  0x3c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x8c,
+  0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5a, 0xfe,
+  0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x00,
+  0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x68, 0xf5, 0xff, 0xff,
+  0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13,
+  0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8e, 0xfe,
+  0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00,
+  0x00, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
+  0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x12, 0x00,
+  0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a,
+  0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x11, 0x00,
+  0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0xee, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02,
+  0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x00, 0x10, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xd0,
+  0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x7e, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x6e, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e,
+  0x00, 0x00, 0x00, 0x5e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x50, 0xff, 0xff,
+  0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03,
+  0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x04, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c,
+  0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x05, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x17, 0x00, 0x10, 0x00,
+  0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00,
+  0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00,
+  0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x28, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08,
+  0x00, 0x07, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
+  0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+  0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x09, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14,
+  0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00,
+  0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0xc4, 0x07, 0x00, 0x00, 0x7c, 0x07, 0x00,
+  0x00, 0x44, 0x07, 0x00, 0x00, 0x0c, 0x07, 0x00, 0x00, 0xd4, 0x06, 0x00, 0x00,
+  0x88, 0x06, 0x00, 0x00, 0x2c, 0x06, 0x00, 0x00, 0xe0, 0x05, 0x00, 0x00, 0x8c,
+  0x05, 0x00, 0x00, 0x38, 0x05, 0x00, 0x00, 0xe4, 0x04, 0x00, 0x00, 0x28, 0x04,
+  0x00, 0x00, 0xb4, 0x03, 0x00, 0x00, 0xf8, 0x02, 0x00, 0x00, 0x84, 0x02, 0x00,
+  0x00, 0xc8, 0x01, 0x00, 0x00, 0x54, 0x01, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf8, 0xff, 0xff, 0x14,
+  0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x15, 0x00,
+  0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff,
+  0xff, 0x02, 0x00, 0x00, 0x00, 0x3c, 0xf8, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00,
+  0x53, 0x74, 0x61, 0x74, 0x65, 0x66, 0x75, 0x6c, 0x50, 0x61, 0x72, 0x74, 0x69,
+  0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43, 0x61, 0x6c, 0x6c, 0x3a, 0x30, 0x00,
+  0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+  0x00, 0xac, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x02,
+  0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x90, 0xf8,
+  0xff, 0xff, 0x5b, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+  0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+  0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x3b, 0x73, 0x65,
+  0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64,
+  0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x52, 0x65, 0x6c, 0x75,
+  0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+  0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x42,
+  0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00,
+  0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
+  0x3c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80,
+  0x04, 0x00, 0x00, 0x24, 0xf9, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65,
+  0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x66,
+  0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, 0x52, 0x65, 0x73,
+  0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x80, 0x04, 0x00, 0x00, 0x9c, 0xf9, 0xff, 0xff, 0x14, 0x00,
+  0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00,
+  0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x88,
+  0xf9, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e,
+  0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70,
+  0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x38, 0x2f,
+  0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00,
+  0x00, 0x00, 0x0c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00,
+  0x00, 0x24, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0c,
+  0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xff, 0xff, 0x6e, 0x00,
+  0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+  0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33,
+  0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+  0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64,
+  0x5f, 0x32, 0x34, 0x33, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+  0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43,
+  0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f,
+  0x32, 0x34, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0xc4, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24,
+  0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00,
+  0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00,
+  0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xb0, 0xfa, 0xff, 0xff,
+  0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61,
+  0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c,
+  0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x37, 0x2f, 0x4d, 0x61, 0x78,
+  0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34,
+  0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00,
+  0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0xff, 0xff, 0xff, 0xff, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x20, 0xfb, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73,
+  0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f,
+  0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x52, 0x65,
+  0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c,
+  0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x32, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71,
+  0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f,
+  0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x76,
+  0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32,
+  0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0xec, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04,
+  0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x1f, 0x00,
+  0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xd8, 0xfb, 0xff, 0xff, 0x27, 0x00, 0x00,
+  0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+  0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67,
+  0x32, 0x64, 0x5f, 0x31, 0x39, 0x36, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f,
+  0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00,
+  0x00, 0x1f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x5c, 0xfc, 0xff, 0xff,
+  0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0d,
+  0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff,
+  0xff, 0xff, 0x3e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+  0x00, 0x48, 0xfc, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
+  0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e,
+  0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+  0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x42,
+  0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e,
+  0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32,
+  0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b,
+  0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x62, 0x69,
+  0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3e,
+  0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x56, 0xfd,
+  0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00,
+  0x00, 0x2c, 0x00, 0x00, 0x00, 0xe8, 0xfc, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+  0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43,
+  0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00,
+  0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0xa6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0b,
+  0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x38, 0xfd, 0xff, 0xff, 0x1f, 0x00,
+  0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+  0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32,
+  0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10,
+  0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00,
+  0x00, 0x00, 0xf6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x0a, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x88, 0xfd, 0xff, 0xff,
+  0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61,
+  0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32,
+  0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10,
+  0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xd8, 0xfd,
+  0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+  0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+  0x31, 0x36, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02,
+  0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xfe,
+  0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+  0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+  0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61,
+  0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x80, 0x04, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00,
+  0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x14,
+  0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0x7c, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00,
+  0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+  0x33, 0x2f, 0x66, 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f,
+  0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00,
+  0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x06, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xc0, 0xfe, 0xff, 0xff,
+  0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x31, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00,
+  0x00, 0x00, 0x62, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x05, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff,
+  0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x32, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+  0x00, 0x00, 0x96, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0xff, 0xff, 0xff,
+  0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00,
+  0x00, 0x00, 0xca, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff,
+  0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33,
+  0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c,
+  0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0,
+  0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+  0x31, 0x36, 0x34, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x14,
+  0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00,
+  0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0xff, 0xff, 0xff, 0xff, 0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x22,
+  0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65,
+  0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f,
+  0x32, 0x34, 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3a, 0x30, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x40,
+  0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00,
+  0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0xdc, 0xff, 0xff, 0xff, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+  0xe8, 0xff, 0xff, 0xff, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0xf4,
+  0xff, 0xff, 0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x0c, 0x00,
+  0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00,
+  0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
+};
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
new file mode 100644
index 0000000000..2fab99dd8b
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) * 2;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) * 2;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
+                                          const uint8_t *x, int pitch_x,
+                                          tran_low_t *c, int pitch_c,
+                                          int dwt_scale_bits, int hbd) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+
+  if (hbd) {
+    const uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  } else {
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  }
+
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
+                               int stride, int hbd) {
+  dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+}
+
+static int haar_ac_sad(const tran_low_t *output, int bw, int bh, int stride) {
+  int acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]);
+    }
+  return acsad;
+}
+
+static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride,
+                                       int hbd) {
+  tran_low_t output[64];
+
+  av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+  return haar_ac_sad(output, 8, 8, 8);
+}
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+                                        int hbd, int num_8x8_rows,
+                                        int num_8x8_cols) {
+  int64_t wavelet_energy = 0;
+  for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
+    for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
+      wavelet_energy += haar_ac_sad_8x8_uint8_input(
+          input + c8 * 8 + r8 * 8 * stride, stride, hbd);
+    }
+  }
+  return wavelet_energy;
+}
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
new file mode 100644
index 0000000000..443b6bc12c
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+
+#define DWT_MAX_LENGTH 64
+
+void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
+                               int stride, int hbd);
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+                                        int hbd, int num_8x8_rows,
+                                        int num_8x8_cols);
+
+#endif  // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/aom/av1/encoder/enc_enums.h b/third_party/aom/av1/encoder/enc_enums.h
new file mode 100644
index 0000000000..20cefa16a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/enc_enums.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_
+#define AOM_AV1_ENCODER_ENC_ENUMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code.
+enum {
+  THR_NEARESTMV,
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+  THR_NEARESTA2,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_NEWMV,
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+  THR_NEWA2,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+  THR_NEARA2,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
+
+  THR_COMP_NEAREST_NEARESTLA,
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+  THR_COMP_NEAREST_NEARESTGA,
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+  THR_COMP_NEAREST_NEARESTLA2,
+  THR_COMP_NEAREST_NEARESTL2A2,
+  THR_COMP_NEAREST_NEARESTL3A2,
+  THR_COMP_NEAREST_NEARESTGA2,
+  THR_COMP_NEAREST_NEARESTLL2,
+  THR_COMP_NEAREST_NEARESTLL3,
+  THR_COMP_NEAREST_NEARESTLG,
+  THR_COMP_NEAREST_NEARESTBA,
+
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
+
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
+
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_GLOBAL_GLOBALGB,
+
+  THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEWLA2,
+  THR_COMP_NEW_NEARESTLA2,
+  THR_COMP_NEAREST_NEWLA2,
+  THR_COMP_NEW_NEARLA2,
+  THR_COMP_NEAR_NEWLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
+
+  THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEWL2A2,
+  THR_COMP_NEW_NEARESTL2A2,
+  THR_COMP_NEAREST_NEWL2A2,
+  THR_COMP_NEW_NEARL2A2,
+  THR_COMP_NEAR_NEWL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
+
+  THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEWL3A2,
+  THR_COMP_NEW_NEARESTL3A2,
+  THR_COMP_NEAREST_NEWL3A2,
+  THR_COMP_NEW_NEARL3A2,
+  THR_COMP_NEAR_NEWL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
+
+  THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEWGA2,
+  THR_COMP_NEW_NEARESTGA2,
+  THR_COMP_NEAREST_NEWGA2,
+  THR_COMP_NEW_NEARGA2,
+  THR_COMP_NEAR_NEWGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEWLL2,
+  THR_COMP_NEW_NEARESTLL2,
+  THR_COMP_NEAREST_NEWLL2,
+  THR_COMP_NEW_NEARLL2,
+  THR_COMP_NEAR_NEWLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
+
+  THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEWLL3,
+  THR_COMP_NEW_NEARESTLL3,
+  THR_COMP_NEAREST_NEWLL3,
+  THR_COMP_NEW_NEARLL3,
+  THR_COMP_NEAR_NEWLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
+
+  THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEWLG,
+  THR_COMP_NEW_NEARESTLG,
+  THR_COMP_NEAREST_NEWLG,
+  THR_COMP_NEW_NEARLG,
+  THR_COMP_NEAR_NEWLG,
+  THR_COMP_GLOBAL_GLOBALLG,
+
+  THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEWBA,
+  THR_COMP_NEW_NEARESTBA,
+  THR_COMP_NEAREST_NEWBA,
+  THR_COMP_NEW_NEARBA,
+  THR_COMP_NEAR_NEWBA,
+  THR_COMP_GLOBAL_GLOBALBA,
+
+  THR_DC,
+  THR_PAETH,
+  THR_SMOOTH,
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
+
+  MAX_MODES,
+  SINGLE_REF_MODE_START = THR_NEARESTMV,
+  SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA,
+  NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START,
+  THR_MODE_START = THR_NEARESTMV,
+  THR_MODE_END = MAX_MODES,
+  THR_INTER_MODE_START = THR_MODE_START,
+  THR_INTER_MODE_END = THR_DC,
+  THR_INVALID = 255
+} UENUM1BYTE(THR_MODES);
+
+enum {
+  THR_LAST,
+  THR_LAST2,
+  THR_LAST3,
+  THR_BWDR,
+  THR_ALTR2,
+  THR_GOLD,
+  THR_ALTR,
+
+  THR_COMP_LA,
+  THR_COMP_L2A,
+  THR_COMP_L3A,
+  THR_COMP_GA,
+
+  THR_COMP_LB,
+  THR_COMP_L2B,
+  THR_COMP_L3B,
+  THR_COMP_GB,
+
+  THR_COMP_LA2,
+  THR_COMP_L2A2,
+  THR_COMP_L3A2,
+  THR_COMP_GA2,
+
+  THR_INTRA,
+
+  MAX_REFS
+} UENUM1BYTE(THR_MODES_SUB8X8);
+
+enum {
+  FULL_TXFM_RD,
+  LOW_TXFM_RD,
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+  USE_FULL_RD = 0,
+  USE_FAST_RD,
+  USE_LARGESTALL,
+} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENC_ENUMS_H_
diff --git a/third_party/aom/av1/encoder/encode_strategy.c b/third_party/aom/av1/encoder/encode_strategy.c
new file mode 100644
index 0000000000..35ca83c3f4
--- /dev/null
+++ b/third_party/aom/av1/encoder/encode_strategy.c
@@ -0,0 +1,1767 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "av1/common/blockd.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/temporal_filter.h"
+#if CONFIG_THREE_PASS
+#include "av1/encoder/thirdpass.h"
+#endif  // CONFIG_THREE_PASS
+#include "av1/encoder/tpl_model.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
+
+static INLINE void set_refresh_frame_flags(
+    RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref,
+    bool refresh_arf) {
+  refresh_frame->golden_frame = refresh_gf;
+  refresh_frame->bwd_ref_frame = refresh_bwdref;
+  refresh_frame->alt_ref_frame = refresh_arf;
+}
+
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  RefreshFrameInfo *const refresh_frame,
+                                  const FRAME_UPDATE_TYPE type,
+                                  const REFBUF_STATE refbuf_state,
+                                  int force_refresh_all) {
+  // NOTE(weitinglin): Should we define another function to take care of
+  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+  const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &cpi->ext_flags.refresh_frame;
+  cpi->rc.is_src_frame_alt_ref = 0;
+
+  switch (type) {
+    case KF_UPDATE:
+      set_refresh_frame_flags(refresh_frame, true, true, true);
+      break;
+
+    case LF_UPDATE:
+      set_refresh_frame_flags(refresh_frame, false, false, false);
+      break;
+
+    case GF_UPDATE:
+      set_refresh_frame_flags(refresh_frame, true, false, false);
+      break;
+
+    case OVERLAY_UPDATE:
+      if (refbuf_state == REFBUF_RESET)
+        set_refresh_frame_flags(refresh_frame, true, true, true);
+      else
+        set_refresh_frame_flags(refresh_frame, true, false, false);
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+
+    case ARF_UPDATE:
+      // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+      if (refbuf_state == REFBUF_RESET)
+        set_refresh_frame_flags(refresh_frame, true, true, true);
+      else
+        set_refresh_frame_flags(refresh_frame, false, false, true);
+
+      break;
+
+    case INTNL_OVERLAY_UPDATE:
+      set_refresh_frame_flags(refresh_frame, false, false, false);
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+
+    case INTNL_ARF_UPDATE:
+      set_refresh_frame_flags(refresh_frame, false, true, false);
+      break;
+
+    default: assert(0); break;
+  }
+
+  if (ext_refresh_frame_flags->update_pending &&
+      (!is_stat_generation_stage(cpi))) {
+    set_refresh_frame_flags(refresh_frame,
+                            ext_refresh_frame_flags->golden_frame,
+                            ext_refresh_frame_flags->bwd_ref_frame,
+                            ext_refresh_frame_flags->alt_ref_frame);
+    GF_GROUP *gf_group = &cpi->ppi->gf_group;
+    if (ext_refresh_frame_flags->golden_frame)
+      gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE;
+    if (ext_refresh_frame_flags->alt_ref_frame)
+      gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE;
+    if (ext_refresh_frame_flags->bwd_ref_frame)
+      gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE;
+  }
+
+  if (force_refresh_all)
+    set_refresh_frame_flags(refresh_frame, true, true, true);
+}
+
+static void set_additional_frame_flags(const AV1_COMMON *const cm,
+                                       unsigned int *const frame_flags) {
+  if (frame_is_intra_only(cm)) {
+    *frame_flags |= FRAMEFLAGS_INTRAONLY;
+  }
+  if (frame_is_sframe(cm)) {
+    *frame_flags |= FRAMEFLAGS_SWITCH;
+  }
+  if (cm->features.error_resilient_mode) {
+    *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+  }
+}
+
+static void set_ext_overrides(AV1_COMMON *const cm,
+                              EncodeFrameParams *const frame_params,
+                              ExternalFlags *const ext_flags) {
+  // Overrides the defaults with the externally supplied values with
+  // av1_update_reference() and av1_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to av1_encode_lowlevel()
+
+  if (ext_flags->use_s_frame) {
+    frame_params->frame_type = S_FRAME;
+  }
+
+  if (ext_flags->refresh_frame_context_pending) {
+    cm->features.refresh_frame_context = ext_flags->refresh_frame_context;
+    ext_flags->refresh_frame_context_pending = 0;
+  }
+  cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs;
+
+  frame_params->error_resilient_mode = ext_flags->use_error_resilient;
+  // A keyframe is already error resilient and keyframes with
+  // error_resilient_mode interferes with the use of show_existing_frame
+  // when forward reference keyframes are enabled.
+  frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME;
+  // For bitstream conformance, s-frames must be error-resilient
+  frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
+}
+
+static int choose_primary_ref_frame(
+    AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int intra_only = frame_params->frame_type == KEY_FRAME ||
+                         frame_params->frame_type == INTRA_ONLY_FRAME;
+  if (intra_only || frame_params->error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    return PRIMARY_REF_NONE;
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->use_ducky_encode) {
+    int wanted_fb = cpi->ppi->gf_group.primary_ref_idx[cpi->gf_frame_index];
+    for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+      if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb)
+        return ref_frame - LAST_FRAME;
+    }
+
+    return PRIMARY_REF_NONE;
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  // In large scale case, always use Last frame's frame contexts.
+  // Note(yunqing): In other cases, primary_ref_frame is chosen based on
+  // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls
+  // frame bit allocation.
+  if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
+
+  if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config)
+    return av1_svc_primary_ref_frame(cpi);
+
+  // Find the most recent reference frame with the same reference type as the
+  // current frame
+  const int current_ref_type = get_current_frame_ref_type(cpi);
+  int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type];
+#if CONFIG_FPMT_TEST
+  if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    if (gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+      int frame_level = gf_group->frame_parallel_level[cpi->gf_frame_index];
+      // Book keep wanted_fb of frame_parallel_level 1 frame in an FP2 set.
+      if (frame_level == 1) {
+        cpi->wanted_fb = wanted_fb;
+      }
+      // Use the wanted_fb of level 1 frame in an FP2 for a level 2 frame in the
+      // set.
+      if (frame_level == 2 &&
+          gf_group->update_type[cpi->gf_frame_index - 1] == INTNL_ARF_UPDATE) {
+        assert(gf_group->frame_parallel_level[cpi->gf_frame_index - 1] == 1);
+        wanted_fb = cpi->wanted_fb;
+      }
+    }
+  }
+#endif  // CONFIG_FPMT_TEST
+  int primary_ref_frame = PRIMARY_REF_NONE;
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
+      primary_ref_frame = ref_frame - LAST_FRAME;
+    }
+  }
+
+  return primary_ref_frame;
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
+  TimeStamps *time_stamps = &cpi->time_stamps;
+  int64_t this_duration;
+  int step = 0;
+
+  // Clear down mmx registers
+
+  if (cpi->ppi->use_svc && cpi->ppi->rtc_ref.set_ref_frame_config &&
+      cpi->svc.number_spatial_layers > 1) {
+    // ts_start is the timestamp for the current frame and ts_end is the
+    // expected next timestamp given the duration passed into codec_encode().
+    // See the setting in encoder_encode() in av1_cx_iface.c:
+    // ts_start = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol),
+    // ts_end = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol +
+    // duration). So the difference ts_end - ts_start is the duration passed
+    // in by the user. For spatial layers SVC set the framerate based directly
+    // on the duration, and bypass the adjustments below.
+    this_duration = ts_end - ts_start;
+    if (this_duration > 0) {
+      cpi->new_framerate = 10000000.0 / this_duration;
+      av1_new_framerate(cpi, cpi->new_framerate);
+      time_stamps->prev_ts_start = ts_start;
+      time_stamps->prev_ts_end = ts_end;
+      return;
+    }
+  }
+
+  if (ts_start == time_stamps->first_ts_start) {
+    this_duration = ts_end - ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration =
+        time_stamps->prev_ts_end - time_stamps->prev_ts_start;
+
+    this_duration = ts_end - time_stamps->prev_ts_end;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      cpi->new_framerate = 10000000.0 / this_duration;
+      av1_new_framerate(cpi, cpi->new_framerate);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval =
+          AOMMIN((double)(ts_end - time_stamps->first_ts_start), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+      cpi->new_framerate = (10000000.0 / avg_duration);
+      // For parallel frames update cpi->framerate with new_framerate
+      // during av1_post_encode_updates()
+      double framerate =
+          (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+              ? cpi->framerate
+              : cpi->new_framerate;
+      av1_new_framerate(cpi, framerate);
+    }
+  }
+
+  time_stamps->prev_ts_start = ts_start;
+  time_stamps->prev_ts_end = ts_end;
+}
+
+// Determine whether there is a forced keyframe pending in the lookahead buffer
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+                               const int up_to_index,
+                               const COMPRESSOR_STAGE compressor_stage) {
+  for (int i = 0; i <= up_to_index; i++) {
+    const struct lookahead_entry *e =
+        av1_lookahead_peek(lookahead, i, compressor_stage);
+    if (e == NULL) {
+      // We have reached the end of the lookahead buffer and not early-returned
+      // so there isn't a forced key-frame pending.
+      return -1;
+    } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+      return i;
+    } else {
+      continue;
+    }
+  }
+  return -1;  // Never reached
+}
+
+// Check if we should encode an ARF or internal ARF.  If not, try a LAST
+// Do some setup associated with the chosen source
+// temporal_filtered, flush, and frame_update_type are outputs.
+// Return the frame source, or NULL if we couldn't find one
+static struct lookahead_entry *choose_frame_source(
+    AV1_COMP *const cpi, int *const flush, int *pop_lookahead,
+    struct lookahead_entry **last_source, int *const show_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  struct lookahead_entry *source = NULL;
+
+  // Source index in lookahead buffer.
+  int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+
+  // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q
+  if (src_index &&
+      (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index,
+                                  cpi->compressor_stage) != -1) &&
+      cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) {
+    src_index = 0;
+    *flush = 1;
+  }
+
+  // If the current frame is arf, then we should not pop from the lookahead
+  // buffer. If the current frame is not arf, then pop it. This assumes the
+  // first frame in the GF group is not arf. May need to change if it is not
+  // true.
+  *pop_lookahead = (src_index == 0);
+  // If this is a key frame and keyframe filtering is enabled with overlay,
+  // then do not pop.
+  if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 &&
+      gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+      !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) {
+    if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz &&
+        (*flush ||
+         cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz ==
+             cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].pop_sz)) {
+      *pop_lookahead = 0;
+    }
+  }
+
+  // LAP stage does not have ARFs or forward key-frames,
+  // hence, always pop_lookahead here.
+  if (is_stat_generation_stage(cpi)) {
+    *pop_lookahead = 1;
+    src_index = 0;
+  }
+
+  *show_frame = *pop_lookahead;
+
+#if CONFIG_FPMT_TEST
+  if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) {
+#else
+  {
+#endif  // CONFIG_FPMT_TEST
+    // Future frame in parallel encode set
+    if (gf_group->src_offset[cpi->gf_frame_index] != 0 &&
+        !is_stat_generation_stage(cpi))
+      src_index = gf_group->src_offset[cpi->gf_frame_index];
+  }
+  if (*show_frame) {
+    // show frame, pop from buffer
+    // Get last frame source.
+    if (cm->current_frame.frame_number > 0) {
+      *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1,
+                                        cpi->compressor_stage);
+    }
+    // Read in the source frame.
+    source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+                                cpi->compressor_stage);
+  } else {
+    // no show frames are arf frames
+    source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+                                cpi->compressor_stage);
+    if (source != NULL) {
+      cm->showable_frame = 1;
+    }
+  }
+  return source;
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient or
+// S-Frame. An exception can be made in the case of a keyframe, since it does
+// not depend on any previous frames.
+static int allow_show_existing(const AV1_COMP *const cpi,
+                               unsigned int frame_flags) {
+  if (cpi->common.current_frame.frame_number == 0) return 0;
+
+  const struct lookahead_entry *lookahead_src =
+      av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+  if (lookahead_src == NULL) return 1;
+
+  const int is_error_resilient =
+      cpi->oxcf.tool_cfg.error_resilient_mode ||
+      (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+  const int is_s_frame = cpi->oxcf.kf_cfg.enable_sframe ||
+                         (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+  const int is_key_frame =
+      (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY);
+  return !(is_error_resilient || is_s_frame) || is_key_frame;
+}
+
+// Update frame_flags to tell the encoder's caller what sort of frame was
+// encoded.
+static void update_frame_flags(const AV1_COMMON *const cm,
+                               const RefreshFrameInfo *const refresh_frame,
+                               unsigned int *frame_flags) {
+  if (encode_show_existing_frame(cm)) {
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY;
+    return;
+  }
+
+  if (refresh_frame->golden_frame) {
+    *frame_flags |= FRAMEFLAGS_GOLDEN;
+  } else {
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN;
+  }
+
+  if (refresh_frame->alt_ref_frame) {
+    *frame_flags |= FRAMEFLAGS_ALTREF;
+  } else {
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF;
+  }
+
+  if (refresh_frame->bwd_ref_frame) {
+    *frame_flags |= FRAMEFLAGS_BWDREF;
+  } else {
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF;
+  }
+
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    *frame_flags |= FRAMEFLAGS_KEY;
+  } else {
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY;
+  }
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+                          const YV12_BUFFER_CONFIG *const ref_buf,
+                          char *file_name) {
+  int h;
+  FILE *f_ref = NULL;
+
+  if (ref_buf == NULL) {
+    printf("Frame data buffer is NULL.\n");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  if ((f_ref = fopen(file_name, "wb")) == NULL) {
+    printf("Unable to open file %s to write.\n", file_name);
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+
+  fclose(f_ref);
+
+  return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    char file_name[256] = "";
+    snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+             cm->current_frame.frame_number, ref_frame);
+    dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name);
+  }
+}
+#endif  // DUMP_REF_FRAME_IMAGES == 1
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags) {
+  int ref_map_index;
+
+  for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index)
+    if ((refresh_frame_flags >> ref_map_index) & 1) break;
+
+  if (ref_map_index == REF_FRAMES) ref_map_index = INVALID_IDX;
+  return ref_map_index;
+}
+
+static int get_free_ref_map_index(RefFrameMapPair ref_map_pairs[REF_FRAMES]) {
+  for (int idx = 0; idx < REF_FRAMES; ++idx)
+    if (ref_map_pairs[idx].disp_order == -1) return idx;
+  return INVALID_IDX;
+}
+
+static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                           int update_arf, GF_GROUP *gf_group, int gf_index,
+                           int enable_refresh_skip, int cur_frame_disp) {
+  int arf_count = 0;
+  int oldest_arf_order = INT32_MAX;
+  int oldest_arf_idx = -1;
+
+  int oldest_frame_order = INT32_MAX;
+  int oldest_idx = -1;
+
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+    if (ref_pair.disp_order == -1) continue;
+    const int frame_order = ref_pair.disp_order;
+    const int reference_frame_level = ref_pair.pyr_level;
+    // Keep future frames and three closest previous frames in output order.
+    if (frame_order > cur_frame_disp - 3) continue;
+
+    if (enable_refresh_skip) {
+      int skip_frame = 0;
+      // Prevent refreshing a frame in gf_group->skip_frame_refresh.
+      for (int i = 0; i < REF_FRAMES; i++) {
+        int frame_to_skip = gf_group->skip_frame_refresh[gf_index][i];
+        if (frame_to_skip == INVALID_IDX) break;
+        if (frame_order == frame_to_skip) {
+          skip_frame = 1;
+          break;
+        }
+      }
+      if (skip_frame) continue;
+    }
+
+    // Keep track of the oldest level 1 frame if the current frame is also level
+    // 1.
+    if (reference_frame_level == 1) {
+      // If there are more than 2 level 1 frames in the reference list,
+      // discard the oldest.
+      if (frame_order < oldest_arf_order) {
+        oldest_arf_order = frame_order;
+        oldest_arf_idx = map_idx;
+      }
+      arf_count++;
+      continue;
+    }
+
+    // Update the overall oldest reference frame.
+    if (frame_order < oldest_frame_order) {
+      oldest_frame_order = frame_order;
+      oldest_idx = map_idx;
+    }
+  }
+  if (update_arf && arf_count > 2) return oldest_arf_idx;
+  if (oldest_idx >= 0) return oldest_idx;
+  if (oldest_arf_idx >= 0) return oldest_arf_idx;
+  if (oldest_idx == -1) {
+    assert(arf_count > 2 && enable_refresh_skip);
+    return oldest_arf_idx;
+  }
+  assert(0 && "No valid refresh index found");
+  return -1;
+}
+
+// Computes the reference refresh index for INTNL_ARF_UPDATE frame.
+int av1_calc_refresh_idx_for_intnl_arf(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+    int gf_index) {
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+  // Search for the open slot to store the current frame.
+  int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs);
+
+  // Use a free slot if available.
+  if (free_fb_index != INVALID_IDX) {
+    return free_fb_index;
+  } else {
+    int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+    int refresh_idx =
+        get_refresh_idx(ref_frame_map_pairs, 0, gf_group, gf_index,
+                        enable_refresh_skip, gf_group->display_idx[gf_index]);
+    return refresh_idx;
+  }
+}
+
+int av1_get_refresh_frame_flags(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+    FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order,
+    RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &cpi->ext_flags.refresh_frame;
+
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->refbuf_state[gf_index] == REFBUF_RESET)
+    return SELECT_ALL_BUF_SLOTS;
+
+  // TODO(jingning): Deprecate the following operations.
+  // Switch frames and shown key-frames overwrite all reference slots
+  if (frame_params->frame_type == S_FRAME) return SELECT_ALL_BUF_SLOTS;
+
+  // show_existing_frames don't actually send refresh_frame_flags so set the
+  // flags to 0 to keep things consistent.
+  if (frame_params->show_existing_frame) return 0;
+
+  const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  if (is_frame_droppable(rtc_ref, ext_refresh_frame_flags)) return 0;
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->use_ducky_encode &&
+      cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+    int new_fb_map_idx = cpi->ppi->gf_group.update_ref_idx[gf_index];
+    if (new_fb_map_idx == INVALID_IDX) return 0;
+    return 1 << new_fb_map_idx;
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  int refresh_mask = 0;
+  if (ext_refresh_frame_flags->update_pending) {
+    if (rtc_ref->set_ref_frame_config ||
+        use_rtc_reference_structure_one_layer(cpi)) {
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+        int ref_frame_map_idx = rtc_ref->ref_idx[i];
+        refresh_mask |= rtc_ref->refresh[ref_frame_map_idx]
+                        << ref_frame_map_idx;
+      }
+      return refresh_mask;
+    }
+    // Unfortunately the encoder interface reflects the old refresh_*_frame
+    // flags so we have to replicate the old refresh_frame_flags logic here in
+    // order to preserve the behaviour of the flag overrides.
+    int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_refresh_frame_flags->last_frame << ref_frame_map_idx;
+
+    ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_refresh_frame_flags->bwd_ref_frame
+                      << ref_frame_map_idx;
+
+    ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_refresh_frame_flags->alt2_ref_frame
+                      << ref_frame_map_idx;
+
+    if (frame_update_type == OVERLAY_UPDATE) {
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_refresh_frame_flags->golden_frame
+                        << ref_frame_map_idx;
+    } else {
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_refresh_frame_flags->golden_frame
+                        << ref_frame_map_idx;
+
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_refresh_frame_flags->alt_ref_frame
+                        << ref_frame_map_idx;
+    }
+    return refresh_mask;
+  }
+
+  // Search for the open slot to store the current frame.
+  int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs);
+
+  // No refresh necessary for these frame types.
+  if (frame_update_type == OVERLAY_UPDATE ||
+      frame_update_type == INTNL_OVERLAY_UPDATE)
+    return refresh_mask;
+
+  // If there is an open slot, refresh that one instead of replacing a
+  // reference.
+  if (free_fb_index != INVALID_IDX) {
+    refresh_mask = 1 << free_fb_index;
+    return refresh_mask;
+  }
+  const int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+  const int update_arf = frame_update_type == ARF_UPDATE;
+  const int refresh_idx =
+      get_refresh_idx(ref_frame_map_pairs, update_arf, &cpi->ppi->gf_group,
+                      gf_index, enable_refresh_skip, cur_disp_order);
+  return 1 << refresh_idx;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size);
+
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
+
+  set_mi_offsets(&cm->mi_params, xd, 0, 0);
+}
+
+// Apply temporal filtering to source frames and encode the filtered frame.
+// If the current frame does not require filtering, this function is identical
+// to av1_encode() except that tpl is not performed.
+static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
+                              EncodeFrameInput *const frame_input,
+                              const EncodeFrameParams *const frame_params,
+                              EncodeFrameResults *const frame_results) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) start_timing(cpi, denoise_and_encode_time);
+#endif
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  const int is_second_arf =
+      av1_gop_is_second_arf(gf_group, cpi->gf_frame_index);
+
+  // Decide whether to apply temporal filtering to the source frame.
+  int apply_filtering =
+      av1_is_temporal_filter_on(oxcf) && !is_stat_generation_stage(cpi);
+  if (update_type != KF_UPDATE && update_type != ARF_UPDATE && !is_second_arf) {
+    apply_filtering = 0;
+  }
+  if (apply_filtering) {
+    if (frame_params->frame_type == KEY_FRAME) {
+      // TODO(angiebird): Move the noise level check to av1_tf_info_filtering.
+      // Decide whether it is allowed to perform key frame filtering
+      int allow_kf_filtering = oxcf->kf_cfg.enable_keyframe_filtering &&
+                               !frame_params->show_existing_frame &&
+                               !is_lossless_requested(&oxcf->rc_cfg);
+      if (allow_kf_filtering) {
+        double y_noise_level = 0.0;
+        av1_estimate_noise_level(
+            frame_input->source, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+            cm->seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD);
+        apply_filtering = y_noise_level > 0;
+      } else {
+        apply_filtering = 0;
+      }
+      // If we are doing kf filtering, set up a few things.
+      if (apply_filtering) {
+        av1_setup_past_independence(cm);
+      }
+    } else if (is_second_arf) {
+      apply_filtering = cpi->sf.hl_sf.second_alt_ref_filtering;
+    }
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time);
+#endif
+  // Save the pointer to the original source image.
+  YV12_BUFFER_CONFIG *source_buffer = frame_input->source;
+  // apply filtering to frame
+  if (apply_filtering) {
+    int show_existing_alt_ref = 0;
+    FRAME_DIFF frame_diff;
+    int top_index = 0;
+    int bottom_index = 0;
+    const int q_index = av1_rc_pick_q_and_bounds(
+        cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+        cpi->gf_frame_index, &bottom_index, &top_index);
+
+    // TODO(bohanli): figure out why we need frame_type in cm here.
+    cm->current_frame.frame_type = frame_params->frame_type;
+    if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
+      YV12_BUFFER_CONFIG *tf_buf = av1_tf_info_get_filtered_buf(
+          &cpi->ppi->tf_info, cpi->gf_frame_index, &frame_diff);
+      if (tf_buf != NULL) {
+        frame_input->source = tf_buf;
+        show_existing_alt_ref = av1_check_show_filtered_frame(
+            tf_buf, &frame_diff, q_index, cm->seq_params->bit_depth);
+        if (show_existing_alt_ref) {
+          cpi->common.showable_frame |= 1;
+        } else {
+          cpi->common.showable_frame = 0;
+        }
+      }
+      if (gf_group->frame_type[cpi->gf_frame_index] != KEY_FRAME) {
+        cpi->ppi->show_existing_alt_ref = show_existing_alt_ref;
+      }
+    }
+
+    if (is_second_arf) {
+      // Allocate the memory for tf_buf_second_arf buffer, only when it is
+      // required.
+      int ret = aom_realloc_frame_buffer(
+          &cpi->ppi->tf_info.tf_buf_second_arf, oxcf->frm_dim_cfg.width,
+          oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x,
+          cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL, cpi->image_pyramid_levels, 0);
+      if (ret)
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate tf_buf_second_arf");
+
+      YV12_BUFFER_CONFIG *tf_buf_second_arf =
+          &cpi->ppi->tf_info.tf_buf_second_arf;
+      // We didn't apply temporal filtering for second arf ahead in
+      // av1_tf_info_filtering().
+      const int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+      // Right now, we are still using tf_buf_second_arf due to
+      // implementation complexity.
+      // TODO(angiebird): Reuse tf_info->tf_buf here.
+      av1_temporal_filter(cpi, arf_src_index, cpi->gf_frame_index, &frame_diff,
+                          tf_buf_second_arf);
+      show_existing_alt_ref = av1_check_show_filtered_frame(
+          tf_buf_second_arf, &frame_diff, q_index, cm->seq_params->bit_depth);
+      if (show_existing_alt_ref) {
+        aom_extend_frame_borders(tf_buf_second_arf, av1_num_planes(cm));
+        frame_input->source = tf_buf_second_arf;
+      }
+      // Currently INTNL_ARF_UPDATE only do show_existing.
+      cpi->common.showable_frame |= 1;
+    }
+
+    // Copy source metadata to the temporal filtered frame
+    if (source_buffer->metadata &&
+        aom_copy_metadata_to_frame_buffer(frame_input->source,
+                                          source_buffer->metadata)) {
+      aom_internal_error(
+          cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to copy source metadata to the temporal filtered frame");
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) end_timing(cpi, apply_filtering_time);
+#endif
+
+  int set_mv_params = frame_params->frame_type == KEY_FRAME ||
+                      update_type == ARF_UPDATE || update_type == GF_UPDATE;
+  cm->show_frame = frame_params->show_frame;
+  cm->current_frame.frame_type = frame_params->frame_type;
+  // TODO(bohanli): Why is this? what part of it is necessary?
+  av1_set_frame_size(cpi, cm->width, cm->height);
+  if (set_mv_params) av1_set_mv_search_params(cpi);
+
+#if CONFIG_RD_COMMAND
+  if (frame_params->frame_type == KEY_FRAME) {
+    char filepath[] = "rd_command.txt";
+    av1_read_rd_command(filepath, &cpi->rd_command);
+  }
+#endif  // CONFIG_RD_COMMAND
+  if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi)) {
+    // perform tpl after filtering
+    int allow_tpl =
+        oxcf->gf_cfg.lag_in_frames > 1 && oxcf->algo_cfg.enable_tpl_model;
+    if (gf_group->size > MAX_LENGTH_TPL_FRAME_STATS) {
+      allow_tpl = 0;
+    }
+    if (frame_params->frame_type != KEY_FRAME) {
+      // In rare case, it's possible to have non ARF/GF update_type here.
+      // We should set allow_tpl to zero in the situation
+      allow_tpl =
+          allow_tpl && (update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+                        (cpi->use_ducky_encode &&
+                         cpi->ducky_encode_info.frame_info.gop_mode ==
+                             DUCKY_ENCODE_GOP_MODE_RCL));
+    }
+
+    if (allow_tpl) {
+      if (!cpi->skip_tpl_setup_stats) {
+        av1_tpl_preload_rc_estimate(cpi, frame_params);
+        av1_tpl_setup_stats(cpi, 0, frame_params);
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+        assert(cpi->gf_frame_index == 0);
+        av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+                                       gf_group, cm->seq_params->bit_depth);
+#endif
+      }
+    } else {
+      av1_init_tpl_stats(&cpi->ppi->tpl_data);
+    }
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+    if (cpi->oxcf.pass == AOM_RC_SECOND_PASS &&
+        cpi->second_pass_log_stream != NULL) {
+      TPL_INFO *tpl_info;
+      AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info)));
+      av1_pack_tpl_info(tpl_info, gf_group, &cpi->ppi->tpl_data);
+      av1_write_tpl_info(tpl_info, cpi->second_pass_log_stream,
+                         cpi->common.error);
+      aom_free(tpl_info);
+    }
+#endif  // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+  }
+
+  if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+  // Set frame_input source to true source for psnr calculation.
+  if (apply_filtering && is_psnr_calc_enabled(cpi)) {
+    cpi->source = av1_realloc_and_scale_if_required(
+        cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0,
+        false, true, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+    cpi->unscaled_source = source_buffer;
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) end_timing(cpi, denoise_and_encode_time);
+#endif
+  return AOM_CODEC_OK;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+/*!\cond */
+// Struct to keep track of relevant reference frame data.
+typedef struct {
+  int map_idx;
+  int disp_order;
+  int pyr_level;
+  int used;
+} RefBufMapData;
+/*!\endcond */
+
+// Comparison function to sort reference frames in ascending display order.
+static int compare_map_idx_pair_asc(const void *a, const void *b) {
+  if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) {
+    return 0;
+  } else if (((const RefBufMapData *)a)->disp_order >
+             ((const RefBufMapData *)b)->disp_order) {
+    return 1;
+  } else {
+    return -1;
+  }
+}
+
+// Checks to see if a particular reference frame is already in the reference
+// frame map.
+static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) {
+  for (int i = 0; i < n_frames; i++) {
+    if (disp_order == map[i].disp_order) return 1;
+  }
+  return 0;
+}
+
+// Add a reference buffer index to a named reference slot.
+static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx,
+                            int frame) {
+  remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx;
+  ref->used = 1;
+}
+
+// Threshold dictating when we are allowed to start considering
+// leaving lowest level frames unmapped.
+#define LOW_LEVEL_FRAMES_TR 5
+
+// Find which reference buffer should be left out of the named mapping.
+// This is because there are 8 reference buffers and only 7 named slots.
+static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs,
+                             int n_min_level_refs, int min_level,
+                             int cur_frame_disp) {
+  int max_dist = 0;
+  int unmapped_idx = -1;
+  if (n_bufs <= ALTREF_FRAME) return;
+  for (int i = 0; i < n_bufs; i++) {
+    if (buffer_map[i].used) continue;
+    if (buffer_map[i].pyr_level != min_level ||
+        n_min_level_refs >= LOW_LEVEL_FRAMES_TR) {
+      int dist = abs(cur_frame_disp - buffer_map[i].disp_order);
+      if (dist > max_dist) {
+        max_dist = dist;
+        unmapped_idx = i;
+      }
+    }
+  }
+  assert(unmapped_idx >= 0 && "Unmapped reference not found");
+  buffer_map[unmapped_idx].used = 1;
+}
+
+void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                        int cur_frame_disp, const AV1_COMP *cpi, int gf_index,
+                        int is_parallel_encode,
+                        int remapped_ref_idx[REF_FRAMES]) {
+  int buf_map_idx = 0;
+
+  // Initialize reference frame mappings.
+  for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->use_ducky_encode &&
+      cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+    for (int rf = LAST_FRAME; rf < REF_FRAMES; ++rf) {
+      if (cpi->ppi->gf_group.ref_frame_list[gf_index][rf] != INVALID_IDX) {
+        remapped_ref_idx[rf - LAST_FRAME] =
+            cpi->ppi->gf_group.ref_frame_list[gf_index][rf];
+      }
+    }
+
+    int valid_rf_idx = 0;
+    static const int ref_frame_type_order[REF_FRAMES - LAST_FRAME] = {
+      GOLDEN_FRAME,  ALTREF_FRAME, LAST_FRAME, BWDREF_FRAME,
+      ALTREF2_FRAME, LAST2_FRAME,  LAST3_FRAME
+    };
+    for (int i = 0; i < REF_FRAMES - LAST_FRAME; i++) {
+      int rf = ref_frame_type_order[i];
+      if (remapped_ref_idx[rf - LAST_FRAME] != INVALID_IDX) {
+        valid_rf_idx = remapped_ref_idx[rf - LAST_FRAME];
+        break;
+      }
+    }
+
+    for (int i = 0; i < REF_FRAMES; ++i) {
+      if (remapped_ref_idx[i] == INVALID_IDX) {
+        remapped_ref_idx[i] = valid_rf_idx;
+      }
+    }
+
+    return;
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  RefBufMapData buffer_map[REF_FRAMES];
+  int n_bufs = 0;
+  memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0]));
+  int min_level = MAX_ARF_LAYERS;
+  int max_level = 0;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int skip_ref_unmapping = 0;
+  int is_one_pass_rt = is_one_pass_rt_params(cpi);
+
+  // Go through current reference buffers and store display order, pyr level,
+  // and map index.
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    // Get reference frame buffer.
+    RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+    if (ref_pair.disp_order == -1) continue;
+    const int frame_order = ref_pair.disp_order;
+    // Avoid duplicates.
+    if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue;
+    const int reference_frame_level = ref_pair.pyr_level;
+
+    // Keep track of the lowest and highest levels that currently exist.
+    if (reference_frame_level < min_level) min_level = reference_frame_level;
+    if (reference_frame_level > max_level) max_level = reference_frame_level;
+
+    buffer_map[n_bufs].map_idx = map_idx;
+    buffer_map[n_bufs].disp_order = frame_order;
+    buffer_map[n_bufs].pyr_level = reference_frame_level;
+    buffer_map[n_bufs].used = 0;
+    n_bufs++;
+  }
+
+  // Sort frames in ascending display order.
+  qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc);
+
+  int n_min_level_refs = 0;
+  int closest_past_ref = -1;
+  int golden_idx = -1;
+  int altref_idx = -1;
+
+  // Find the GOLDEN_FRAME and BWDREF_FRAME.
+  // Also collect various stats about the reference frames for the remaining
+  // mappings.
+  for (int i = n_bufs - 1; i >= 0; i--) {
+    if (buffer_map[i].pyr_level == min_level) {
+      // Keep track of the number of lowest level frames.
+      n_min_level_refs++;
+      if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 &&
+          remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) {
+        // Save index for GOLDEN.
+        golden_idx = i;
+      } else if (buffer_map[i].disp_order > cur_frame_disp &&
+                 altref_idx == -1 &&
+                 remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) {
+        // Save index for ALTREF.
+        altref_idx = i;
+      }
+    } else if (buffer_map[i].disp_order == cur_frame_disp) {
+      // Map the BWDREF_FRAME if this is the show_existing_frame.
+      add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME);
+    }
+
+    // During parallel encodes of lower layer frames, exclude the first frame
+    // (frame_parallel_level 1) from being used for the reference assignment of
+    // the second frame (frame_parallel_level 2).
+    if (!is_one_pass_rt && gf_group->frame_parallel_level[gf_index] == 2 &&
+        gf_group->frame_parallel_level[gf_index - 1] == 1 &&
+        gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) {
+      assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE);
+#if CONFIG_FPMT_TEST
+      is_parallel_encode = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE)
+                               ? is_parallel_encode
+                               : 0;
+#endif  // CONFIG_FPMT_TEST
+      // If parallel cpis are active, use ref_idx_to_skip, else, use display
+      // index.
+      assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX));
+      assert(IMPLIES(!is_parallel_encode,
+                     gf_group->skip_frame_as_ref[gf_index] != INVALID_IDX));
+      buffer_map[i].used = is_parallel_encode
+                               ? (buffer_map[i].map_idx == cpi->ref_idx_to_skip)
+                               : (buffer_map[i].disp_order ==
+                                  gf_group->skip_frame_as_ref[gf_index]);
+      // In case a ref frame is excluded from being used during assignment,
+      // skip the call to set_unmapped_ref(). Applicable in steady state.
+      if (buffer_map[i].used) skip_ref_unmapping = 1;
+    }
+
+    // Keep track of where the frames change from being past frames to future
+    // frames.
+    if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0)
+      closest_past_ref = i;
+  }
+
+  // Do not map GOLDEN and ALTREF based on their pyramid level if all reference
+  // frames have the same level.
+  if (n_min_level_refs <= n_bufs) {
+    // Map the GOLDEN_FRAME.
+    if (golden_idx > -1)
+      add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME);
+    // Map the ALTREF_FRAME.
+    if (altref_idx > -1)
+      add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME);
+  }
+
+  // Find the buffer to be excluded from the mapping.
+  if (!skip_ref_unmapping)
+    set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level,
+                     cur_frame_disp);
+
+  // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME.
+  for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer
+    // in decreasing ouptut order relative to current picture.
+    int next_buf_max = 0;
+    int next_disp_order = INT_MIN;
+    for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used &&
+          buffer_map[buf_map_idx].disp_order < cur_frame_disp &&
+          buffer_map[buf_map_idx].disp_order > next_disp_order) {
+        next_disp_order = buffer_map[buf_map_idx].disp_order;
+        next_buf_max = buf_map_idx;
+      }
+    }
+    buf_map_idx = next_buf_max;
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME.
+  for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer
+    // in increasing ouptut order relative to current picture.
+    int next_buf_max = 0;
+    int next_disp_order = INT_MAX;
+    for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used &&
+          buffer_map[buf_map_idx].disp_order > cur_frame_disp &&
+          buffer_map[buf_map_idx].disp_order < next_disp_order) {
+        next_disp_order = buffer_map[buf_map_idx].disp_order;
+        next_buf_max = buf_map_idx;
+      }
+    }
+    buf_map_idx = next_buf_max;
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place remaining past frames.
+  buf_map_idx = closest_past_ref;
+  for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer.
+    for (; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used) break;
+    }
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place remaining future frames.
+  buf_map_idx = n_bufs - 1;
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer.
+    for (; buf_map_idx > closest_past_ref; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used) break;
+    }
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Fill any slots that are empty (should only happen for the first 7 frames).
+  for (int i = 0; i < REF_FRAMES; ++i)
+    if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0;
+}
+
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+                        uint8_t *const dest, unsigned int *frame_flags,
+                        int64_t *const time_stamp, int64_t *const time_end,
+                        const aom_rational64_t *const timestamp_ratio,
+                        int *const pop_lookahead, int flush) {
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
+  GFConfig *const gf_cfg = &oxcf->gf_cfg;
+
+  EncodeFrameInput frame_input;
+  EncodeFrameParams frame_params;
+  EncodeFrameResults frame_results;
+  memset(&frame_input, 0, sizeof(frame_input));
+  memset(&frame_params, 0, sizeof(frame_params));
+  memset(&frame_results, 0, sizeof(frame_results));
+
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+  VBR_RATECTRL_INFO *vbr_rc_info = &cpi->vbr_rc_info;
+  if (oxcf->pass == AOM_RC_THIRD_PASS && vbr_rc_info->ready == 0) {
+    THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+    av1_open_second_pass_log(cpi, 1);
+    FILE *second_pass_log_stream = cpi->second_pass_log_stream;
+    fseek(second_pass_log_stream, 0, SEEK_END);
+    size_t file_size = ftell(second_pass_log_stream);
+    rewind(second_pass_log_stream);
+    size_t read_size = 0;
+    while (read_size < file_size) {
+      THIRD_PASS_GOP_INFO gop_info;
+      struct aom_internal_error_info *error = cpi->common.error;
+      // Read in GOP information from the second pass file.
+      av1_read_second_pass_gop_info(second_pass_log_stream, &gop_info, error);
+      TPL_INFO *tpl_info;
+      AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info)));
+      av1_read_tpl_info(tpl_info, second_pass_log_stream, error);
+      // Read in per-frame info from second-pass encoding
+      av1_read_second_pass_per_frame_info(second_pass_log_stream, frame_info,
+                                          gop_info.num_frames, error);
+      av1_vbr_rc_append_tpl_info(vbr_rc_info, tpl_info);
+      read_size = ftell(second_pass_log_stream);
+      aom_free(tpl_info);
+    }
+    av1_close_second_pass_log(cpi);
+    if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+      vbr_rc_info->base_q_index = cpi->oxcf.rc_cfg.cq_level;
+      av1_vbr_rc_compute_q_indices(
+          vbr_rc_info->base_q_index, vbr_rc_info->total_frame_count,
+          vbr_rc_info->qstep_ratio_list, cm->seq_params->bit_depth,
+          vbr_rc_info->q_index_list);
+    } else {
+      vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q(
+          vbr_rc_info->total_bit_budget, cm->seq_params->bit_depth,
+          vbr_rc_info->scale_factors, vbr_rc_info->total_frame_count,
+          vbr_rc_info->update_type_list, vbr_rc_info->qstep_ratio_list,
+          vbr_rc_info->txfm_stats_list, vbr_rc_info->q_index_list, NULL);
+    }
+    vbr_rc_info->ready = 1;
+#if CONFIG_RATECTRL_LOG
+    rc_log_record_chunk_info(&cpi->rc_log, vbr_rc_info->base_q_index,
+                             vbr_rc_info->total_frame_count);
+#endif  // CONFIG_RATECTRL_LOG
+  }
+#endif  // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+
+  // Check if we need to stuff more src frames
+  if (flush == 0) {
+    int srcbuf_size =
+        av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+    int pop_size =
+        av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage);
+
+    // Continue buffering look ahead buffer.
+    if (srcbuf_size < pop_size) return -1;
+  }
+
+  if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) {
+#if !CONFIG_REALTIME_ONLY
+    if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+        !cpi->ppi->twopass.first_pass_done) {
+      av1_end_first_pass(cpi); /* get last stats packet */
+      cpi->ppi->twopass.first_pass_done = 1;
+    }
+#endif
+    return -1;
+  }
+
+  // TODO(sarahparker) finish bit allocation for one pass pyramid
+  if (has_no_stats_stage(cpi)) {
+    gf_cfg->gf_max_pyr_height =
+        AOMMIN(gf_cfg->gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
+    gf_cfg->gf_min_pyr_height =
+        AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height);
+  }
+
+  // Allocation of mi buffers.
+  alloc_mb_mode_info_buffers(cpi);
+
+  cpi->skip_tpl_setup_stats = 0;
+#if !CONFIG_REALTIME_ONLY
+  if (oxcf->pass != AOM_RC_FIRST_PASS) {
+    TplParams *const tpl_data = &cpi->ppi->tpl_data;
+    if (tpl_data->tpl_stats_pool[0] == NULL) {
+      av1_setup_tpl_buffers(cpi->ppi, &cm->mi_params, oxcf->frm_dim_cfg.width,
+                            oxcf->frm_dim_cfg.height, 0,
+                            oxcf->gf_cfg.lag_in_frames);
+    }
+  }
+  cpi->twopass_frame.this_frame = NULL;
+  const int use_one_pass_rt_params = is_one_pass_rt_params(cpi);
+  if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_get_second_pass_params_time);
+#endif
+
+    // Initialise frame_level_rate_correction_factors with value previous
+    // to the parallel frames.
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      for (int i = 0; i < RATE_FACTOR_LEVELS; i++) {
+        cpi->rc.frame_level_rate_correction_factors[i] =
+#if CONFIG_FPMT_TEST
+            (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)
+                ? cpi->ppi->p_rc.temp_rate_correction_factors[i]
+                :
+#endif  // CONFIG_FPMT_TEST
+                cpi->ppi->p_rc.rate_correction_factors[i];
+      }
+    }
+
+    // copy mv_stats from ppi to frame_level cpi.
+    cpi->mv_stats = cpi->ppi->mv_stats;
+    av1_get_second_pass_params(cpi, &frame_params, *frame_flags);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_get_second_pass_params_time);
+#endif
+  }
+#endif
+
+  if (!is_stat_generation_stage(cpi)) {
+    // TODO(jingning): fwd key frame always uses show existing frame?
+    if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE &&
+        gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+      frame_params.show_existing_frame = 1;
+    } else {
+      frame_params.show_existing_frame =
+          (cpi->ppi->show_existing_alt_ref &&
+           gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) ||
+          gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE;
+    }
+    frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
+
+    // Special handling to reset 'show_existing_frame' in case of dropped
+    // frames.
+    if (oxcf->rc_cfg.drop_frames_water_mark &&
+        (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE ||
+         gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE)) {
+      // During the encode of an OVERLAY_UPDATE/INTNL_OVERLAY_UPDATE frame, loop
+      // over the gf group to check if the corresponding
+      // ARF_UPDATE/INTNL_ARF_UPDATE frame was dropped.
+      int cur_disp_idx = gf_group->display_idx[cpi->gf_frame_index];
+      for (int idx = 0; idx < cpi->gf_frame_index; idx++) {
+        if (cur_disp_idx == gf_group->display_idx[idx]) {
+          assert(IMPLIES(
+              gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE,
+              gf_group->update_type[idx] == ARF_UPDATE));
+          assert(IMPLIES(gf_group->update_type[cpi->gf_frame_index] ==
+                             INTNL_OVERLAY_UPDATE,
+                         gf_group->update_type[idx] == INTNL_ARF_UPDATE));
+          // Reset show_existing_frame and set cpi->is_dropped_frame to true if
+          // the frame was dropped during its first encode.
+          if (gf_group->is_frame_dropped[idx]) {
+            frame_params.show_existing_frame = 0;
+            assert(!cpi->is_dropped_frame);
+            cpi->is_dropped_frame = true;
+          }
+          break;
+        }
+      }
+    }
+
+    // Reset show_existing_alt_ref decision to 0 after it is used.
+    if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+      cpi->ppi->show_existing_alt_ref = 0;
+    }
+  } else {
+    frame_params.show_existing_frame = 0;
+  }
+
+  struct lookahead_entry *source = NULL;
+  struct lookahead_entry *last_source = NULL;
+  if (frame_params.show_existing_frame) {
+    source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+    *pop_lookahead = 1;
+    frame_params.show_frame = 1;
+  } else {
+    source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source,
+                                 &frame_params.show_frame);
+  }
+
+  if (source == NULL) {  // If no source was found, we can't encode a frame.
+#if !CONFIG_REALTIME_ONLY
+    if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+        !cpi->ppi->twopass.first_pass_done) {
+      av1_end_first_pass(cpi); /* get last stats packet */
+      cpi->ppi->twopass.first_pass_done = 1;
+    }
+#endif
+    return -1;
+  }
+
+  // reset src_offset to allow actual encode call for this frame to get its
+  // source.
+  gf_group->src_offset[cpi->gf_frame_index] = 0;
+
+  // Source may be changed if temporal filtered later.
+  frame_input.source = &source->img;
+  if ((cpi->ppi->use_svc || cpi->rc.prev_frame_is_dropped) &&
+      last_source != NULL)
+    av1_svc_set_last_source(cpi, &frame_input, &last_source->img);
+  else
+    frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
+  frame_input.ts_duration = source->ts_end - source->ts_start;
+  // Save unfiltered source. It is used in av1_get_second_pass_params().
+  cpi->unfiltered_source = frame_input.source;
+
+  *time_stamp = source->ts_start;
+  *time_end = source->ts_end;
+  if (source->ts_start < cpi->time_stamps.first_ts_start) {
+    cpi->time_stamps.first_ts_start = source->ts_start;
+    cpi->time_stamps.prev_ts_end = source->ts_start;
+  }
+
+  av1_apply_encoding_flags(cpi, source->flags);
+  *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+#if CONFIG_FPMT_TEST
+  if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      cpi->framerate = cpi->temp_framerate;
+    }
+  }
+#endif  // CONFIG_FPMT_TEST
+
+  // Shown frames and arf-overlay frames need frame-rate considering
+  if (frame_params.show_frame)
+    adjust_frame_rate(cpi, source->ts_start, source->ts_end);
+
+  if (!frame_params.show_existing_frame) {
+    if (cpi->film_grain_table) {
+      cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup(
+          cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+          &cm->film_grain_params);
+    } else {
+      cm->cur_frame->film_grain_params_present =
+          cm->seq_params->film_grain_params_present;
+    }
+    // only one operating point supported now
+    const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp);
+    if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+
+    cm->frame_presentation_time = (uint32_t)pts64;
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
+#if CONFIG_REALTIME_ONLY
+  av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input,
+                             *frame_flags);
+  if (use_rtc_reference_structure_one_layer(cpi))
+    av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0);
+#else
+  if (use_one_pass_rt_params) {
+    av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input,
+                               *frame_flags);
+    if (use_rtc_reference_structure_one_layer(cpi))
+      av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0);
+  }
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
+
+  FRAME_UPDATE_TYPE frame_update_type =
+      get_frame_update_type(gf_group, cpi->gf_frame_index);
+
+  if (frame_params.show_existing_frame &&
+      frame_params.frame_type != KEY_FRAME) {
+    // Force show-existing frames to be INTER, except forward keyframes
+    frame_params.frame_type = INTER_FRAME;
+  }
+
+  // Per-frame encode speed.  In theory this can vary, but things may have
+  // been written assuming speed-level will not change within a sequence, so
+  // this parameter should be used with caution.
+  frame_params.speed = oxcf->speed;
+
+#if !CONFIG_REALTIME_ONLY
+  // Set forced key frames when necessary. For two-pass encoding / lap mode,
+  // this is already handled by av1_get_second_pass_params. However when no
+  // stats are available, we still need to check if the new frame is a keyframe.
+  // For one pass rt, this is already checked in av1_get_one_pass_rt_params.
+  if (!use_one_pass_rt_params &&
+      (is_stat_generation_stage(cpi) || has_no_stats_stage(cpi))) {
+    // Current frame is coded as a key-frame for any of the following cases:
+    // 1) First frame of a video
+    // 2) For all-intra frame encoding
+    // 3) When a key-frame is forced
+    const int kf_requested =
+        (cm->current_frame.frame_number == 0 ||
+         oxcf->kf_cfg.key_freq_max == 0 || (*frame_flags & FRAMEFLAGS_KEY));
+    if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
+        frame_update_type != INTNL_OVERLAY_UPDATE) {
+      frame_params.frame_type = KEY_FRAME;
+    } else if (is_stat_generation_stage(cpi)) {
+      // For stats generation, set the frame type to inter here.
+      frame_params.frame_type = INTER_FRAME;
+    }
+  }
+#endif
+
+  // Work out some encoding parameters specific to the pass:
+  if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_update_parameters(cpi);
+  } else if (is_stat_generation_stage(cpi)) {
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg);
+  } else if (is_stat_consumption_stage(cpi)) {
+#if CONFIG_MISMATCH_DEBUG
+    mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+    cm->txcoeff_cost_timer = 0;
+    cm->txcoeff_cost_count = 0;
+#endif
+  }
+
+  if (!is_stat_generation_stage(cpi))
+    set_ext_overrides(cm, &frame_params, ext_flags);
+
+  // Shown keyframes and S frames refresh all reference buffers
+  const int force_refresh_all =
+      ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) ||
+       frame_params.frame_type == S_FRAME) &&
+      !frame_params.show_existing_frame;
+
+  av1_configure_buffer_updates(
+      cpi, &frame_params.refresh_frame, frame_update_type,
+      gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all);
+
+  if (!is_stat_generation_stage(cpi)) {
+    const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
+
+    RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+    init_ref_map_pair(cpi, ref_frame_map_pairs);
+    const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+    const int cur_frame_disp =
+        cpi->common.current_frame.frame_number + order_offset;
+
+    int get_ref_frames = 0;
+#if CONFIG_FPMT_TEST
+    get_ref_frames =
+        (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0;
+#endif  // CONFIG_FPMT_TEST
+    if (get_ref_frames ||
+        gf_group->frame_parallel_level[cpi->gf_frame_index] == 0) {
+      if (!ext_flags->refresh_frame.update_pending) {
+        av1_get_ref_frames(ref_frame_map_pairs, cur_frame_disp, cpi,
+                           cpi->gf_frame_index, 1, cm->remapped_ref_idx);
+      } else if (cpi->ppi->rtc_ref.set_ref_frame_config ||
+                 use_rtc_reference_structure_one_layer(cpi)) {
+        for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
+          cm->remapped_ref_idx[i] = cpi->ppi->rtc_ref.ref_idx[i];
+      }
+    }
+
+    // Get the reference frames
+    bool has_ref_frames = false;
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      const RefCntBuffer *ref_frame =
+          get_ref_frame_buf(cm, ref_frame_priority_order[i]);
+      ref_frame_buf[i] = ref_frame != NULL ? &ref_frame->buf : NULL;
+      if (ref_frame != NULL) has_ref_frames = true;
+    }
+    if (!has_ref_frames && (frame_params.frame_type == INTER_FRAME ||
+                            frame_params.frame_type == S_FRAME)) {
+      return AOM_CODEC_ERROR;
+    }
+
+    // Work out which reference frame slots may be used.
+    frame_params.ref_frame_flags =
+        get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frame_buf,
+                            ext_flags->ref_frame_flags);
+
+    // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE.
+    if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) {
+      frame_params.primary_ref_frame = PRIMARY_REF_NONE;
+    } else {
+      frame_params.primary_ref_frame =
+          choose_primary_ref_frame(cpi, &frame_params);
+    }
+
+    frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+
+    // Call av1_get_refresh_frame_flags() if refresh index not available.
+    if (!cpi->refresh_idx_available) {
+      frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
+          cpi, &frame_params, frame_update_type, cpi->gf_frame_index,
+          cur_frame_disp, ref_frame_map_pairs);
+    } else {
+      assert(cpi->ref_refresh_index != INVALID_IDX);
+      frame_params.refresh_frame_flags = (1 << cpi->ref_refresh_index);
+    }
+
+    // Make the frames marked as is_frame_non_ref to non-reference frames.
+    if (gf_group->is_frame_non_ref[cpi->gf_frame_index])
+      frame_params.refresh_frame_flags = 0;
+
+    frame_params.existing_fb_idx_to_show = INVALID_IDX;
+    // Find the frame buffer to show based on display order.
+    if (frame_params.show_existing_frame) {
+      for (int frame = 0; frame < REF_FRAMES; frame++) {
+        const RefCntBuffer *const buf = cm->ref_frame_map[frame];
+        if (buf == NULL) continue;
+        const int frame_order = (int)buf->display_order_hint;
+        if (frame_order == cur_frame_disp)
+          frame_params.existing_fb_idx_to_show = frame;
+      }
+    }
+  }
+
+  // The way frame_params->remapped_ref_idx is setup is a placeholder.
+  // Currently, reference buffer assignment is done by update_ref_frame_map()
+  // which is called by high-level strategy AFTER encoding a frame.  It
+  // modifies cm->remapped_ref_idx.  If you want to use an alternative method
+  // to determine reference buffer assignment, just put your assignments into
+  // frame_params->remapped_ref_idx here and they will be used when encoding
+  // this frame.  If frame_params->remapped_ref_idx is setup independently of
+  // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
+  memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
+         REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+  cpi->td.mb.rdmult_delta_qindex = cpi->td.mb.delta_qindex = 0;
+
+  if (!frame_params.show_existing_frame) {
+    cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
+  }
+
+  const int is_intra_frame = frame_params.frame_type == KEY_FRAME ||
+                             frame_params.frame_type == INTRA_ONLY_FRAME;
+  FeatureFlags *const features = &cm->features;
+  if (!is_stat_generation_stage(cpi) &&
+      (oxcf->pass == AOM_RC_ONE_PASS || oxcf->pass >= AOM_RC_SECOND_PASS) &&
+      is_intra_frame) {
+    av1_set_screen_content_options(cpi, features);
+  }
+
+#if CONFIG_REALTIME_ONLY
+  if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+#else
+  if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
+      gf_cfg->lag_in_frames == 0) {
+    if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  } else if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
+                                &frame_results) != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+#endif  // CONFIG_REALTIME_ONLY
+
+  // This is used in rtc temporal filter case. Use true source in the PSNR
+  // calculation.
+  if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf &&
+      cpi->common.current_frame.frame_type != KEY_FRAME) {
+    assert(cpi->orig_source.buffer_alloc_sz > 0);
+    cpi->source = &cpi->orig_source;
+  }
+
+  if (!is_stat_generation_stage(cpi)) {
+    // First pass doesn't modify reference buffer assignment or produce frame
+    // flags
+    update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags);
+    set_additional_frame_flags(cm, frame_flags);
+  }
+
+#if !CONFIG_REALTIME_ONLY
+#if TXCOEFF_COST_TIMER
+  if (!is_stat_generation_stage(cpi)) {
+    cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+    fprintf(stderr,
+            "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+            "in us\n",
+            cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+            cm->cum_txcoeff_cost_timer);
+  }
+#endif
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_TUNE_VMAF
+  if (!is_stat_generation_stage(cpi) &&
+      (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+       oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+    av1_update_vmaf_curve(cpi);
+  }
+#endif
+
+  // Unpack frame_results:
+  *size = frame_results.size;
+
+  // Leave a signal for a higher level caller about if this frame is droppable
+  if (*size > 0) {
+    cpi->droppable =
+        is_frame_droppable(&cpi->ppi->rtc_ref, &ext_flags->refresh_frame);
+  }
+
+  // For SVC, or when frame-dropper is enabled:
+  // keep track of the (unscaled) source corresponding to the refresh of LAST
+  // reference (base temporal layer - TL0). Copy only for the
+  // top spatial enhancement layer so all spatial layers of the next
+  // superframe have last_source to be aligned with previous TL0 superframe.
+  // Avoid cases where resolution changes for unscaled source (top spatial
+  // layer). Only needs to be done for frame that are encoded (size > 0).
+  if (*size > 0 &&
+      (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+      cpi->svc.temporal_layer_id == 0 &&
+      cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width &&
+      cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) {
+    aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+    aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+    aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+  }
+
+  return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/encode_strategy.h b/third_party/aom/av1/encoder/encode_strategy.h
new file mode 100644
index 0000000000..c1d14d134c
--- /dev/null
+++ b/third_party/aom/av1/encoder/encode_strategy.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares frame encoding functions.
+ */
+#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "aom/aom_encoder.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+/*!\brief Implement high-level encode strategy
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function will implement high-level encode strategy, choosing frame type,
+ * frame placement, etc. It populates an EncodeFrameParams struct with the
+ * results of these decisions and then encodes the frame. The caller should use
+ * the output parameters *time_stamp and *time_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in]    cpi         Top-level encoder structure
+ * \param[in]    size        Bitstream size
+ * \param[in]    dest        Bitstream output
+ * \param[in]    frame_flags Flags to decide how to encoding the frame
+ * \param[out]   time_stamp  Time stamp of the frame
+ * \param[out]   time_end    Time end
+ * \param[in]    timestamp_ratio Time base
+ * \param[in]    pop_lookahead Decide to pop the source frame from queue
+ * \param[in]    flush       Decide to encode one frame or the rest of frames
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+                        uint8_t *const dest, unsigned int *frame_flags,
+                        int64_t *const time_stamp, int64_t *const time_end,
+                        const aom_rational64_t *const timestamp_ratio,
+                        int *const pop_lookahead, int flush);
+
+/*!\cond */
+// Set individual buffer update flags based on frame reference type.
+// force_refresh_all is used when we have a KEY_FRAME or S_FRAME.  It forces all
+// refresh_*_frame flags to be set, because we refresh all buffers in this case.
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  RefreshFrameInfo *const refresh_frame,
+                                  const FRAME_UPDATE_TYPE type,
+                                  const REFBUF_STATE refbuf_state,
+                                  int force_refresh_all);
+
+int av1_get_refresh_frame_flags(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+    FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order,
+    RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]);
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags);
+
+/*!\brief Obtain indices of reference frames in ref_frame_map
+ *
+ * \callgraph
+ * \callergraph
+ *
+ * \param[out]   remapped_ref_idx  An array for storing indices of reference
+ *                                 frames. The index is used to retrieve a
+ *                                 reference frame buffer from ref_frame_map
+ *                                 in AV1Common.
+ */
+void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                        int cur_frame_disp, const AV1_COMP *cpi, int gf_index,
+                        int is_parallel_encode,
+                        int remapped_ref_idx[REF_FRAMES]);
+
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+                               const int up_to_index,
+                               const COMPRESSOR_STAGE compressor_stage);
+
+static AOM_INLINE int is_frame_droppable(
+    const RTC_REF *const rtc_ref,
+    const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
+  // Droppable frame is only used by external refresh flags. VoD setting won't
+  // trigger its use case.
+  if (rtc_ref->set_ref_frame_config)
+    return rtc_ref->non_reference_frame;
+  else if (ext_refresh_frame_flags->update_pending)
+    return !(ext_refresh_frame_flags->alt_ref_frame ||
+             ext_refresh_frame_flags->alt2_ref_frame ||
+             ext_refresh_frame_flags->bwd_ref_frame ||
+             ext_refresh_frame_flags->golden_frame ||
+             ext_refresh_frame_flags->last_frame);
+  else
+    return 0;
+}
+
+static AOM_INLINE int get_current_frame_ref_type(const AV1_COMP *const cpi) {
+  // We choose the reference "type" of this frame from the flags which indicate
+  // which reference frames will be refreshed by it. More than one of these
+  // flags may be set, so the order here implies an order of precedence. This is
+  // just used to choose the primary_ref_frame (as the most recent reference
+  // buffer of the same reference-type as the current frame).
+
+  switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) {
+    case 0: return 0;
+    case 1: return 1;
+    case MAX_ARF_LAYERS:
+    case MAX_ARF_LAYERS + 1: return 4;
+    default: return 7;
+  }
+}
+
+int av1_calc_refresh_idx_for_intnl_arf(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+    int gf_index);
+/*!\endcond */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
new file mode 100644
index 0000000000..e2213a8355
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -0,0 +1,2408 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_model_weights.h"
+#endif
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+/*!\cond */
+// This is used as a reference when computing the source variance for the
+//  purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+//  which will be faster.
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16
+};
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+/*!\endcond */
+
+// For the given bit depth, returns a constant array used to assist the
+// calculation of source block variance, which will then be used to decide
+// adaptive quantizers.
+static const uint8_t *get_var_offs(int use_hbd, int bd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    assert(bd == 8 || bd == 10 || bd == 12);
+    const int off_index = (bd - 8) >> 1;
+    static const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8,
+                                                AV1_HIGH_VAR_OFFS_10,
+                                                AV1_HIGH_VAR_OFFS_12 };
+    return CONVERT_TO_BYTEPTR(high_var_offs[off_index]);
+  }
+#else
+  (void)use_hbd;
+  (void)bd;
+  assert(!use_hbd);
+#endif
+  assert(bd == 8);
+  return AV1_VAR_OFFS;
+}
+
+void av1_init_rtc_counters(MACROBLOCK *const x) {
+  av1_init_cyclic_refresh_counters(x);
+  x->cnt_zeromv = 0;
+}
+
+void av1_accumulate_rtc_counters(AV1_COMP *cpi, const MACROBLOCK *const x) {
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh, x);
+  cpi->rc.cnt_zeromv += x->cnt_zeromv;
+}
+
+unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi,
+                                       const MACROBLOCKD *xd,
+                                       const struct buf_2d *ref,
+                                       BLOCK_SIZE bsize, int plane,
+                                       int use_hbd) {
+  const int subsampling_x = xd->plane[plane].subsampling_x;
+  const int subsampling_y = xd->plane[plane].subsampling_y;
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, subsampling_x, subsampling_y);
+  unsigned int sse;
+  const unsigned int var = cpi->ppi->fn_ptr[plane_bsize].vf(
+      ref->buf, ref->stride, get_var_offs(use_hbd, xd->bd), 0, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[plane_bsize]);
+}
+
+unsigned int av1_get_perpixel_variance_facade(const AV1_COMP *cpi,
+                                              const MACROBLOCKD *xd,
+                                              const struct buf_2d *ref,
+                                              BLOCK_SIZE bsize, int plane) {
+  const int use_hbd = is_cur_buf_hbd(xd);
+  return av1_get_perpixel_variance(cpi, xd, ref, bsize, plane, use_hbd);
+}
+
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col, const int num_planes,
+                          BLOCK_SIZE bsize) {
+  // Set current frame pointer.
+  x->e_mbd.cur_buf = src;
+
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
+    const int is_uv = i > 0;
+    setup_pred_plane(
+        &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv],
+        src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL,
+        x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y);
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+/*!\brief Assigns different quantization parameters to each super
+ * block based on its TPL weight.
+ *
+ * \ingroup tpl_modelling
+ *
+ * \param[in]     cpi         Top level encoder instance structure
+ * \param[in,out] td          Thread data structure
+ * \param[in,out] x           Macro block level data for this block.
+ * \param[in]     tile_info   Tile infromation / identification
+ * \param[in]     mi_row      Block row (in "MI_SIZE" units) index
+ * \param[in]     mi_col      Block column (in "MI_SIZE" units) index
+ * \param[out]    num_planes  Number of image planes (e.g. Y,U,V)
+ *
+ * \remark No return value but updates macroblock and thread data
+ * related to the q / q delta to be used.
+ */
+static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
+                                     MACROBLOCK *const x,
+                                     const TileInfo *const tile_info,
+                                     int mi_row, int mi_col, int num_planes) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  assert(delta_q_info->delta_q_present_flag);
+
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  // Delta-q modulation based on variance
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+  const int delta_q_res = delta_q_info->delta_q_res;
+  int current_qindex = cm->quant_params.base_qindex;
+  if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.qp_mode ==
+                                   DUCKY_ENCODE_FRAME_MODE_QINDEX) {
+    const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+    const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
+    const int sb_cols =
+        CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+    const int sb_index = sb_row * sb_cols + sb_col;
+    current_qindex =
+        cpi->ducky_encode_info.frame_info.superblock_encode_qindex[sb_index];
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
+    if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+      const int block_wavelet_energy_level =
+          av1_block_wavelet_energy_level(cpi, x, sb_size);
+      x->sb_energy_level = block_wavelet_energy_level;
+      current_qindex = av1_compute_q_from_energy_level_deltaq_mode(
+          cpi, block_wavelet_energy_level);
+    } else {
+      const int block_var_level = av1_log_block_var(cpi, x, sb_size);
+      x->sb_energy_level = block_var_level;
+      current_qindex =
+          av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level);
+    }
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE &&
+             cpi->oxcf.algo_cfg.enable_tpl_model) {
+    // Setup deltaq based on tpl stats
+    current_qindex =
+        av1_get_q_for_deltaq_objective(cpi, td, NULL, sb_size, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) {
+    current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+    current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) {
+    current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col);
+  }
+
+  x->rdmult_cur_qindex = current_qindex;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int adjusted_qindex = av1_adjust_q_from_delta_q_res(
+      delta_q_res, xd->current_base_qindex, current_qindex);
+  if (cpi->use_ducky_encode) {
+    assert(adjusted_qindex == current_qindex);
+  }
+  current_qindex = adjusted_qindex;
+
+  x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+  x->rdmult_delta_qindex = x->delta_qindex;
+
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+  xd->mi[0]->current_qindex = current_qindex;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
+
+  // keep track of any non-zero delta-q used
+  td->deltaq_used |= (x->delta_qindex != 0);
+
+  if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
+    const int delta_lf_res = delta_q_info->delta_lf_res;
+    const int lfmask = ~(delta_lf_res - 1);
+    const int delta_lf_from_base =
+        ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
+    const int8_t delta_lf =
+        (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+    const int frame_lf_count =
+        av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+    const int mib_size = cm->seq_params->mib_size;
+
+    // pre-set the delta lf for loop filter. Note that this value is set
+    // before mi is assigned for each block in current superblock
+    for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+      for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+        const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+        mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
+        }
+      }
+    }
+  }
+}
+
+static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
+                                 int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCK *x = &td->mb;
+  const int frame_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+  av1_zero(x->tpl_keep_ref_frame);
+
+  if (!av1_tpl_stats_ready(tpl_data, frame_idx)) return;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
+
+  const int is_overlay =
+      cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
+  if (is_overlay) {
+    memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame));
+    return;
+  }
+
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int tpl_stride = tpl_frame->stride;
+  int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
+  const int step = 1 << block_mis_log2;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+  const int mi_row_end =
+      AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      AOMMIN(coded_to_superres_mi(mi_col + mi_size_wide[sb_size],
+                                  cm->superres_scale_denominator),
+             mi_cols_sr);
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row_end; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      const TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 };
+      // Find the winner ref frame idx for the current block
+      int64_t best_inter_cost = this_stats->pred_error[0];
+      int best_rf_idx = 0;
+      for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) {
+        if ((this_stats->pred_error[idx] < best_inter_cost) &&
+            (this_stats->pred_error[idx] != 0)) {
+          best_inter_cost = this_stats->pred_error[idx];
+          best_rf_idx = idx;
+        }
+      }
+      // tpl_pred_error is the pred_error reduction of best_ref w.r.t.
+      // LAST_FRAME.
+      tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] -
+                                    this_stats->pred_error[LAST_FRAME - 1];
+
+      for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx)
+        inter_cost[rf_idx] += tpl_pred_error[rf_idx];
+    }
+  }
+
+  int rank_index[INTER_REFS_PER_FRAME - 1];
+  for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+    rank_index[idx] = idx + 1;
+    for (int i = idx; i > 0; --i) {
+      if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) {
+        const int tmp = rank_index[i - 1];
+        rank_index[i - 1] = rank_index[i];
+        rank_index[i] = tmp;
+      }
+    }
+  }
+
+  x->tpl_keep_ref_frame[INTRA_FRAME] = 1;
+  x->tpl_keep_ref_frame[LAST_FRAME] = 1;
+
+  int cutoff_ref = 0;
+  for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+    x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
+    if (idx > 2) {
+      if (!cutoff_ref) {
+        // If the predictive coding gains are smaller than the previous more
+        // relevant frame over certain amount, discard this frame and all the
+        // frames afterwards.
+        if (llabs(inter_cost[rank_index[idx]]) <
+                llabs(inter_cost[rank_index[idx - 1]]) / 8 ||
+            inter_cost[rank_index[idx]] == 0)
+          cutoff_ref = 1;
+      }
+
+      if (cutoff_ref) x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
+    }
+  }
+}
+
+static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
+                                               int mi_row, int mi_col) {
+  const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+  const int orig_rdmult = cpi->rd.RDMULT;
+
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int gf_group_index = cpi->gf_frame_index;
+  if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+      cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
+      cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+    const int dr =
+        av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult);
+    x->rdmult = dr;
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_RT_ML_PARTITIONING
+// Get a prediction(stored in x->est_pred) for the whole superblock.
+static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile,
+                               MACROBLOCK *x, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  // TODO(kyslov) Extend to 128x128
+  assert(cm->seq_params->sb_size == BLOCK_64X64);
+
+  av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+  if (!is_key_frame) {
+    MB_MODE_INFO *mi = xd->mi[0];
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+
+    assert(yv12 != NULL);
+
+    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         get_ref_scale_factors(cm, LAST_FRAME), 1);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE;
+    mi->bsize = BLOCK_64X64;
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+
+    xd->plane[0].dst.buf = x->est_pred;
+    xd->plane[0].dst.stride = 64;
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+  } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+    switch (xd->bd) {
+      case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break;
+      case 10:
+        memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+      case 12:
+        memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+    }
+#else
+    memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0]));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+#endif  // CONFIG_RT_ML_PARTITIONING
+
+#define AVG_CDF_WEIGHT_LEFT 3
+#define AVG_CDF_WEIGHT_TOP_RIGHT 1
+
+/*!\brief Encode a superblock (minimal RD search involved)
+ *
+ * \ingroup partition_search
+ * Encodes the superblock by a pre-determined partition pattern, only minor
+ * rd-based searches are allowed to adjust the initial pattern. It is only used
+ * by realtime encoding.
+ */
+static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
+                                       TileDataEnc *tile_data, TokenExtra **tp,
+                                       const int mi_row, const int mi_col,
+                                       const int seg_skip) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  PC_TREE *const pc_root = td->pc_root;
+
+#if CONFIG_RT_ML_PARTITIONING
+  if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) {
+    RD_STATS dummy_rdc;
+    get_estimated_pred(cpi, tile_info, x, mi_row, mi_col);
+    av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                             BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, pc_root);
+    return;
+  }
+#endif
+  // Set the partition
+  if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+      (sf->rt_sf.use_fast_fixed_part &&
+       x->content_state_sb.source_sad_nonrd < kMedSad)) {
+    // set a fixed-size partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size;
+    if (sf->rt_sf.use_fast_fixed_part &&
+        x->content_state_sb.source_sad_nonrd < kLowSad) {
+      bsize_select = BLOCK_64X64;
+    }
+    const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select;
+    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+  } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    // set a variance-based partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+  }
+  assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+         sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
+  set_cb_offsets(td->mb.cb_offset, 0, 0);
+
+  // Initialize the flag to skip cdef to 1.
+  if (sf->rt_sf.skip_cdef_sb) {
+    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
+    // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
+    // "blocks".
+    for (int r = 0; r < block64_in_sb; ++r) {
+      for (int c = 0; c < block64_in_sb; ++c) {
+        const int idx_in_sb =
+            r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+        if (mi[idx_in_sb]) mi[idx_in_sb]->cdef_strength = 1;
+      }
+    }
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, nonrd_use_partition_time);
+#endif
+  av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                          pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, nonrd_use_partition_time);
+#endif
+}
+
+// This function initializes the stats for encode_rd_sb.
+static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+                                     const TileDataEnc *tile_data,
+                                     SIMPLE_MOTION_DATA_TREE *sms_root,
+                                     RD_STATS *rd_cost, int mi_row, int mi_col,
+                                     int gather_tpl_data) {
+  const AV1_COMMON *cm = &cpi->common;
+  const TileInfo *tile_info = &tile_data->tile_info;
+  MACROBLOCK *x = &td->mb;
+
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const int use_simple_motion_search =
+      (sf->part_sf.simple_motion_search_split ||
+       sf->part_sf.simple_motion_search_prune_rect ||
+       sf->part_sf.simple_motion_search_early_term_none ||
+       sf->part_sf.ml_early_term_after_part_split_level) &&
+      !frame_is_intra_only(cm);
+  if (use_simple_motion_search) {
+    av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_root,
+                                             mi_row, mi_col);
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+        cpi->oxcf.gf_cfg.lag_in_frames == 0)) {
+    init_ref_frame_space(cpi, td, mi_row, mi_col);
+    x->sb_energy_level = 0;
+    x->part_search_info.cnn_output_valid = 0;
+    if (gather_tpl_data) {
+      if (cm->delta_q_info.delta_q_present_flag) {
+        const int num_planes = av1_num_planes(cm);
+        const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+        setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
+        av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
+      }
+
+      // TODO(jingning): revisit this function.
+      if (cpi->oxcf.algo_cfg.enable_tpl_model && (0)) {
+        adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+      }
+    }
+  }
+#else
+  (void)tile_info;
+  (void)mi_row;
+  (void)mi_col;
+  (void)gather_tpl_data;
+#endif
+
+  x->reuse_inter_pred = false;
+  x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
+  reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+  av1_zero(x->picked_ref_frames_mask);
+  av1_invalid_rd_stats(rd_cost);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void sb_qp_sweep_init_quantizers(AV1_COMP *cpi, ThreadData *td,
+                                        const TileDataEnc *tile_data,
+                                        SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                        RD_STATS *rd_cost, int mi_row,
+                                        int mi_col, int delta_qp_ofs) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const TileInfo *tile_info = &tile_data->tile_info;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  assert(delta_q_info->delta_q_present_flag);
+  const int delta_q_res = delta_q_info->delta_q_res;
+
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const int use_simple_motion_search =
+      (sf->part_sf.simple_motion_search_split ||
+       sf->part_sf.simple_motion_search_prune_rect ||
+       sf->part_sf.simple_motion_search_early_term_none ||
+       sf->part_sf.ml_early_term_after_part_split_level) &&
+      !frame_is_intra_only(cm);
+  if (use_simple_motion_search) {
+    av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_tree,
+                                             mi_row, mi_col);
+  }
+
+  int current_qindex = x->rdmult_cur_qindex + delta_qp_ofs;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  current_qindex = av1_adjust_q_from_delta_q_res(
+      delta_q_res, xd->current_base_qindex, current_qindex);
+
+  x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+  xd->mi[0]->current_qindex = current_qindex;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
+
+  // keep track of any non-zero delta-q used
+  td->deltaq_used |= (x->delta_qindex != 0);
+
+  if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
+    const int delta_lf_res = delta_q_info->delta_lf_res;
+    const int lfmask = ~(delta_lf_res - 1);
+    const int delta_lf_from_base =
+        ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
+    const int8_t delta_lf =
+        (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+    const int frame_lf_count =
+        av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+    const int mib_size = cm->seq_params->mib_size;
+
+    // pre-set the delta lf for loop filter. Note that this value is set
+    // before mi is assigned for each block in current superblock
+    for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+      for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+        const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+        mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
+        }
+      }
+    }
+  }
+
+  x->reuse_inter_pred = false;
+  x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
+  reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+  av1_zero(x->picked_ref_frames_mask);
+  av1_invalid_rd_stats(rd_cost);
+}
+
+static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td,
+                       TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                       int mi_col, BLOCK_SIZE bsize,
+                       SIMPLE_MOTION_DATA_TREE *sms_tree,
+                       SB_FIRST_PASS_STATS *sb_org_stats) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  RD_STATS rdc_winner, cur_rdc;
+  av1_invalid_rd_stats(&rdc_winner);
+
+  int best_qindex = td->mb.rdmult_delta_qindex;
+  const int start = cm->current_frame.frame_type == KEY_FRAME ? -20 : -12;
+  const int end = cm->current_frame.frame_type == KEY_FRAME ? 20 : 12;
+  const int step = cm->delta_q_info.delta_q_res;
+
+  for (int sweep_qp_delta = start; sweep_qp_delta <= end;
+       sweep_qp_delta += step) {
+    sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_tree, &cur_rdc, mi_row,
+                                mi_col, sweep_qp_delta);
+
+    const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+    const int backup_current_qindex =
+        cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+    av1_reset_mbmi(&cm->mi_params, bsize, mi_row, mi_col);
+    av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
+    cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex;
+
+    td->pc_root = av1_alloc_pc_tree_node(bsize);
+    if (!td->pc_root)
+      aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                          &cur_rdc, cur_rdc, td->pc_root, sms_tree, NULL,
+                          SB_DRY_PASS, NULL);
+
+    if ((rdc_winner.rdcost > cur_rdc.rdcost) ||
+        (abs(sweep_qp_delta) < abs(best_qindex - x->rdmult_delta_qindex) &&
+         rdc_winner.rdcost == cur_rdc.rdcost)) {
+      rdc_winner = cur_rdc;
+      best_qindex = x->rdmult_delta_qindex + sweep_qp_delta;
+    }
+  }
+
+  return best_qindex;
+}
+#endif  //! CONFIG_REALTIME_ONLY
+
+/*!\brief Encode a superblock (RD-search-based)
+ *
+ * \ingroup partition_search
+ * Conducts partition search for a superblock, based on rate-distortion costs,
+ * from scratch or adjusting from a pre-calculated partition pattern.
+ */
+static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+                                    TileDataEnc *tile_data, TokenExtra **tp,
+                                    const int mi_row, const int mi_col,
+                                    const int seg_skip) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int num_planes = av1_num_planes(cm);
+  int dummy_rate;
+  int64_t dummy_dist;
+  RD_STATS dummy_rdc;
+  SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root;
+
+#if CONFIG_REALTIME_ONLY
+  (void)seg_skip;
+#endif  // CONFIG_REALTIME_ONLY
+
+  init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col,
+                    1);
+
+  // Encode the superblock
+  if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    // partition search starting from a variance-based partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, rd_use_partition_time);
+#endif
+    td->pc_root = av1_alloc_pc_tree_node(sb_size);
+    if (!td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                         &dummy_rate, &dummy_dist, 1, td->pc_root);
+    av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
+                               sf->part_sf.partition_search_type);
+    td->pc_root = NULL;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, rd_use_partition_time);
+#endif
+  }
+#if !CONFIG_REALTIME_ONLY
+  else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+    // partition search by adjusting a fixed-size partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
+    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+    td->pc_root = av1_alloc_pc_tree_node(sb_size);
+    if (!td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                         &dummy_rate, &dummy_dist, 1, td->pc_root);
+    av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
+                               sf->part_sf.partition_search_type);
+    td->pc_root = NULL;
+  } else {
+    // The most exhaustive recursive partition search
+    SuperBlockEnc *sb_enc = &x->sb_enc;
+    // No stats for overlay frames. Exclude key frame.
+    av1_get_tpl_stats_sb(cpi, sb_size, mi_row, mi_col, sb_enc);
+
+    // Reset the tree for simple motion search data
+    av1_reset_simple_motion_tree_partition(sms_root, sb_size);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, rd_pick_partition_time);
+#endif
+
+    // Estimate the maximum square partition block size, which will be used
+    // as the starting block size for partitioning the sb
+    set_max_min_partition_size(sb_enc, cpi, x, sf, sb_size, mi_row, mi_col);
+
+    // The superblock can be searched only once, or twice consecutively for
+    // better quality. Note that the meaning of passes here is different from
+    // the general concept of 1-pass/2-pass encoders.
+    const int num_passes =
+        cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1;
+
+    if (cpi->oxcf.sb_qp_sweep &&
+        !(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+          cpi->oxcf.gf_cfg.lag_in_frames == 0) &&
+        cm->delta_q_info.delta_q_present_flag) {
+      AOM_CHECK_MEM_ERROR(
+          x->e_mbd.error_info, td->mb.sb_stats_cache,
+          (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_stats_cache)));
+      av1_backup_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+                          mi_col);
+      assert(x->rdmult_delta_qindex == x->delta_qindex);
+
+      const int best_qp_diff =
+          sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root,
+                      td->mb.sb_stats_cache) -
+          x->rdmult_delta_qindex;
+
+      sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc,
+                                  mi_row, mi_col, best_qp_diff);
+
+      const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+      const int backup_current_qindex =
+          cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+      av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+      av1_restore_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+                           mi_col);
+
+      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+          backup_current_qindex;
+      aom_free(td->mb.sb_stats_cache);
+      td->mb.sb_stats_cache = NULL;
+    }
+    if (num_passes == 1) {
+#if CONFIG_PARTITION_SEARCH_ORDER
+      if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) {
+        av1_reset_part_sf(&cpi->sf.part_sf);
+        av1_reset_sf_for_ext_part(cpi);
+        RD_STATS this_rdc;
+        av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row,
+                                mi_col, sb_size, &this_rdc);
+      } else {
+        td->pc_root = av1_alloc_pc_tree_node(sb_size);
+        if (!td->pc_root)
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PC_TREE");
+        av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                              &dummy_rdc, dummy_rdc, td->pc_root, sms_root,
+                              NULL, SB_SINGLE_PASS, NULL);
+      }
+#else
+      td->pc_root = av1_alloc_pc_tree_node(sb_size);
+      if (!td->pc_root)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+                            SB_SINGLE_PASS, NULL);
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
+    } else {
+      // First pass
+      AOM_CHECK_MEM_ERROR(
+          x->e_mbd.error_info, td->mb.sb_fp_stats,
+          (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_fp_stats)));
+      av1_backup_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+                          mi_col);
+      td->pc_root = av1_alloc_pc_tree_node(sb_size);
+      if (!td->pc_root)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+                            SB_DRY_PASS, NULL);
+
+      // Second pass
+      init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
+                        mi_col, 0);
+      av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+      av1_reset_simple_motion_tree_partition(sms_root, sb_size);
+
+      av1_restore_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+                           mi_col);
+
+      td->pc_root = av1_alloc_pc_tree_node(sb_size);
+      if (!td->pc_root)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+                            SB_WET_PASS, NULL);
+      aom_free(td->mb.sb_fp_stats);
+      td->mb.sb_fp_stats = NULL;
+    }
+
+    // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
+    sb_enc->tpl_data_count = 0;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, rd_pick_partition_time);
+#endif
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  // Update the inter rd model
+  // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+      cm->tiles.cols == 1 && cm->tiles.rows == 1) {
+    av1_inter_mode_data_fit(tile_data, x->rdmult);
+  }
+}
+
+// Check if the cost update of symbols mode, coeff and dv are tile or off.
+static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off(
+    const AV1_COMP *const cpi) {
+  const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
+
+  return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+          inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+          cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+}
+
+// When row-mt is enabled and cost update frequencies are set to off/tile,
+// processing of current SB can start even before processing of top-right SB
+// is finished. This function checks if it is sufficient to wait for top SB
+// to finish processing before current SB starts processing.
+static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
+  const MODE mode = cpi->oxcf.mode;
+  if (mode == GOOD) return 0;
+
+  if (mode == ALLINTRA)
+    return is_mode_coeff_dv_upd_freq_tile_or_off(cpi);
+  else if (mode == REALTIME)
+    return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) &&
+            cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+  else
+    return 0;
+}
+
+/*!\brief Calculate source SAD at superblock level using 64x64 block source SAD
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
+                                             int mi_col) {
+  if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int blk_64x64_in_mis = (cm->seq_params->sb_size == BLOCK_128X128)
+                                   ? (cm->seq_params->mib_size >> 1)
+                                   : cm->seq_params->mib_size;
+  const int num_blk_64x64_cols =
+      (cm->mi_params.mi_cols + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+  const int num_blk_64x64_rows =
+      (cm->mi_params.mi_rows + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+  const int blk_64x64_col_index = mi_col / blk_64x64_in_mis;
+  const int blk_64x64_row_index = mi_row / blk_64x64_in_mis;
+  uint64_t curr_sb_sad = UINT64_MAX;
+  const uint64_t *const src_sad_blk_64x64_data =
+      &cpi->src_sad_blk_64x64[blk_64x64_col_index +
+                              blk_64x64_row_index * num_blk_64x64_cols];
+  if (cm->seq_params->sb_size == BLOCK_128X128 &&
+      blk_64x64_col_index + 1 < num_blk_64x64_cols &&
+      blk_64x64_row_index + 1 < num_blk_64x64_rows) {
+    // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the
+    // superblock
+    curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] +
+                  src_sad_blk_64x64_data[num_blk_64x64_cols] +
+                  src_sad_blk_64x64_data[num_blk_64x64_cols + 1];
+  } else if (cm->seq_params->sb_size == BLOCK_64X64) {
+    curr_sb_sad = src_sad_blk_64x64_data[0];
+  }
+  return curr_sb_sad;
+}
+
+/*!\brief Determine whether grading content can be skipped based on sad stat
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi,
+                                                  MACROBLOCK *const x,
+                                                  int mi_row, int mi_col) {
+  if (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+    return true;
+  const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col);
+  if (curr_sb_sad == UINT64_MAX) return true;
+  if (curr_sb_sad == 0) {
+    x->content_state_sb.source_sad_nonrd = kZeroSad;
+    return false;
+  }
+  AV1_COMMON *const cm = &cpi->common;
+  bool do_calc_src_content = true;
+
+  if (cpi->oxcf.speed < 9) return do_calc_src_content;
+
+  // TODO(yunqing): Tune/validate the thresholds for 128x128 SB size.
+  if (AOMMIN(cm->width, cm->height) < 360) {
+    // Derive Average 64x64 block source SAD from SB source SAD
+    const uint64_t avg_64x64_blk_sad =
+        (cm->seq_params->sb_size == BLOCK_128X128) ? ((curr_sb_sad + 2) >> 2)
+                                                   : curr_sb_sad;
+
+    // The threshold is determined based on kLowSad and kHighSad threshold and
+    // test results.
+    const uint64_t thresh_low = 15000;
+    const uint64_t thresh_high = 40000;
+
+    if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) {
+      do_calc_src_content = false;
+      // Note: set x->content_state_sb.source_sad_rd as well if this is extended
+      // to RTC rd path.
+      x->content_state_sb.source_sad_nonrd = kMedSad;
+    }
+  }
+
+  return do_calc_src_content;
+}
+
+/*!\brief Determine whether grading content is needed based on sf and frame stat
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+// TODO(any): consolidate sfs to make interface cleaner
+static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
+                                               MACROBLOCK *const x,
+                                               TileDataEnc *tile_data,
+                                               int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cm->current_frame.frame_type == KEY_FRAME ||
+      (cpi->ppi->use_svc &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+    assert(x->content_state_sb.source_sad_nonrd == kMedSad);
+    assert(x->content_state_sb.source_sad_rd == kMedSad);
+    return;
+  }
+  bool calc_src_content = false;
+
+  if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
+    if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0) {
+      calc_src_content = is_calc_src_content_needed(cpi, x, mi_row, mi_col);
+    } else {
+      x->content_state_sb.source_sad_nonrd = kZeroSad;
+    }
+  } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) &&
+             (cm->width * cm->height <= 352 * 288)) {
+    if (cpi->rc.frame_source_sad > 0)
+      calc_src_content = true;
+    else
+      x->content_state_sb.source_sad_rd = kZeroSad;
+  }
+  if (calc_src_content)
+    av1_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
+}
+
+/*!\brief Encode a superblock row by breaking it into superblocks
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Do partition and mode search for an sb row: one row of superblocks filling up
+ * the width of the current tile.
+ */
+static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
+                                     TileDataEnc *tile_data, int mi_row,
+                                     TokenExtra **tp) {
+  AV1_COMMON *const cm = &cpi->common;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+  bool row_mt_enabled = mt_info->row_mt_enabled;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int mib_size = cm->seq_params->mib_size;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+  const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_row_time);
+#endif
+
+  // Initialize the left context for the new SB row
+  av1_zero_left_context(xd);
+
+  // Reset delta for quantizer and loof filters at the beginning of every tile
+  if (mi_row == tile_info->mi_row_start || row_mt_enabled) {
+    if (cm->delta_q_info.delta_q_present_flag)
+      xd->current_base_qindex = cm->quant_params.base_qindex;
+    if (cm->delta_q_info.delta_lf_present_flag) {
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+    }
+  }
+
+  reset_thresh_freq_fact(x);
+
+  // Code each SB in the row
+  for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
+       mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
+    // In realtime/allintra mode and when frequency of cost updates is off/tile,
+    // wait for the top superblock to finish encoding. Otherwise, wait for the
+    // top-right superblock to finish encoding.
+    enc_row_mt->sync_read_ptr(
+        row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi));
+
+#if CONFIG_MULTITHREAD
+    if (row_mt_enabled) {
+      pthread_mutex_lock(enc_row_mt->mutex_);
+      const bool row_mt_exit = enc_row_mt->row_mt_exit;
+      pthread_mutex_unlock(enc_row_mt->mutex_);
+      // Exit in case any worker has encountered an error.
+      if (row_mt_exit) return;
+    }
+#endif
+
+    const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled;
+    if (update_cdf && (tile_info->mi_row_start != mi_row)) {
+      if ((tile_info->mi_col_start == mi_col)) {
+        // restore frame context at the 1st column sb
+        memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx));
+      } else {
+        // update context
+        int wt_left = AVG_CDF_WEIGHT_LEFT;
+        int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT;
+        if (tile_info->mi_col_end > (mi_col + mib_size))
+          av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile,
+                              wt_left, wt_tr);
+        else
+          av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
+                              wt_left, wt_tr);
+      }
+    }
+
+    // Update the rate cost tables for some symbols
+    av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
+
+    // Reset color coding related parameters
+    av1_zero(x->color_sensitivity_sb);
+    av1_zero(x->color_sensitivity_sb_g);
+    av1_zero(x->color_sensitivity_sb_alt);
+    av1_zero(x->color_sensitivity);
+    x->content_state_sb.source_sad_nonrd = kMedSad;
+    x->content_state_sb.source_sad_rd = kMedSad;
+    x->content_state_sb.lighting_change = 0;
+    x->content_state_sb.low_sumdiff = 0;
+    x->force_zeromv_skip_for_sb = 0;
+    x->sb_me_block = 0;
+    x->sb_me_partition = 0;
+    x->sb_me_mv.as_int = 0;
+
+    if (cpi->oxcf.mode == ALLINTRA) {
+      x->intra_sb_rdmult_modifier = 128;
+    }
+
+    xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
+    x->source_variance = UINT_MAX;
+    td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+
+    // Get segment id and skip flag
+    const struct segmentation *const seg = &cm->seg;
+    int seg_skip = 0;
+    if (seg->enabled) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      const uint8_t segment_id =
+          map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col)
+              : 0;
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+    }
+
+    produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col);
+
+    init_src_var_info_of_4x4_sub_blocks(cpi, x->src_var_info_of_4x4_sub_blocks,
+                                        sb_size);
+
+    // Grade the temporal variation of the sb, the grade will be used to decide
+    // fast mode search strategy for coding blocks
+    grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
+
+    // encode the superblock
+    if (use_nonrd_mode) {
+      encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
+    } else {
+      encode_rd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
+    }
+
+    // Update the top-right context in row_mt coding
+    if (update_cdf && (tile_info->mi_row_end > (mi_row + mib_size))) {
+      if (sb_cols_in_tile == 1)
+        memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
+      else if (sb_col_in_tile >= 1)
+        memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx,
+               sizeof(*xd->tile_ctx));
+    }
+    enc_row_mt->sync_write_ptr(row_mt_sync, sb_row, sb_col_in_tile,
+                               sb_cols_in_tile);
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_row_time);
+#endif
+}
+
+static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Copy data over into macro block data structures.
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
+                       cm->seq_params->sb_size);
+
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
+}
+
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  av1_row_mt_mem_dealloc(cpi);
+
+  aom_free(cpi->tile_data);
+  cpi->allocated_tiles = 0;
+  enc_row_mt->allocated_tile_cols = 0;
+  enc_row_mt->allocated_tile_rows = 0;
+
+  CHECK_MEM_ERROR(
+      cm, cpi->tile_data,
+      aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+
+  cpi->allocated_tiles = tile_cols * tile_rows;
+  enc_row_mt->allocated_tile_cols = tile_cols;
+  enc_row_mt->allocated_tile_rows = tile_rows;
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+      av1_zero(this_tile->row_mt_sync);
+      this_tile->row_ctx = NULL;
+    }
+  }
+}
+
+void av1_init_tile_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int tile_col, tile_row;
+  TokenInfo *const token_info = &cpi->token_info;
+  TokenExtra *pre_tok = token_info->tile_tok[0][0];
+  TokenList *tplist = token_info->tplist[0][0];
+  unsigned int tile_tok = 0;
+  int tplist_count = 0;
+
+  if (!is_stat_generation_stage(cpi) &&
+      cm->features.allow_screen_content_tools) {
+    // Number of tokens for which token info needs to be allocated.
+    unsigned int tokens_required =
+        get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
+                        MAX_SB_SIZE_LOG2, num_planes);
+    // Allocate/reallocate memory for token related info if the number of tokens
+    // required is more than the number of tokens already allocated. This could
+    // occur in case of the following:
+    // 1) If the memory is not yet allocated
+    // 2) If the frame dimensions have changed
+    const bool realloc_tokens = tokens_required > token_info->tokens_allocated;
+    if (realloc_tokens) {
+      free_token_info(token_info);
+      alloc_token_info(cm, token_info, tokens_required);
+      pre_tok = token_info->tile_tok[0][0];
+      tplist = token_info->tplist[0][0];
+    }
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *const tile_info = &tile_data->tile_info;
+      av1_tile_init(tile_info, cm, tile_row, tile_col);
+      tile_data->firstpass_top_mv = kZeroMv;
+      tile_data->abs_sum_level = 0;
+
+      if (is_token_info_allocated(token_info)) {
+        token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+        pre_tok = token_info->tile_tok[tile_row][tile_col];
+        tile_tok = allocated_tokens(
+            tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+            num_planes);
+        token_info->tplist[tile_row][tile_col] = tplist + tplist_count;
+        tplist = token_info->tplist[tile_row][tile_col];
+        tplist_count = av1_get_sb_rows_in_tile(cm, tile_info);
+      }
+      tile_data->allow_update_cdf = !cm->tiles.large_scale;
+      tile_data->allow_update_cdf = tile_data->allow_update_cdf &&
+                                    !cm->features.disable_cdf_update &&
+                                    !delay_wait_for_top_right_sb(cpi);
+      tile_data->tctx = *cm->fc;
+    }
+  }
+}
+
+// Populate the start palette token info prior to encoding an SB row.
+static AOM_INLINE void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info,
+                                       int tile_row, int tile_col, int mi_row,
+                                       TokenExtra **tp) {
+  const TokenInfo *token_info = &cpi->token_info;
+  if (!is_token_info_allocated(token_info)) return;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col];
+  const int sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
+
+  get_start_tok(cpi, tile_row, tile_col, mi_row, tp,
+                cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes);
+  assert(tplist != NULL);
+  tplist[sb_row_in_tile].start = *tp;
+}
+
+// Populate the token count after encoding an SB row.
+static AOM_INLINE void populate_token_count(AV1_COMP *cpi,
+                                            const TileInfo *tile_info,
+                                            int tile_row, int tile_col,
+                                            int mi_row, TokenExtra *tok) {
+  const TokenInfo *token_info = &cpi->token_info;
+  if (!is_token_info_allocated(token_info)) return;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TokenList *const tplist = token_info->tplist[tile_row][tile_col];
+  const int sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
+  const int tile_mb_cols =
+      (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+  const int num_mb_rows_in_sb =
+      ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+  tplist[sb_row_in_tile].count =
+      (unsigned int)(tok - tplist[sb_row_in_tile].start);
+
+  assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <=
+         get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+                         cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+                         num_planes));
+
+  (void)num_planes;
+  (void)tile_mb_cols;
+  (void)num_mb_rows_in_sb;
+}
+
+/*!\brief Encode a superblock row
+ *
+ * \ingroup partition_search
+ */
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                       int tile_col, int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TokenExtra *tok = NULL;
+
+  get_token_start(cpi, tile_info, tile_row, tile_col, mi_row, &tok);
+
+  encode_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+  populate_token_count(cpi, tile_info, tile_row, tile_col, mi_row, tok);
+}
+
+/*!\brief Encode a tile
+ *
+ * \ingroup partition_search
+ */
+void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                     int tile_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileDataEnc *const this_tile =
+      &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile);
+
+  av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
+                         tile_info->mi_col_end, tile_row);
+  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+                         &td->mb.e_mbd);
+
+  if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra)
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
+
+  if (td->mb.txfm_search_info.mb_rd_record != NULL) {
+    av1_crc32c_calculator_init(
+        &td->mb.txfm_search_info.mb_rd_record->crc_calculator);
+  }
+
+  for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += cm->seq_params->mib_size) {
+    av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
+  }
+  this_tile->abs_sum_level = td->abs_sum_level;
+}
+
+/*!\brief Break one frame into tiles and encode the tiles
+ *
+ * \ingroup partition_search
+ *
+ * \param[in]    cpi    Top-level encoder structure
+ */
+static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int tile_col, tile_row;
+
+  MACROBLOCK *const mb = &cpi->td.mb;
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
+
+  av1_init_tile_data(cpi);
+  av1_alloc_mb_data(cpi, mb);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const this_tile =
+          &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+      cpi->td.intrabc_used = 0;
+      cpi->td.deltaq_used = 0;
+      cpi->td.abs_sum_level = 0;
+      cpi->td.rd_counts.seg_tmp_pred_cost[0] = 0;
+      cpi->td.rd_counts.seg_tmp_pred_cost[1] = 0;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+      cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
+      av1_init_rtc_counters(&cpi->td.mb);
+      cpi->td.mb.palette_pixels = 0;
+      av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+      if (!frame_is_intra_only(&cpi->common))
+        av1_accumulate_rtc_counters(cpi, &cpi->td.mb);
+      cpi->palette_pixel_num += cpi->td.mb.palette_pixels;
+      cpi->intrabc_used |= cpi->td.intrabc_used;
+      cpi->deltaq_used |= cpi->td.deltaq_used;
+    }
+  }
+
+  av1_dealloc_mb_data(mb, av1_num_planes(cm));
+}
+
+// Set the relative distance of a reference frame w.r.t. current frame
+static AOM_INLINE void set_rel_frame_dist(
+    const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info,
+    const int ref_frame_flags) {
+  MV_REFERENCE_FRAME ref_frame;
+  int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX;
+  ref_frame_dist_info->nearest_past_ref = NONE_FRAME;
+  ref_frame_dist_info->nearest_future_ref = NONE_FRAME;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
+    if (ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      int dist = av1_encoder_get_relative_dist(
+          cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME],
+          cm->current_frame.display_order_hint);
+      ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
+      // Get the nearest ref_frame in the past
+      if (abs(dist) < min_past_dist && dist < 0) {
+        ref_frame_dist_info->nearest_past_ref = ref_frame;
+        min_past_dist = abs(dist);
+      }
+      // Get the nearest ref_frame in the future
+      if (dist < min_future_dist && dist > 0) {
+        ref_frame_dist_info->nearest_future_ref = ref_frame;
+        min_future_dist = dist;
+      }
+    }
+  }
+}
+
+static INLINE int refs_are_one_sided(const AV1_COMMON *cm) {
+  assert(!frame_is_intra_only(cm));
+
+  int one_sided_refs = 1;
+  const int cur_display_order_hint = cm->current_frame.display_order_hint;
+  for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+    if (buf == NULL) continue;
+    if (av1_encoder_get_relative_dist(buf->display_order_hint,
+                                      cur_display_order_hint) > 0) {
+      one_sided_refs = 0;  // bwd reference
+      break;
+    }
+  }
+  return one_sided_refs;
+}
+
+static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+                                             int ref_order_hint[2]) {
+  const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+  ref_order_hint[0] = ref_order_hint[1] = 0;
+  if (!skip_mode_info->skip_mode_allowed) return;
+
+  const RefCntBuffer *const buf_0 =
+      get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0);
+  const RefCntBuffer *const buf_1 =
+      get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1);
+  assert(buf_0 != NULL && buf_1 != NULL);
+
+  ref_order_hint[0] = buf_0->order_hint;
+  ref_order_hint[1] = buf_1->order_hint;
+}
+
+static int check_skip_mode_enabled(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  av1_setup_skip_mode_allowed(cm);
+  if (!cm->current_frame.skip_mode_info.skip_mode_allowed) return 0;
+
+  // Turn off skip mode if the temporal distances of the reference pair to the
+  // current frame are different by more than 1 frame.
+  const int cur_offset = (int)cm->current_frame.order_hint;
+  int ref_offset[2];
+  get_skip_mode_ref_offsets(cm, ref_offset);
+  const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info,
+                                            cur_offset, ref_offset[0]);
+  const int cur_to_ref1 = abs(get_relative_dist(
+      &cm->seq_params->order_hint_info, cur_offset, ref_offset[1]));
+  if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
+
+  // High Latency: Turn off skip mode if all refs are fwd.
+  if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0;
+
+  const int ref_frame[2] = {
+    cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME,
+    cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME
+  };
+  if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[0]]) ||
+      !(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]]))
+    return 0;
+
+  return 1;
+}
+
+static AOM_INLINE void set_default_interp_skip_flags(
+    const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) {
+  const int num_planes = av1_num_planes(cm);
+  interp_search_flags->default_interp_skip_flags =
+      (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA
+                        : INTERP_SKIP_LUMA_SKIP_CHROMA;
+}
+
+static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
+  if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
+       cpi->sf.inter_sf.disable_onesided_comp) &&
+      cpi->all_one_sided_refs) {
+    // Disable all compound references
+    cpi->prune_ref_frame_mask = (1 << MODE_CTX_REF_FRAMES) - (1 << REF_FRAMES);
+  } else if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+             cpi->sf.inter_sf.selective_ref_frame >= 2) {
+    AV1_COMMON *const cm = &cpi->common;
+    const int cur_frame_display_order_hint =
+        cm->current_frame.display_order_hint;
+    unsigned int *ref_display_order_hint =
+        cm->cur_frame->ref_display_order_hint;
+    const int arf2_dist = av1_encoder_get_relative_dist(
+        ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
+        cur_frame_display_order_hint);
+    const int bwd_dist = av1_encoder_get_relative_dist(
+        ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
+        cur_frame_display_order_hint);
+
+    for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) {
+      MV_REFERENCE_FRAME rf[2];
+      av1_set_ref_frame(rf, ref_idx);
+      if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) ||
+          !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) {
+        continue;
+      }
+
+      if (!cpi->all_one_sided_refs) {
+        int ref_dist[2];
+        for (int i = 0; i < 2; ++i) {
+          ref_dist[i] = av1_encoder_get_relative_dist(
+              ref_display_order_hint[rf[i] - LAST_FRAME],
+              cur_frame_display_order_hint);
+        }
+
+        // One-sided compound is used only when all reference frames are
+        // one-sided.
+        if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) {
+          cpi->prune_ref_frame_mask |= 1 << ref_idx;
+        }
+      }
+
+      if (cpi->sf.inter_sf.selective_ref_frame >= 4 &&
+          (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) &&
+          (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) {
+        // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references.
+        if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) {
+          // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer
+          // reference to the current frame than ALTREF2_FRAME
+          cpi->prune_ref_frame_mask |= 1 << ref_idx;
+        }
+      }
+    }
+  }
+}
+
+static int allow_deltaq_mode(AV1_COMP *cpi) {
+#if !CONFIG_REALTIME_ONLY
+  AV1_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  int sbs_wide = mi_size_wide[sb_size];
+  int sbs_high = mi_size_high[sb_size];
+
+  int64_t delta_rdcost = 0;
+  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sbs_high) {
+    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sbs_wide) {
+      int64_t this_delta_rdcost = 0;
+      av1_get_q_for_deltaq_objective(cpi, &cpi->td, &this_delta_rdcost, sb_size,
+                                     mi_row, mi_col);
+      delta_rdcost += this_delta_rdcost;
+    }
+  }
+  return delta_rdcost < 0;
+#else
+  (void)cpi;
+  return 1;
+#endif  // !CONFIG_REALTIME_ONLY
+}
+
+#define FORCE_ZMV_SKIP_128X128_BLK_DIFF 10000
+#define FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF 4
+
+// Populates block level thresholds for force zeromv-skip decision
+static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) {
+  if (cpi->sf.rt_sf.part_early_exit_zeromv == 0) return;
+
+  // Threshold for forcing zeromv-skip decision is as below:
+  // For 128x128 blocks, threshold is 10000 and per pixel threshold is 0.6103.
+  // For 64x64 blocks, threshold is 5000 and per pixel threshold is 1.221
+  // allowing slightly higher error for smaller blocks.
+  // Per Pixel Threshold of 64x64 block        Area of 64x64 block         1  1
+  // ------------------------------------=sqrt(---------------------)=sqrt(-)=-
+  // Per Pixel Threshold of 128x128 block      Area of 128x128 block       4  2
+  // Thus, per pixel thresholds for blocks of size 32x32, 16x16,...  can be
+  // chosen as 2.442, 4.884,.... As the per pixel error tends to be higher for
+  // small blocks, the same is clipped to 4.
+  const unsigned int thresh_exit_128x128_part = FORCE_ZMV_SKIP_128X128_BLK_DIFF;
+  const int num_128x128_pix =
+      block_size_wide[BLOCK_128X128] * block_size_high[BLOCK_128X128];
+
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+    const int num_block_pix = block_size_wide[bsize] * block_size_high[bsize];
+
+    // Calculate the threshold for zeromv-skip decision based on area of the
+    // partition
+    unsigned int thresh_exit_part_blk =
+        (unsigned int)(thresh_exit_128x128_part *
+                           sqrt((double)num_block_pix / num_128x128_pix) +
+                       0.5);
+    thresh_exit_part_blk = AOMMIN(
+        thresh_exit_part_blk,
+        (unsigned int)(FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF * num_block_pix));
+    cpi->zeromv_skip_thresh_exit_part[bsize] = thresh_exit_part_blk;
+  }
+}
+
+static void free_block_hash_buffers(uint32_t *block_hash_values[2][2],
+                                    int8_t *is_block_same[2][3]) {
+  for (int k = 0; k < 2; ++k) {
+    for (int j = 0; j < 2; ++j) {
+      aom_free(block_hash_values[k][j]);
+    }
+
+    for (int j = 0; j < 3; ++j) {
+      aom_free(is_block_same[k][j]);
+    }
+  }
+}
+
+/*!\brief Encoder setup(only for the current frame), encoding, and recontruction
+ * for a single frame
+ *
+ * \ingroup high_level_algo
+ */
+static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  FeatureFlags *const features = &cm->features;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+#if CONFIG_FPMT_TEST
+  FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+  FrameProbInfo *const temp_frame_probs_simulation =
+      &cpi->ppi->temp_frame_probs_simulation;
+#endif
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+  IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const DELTAQ_MODE deltaq_mode = oxcf->q_cfg.deltaq_mode;
+  int i;
+
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    mi_params->setup_mi(mi_params);
+  }
+
+  set_mi_offsets(mi_params, xd, 0, 0);
+
+  av1_zero(*td->counts);
+  av1_zero(rdc->tx_type_used);
+  av1_zero(rdc->obmc_used);
+  av1_zero(rdc->warped_used);
+  av1_zero(rdc->seg_tmp_pred_cost);
+
+  // Reset the flag.
+  cpi->intrabc_used = 0;
+  // Need to disable intrabc when superres is selected
+  if (av1_superres_scaled(cm)) {
+    features->allow_intrabc = 0;
+  }
+
+  features->allow_intrabc &= (oxcf->kf_cfg.enable_intrabc);
+
+  if (features->allow_warped_motion &&
+      cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int warped_probability =
+#if CONFIG_FPMT_TEST
+        cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE
+            ? temp_frame_probs->warped_probs[update_type]
+            :
+#endif  // CONFIG_FPMT_TEST
+            frame_probs->warped_probs[update_type];
+    if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh)
+      features->allow_warped_motion = 0;
+  }
+
+  int hash_table_created = 0;
+  if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
+      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    // TODO(any): move this outside of the recoding loop to avoid recalculating
+    // the hash table.
+    // add to hash table
+    const int pic_width = cpi->source->y_crop_width;
+    const int pic_height = cpi->source->y_crop_height;
+    uint32_t *block_hash_values[2][2] = { { NULL } };
+    int8_t *is_block_same[2][3] = { { NULL } };
+    int k, j;
+    bool error = false;
+
+    for (k = 0; k < 2 && !error; ++k) {
+      for (j = 0; j < 2; ++j) {
+        block_hash_values[k][j] = (uint32_t *)aom_malloc(
+            sizeof(*block_hash_values[0][0]) * pic_width * pic_height);
+        if (!block_hash_values[k][j]) {
+          error = true;
+          break;
+        }
+      }
+
+      for (j = 0; j < 3 && !error; ++j) {
+        is_block_same[k][j] = (int8_t *)aom_malloc(
+            sizeof(*is_block_same[0][0]) * pic_width * pic_height);
+        if (!is_block_same[k][j]) error = true;
+      }
+    }
+
+    av1_hash_table_init(intrabc_hash_info);
+    if (error ||
+        !av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) {
+      free_block_hash_buffers(block_hash_values, is_block_same);
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating intrabc_hash_table and buffers");
+    }
+    hash_table_created = 1;
+    av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source,
+                                      block_hash_values[0], is_block_same[0]);
+    // Hash data generated for screen contents is used for intraBC ME
+    const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize];
+    const int max_sb_size =
+        (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2));
+    int src_idx = 0;
+    for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
+      const int dst_idx = !src_idx;
+      av1_generate_block_hash_value(
+          intrabc_hash_info, cpi->source, size, block_hash_values[src_idx],
+          block_hash_values[dst_idx], is_block_same[src_idx],
+          is_block_same[dst_idx]);
+      if (size >= min_alloc_size) {
+        if (!av1_add_to_hash_map_by_row_with_precal_data(
+                &intrabc_hash_info->intrabc_hash_table,
+                block_hash_values[dst_idx], is_block_same[dst_idx][2],
+                pic_width, pic_height, size)) {
+          error = true;
+          break;
+        }
+      }
+    }
+
+    free_block_hash_buffers(block_hash_values, is_block_same);
+
+    if (error) {
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error adding data to intrabc_hash_table");
+    }
+  }
+
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex =
+        cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex)
+                        : quant_params->base_qindex;
+    xd->lossless[i] =
+        qindex == 0 && quant_params->y_dc_delta_q == 0 &&
+        quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
+        quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
+    if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1;
+    xd->qindex[i] = qindex;
+    if (xd->lossless[i]) {
+      cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT;
+    } else {
+      cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients;
+    }
+  }
+  features->coded_lossless = is_coded_lossless(cm, xd);
+  features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
+
+  // Fix delta q resolution for the moment
+
+  cm->delta_q_info.delta_q_res = 0;
+  if (cpi->use_ducky_encode) {
+    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_DUCKY_ENCODE;
+  } else if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) {
+    if (deltaq_mode == DELTA_Q_OBJECTIVE)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
+    else if (deltaq_mode == DELTA_Q_PERCEPTUAL)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_PERCEPTUAL_AI)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_USER_RATING_BASED)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_HDR)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    // Set delta_q_present_flag before it is used for the first time
+    cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
+    cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q;
+
+    // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q
+    // is used for ineligible frames. That effectively will turn off row_mt
+    // usage. Note objective delta_q and tpl eligible frames are only altref
+    // frames currently.
+    const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+    if (cm->delta_q_info.delta_q_present_flag) {
+      if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+          gf_group->update_type[cpi->gf_frame_index] == LF_UPDATE)
+        cm->delta_q_info.delta_q_present_flag = 0;
+
+      if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+          cm->delta_q_info.delta_q_present_flag) {
+        cm->delta_q_info.delta_q_present_flag &= allow_deltaq_mode(cpi);
+      }
+    }
+
+    // Reset delta_q_used flag
+    cpi->deltaq_used = 0;
+
+    cm->delta_q_info.delta_lf_present_flag =
+        cm->delta_q_info.delta_q_present_flag &&
+        oxcf->tool_cfg.enable_deltalf_mode;
+    cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+
+    // update delta_q_present_flag and delta_lf_present_flag based on
+    // base_qindex
+    cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
+    cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
+  } else if (cpi->cyclic_refresh->apply_cyclic_refresh ||
+             cpi->svc.number_temporal_layers == 1) {
+    cpi->cyclic_refresh->actual_num_seg1_blocks = 0;
+    cpi->cyclic_refresh->actual_num_seg2_blocks = 0;
+  }
+  cpi->rc.cnt_zeromv = 0;
+
+  av1_frame_init_quantizer(cpi);
+  init_encode_frame_mb_context(cpi);
+  set_default_interp_skip_flags(cm, &cpi->interp_search_flags);
+
+  if (cm->prev_frame && cm->prev_frame->seg.enabled)
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  else
+    cm->last_frame_seg_map = NULL;
+  if (features->allow_intrabc || features->coded_lossless) {
+    av1_set_default_ref_deltas(cm->lf.ref_deltas);
+    av1_set_default_mode_deltas(cm->lf.mode_deltas);
+  } else if (cm->prev_frame) {
+    memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+
+  cpi->all_one_sided_refs =
+      frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm);
+
+  cpi->prune_ref_frame_mask = 0;
+  // Figure out which ref frames can be skipped at frame level.
+  setup_prune_ref_frame_mask(cpi);
+
+  x->txfm_search_info.txb_split_count = 0;
+#if CONFIG_SPEED_STATS
+  x->txfm_search_info.tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
+
+#if !CONFIG_REALTIME_ONLY
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_compute_global_motion_time);
+#endif
+  av1_compute_global_motion_facade(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_compute_global_motion_time);
+#endif
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_setup_motion_field_time);
+#endif
+  av1_calculate_ref_frame_side(cm);
+  if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_setup_motion_field_time);
+#endif
+
+  cm->current_frame.skip_mode_info.skip_mode_flag =
+      check_skip_mode_enabled(cpi);
+
+  // Initialization of skip mode cost depends on the value of
+  // 'skip_mode_flag'. This initialization happens in the function
+  // av1_fill_mode_rates(), which is in turn called in
+  // av1_initialize_rd_consts(). Thus, av1_initialize_rd_consts()
+  // has to be called after 'skip_mode_flag' is initialized.
+  av1_initialize_rd_consts(cpi);
+  av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex);
+  populate_thresh_to_force_zeromv_skip(cpi);
+
+  enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+  enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+  mt_info->row_mt_enabled = 0;
+  mt_info->pack_bs_mt_enabled = AOMMIN(mt_info->num_mod_workers[MOD_PACK_BS],
+                                       cm->tiles.cols * cm->tiles.rows) > 1;
+
+  if (oxcf->row_mt && (mt_info->num_workers > 1)) {
+    mt_info->row_mt_enabled = 1;
+    enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+    enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+    av1_encode_tiles_row_mt(cpi);
+  } else {
+    if (AOMMIN(mt_info->num_workers, cm->tiles.cols * cm->tiles.rows) > 1) {
+      av1_encode_tiles_mt(cpi);
+    } else {
+      // Preallocate the pc_tree for realtime coding to reduce the cost of
+      // memory allocation.
+      const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+      if (use_nonrd_mode) {
+        td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+        if (!td->pc_root)
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PC_TREE");
+      } else {
+        td->pc_root = NULL;
+      }
+
+      encode_tiles(cpi);
+      av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+                                 cpi->sf.part_sf.partition_search_type);
+      td->pc_root = NULL;
+    }
+  }
+
+  // If intrabc is allowed but never selected, reset the allow_intrabc flag.
+  if (features->allow_intrabc && !cpi->intrabc_used) {
+    features->allow_intrabc = 0;
+  }
+  if (features->allow_intrabc) {
+    cm->delta_q_info.delta_lf_present_flag = 0;
+  }
+
+  if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+    cm->delta_q_info.delta_q_present_flag = 0;
+  }
+
+  // Set the transform size appropriately before bitstream creation
+  const MODE_EVAL_TYPE eval_type =
+      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch
+          ? WINNER_MODE_EVAL
+          : DEFAULT_EVAL;
+  const TX_SIZE_SEARCH_METHOD tx_search_type =
+      cpi->winner_mode_params.tx_size_search_methods[eval_type];
+  assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL);
+  features->tx_mode = select_tx_mode(cm, tx_search_type);
+
+  // Retain the frame level probability update conditions for parallel frames.
+  // These conditions will be consumed during postencode stage to update the
+  // probability.
+  if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+    cpi->do_update_frame_probs_txtype[cpi->num_frame_recode] =
+        cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats;
+    cpi->do_update_frame_probs_obmc[cpi->num_frame_recode] =
+        (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+         cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX);
+    cpi->do_update_frame_probs_warp[cpi->num_frame_recode] =
+        (features->allow_warped_motion &&
+         cpi->sf.inter_sf.prune_warped_prob_thresh > 0);
+    cpi->do_update_frame_probs_interpfilter[cpi->num_frame_recode] =
+        (cm->current_frame.frame_type != KEY_FRAME &&
+         cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+         features->interp_filter == SWITCHABLE);
+  }
+
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats ||
+      ((cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh !=
+        INT_MAX) &&
+       (cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != 0))) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    for (i = 0; i < TX_SIZES_ALL; i++) {
+      int sum = 0;
+      int j;
+      int left = MAX_TX_TYPE_PROB;
+
+      for (j = 0; j < TX_TYPES; j++)
+        sum += cpi->td.rd_counts.tx_type_used[i][j];
+
+      for (j = TX_TYPES - 1; j >= 0; j--) {
+        int update_txtype_frameprobs = 1;
+        const int new_prob =
+            sum ? MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum
+                : (j ? 0 : MAX_TX_TYPE_PROB);
+#if CONFIG_FPMT_TEST
+        if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+          if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+              0) {
+            int prob =
+                (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] +
+                 new_prob) >>
+                1;
+            left -= prob;
+            if (j == 0) prob += left;
+            temp_frame_probs_simulation->tx_type_probs[update_type][i][j] =
+                prob;
+            // Copy temp_frame_probs_simulation to temp_frame_probs
+            for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+                 update_type_idx++) {
+              temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+                  temp_frame_probs_simulation
+                      ->tx_type_probs[update_type_idx][i][j];
+            }
+          }
+          update_txtype_frameprobs = 0;
+        }
+#endif  // CONFIG_FPMT_TEST
+        // Track the frame probabilities of parallel encode frames to update
+        // during postencode stage.
+        if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+          update_txtype_frameprobs = 0;
+          cpi->frame_new_probs[cpi->num_frame_recode]
+              .tx_type_probs[update_type][i][j] = new_prob;
+        }
+        if (update_txtype_frameprobs) {
+          int prob =
+              (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->tx_type_probs[update_type][i][j] = prob;
+        }
+      }
+    }
+  }
+
+  if (cm->seg.enabled) {
+    cm->seg.temporal_update = 1;
+    if (rdc->seg_tmp_pred_cost[0] < rdc->seg_tmp_pred_cost[1])
+      cm->seg.temporal_update = 0;
+  }
+
+  if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+      cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+    for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+      int sum = 0;
+      int update_obmc_frameprobs = 1;
+      for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j];
+
+      const int new_prob =
+          sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
+#if CONFIG_FPMT_TEST
+      if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+        if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+          temp_frame_probs_simulation->obmc_probs[update_type][i] =
+              (temp_frame_probs_simulation->obmc_probs[update_type][i] +
+               new_prob) >>
+              1;
+          // Copy temp_frame_probs_simulation to temp_frame_probs
+          for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+               update_type_idx++) {
+            temp_frame_probs->obmc_probs[update_type_idx][i] =
+                temp_frame_probs_simulation->obmc_probs[update_type_idx][i];
+          }
+        }
+        update_obmc_frameprobs = 0;
+      }
+#endif  // CONFIG_FPMT_TEST
+      // Track the frame probabilities of parallel encode frames to update
+      // during postencode stage.
+      if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+        update_obmc_frameprobs = 0;
+        cpi->frame_new_probs[cpi->num_frame_recode].obmc_probs[update_type][i] =
+            new_prob;
+      }
+      if (update_obmc_frameprobs) {
+        frame_probs->obmc_probs[update_type][i] =
+            (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+      }
+    }
+  }
+
+  if (features->allow_warped_motion &&
+      cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int update_warp_frameprobs = 1;
+    int sum = 0;
+    for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i];
+    const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0;
+#if CONFIG_FPMT_TEST
+    if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+      if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+        temp_frame_probs_simulation->warped_probs[update_type] =
+            (temp_frame_probs_simulation->warped_probs[update_type] +
+             new_prob) >>
+            1;
+        // Copy temp_frame_probs_simulation to temp_frame_probs
+        for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+             update_type_idx++) {
+          temp_frame_probs->warped_probs[update_type_idx] =
+              temp_frame_probs_simulation->warped_probs[update_type_idx];
+        }
+      }
+      update_warp_frameprobs = 0;
+    }
+#endif  // CONFIG_FPMT_TEST
+    // Track the frame probabilities of parallel encode frames to update
+    // during postencode stage.
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      update_warp_frameprobs = 0;
+      cpi->frame_new_probs[cpi->num_frame_recode].warped_probs[update_type] =
+          new_prob;
+    }
+    if (update_warp_frameprobs) {
+      frame_probs->warped_probs[update_type] =
+          (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+    }
+  }
+
+  if (cm->current_frame.frame_type != KEY_FRAME &&
+      cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+      features->interp_filter == SWITCHABLE) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+      int sum = 0;
+      int j;
+      int left = 1536;
+
+      for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+        sum += cpi->td.counts->switchable_interp[i][j];
+      }
+
+      for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+        int update_interpfilter_frameprobs = 1;
+        const int new_prob =
+            sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum
+                : (j ? 0 : 1536);
+#if CONFIG_FPMT_TEST
+        if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+          if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+              0) {
+            int prob = (temp_frame_probs_simulation
+                            ->switchable_interp_probs[update_type][i][j] +
+                        new_prob) >>
+                       1;
+            left -= prob;
+            if (j == 0) prob += left;
+            temp_frame_probs_simulation
+                ->switchable_interp_probs[update_type][i][j] = prob;
+            // Copy temp_frame_probs_simulation to temp_frame_probs
+            for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+                 update_type_idx++) {
+              temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+                  temp_frame_probs_simulation
+                      ->switchable_interp_probs[update_type_idx][i][j];
+            }
+          }
+          update_interpfilter_frameprobs = 0;
+        }
+#endif  // CONFIG_FPMT_TEST
+        // Track the frame probabilities of parallel encode frames to update
+        // during postencode stage.
+        if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+          update_interpfilter_frameprobs = 0;
+          cpi->frame_new_probs[cpi->num_frame_recode]
+              .switchable_interp_probs[update_type][i][j] = new_prob;
+        }
+        if (update_interpfilter_frameprobs) {
+          int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+                      new_prob) >>
+                     1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+        }
+      }
+    }
+  }
+  if (hash_table_created) {
+    av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table);
+  }
+}
+
+/*!\brief Setup reference frame buffers and encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi    Top-level encoder structure
+ */
+void av1_encode_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  FeatureFlags *const features = &cm->features;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  // Indicates whether or not to use a default reduced set for ext-tx
+  // rather than the potential full set of 16 transforms
+  features->reduced_tx_set_used = oxcf->txfm_cfg.reduced_tx_type_set;
+
+  // Make sure segment_id is no larger than last_active_segid.
+  if (cm->seg.enabled && cm->seg.update_map) {
+    const int mi_rows = cm->mi_params.mi_rows;
+    const int mi_cols = cm->mi_params.mi_cols;
+    const int last_active_segid = cm->seg.last_active_segid;
+    uint8_t *map = cpi->enc_seg.map;
+    for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
+      for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
+        map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
+      }
+      map += mi_cols;
+    }
+  }
+
+  av1_setup_frame_buf_refs(cm);
+  enforce_max_ref_frames(cpi, &cpi->ref_frame_flags,
+                         cm->cur_frame->ref_display_order_hint,
+                         cm->current_frame.display_order_hint);
+  set_rel_frame_dist(&cpi->common, &cpi->ref_frame_dist_info,
+                     cpi->ref_frame_flags);
+  av1_setup_frame_sign_bias(cm);
+
+  // If global motion is enabled, then every buffer which is used as either
+  // a source or a ref frame should have an image pyramid allocated.
+  // Check here so that issues can be caught early in debug mode
+#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+  if (cpi->image_pyramid_levels > 0) {
+    assert(cpi->source->y_pyramid);
+    for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+      if (buf != NULL) {
+        assert(buf->buf.y_pyramid);
+      }
+    }
+  }
+#endif  // !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_reset_frame(av1_num_planes(cm));
+#endif
+
+  rdc->newmv_or_intra_blocks = 0;
+  cpi->palette_pixel_num = 0;
+
+  if (cpi->sf.hl_sf.frame_parameter_update ||
+      cpi->sf.rt_sf.use_comp_ref_nonrd) {
+    if (frame_is_intra_only(cm))
+      current_frame->reference_mode = SINGLE_REFERENCE;
+    else
+      current_frame->reference_mode = REFERENCE_MODE_SELECT;
+
+    features->interp_filter = SWITCHABLE;
+    if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR;
+
+    features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+        features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc);
+
+    rdc->compound_ref_used_flag = 0;
+    rdc->skip_mode_used_flag = 0;
+
+    encode_frame_internal(cpi);
+
+    if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+      // Use a flag that includes 4x4 blocks
+      if (rdc->compound_ref_used_flag == 0) {
+        current_frame->reference_mode = SINGLE_REFERENCE;
+#if CONFIG_ENTROPY_STATS
+        av1_zero(cpi->td.counts->comp_inter);
+#endif  // CONFIG_ENTROPY_STATS
+      }
+    }
+    // Re-check on the skip mode status as reference mode may have been
+    // changed.
+    SkipModeInfo *const skip_mode_info = &current_frame->skip_mode_info;
+    if (frame_is_intra_only(cm) ||
+        current_frame->reference_mode == SINGLE_REFERENCE) {
+      skip_mode_info->skip_mode_allowed = 0;
+      skip_mode_info->skip_mode_flag = 0;
+    }
+    if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0)
+      skip_mode_info->skip_mode_flag = 0;
+
+    if (!cm->tiles.large_scale) {
+      if (features->tx_mode == TX_MODE_SELECT &&
+          cpi->td.mb.txfm_search_info.txb_split_count == 0)
+        features->tx_mode = TX_MODE_LARGEST;
+    }
+  } else {
+    // This is needed if real-time speed setting is changed on the fly
+    // from one using compound prediction to one using single reference.
+    if (current_frame->reference_mode == REFERENCE_MODE_SELECT)
+      current_frame->reference_mode = SINGLE_REFERENCE;
+    encode_frame_internal(cpi);
+  }
+}
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
new file mode 100644
index 0000000000..ce32fb47e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#include "av1/encoder/global_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DELTA_Q_PERCEPTUAL_MODULATION \
+  1  // 0: variance based
+     // 1: wavelet AC energy based
+
+struct macroblock;
+struct yv12_buffer_config;
+struct AV1_COMP;
+struct ThreadData;
+
+void av1_init_rtc_counters(struct macroblock *const x);
+
+void av1_accumulate_rtc_counters(struct AV1_COMP *cpi,
+                                 const struct macroblock *const x);
+
+void av1_setup_src_planes(struct macroblock *x,
+                          const struct yv12_buffer_config *src, int mi_row,
+                          int mi_col, const int num_planes, BLOCK_SIZE bsize);
+
+void av1_encode_frame(struct AV1_COMP *cpi);
+
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
+void av1_init_tile_data(struct AV1_COMP *cpi);
+void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
+                     int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                       int tile_row, int tile_col, int mi_row);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodeframe_utils.c b/third_party/aom/av1/encoder/encodeframe_utils.c
new file mode 100644
index 0000000000..949837184a
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe_utils.c
@@ -0,0 +1,1775 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/rdopt.h"
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const BLOCK_SIZE bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 1.0;
+
+  // To avoid overflow of 'geom_mean_of_scale', bsize_base must be at least
+  // BLOCK_8X8.
+  //
+  // For bsize=BLOCK_128X128 and bsize_base=BLOCK_8X8, the loop below would
+  // iterate 256 times. Considering the maximum value of
+  // cpi->ssim_rdmult_scaling_factors (see av1_set_mb_ssim_rdmult_scaling()),
+  // geom_mean_of_scale can go up to 4.8323^256, which is within DBL_MAX
+  // (maximum value a double data type can hold). If bsize_base is modified to
+  // BLOCK_4X4 (minimum possible block size), geom_mean_of_scale can go up
+  // to 4.8323^1024 and exceed DBL_MAX, resulting in data overflow.
+  assert(bsize_base >= BLOCK_8X8);
+  assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM);
+
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      assert(cpi->ssim_rdmult_scaling_factors[index] != 0.0);
+      geom_mean_of_scale *= cpi->ssim_rdmult_scaling_factors[index];
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = pow(geom_mean_of_scale, (1.0 / num_of_mi));
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  av1_set_error_per_bit(errorperbit, *rdmult);
+}
+
+#if CONFIG_SALIENCY_MAP
+void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi,
+                                      int *errorperbit, const BLOCK_SIZE bsize,
+                                      const int mi_row, const int mi_col,
+                                      int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_mi_w = mi_size_wide[bsize];
+  const int num_mi_h = mi_size_high[bsize];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+
+  *rdmult =
+      (int)(*rdmult * cpi->sm_scaling_factor[(mi_row / num_mi_h) * num_cols +
+                                             (mi_col / num_mi_w)]);
+
+  *rdmult = AOMMAX(*rdmult, 0);
+  av1_set_error_per_bit(errorperbit, *rdmult);
+}
+#endif
+
+// TODO(angiebird): Move these function to tpl_model.c
+#if !CONFIG_REALTIME_ONLY
+// Return the end column for the current superblock, in unit of TPL blocks.
+static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
+                                         int num_mi_w) {
+  // Find the start column of this superblock.
+  const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2)
+                              << cm->seq_params->mib_size_log2;
+  // Same but in superres upscaled dimension.
+  const int sb_mi_col_start_sr =
+      coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator);
+  // Width of this superblock in mi units.
+  const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size];
+  // Same but in superres upscaled dimension.
+  const int sb_mi_width_sr =
+      coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator);
+  // Superblock end in mi units.
+  const int sb_mi_end = sb_mi_col_start_sr + sb_mi_width_sr;
+  // Superblock end in TPL units.
+  return (sb_mi_end + num_mi_w - 1) / num_mi_w;
+}
+
+int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                      const BLOCK_SIZE bsize, const int mi_row,
+                      const int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  int deltaq_rdmult = set_rdmult(cpi, x, -1);
+  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
+  if (cm->superres_scale_denominator != SCALE_NUMERATOR) return deltaq_rdmult;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+  if (x->rb == 0) return deltaq_rdmult;
+
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  int tpl_stride = tpl_frame->stride;
+  double intra_cost_base = 0;
+  double mc_dep_cost_base = 0;
+  double cbcmp_base = 0;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+      double cbcmp = (double)this_stats->srcrf_dist;
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+      intra_cost_base += log(dist_scaled) * cbcmp;
+      mc_dep_cost_base += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
+      cbcmp_base += cbcmp;
+    }
+  }
+
+  if (cbcmp_base == 0) return deltaq_rdmult;
+
+  double rk = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base);
+  deltaq_rdmult = (int)(deltaq_rdmult * (rk / x->rb));
+
+  return AOMMAX(deltaq_rdmult, 1);
+}
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int orig_rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  const int deltaq_rdmult = set_rdmult(cpi, x, -1);
+  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
+    return deltaq_rdmult;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int block_mi_width_sr =
+      coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator);
+
+  const BLOCK_SIZE bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (block_mi_width_sr + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  // This is required because the end col of superblock may be off by 1 in case
+  // of superres.
+  const int sb_bcol_end = get_superblock_tpl_column_end(cm, mi_col, num_mi_w);
+  int row, col;
+  double base_block_count = 0.0;
+  double geom_mean_of_scale = 0.0;
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols &&
+         col < sb_bcol_end;
+         ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]);
+      base_block_count += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
+  int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
+  rdmult = AOMMAX(rdmult, 0);
+  av1_set_error_per_bit(&x->errorperbit, rdmult);
+#if !CONFIG_RD_COMMAND
+  if (bsize == cm->seq_params->sb_size) {
+    const int rdmult_sb = set_rdmult(cpi, x, -1);
+    assert(rdmult_sb == rdmult);
+    (void)rdmult_sb;
+  }
+#endif  // !CONFIG_RD_COMMAND
+  return rdmult;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
+                                                const MACROBLOCKD *xd,
+                                                const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+
+    // Only allow the 3 valid SWITCHABLE_FILTERS.
+    assert(filter < SWITCHABLE_FILTERS);
+    ++counts->switchable_interp[ctx][filter];
+  }
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+    MB_MODE_INFO_EXT *mbmi_ext,
+    const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+                      const PICK_MODE_CONTEXT *const ctx, int mi_row,
+                      int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+  int i, x_idx, y;
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const MB_MODE_INFO *const mi = &ctx->mic;
+  MB_MODE_INFO *const mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int bw = mi_size_wide[mi->bsize];
+  const int bh = mi_size_high[mi->bsize];
+  const int mis = mi_params->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  assert(mi->bsize == bsize);
+
+  *mi_addr = *mi;
+  copy_mbmi_ext_frame_to_mbmi_ext(&x->mbmi_ext, &ctx->mbmi_ext_best,
+                                  av1_ref_frame_type(ctx->mic.ref_frame));
+
+  memcpy(txfm_info->blk_skip, ctx->blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+
+  txfm_info->skip_txfm = ctx->rd_stats.skip_txfm;
+
+  xd->tx_type_map = ctx->tx_type_map;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+  // If not dry_run, copy the transform type data into the frame level buffer.
+  // Encoder will fetch tx types when writing bitstream.
+  if (!dry_run) {
+    const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+    uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
+    const int mi_stride = mi_params->mi_stride;
+    for (int blk_row = 0; blk_row < bh; ++blk_row) {
+      av1_copy_array(tx_type_map + blk_row * mi_stride,
+                     xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
+    }
+    xd->tx_type_map = tx_type_map;
+    xd->tx_type_map_stride = mi_stride;
+  }
+
+  // If segmentation in use
+  if (seg->enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.q_cfg.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      mi_addr->segment_id =
+          map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+        !cpi->rc.rtc_external_ratectrl) {
+      av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize,
+                                        ctx->rd_stats.rate, ctx->rd_stats.dist,
+                                        txfm_info->skip_txfm, dry_run);
+    }
+    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+      mi_addr->uv_mode = UV_DC_PRED;
+
+    if (!dry_run && !mi_addr->skip_txfm) {
+      int cdf_num;
+      const uint8_t spatial_pred = av1_get_spatial_seg_pred(
+          cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4);
+      const uint8_t coded_id = av1_neg_interleave(
+          mi_addr->segment_id, spatial_pred, seg->last_active_segid + 1);
+      int64_t spatial_cost = x->mode_costs.spatial_pred_cost[cdf_num][coded_id];
+      td->rd_counts.seg_tmp_pred_cost[0] += spatial_cost;
+
+      const int pred_segment_id =
+          cm->last_frame_seg_map
+              ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row,
+                               mi_col)
+              : 0;
+      const int use_tmp_pred = pred_segment_id == mi_addr->segment_id;
+      const uint8_t tmp_pred_ctx = av1_get_pred_context_seg_id(xd);
+      td->rd_counts.seg_tmp_pred_cost[1] +=
+          x->mode_costs.tmp_pred_cost[tmp_pred_ctx][use_tmp_pred];
+      if (!use_tmp_pred) {
+        td->rd_counts.seg_tmp_pred_cost[1] += spatial_cost;
+      }
+    }
+  }
+
+  // Count zero motion vector.
+  if (!dry_run && !frame_is_intra_only(cm)) {
+    const MV mv = mi->mv[0].as_mv;
+    if (is_inter_block(mi) && mi->ref_frame[0] == LAST_FRAME &&
+        abs(mv.row) < 8 && abs(mv.col) < 8) {
+      const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+      // Accumulate low_content_frame.
+      for (int mi_y = 0; mi_y < ymis; mi_y += 2) x->cnt_zeromv += bw << 1;
+    }
+  }
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+
+  const int cols =
+      AOMMIN((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width, mi_width);
+  const int rows = AOMMIN(
+      (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height, mi_height);
+  for (y = 0; y < rows; y++) {
+    for (x_idx = 0; x_idx < cols; x_idx++) xd->mi[x_idx + y * mis] = mi_addr;
+  }
+
+  if (cpi->oxcf.q_cfg.aq_mode)
+    av1_init_plane_quantizers(cpi, x, mi_addr->segment_id, 0);
+
+  if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+  {
+    unsigned int *const mode_chosen_counts =
+        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
+    if (frame_is_intra_only(cm)) {
+      static const int kf_mode_index[] = {
+        THR_DC /*DC_PRED*/,
+        THR_V_PRED /*V_PRED*/,
+        THR_H_PRED /*H_PRED*/,
+        THR_D45_PRED /*D45_PRED*/,
+        THR_D135_PRED /*D135_PRED*/,
+        THR_D113_PRED /*D113_PRED*/,
+        THR_D157_PRED /*D157_PRED*/,
+        THR_D203_PRED /*D203_PRED*/,
+        THR_D67_PRED /*D67_PRED*/,
+        THR_SMOOTH,   /*SMOOTH_PRED*/
+        THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+        THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+        THR_PAETH /*PAETH_PRED*/,
+      };
+      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
+    } else {
+      // Note how often each mode chosen as best
+      ++mode_chosen_counts[ctx->best_mode_index];
+    }
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mi) && cm->features.interp_filter == SWITCHABLE) {
+      // When the frame interp filter is SWITCHABLE, several cases that always
+      // use the default type (EIGHTTAP_REGULAR) are described in
+      // av1_is_interp_needed(). Here, we should keep the counts for all
+      // applicable blocks, so the frame filter resetting decision in
+      // fix_interp_filter() is made correctly.
+      update_filter_type_count(td->counts, xd, mi_addr);
+    }
+  }
+
+  const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
+  if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
+    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+}
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                 PREDICTION_MODE mode, int16_t mode_context) {
+  (void)counts;
+
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->newmv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->newmv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+  if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->zeromv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->zeromv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+  ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+  update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+}
+
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                               FRAME_COUNTS *counts) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  (void)counts;
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+    update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+               n > 0, 2);
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+    }
+  }
+
+  if (mbmi->uv_mode == UV_DC_PRED) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+    update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+    }
+  }
+}
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                         MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                         const MB_MODE_INFO *above_mi,
+                         const MB_MODE_INFO *left_mi, const int intraonly) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const PREDICTION_MODE y_mode = mbmi->mode;
+  (void)counts;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+
+  if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+    const int above_ctx = intra_mode_context[above];
+    const int left_ctx = intra_mode_context[left];
+    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+  }
+
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    const int use_filter_intra_mode =
+        mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+    ++counts->filter_intra[mbmi->bsize][use_filter_intra_mode];
+    if (use_filter_intra_mode) {
+      ++counts
+            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+    }
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(fc->filter_intra_cdfs[mbmi->bsize], use_filter_intra_mode, 2);
+    if (use_filter_intra_mode) {
+      update_cdf(fc->filter_intra_mode_cdf,
+                 mbmi->filter_intra_mode_info.filter_intra_mode,
+                 FILTER_INTRA_MODES);
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[mbmi->mode - V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+    update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
+  }
+
+  if (!xd->is_chroma_ref) return;
+
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+#if CONFIG_ENTROPY_STATS
+  ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
+  update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+             UV_INTRA_MODES - !cfl_allowed);
+  if (uv_mode == UV_CFL_PRED) {
+    const int8_t joint_sign = mbmi->cfl_alpha_signs;
+    const uint8_t idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->cfl_sign[joint_sign];
+#endif
+    update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+      update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+    }
+    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+      update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+    }
+  }
+  const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+  if (av1_is_directional_mode(intra_mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[intra_mode - V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+    update_cdf(fc->angle_delta_cdf[intra_mode - V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
+  }
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+    update_palette_cdf(xd, mbmi, counts);
+  }
+}
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide = mi_size_wide[bsize];
+  const int num_4x4_blocks_high = mi_size_high[bsize];
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+  for (p = 0; p < num_planes; p++) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        ctx->a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+            xd->plane[p].subsampling_x);
+    memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           ctx->l + num_4x4_blocks_high * p,
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+               xd->plane[p].subsampling_y);
+  }
+  memcpy(xd->above_partition_context + mi_col, ctx->sa,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+         sizeof(xd->left_partition_context[0]) * mi_height);
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, ctx->tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+}
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                      int mi_row, int mi_col, BLOCK_SIZE bsize,
+                      const int num_planes) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < num_planes; ++p) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(
+        ctx->a + mi_width * p,
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
+    memcpy(ctx->l + mi_height * p,
+           xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
+  }
+  memcpy(ctx->sa, xd->above_partition_context + mi_col,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
+         sizeof(xd->left_partition_context[0]) * mi_height);
+  memcpy(ctx->ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(ctx->tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+                                     MB_MODE_INFO *mi, int bh_in, int bw_in,
+                                     int mi_rows_remaining,
+                                     int mi_cols_remaining, BLOCK_SIZE bsize,
+                                     MB_MODE_INFO **mib) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < cm->seq_params->mib_size; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < cm->seq_params->mib_size; c += bw) {
+      const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
+      const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
+      mib[grid_index] = mi + mi_index;
+      mib[grid_index]->bsize = find_partition_size(
+          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+    }
+  }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                MB_MODE_INFO **mib, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  MB_MODE_INFO *const mi_upper_left =
+      mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+
+  assert(bsize >= mi_params->mi_alloc_bsize &&
+         "Attempted to use bsize < mi_params->mi_alloc_bsize");
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->seq_params->mib_size) &&
+      (mi_rows_remaining >= cm->seq_params->mib_size)) {
+    for (int block_row = 0; block_row < cm->seq_params->mib_size;
+         block_row += bh) {
+      for (int block_col = 0; block_col < cm->seq_params->mib_size;
+           block_col += bw) {
+        const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
+        const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
+        mib[grid_index] = mi_upper_left + mi_index;
+        mib[grid_index]->bsize = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+                             mi_cols_remaining, bsize, mib);
+  }
+}
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize) {
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  assert(bsize >= BLOCK_8X8);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  for (int i = 0; i < 4; i++) {
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
+        (mi_col + x_idx >= cm->mi_params.mi_cols))
+      return 0;
+    if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
+            PARTITION_NONE &&
+        subsize != BLOCK_8X8)
+      return 0;
+  }
+  return 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int orig_rdmult) {
+  AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+
+  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+    return orig_rdmult;
+  }
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+    return orig_rdmult;
+  }
+
+#ifndef NDEBUG
+  int mi_count = 0;
+#endif
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+#ifndef NDEBUG
+      mi_count++;
+#endif
+    }
+  }
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+  double beta = 1.0;
+  if (mc_dep_cost > 0 && intra_cost > 0) {
+    const double r0 = cpi->rd.r0;
+    const double rk = (double)intra_cost / mc_dep_cost;
+    beta = (r0 / rk);
+  }
+
+  int rdmult = av1_get_adaptive_rdmult(cpi, beta);
+
+  rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
+  rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
+
+  rdmult = AOMMAX(1, rdmult);
+
+  return rdmult;
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_params.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->ppi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
+
+    bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
+    bottom_edge = AOMMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_params.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->ppi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
+
+    right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
+    right_edge = AOMMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col, SuperBlockEnc *sb_enc) {
+  sb_enc->tpl_data_count = 0;
+
+  if (!cpi->oxcf.algo_cfg.enable_tpl_model) return;
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) return;
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
+    return;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int gf_group_index = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  if (!av1_tpl_stats_ready(tpl_data, gf_group_index)) return;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+
+  int mi_count = 0;
+  int count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  // mi_cols_sr is mi_cols at superres case.
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  // TPL store unit size is not the same as the motion estimation unit size.
+  // Here always use motion estimation size to avoid getting repetitive inter/
+  // intra cost.
+  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
+  assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
+  const int row_step = mi_size_high[tpl_bsize];
+  const int col_step_sr = coded_to_superres_mi(mi_size_wide[tpl_bsize],
+                                               cm->superres_scale_denominator);
+
+  // Stride is only based on SB size, and we fill in values for every 16x16
+  // block in a SB.
+  sb_enc->tpl_stride = (mi_col_end_sr - mi_col_sr) / col_step_sr;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      assert(count < MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+      // Handle partial SB, so that no invalid values are used later.
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
+        sb_enc->tpl_inter_cost[count] = INT64_MAX;
+        sb_enc->tpl_intra_cost[count] = INT64_MAX;
+        for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+          sb_enc->tpl_mv[count][i].as_int = INVALID_MV;
+        }
+        count++;
+        continue;
+      }
+
+      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+      sb_enc->tpl_inter_cost[count] = this_stats->inter_cost
+                                      << TPL_DEP_COST_SCALE_LOG2;
+      sb_enc->tpl_intra_cost[count] = this_stats->intra_cost
+                                      << TPL_DEP_COST_SCALE_LOG2;
+      memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv));
+      mi_count++;
+      count++;
+    }
+  }
+
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+  sb_enc->tpl_data_count = mi_count;
+}
+
+// analysis_type 0: Use mc_dep_cost and intra_cost
+// analysis_type 1: Use count of best inter predictor chosen
+// analysis_type 2: Use cost reduction from intra to inter for best inter
+//                  predictor chosen
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
+                                   int64_t *delta_dist, BLOCK_SIZE bsize,
+                                   int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  double intra_cost = 0;
+  double mc_dep_reg = 0;
+  double mc_dep_cost = 0;
+  double cbcmp_base = 1;
+  double srcrf_dist = 0;
+  double srcrf_sse = 0;
+  double srcrf_rate = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int base_qindex = cm->quant_params.base_qindex;
+
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex;
+
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  if (!tpl_frame->is_valid) return base_qindex;
+
+#ifndef NDEBUG
+  int mi_count = 0;
+#endif
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      double cbcmp = (double)this_stats->srcrf_dist;
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+      intra_cost += log(dist_scaled) * cbcmp;
+      mc_dep_cost += log(dist_scaled + mc_dep_delta) * cbcmp;
+      mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
+      srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS);
+      srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS);
+      srcrf_rate += (double)(this_stats->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
+#ifndef NDEBUG
+      mi_count++;
+#endif
+      cbcmp_base += cbcmp;
+    }
+  }
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+  int offset = 0;
+  double beta = 1.0;
+  double rk;
+  if (mc_dep_cost > 0 && intra_cost > 0) {
+    const double r0 = cpi->rd.r0;
+    rk = exp((intra_cost - mc_dep_cost) / cbcmp_base);
+    td->mb.rb = exp((intra_cost - mc_dep_reg) / cbcmp_base);
+    beta = (r0 / rk);
+    assert(beta > 0.0);
+  } else {
+    return base_qindex;
+  }
+  offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  int frm_qstep = av1_dc_quant_QTX(base_qindex, 0, cm->seq_params->bit_depth);
+  int sbs_qstep =
+      av1_dc_quant_QTX(base_qindex, offset, cm->seq_params->bit_depth);
+
+  if (delta_dist) {
+    double sbs_dist = srcrf_dist * pow((double)sbs_qstep / frm_qstep, 2.0);
+    double sbs_rate = srcrf_rate * ((double)frm_qstep / sbs_qstep);
+    sbs_dist = AOMMIN(sbs_dist, srcrf_sse);
+    *delta_dist = (int64_t)((sbs_dist - srcrf_dist) / rk);
+    *delta_dist += RDCOST(tpl_frame->base_rdmult, 4 * 256, 0);
+    *delta_dist += RDCOST(tpl_frame->base_rdmult, sbs_rate - srcrf_rate, 0);
+  }
+  return qindex;
+}
+
+#if !DISABLE_HDR_LUMA_DELTAQ
+// offset table defined in Table3 of T-REC-H.Sup15 document.
+static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0,   301, 367, 434, 501, 567,
+                                                  634, 701, 767, 834, 1024 };
+
+static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3,  2,  1,  0,  -1,
+                                                    -2, -3, -4, -5, -6 };
+#endif
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+                      BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(cm->seq_params->bit_depth == AOM_BITS_10);
+
+#if DISABLE_HDR_LUMA_DELTAQ
+  (void)x;
+  (void)bsize;
+  (void)mi_row;
+  (void)mi_col;
+  return cm->quant_params.base_qindex;
+#else
+  // calculate pixel average
+  const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col);
+  // adjust offset based on average of the pixel block
+  int offset = 0;
+  for (int i = 0; i < HDR_QP_LEVELS; i++) {
+    if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) {
+      offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR);
+      break;
+    }
+  }
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  return qindex;
+#endif
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                            BLOCK_SIZE bsize) {
+  if (sms_tree == NULL) return;
+  sms_tree->partitioning = PARTITION_NONE;
+
+  if (bsize >= BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int idx = 0; idx < 4; ++idx)
+      av1_reset_simple_motion_tree_partition(sms_tree->split[idx], subsize);
+  }
+}
+
+// Record the ref frames that have been selected by square partition blocks.
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+                                       BLOCK_SIZE bsize, int mib_size,
+                                       int mi_row, int mi_col) {
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  const int sb_size_mask = mib_size - 1;
+  const int mi_row_in_sb = mi_row & sb_size_mask;
+  const int mi_col_in_sb = mi_col & sb_size_mask;
+  const int mi_size = mi_size_wide[bsize];
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+      x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
+    }
+  }
+}
+
+static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr,
+                           int num_cdfs, int cdf_stride, int nsymbs,
+                           int wt_left, int wt_tr) {
+  for (int i = 0; i < num_cdfs; i++) {
+    for (int j = 0; j <= nsymbs; j++) {
+      cdf_ptr_left[i * cdf_stride + j] =
+          (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
+                          (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
+                          ((wt_left + wt_tr) / 2)) /
+                         (wt_left + wt_tr));
+      assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
+             cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
+    }
+  }
+}
+
+#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
+  AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
+
+#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride)           \
+  do {                                                                     \
+    aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left;               \
+    aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr;                   \
+    int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob);       \
+    int num_cdfs = array_size / cdf_stride;                                \
+    avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
+                   wt_left, wt_tr);                                        \
+  } while (0)
+
+static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left,
+                    int wt_tr) {
+  AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
+                MV_CLASSES);
+    AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
+                nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
+                nmv_tr->comps[i].class0_hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
+                CLASS0_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
+  }
+}
+
+// In case of row-based multi-threading of encoder, since we always
+// keep a top - right sync, we can average the top - right SB's CDFs and
+// the left SB's CDFs and use the same for current SB's encoding to
+// improve the performance. This function facilitates the averaging
+// of CDF and used only when row-mt is enabled in encoder.
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+                         int wt_left, int wt_tr) {
+  AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
+  AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
+  AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
+  AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
+  AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
+  AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
+  AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
+  AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
+              ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
+              MASKED_COMPOUND_TYPES);
+  AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
+  AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
+              INTERINTRA_MODES);
+  AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
+  AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
+              PALETTE_SIZES);
+  AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
+              PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
+                   ctx_tr->palette_y_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+    AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
+                   ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+  }
+  AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
+  AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
+  AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
+  AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
+  AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
+  AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
+  AVERAGE_CDF(ctx_left->skip_txfm_cdfs, ctx_tr->skip_txfm_cdfs, 2);
+  AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
+  avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
+  avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
+  AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
+              ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
+  AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
+              FILTER_INTRA_MODES);
+  AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
+              RESTORE_SWITCHABLE_TYPES);
+  AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
+  AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
+                 UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
+  AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
+                     CDF_SIZE(10));
+    } else if (i < 16) {
+      AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
+    } else {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
+                     CDF_SIZE(10));
+    }
+  }
+  AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
+              SWITCHABLE_FILTERS);
+  AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
+  AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
+              2 * MAX_ANGLE_DELTA + 1);
+  AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
+                 CDF_SIZE(MAX_TX_DEPTH + 1));
+  AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
+  AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
+                DELTA_LF_PROBS + 1);
+  }
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
+                 CDF_SIZE(TX_TYPES));
+  AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
+              CFL_ALPHABET_SIZE);
+}
+
+// Check neighbor blocks' motion information.
+static int check_neighbor_blocks(MB_MODE_INFO **mi, int mi_stride,
+                                 const TileInfo *const tile_info, int mi_row,
+                                 int mi_col) {
+  int is_above_low_motion = 1;
+  int is_left_low_motion = 1;
+  const int thr = 24;
+
+  // Check above block.
+  if (mi_row > tile_info->mi_row_start) {
+    const MB_MODE_INFO *above_mbmi = mi[-mi_stride];
+    const int_mv above_mv = above_mbmi->mv[0];
+    if (above_mbmi->mode >= INTRA_MODE_END &&
+        (abs(above_mv.as_mv.row) > thr || abs(above_mv.as_mv.col) > thr))
+      is_above_low_motion = 0;
+  }
+
+  // Check left block.
+  if (mi_col > tile_info->mi_col_start) {
+    const MB_MODE_INFO *left_mbmi = mi[-1];
+    const int_mv left_mv = left_mbmi->mv[0];
+    if (left_mbmi->mode >= INTRA_MODE_END &&
+        (abs(left_mv.as_mv.row) > thr || abs(left_mv.as_mv.col) > thr))
+      is_left_low_motion = 0;
+  }
+
+  return (is_above_low_motion && is_left_low_motion);
+}
+
+// Check this block's motion in a fast way.
+static int fast_detect_non_zero_motion(AV1_COMP *cpi, const uint8_t *src_y,
+                                       int src_ystride,
+                                       const uint8_t *last_src_y,
+                                       int last_src_ystride, int mi_row,
+                                       int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const BLOCK_SIZE bsize = cm->seq_params->sb_size;
+  unsigned int blk_sad = INT_MAX;
+  if (cpi->src_sad_blk_64x64 != NULL) {
+    const int sb_size_by_mb = (bsize == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
+    const int sb_cols =
+        (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+    const int sbi_col = mi_col / sb_size_by_mb;
+    const int sbi_row = mi_row / sb_size_by_mb;
+    blk_sad = (unsigned int)cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+  } else {
+    blk_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                          last_src_ystride);
+  }
+
+  // Search 4 1-away points.
+  const uint8_t *const search_pos[4] = {
+    last_src_y - last_src_ystride,
+    last_src_y - 1,
+    last_src_y + 1,
+    last_src_y + last_src_ystride,
+  };
+  unsigned int sad_arr[4];
+  cpi->ppi->fn_ptr[bsize].sdx4df(src_y, src_ystride, search_pos,
+                                 last_src_ystride, sad_arr);
+
+  blk_sad = (blk_sad * 5) >> 3;
+  return (blk_sad < sad_arr[0] && blk_sad < sad_arr[1] &&
+          blk_sad < sad_arr[2] && blk_sad < sad_arr[3]);
+}
+
+// Grade the temporal variation of the source by comparing the current sb and
+// its collocated block in the last frame.
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+                           int mi_row, int mi_col) {
+  if (cpi->last_source->y_width != cpi->source->y_width ||
+      cpi->last_source->y_height != cpi->source->y_height)
+    return;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
+#endif
+
+  unsigned int tmp_sse;
+  unsigned int tmp_variance;
+  const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+  uint8_t *src_y = cpi->source->y_buffer;
+  const int src_ystride = cpi->source->y_stride;
+  const int src_offset = src_ystride * (mi_row << 2) + (mi_col << 2);
+  uint8_t *last_src_y = cpi->last_source->y_buffer;
+  const int last_src_ystride = cpi->last_source->y_stride;
+  const int last_src_offset = last_src_ystride * (mi_row << 2) + (mi_col << 2);
+  uint64_t avg_source_sse_threshold_verylow = 10000;     // ~1.5*1.5*(64*64)
+  uint64_t avg_source_sse_threshold_low[2] = { 100000,   // ~5*5*(64*64)
+                                               36000 };  // ~3*3*(64*64)
+
+  uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
+  if (cpi->sf.rt_sf.increase_source_sad_thresh) {
+    avg_source_sse_threshold_high = avg_source_sse_threshold_high << 1;
+    avg_source_sse_threshold_low[0] = avg_source_sse_threshold_low[0] << 1;
+    avg_source_sse_threshold_verylow = avg_source_sse_threshold_verylow << 1;
+  }
+  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+  src_y += src_offset;
+  last_src_y += last_src_offset;
+  tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                            last_src_ystride, &tmp_sse);
+  // rd thresholds
+  if (tmp_sse < avg_source_sse_threshold_low[1])
+    x->content_state_sb.source_sad_rd = kLowSad;
+
+  // nonrd thresholds
+  if (tmp_sse == 0) {
+    x->content_state_sb.source_sad_nonrd = kZeroSad;
+    return;
+  }
+  if (tmp_sse < avg_source_sse_threshold_verylow)
+    x->content_state_sb.source_sad_nonrd = kVeryLowSad;
+  else if (tmp_sse < avg_source_sse_threshold_low[0])
+    x->content_state_sb.source_sad_nonrd = kLowSad;
+  else if (tmp_sse > avg_source_sse_threshold_high)
+    x->content_state_sb.source_sad_nonrd = kHighSad;
+
+  // Detect large lighting change.
+  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+  if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
+    x->content_state_sb.lighting_change = 1;
+  if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1))
+    x->content_state_sb.low_sumdiff = 1;
+
+  if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad ||
+      cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1)
+    return;
+
+  // In-place temporal filter. If psnr calculation is enabled, we store the
+  // source for that.
+  AV1_COMMON *const cm = &cpi->common;
+  // Calculate n*mean^2
+  const unsigned int nmean2 = tmp_sse - tmp_variance;
+  const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+                                         cm->seq_params->bit_depth);
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const int avg_q_step = av1_ac_quant_QTX(p_rc->avg_frame_qindex[INTER_FRAME],
+                                          0, cm->seq_params->bit_depth);
+
+  const unsigned int threshold =
+      (cpi->sf.rt_sf.use_rtc_tf == 1)
+          ? (clamp(avg_q_step, 250, 1000)) * ac_q_step
+          : 250 * ac_q_step;
+
+  // TODO(yunqing): use a weighted sum instead of averaging in filtering.
+  if (tmp_variance <= threshold && nmean2 <= 15) {
+    // Check neighbor blocks. If neighbor blocks aren't low-motion blocks,
+    // skip temporal filtering for this block.
+    MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                        get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+    const TileInfo *const tile_info = &tile_data->tile_info;
+    const int is_neighbor_blocks_low_motion = check_neighbor_blocks(
+        mi, cm->mi_params.mi_stride, tile_info, mi_row, mi_col);
+    if (!is_neighbor_blocks_low_motion) return;
+
+    // Only consider 64x64 SB for now. Need to extend to 128x128 for large SB
+    // size.
+    // Test several nearby points. If non-zero mv exists, don't do temporal
+    // filtering.
+    const int is_this_blk_low_motion = fast_detect_non_zero_motion(
+        cpi, src_y, src_ystride, last_src_y, last_src_ystride, mi_row, mi_col);
+
+    if (!is_this_blk_low_motion) return;
+
+    const int shift_x[2] = { 0, cpi->source->subsampling_x };
+    const int shift_y[2] = { 0, cpi->source->subsampling_y };
+    const uint8_t h = block_size_high[bsize];
+    const uint8_t w = block_size_wide[bsize];
+
+    for (int plane = 0; plane < av1_num_planes(cm); ++plane) {
+      uint8_t *src = cpi->source->buffers[plane];
+      const int src_stride = cpi->source->strides[plane != 0];
+      uint8_t *last_src = cpi->last_source->buffers[plane];
+      const int last_src_stride = cpi->last_source->strides[plane != 0];
+      src += src_stride * (mi_row << (2 - shift_y[plane != 0])) +
+             (mi_col << (2 - shift_x[plane != 0]));
+      last_src += last_src_stride * (mi_row << (2 - shift_y[plane != 0])) +
+                  (mi_col << (2 - shift_x[plane != 0]));
+
+      for (int i = 0; i < (h >> shift_y[plane != 0]); ++i) {
+        for (int j = 0; j < (w >> shift_x[plane != 0]); ++j) {
+          src[j] = (last_src[j] + src[j]) >> 1;
+        }
+        src += src_stride;
+        last_src += last_src_stride;
+      }
+    }
+  }
+}
+
+// Memset the mbmis at the current superblock to 0
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+                    int mi_row, int mi_col) {
+  // size of sb in unit of mi (BLOCK_4X4)
+  const int sb_size_mi = mi_size_wide[sb_size];
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  // size of sb in unit of allocated mi size
+  const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
+  assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
+         "mi is not allocated as a multiple of sb!");
+  assert(mi_params->mi_stride % sb_size_mi == 0 &&
+         "mi_grid_base is not allocated as a multiple of sb!");
+
+  const int mi_rows = mi_size_high[sb_size];
+  for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
+    assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
+           mi_params->mi_stride);
+    const int mi_grid_idx =
+        get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    const int alloc_mi_idx =
+        get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->mi_grid_base));
+    memset(&mi_params->tx_type_map[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->tx_type_map));
+    if (cur_mi_row % mi_alloc_size_1d == 0) {
+      memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
+             sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
+    }
+  }
+}
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+                         ThreadData *td, const TileDataEnc *tile_data,
+                         int mi_row, int mi_col) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const TileInfo *tile_info = &tile_data->tile_info;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+  sb_fp_stats->rd_count = td->rd_counts;
+  sb_fp_stats->split_count = x->txfm_search_info.txb_split_count;
+
+  sb_fp_stats->fc = *td->counts;
+
+  // Don't copy in row_mt case, otherwise run into data race. No behavior change
+  // in row_mt case.
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+    memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
+           sizeof(sb_fp_stats->inter_mode_rd_models));
+  }
+
+  memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  sb_fp_stats->current_qindex =
+      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+                          ThreadData *td, TileDataEnc *tile_data, int mi_row,
+                          int mi_col) {
+  MACROBLOCK *x = &td->mb;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+  av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
+                      num_planes);
+
+  td->rd_counts = sb_fp_stats->rd_count;
+  x->txfm_search_info.txb_split_count = sb_fp_stats->split_count;
+
+  *td->counts = sb_fp_stats->fc;
+
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+    memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
+           sizeof(sb_fp_stats->inter_mode_rd_models));
+  }
+
+  memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+      sb_fp_stats->current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+/*! Checks whether to skip updating the entropy cost based on tile info.
+ *
+ * This function contains the common code used to skip the cost update of coeff,
+ * mode, mv and dv symbols.
+ */
+static int skip_cost_update(const SequenceHeader *seq_params,
+                            const TileInfo *const tile_info, const int mi_row,
+                            const int mi_col,
+                            INTERNAL_COST_UPDATE_TYPE upd_level) {
+  if (upd_level == INTERNAL_COST_UPD_SB) return 0;
+  if (upd_level == INTERNAL_COST_UPD_OFF) return 1;
+
+  // upd_level is at most as frequent as each sb_row in a tile.
+  if (mi_col != tile_info->mi_col_start) return 1;
+
+  if (upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+    const int mib_size_log2 = seq_params->mib_size_log2;
+    const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+    const int sb_size = seq_params->mib_size * MI_SIZE;
+    const int tile_height =
+        (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+    // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens
+    // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However,
+    // as the update will not be equally spaced in smaller resolutions making
+    // it equally spaced by calculating (mv_num_rows_cost_update) the number of
+    // rows after which the cost update should happen.
+    const int sb_size_update_freq_map[2] = { 2, 4 };
+    const int update_freq_sb_rows =
+        sb_size_update_freq_map[sb_size != MAX_SB_SIZE];
+    const int update_freq_num_rows = sb_size * update_freq_sb_rows;
+    // Round-up the division result to next integer.
+    const int num_updates_per_tile =
+        (tile_height + update_freq_num_rows - 1) / update_freq_num_rows;
+    const int num_rows_update_per_tile = num_updates_per_tile * sb_size;
+    // Round-up the division result to next integer.
+    const int num_sb_rows_per_update =
+        (tile_height + num_rows_update_per_tile - 1) / num_rows_update_per_tile;
+    if ((sb_row % num_sb_rows_per_update) != 0) return 1;
+  }
+  return 0;
+}
+
+// Checks for skip status of mv cost update.
+static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+                               const int mi_row, const int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  // For intra frames, mv cdfs are not updated during the encode. Hence, the mv
+  // cost calculation is skipped in this case.
+  if (frame_is_intra_only(cm)) return 1;
+
+  return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                          cpi->sf.inter_sf.mv_cost_upd_level);
+}
+
+// Checks for skip status of dv cost update.
+static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+                               const int mi_row, const int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  // Intrabc is only applicable to intra frames. So skip if intrabc is not
+  // allowed.
+  if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) {
+    return 1;
+  }
+
+  return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                          cpi->sf.intra_sf.dv_cost_upd_level);
+}
+
+// Update the rate costs of some symbols according to the frequency directed
+// by speed features
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+                           const TileInfo *const tile_info, const int mi_row,
+                           const int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  if (cm->features.disable_cdf_update) {
+    return;
+  }
+
+  switch (cpi->sf.inter_sf.coeff_cost_upd_level) {
+    case INTERNAL_COST_UPD_OFF:
+    case INTERNAL_COST_UPD_TILE:  // Tile level
+      break;
+    case INTERNAL_COST_UPD_SBROW_SET:  // SB row set level in tile
+    case INTERNAL_COST_UPD_SBROW:      // SB row level in tile
+    case INTERNAL_COST_UPD_SB:         // SB level
+      if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                           cpi->sf.inter_sf.coeff_cost_upd_level))
+        break;
+      av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->sf.inter_sf.mode_cost_upd_level) {
+    case INTERNAL_COST_UPD_OFF:
+    case INTERNAL_COST_UPD_TILE:  // Tile level
+      break;
+    case INTERNAL_COST_UPD_SBROW_SET:  // SB row set level in tile
+    case INTERNAL_COST_UPD_SBROW:      // SB row level in tile
+    case INTERNAL_COST_UPD_SB:         // SB level
+      if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                           cpi->sf.inter_sf.mode_cost_upd_level))
+        break;
+      av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->sf.inter_sf.mv_cost_upd_level) {
+    case INTERNAL_COST_UPD_OFF:
+    case INTERNAL_COST_UPD_TILE:  // Tile level
+      break;
+    case INTERNAL_COST_UPD_SBROW_SET:  // SB row set level in tile
+    case INTERNAL_COST_UPD_SBROW:      // SB row level in tile
+    case INTERNAL_COST_UPD_SB:         // SB level
+      // Checks for skip status of mv cost update.
+      if (skip_mv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+      av1_fill_mv_costs(&xd->tile_ctx->nmvc,
+                        cm->features.cur_frame_force_integer_mv,
+                        cm->features.allow_high_precision_mv, x->mv_costs);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->sf.intra_sf.dv_cost_upd_level) {
+    case INTERNAL_COST_UPD_OFF:
+    case INTERNAL_COST_UPD_TILE:  // Tile level
+      break;
+    case INTERNAL_COST_UPD_SBROW_SET:  // SB row set level in tile
+    case INTERNAL_COST_UPD_SBROW:      // SB row level in tile
+    case INTERNAL_COST_UPD_SB:         // SB level
+      // Checks for skip status of dv cost update.
+      if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+      av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs);
+      break;
+    default: assert(0);
+  }
+}
+
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    aom_free(mb->plane[plane].src_diff);
+    mb->plane[plane].src_diff = NULL;
+  }
+}
+
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) {
+  const int num_planes = av1_num_planes(cm);
+#ifndef NDEBUG
+  for (int plane = 0; plane < num_planes; ++plane) {
+    assert(!mb->plane[plane].src_diff);
+  }
+#endif
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_xy =
+        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+              : 0;
+    const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+    CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
+                    (int16_t *)aom_memalign(
+                        32, sizeof(*mb->plane[plane].src_diff) * sb_size));
+  }
+}
diff --git a/third_party/aom/av1/encoder/encodeframe_utils.h b/third_party/aom/av1/encoder/encodeframe_utils.h
new file mode 100644
index 0000000000..14c71b8802
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe_utils.h
@@ -0,0 +1,595 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+
+#include "aom_ports/aom_timer.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define WRITE_FEATURE_TO_FILE 0
+
+#define FEATURE_SIZE_SMS_SPLIT_FAST 6
+#define FEATURE_SIZE_SMS_SPLIT 17
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+#define FEATURE_SMS_NONE_FLAG 1
+#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
+#define FEATURE_SMS_RECT_FLAG (1 << 2)
+
+#define FEATURE_SMS_PRUNE_PART_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
+#define FEATURE_SMS_SPLIT_MODEL_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
+
+// Number of sub-partitions in rectangular partition types.
+#define SUB_PARTITIONS_RECT 2
+
+// Number of sub-partitions in split partition type.
+#define SUB_PARTITIONS_SPLIT 4
+
+// Number of sub-partitions in AB partition types.
+#define SUB_PARTITIONS_AB 3
+
+// Number of sub-partitions in 4-way partition types.
+#define SUB_PARTITIONS_PART4 4
+
+// 4part partition types.
+enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
+
+// AB partition types.
+enum {
+  HORZ_A = 0,
+  HORZ_B,
+  VERT_A,
+  VERT_B,
+  NUM_AB_PARTS
+} UENUM1BYTE(AB_PART_TYPE);
+
+// Rectangular partition types.
+enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
+
+// Structure to keep win flags for HORZ and VERT partition evaluations.
+typedef struct {
+  int rect_part_win[NUM_RECT_PARTS];
+} RD_RECT_PART_WIN_INFO;
+
+enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
+
+enum {
+  SB_SINGLE_PASS,  // Single pass encoding: all ctxs get updated normally
+  SB_DRY_PASS,     // First pass of multi-pass: does not update the ctxs
+  SB_WET_PASS      // Second pass of multi-pass: finalize and update the ctx
+} UENUM1BYTE(SB_MULTI_PASS_MODE);
+
+typedef struct {
+  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+// This struct is used to store the statistics used by sb-level multi-pass
+// encoding. Currently, this is only used to make a copy of the state before we
+// perform the first pass
+typedef struct SB_FIRST_PASS_STATS {
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_COUNTS rd_count;
+
+  int split_count;
+  FRAME_COUNTS fc;
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  int current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+#endif  // CONFIG_INTERNAL_STATS
+} SB_FIRST_PASS_STATS;
+
+// This structure contains block size related
+// variables for use in rd_pick_partition().
+typedef struct {
+  // Half of block width to determine block edge.
+  int mi_step;
+
+  // Block row and column indices.
+  int mi_row;
+  int mi_col;
+
+  // Block edge row and column indices.
+  int mi_row_edge;
+  int mi_col_edge;
+
+  // Block width of current partition block.
+  int width;
+
+  // Block width of minimum partition size allowed.
+  int min_partition_size_1d;
+
+  // Flag to indicate if partition is 8x8 or higher size.
+  int bsize_at_least_8x8;
+
+  // Indicates edge blocks in frame.
+  int has_rows;
+  int has_cols;
+
+  // Block size of current partition.
+  BLOCK_SIZE bsize;
+
+  // Size of current sub-partition.
+  BLOCK_SIZE subsize;
+
+  // Size of split partition.
+  BLOCK_SIZE split_bsize2;
+} PartitionBlkParams;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct PartitionTimingStats {
+  // Tracks the number of partition decision used in the current call to \ref
+  // av1_rd_pick_partition
+  int partition_decisions[EXT_PARTITION_TYPES];
+  // Tracks the number of partition_block searched in the current call to \ref
+  // av1_rd_pick_partition
+  int partition_attempts[EXT_PARTITION_TYPES];
+  // Tracks the time spent on each partition search in the current call to \ref
+  // av1_rd_pick_partition
+  int64_t partition_times[EXT_PARTITION_TYPES];
+  // Tracks the rdcost spent on each partition search in the current call to
+  // \ref av1_rd_pick_partition
+  int64_t partition_rdcost[EXT_PARTITION_TYPES];
+  // Timer used to time the partitions.
+  struct aom_usec_timer timer;
+  // Whether the timer is on
+  int timer_is_on;
+} PartitionTimingStats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+// Structure holding state variables for partition search.
+typedef struct {
+  // Intra partitioning related info.
+  PartitionSearchInfo *intra_part_info;
+
+  // Parameters related to partition block size.
+  PartitionBlkParams part_blk_params;
+
+  // Win flags for HORZ and VERT partition evaluations.
+  RD_RECT_PART_WIN_INFO split_part_rect_win[SUB_PARTITIONS_SPLIT];
+
+  // RD cost for the current block of given partition type.
+  RD_STATS this_rdc;
+
+  // RD cost summed across all blocks of partition type.
+  RD_STATS sum_rdc;
+
+  // Array holding partition type cost.
+  int tmp_partition_cost[PARTITION_TYPES];
+
+  // Pointer to partition cost buffer
+  int *partition_cost;
+
+  // RD costs for different partition types.
+  int64_t none_rd;
+  int64_t split_rd[SUB_PARTITIONS_SPLIT];
+  // RD costs for rectangular partitions.
+  // rect_part_rd[0][i] is the RD cost of ith partition index of PARTITION_HORZ.
+  // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT.
+  int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT];
+
+  // Flags indicating if the corresponding partition was winner or not.
+  // Used to bypass similar blocks during AB partition evaluation.
+  int is_split_ctx_is_ready[2];
+  int is_rect_ctx_is_ready[NUM_RECT_PARTS];
+
+  // If true, skips the rest of partition evaluation at the current bsize level.
+  int terminate_partition_search;
+
+  // If false, skips rdopt on PARTITION_NONE.
+  int partition_none_allowed;
+
+  // If partition_rect_allowed[HORZ] is false, skips searching PARTITION_HORZ,
+  // PARTITION_HORZ_A, PARTITIO_HORZ_B, PARTITION_HORZ_4. Same holds for VERT.
+  int partition_rect_allowed[NUM_RECT_PARTS];
+
+  // If false, skips searching rectangular partition unless some logic related
+  // to edge detection holds.
+  int do_rectangular_split;
+
+  // If false, skips searching PARTITION_SPLIT.
+  int do_square_split;
+
+  // If true, prunes the corresponding PARTITION_HORZ/PARTITION_VERT. Note that
+  // this does not directly affect the extended partitions, so this can be used
+  // to prune out PARTITION_HORZ/PARTITION_VERT while still allowing rdopt of
+  // PARTITION_HORZ_AB4, etc.
+  int prune_rect_part[NUM_RECT_PARTS];
+
+  // Chroma subsampling in x and y directions.
+  int ss_x;
+  int ss_y;
+
+  // Partition plane context index.
+  int pl_ctx_idx;
+
+  // This flag will be set if best partition is found from the search.
+  bool found_best_partition;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionTimingStats part_timing_stats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+} PartitionSearchState;
+
+static AOM_INLINE void av1_disable_square_split_partition(
+    PartitionSearchState *part_state) {
+  part_state->do_square_split = 0;
+}
+
+// Disables all possible rectangular splits. This includes PARTITION_AB4 as they
+// depend on the corresponding partition_rect_allowed.
+static AOM_INLINE void av1_disable_rect_partitions(
+    PartitionSearchState *part_state) {
+  part_state->do_rectangular_split = 0;
+  part_state->partition_rect_allowed[HORZ] = 0;
+  part_state->partition_rect_allowed[VERT] = 0;
+}
+
+// Disables all possible splits so that only PARTITION_NONE *might* be allowed.
+static AOM_INLINE void av1_disable_all_splits(
+    PartitionSearchState *part_state) {
+  av1_disable_square_split_partition(part_state);
+  av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE void av1_set_square_split_only(
+    PartitionSearchState *part_state) {
+  part_state->partition_none_allowed = 0;
+  part_state->do_square_split = 1;
+  av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE bool av1_blk_has_rows_and_cols(
+    const PartitionBlkParams *blk_params) {
+  return blk_params->has_rows && blk_params->has_cols;
+}
+
+static AOM_INLINE bool av1_is_whole_blk_in_frame(
+    const PartitionBlkParams *blk_params,
+    const CommonModeInfoParams *mi_params) {
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+  return mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+         mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+}
+
+static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
+                                              const MB_MODE_INFO *mbmi,
+                                              int dual_filter) {
+  for (int dir = 0; dir < 2; ++dir) {
+    if (dir && !dual_filter) break;
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+               SWITCHABLE_FILTERS);
+  }
+}
+
+static AOM_INLINE int set_rdmult(const AV1_COMP *const cpi,
+                                 const MACROBLOCK *const x, int segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+  const FRAME_UPDATE_TYPE update_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+
+  int qindex;
+  if (segment_id >= 0) {
+    qindex = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+  } else {
+    qindex = quant_params->base_qindex + x->rdmult_delta_qindex +
+             quant_params->y_dc_delta_q;
+  }
+
+  return av1_compute_rd_mult(
+      qindex, bit_depth, update_type, layer_depth, boost_index, frame_type,
+      cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi));
+}
+
+static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) {
+  return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p,
+                                                              int frm) {
+  assert(frm >= 0);
+  if (frm < 0 ||
+      p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
+    return NULL;
+  }
+
+  return &p->stats_buf_ctx->stats_in_start[frm];
+}
+
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int orig_rdmult);
+
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step);
+
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col, SuperBlockEnc *sb_enc);
+
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
+                                   int64_t *delta_dist, BLOCK_SIZE bsize,
+                                   int mi_row, int mi_col);
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+                      BLOCK_SIZE bsize, int mi_row, int mi_col);
+
+int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                      const BLOCK_SIZE bsize, const int mi_row,
+                      const int mi_col);
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int orig_rdmult);
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult);
+
+#if CONFIG_SALIENCY_MAP
+void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi,
+                                      int *errorperbit, const BLOCK_SIZE bsize,
+                                      const int mi_row, const int mi_col,
+                                      int *const rdmult);
+#endif
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+                      const PICK_MODE_CONTEXT *const ctx, int mi_row,
+                      int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run);
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                 PREDICTION_MODE mode, int16_t mode_context);
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                         MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                         const MB_MODE_INFO *above_mi,
+                         const MB_MODE_INFO *left_mi, const int intraonly);
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes);
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                      int mi_row, int mi_col, BLOCK_SIZE bsize,
+                      const int num_planes);
+
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                MB_MODE_INFO **mib, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize);
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize);
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                            BLOCK_SIZE bsize);
+
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+                                       BLOCK_SIZE bsize, int mib_size,
+                                       int mi_row, int mi_col);
+
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+                         int wt_left, int wt_tr);
+
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+                           int mi_row, int mi_col);
+
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+                    int mi_row, int mi_col);
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+                         ThreadData *td, const TileDataEnc *tile_data,
+                         int mi_row, int mi_col);
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+                          ThreadData *td, TileDataEnc *tile_data, int mi_row,
+                          int mi_col);
+
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+                           const TileInfo *const tile_info, const int mi_row,
+                           const int mi_col);
+
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes);
+
+static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb,
+                                           int num_planes) {
+  aom_free(mb->txfm_search_info.mb_rd_record);
+  mb->txfm_search_info.mb_rd_record = NULL;
+
+  aom_free(mb->inter_modes_info);
+  mb->inter_modes_info = NULL;
+
+  av1_dealloc_src_diff_buf(mb, num_planes);
+
+  aom_free(mb->e_mbd.seg_mask);
+  mb->e_mbd.seg_mask = NULL;
+
+  aom_free(mb->winner_mode_stats);
+  mb->winner_mode_stats = NULL;
+
+  aom_free(mb->dqcoeff_buf);
+  mb->dqcoeff_buf = NULL;
+}
+
+static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
+                                                  struct macroblock *mb) {
+  const SPEED_FEATURES *sf = &cpi->sf;
+  // The winner_mode_stats buffer is not required in these cases.
+  if (is_stat_generation_stage(cpi) ||
+      (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode) ||
+      (sf->winner_mode_sf.multi_winner_mode_type == MULTI_WINNER_MODE_OFF))
+    return;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int winner_mode_count =
+      winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type];
+  CHECK_MEM_ERROR(cm, mb->winner_mode_stats,
+                  (WinnerModeStats *)aom_malloc(
+                      winner_mode_count * sizeof(mb->winner_mode_stats[0])));
+}
+
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb);
+
+static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
+                                         struct macroblock *mb) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *sf = &cpi->sf;
+  if (!sf->rt_sf.use_nonrd_pick_mode) {
+    // Memory for mb_rd_record is allocated only when use_mb_rd_hash sf is
+    // enabled.
+    if (sf->rd_sf.use_mb_rd_hash)
+      CHECK_MEM_ERROR(cm, mb->txfm_search_info.mb_rd_record,
+                      (MB_RD_RECORD *)aom_malloc(sizeof(MB_RD_RECORD)));
+    if (!frame_is_intra_only(cm))
+      CHECK_MEM_ERROR(
+          cm, mb->inter_modes_info,
+          (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info)));
+  }
+
+  av1_alloc_src_diff_buf(cm, mb);
+
+  CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask,
+                  (uint8_t *)aom_memalign(
+                      16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0])));
+
+  allocate_winner_mode_stats(cpi, mb);
+
+  const int max_sb_square_y = 1
+                              << num_pels_log2_lookup[cm->seq_params->sb_size];
+  CHECK_MEM_ERROR(
+      cm, mb->dqcoeff_buf,
+      (tran_low_t *)aom_memalign(32, max_sb_square_y * sizeof(tran_low_t)));
+}
+
+// This function will compute the number of reference frames to be disabled
+// based on selective_ref_frame speed feature.
+static AOM_INLINE unsigned int get_num_refs_to_disable(
+    const AV1_COMP *cpi, const int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  unsigned int num_refs_to_disable = 0;
+  if (cpi->sf.inter_sf.selective_ref_frame >= 3) {
+    num_refs_to_disable++;
+    if (cpi->sf.inter_sf.selective_ref_frame >= 6) {
+      // Disable LAST2_FRAME  and ALTREF2_FRAME
+      num_refs_to_disable += 2;
+    } else if (cpi->sf.inter_sf.selective_ref_frame == 5 &&
+               *ref_frame_flags & av1_ref_frame_flag_list[LAST2_FRAME]) {
+      const int last2_frame_dist = av1_encoder_get_relative_dist(
+          ref_display_order_hint[LAST2_FRAME - LAST_FRAME],
+          cur_frame_display_index);
+      // Disable LAST2_FRAME if it is a temporally distant frame
+      if (abs(last2_frame_dist) > 2) {
+        num_refs_to_disable++;
+      }
+#if !CONFIG_REALTIME_ONLY
+      else if (is_stat_consumption_stage_twopass(cpi)) {
+        const FIRSTPASS_STATS *const this_frame_stats =
+            read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index);
+        const double coded_error_per_mb = this_frame_stats->coded_error;
+        // Disable LAST2_FRAME if the coded error of the current frame based on
+        // first pass stats is very low.
+        if (coded_error_per_mb < 100.0) num_refs_to_disable++;
+      }
+#endif  // CONFIG_REALTIME_ONLY
+    }
+  }
+  return num_refs_to_disable;
+}
+
+static INLINE int get_max_allowed_ref_frames(
+    const AV1_COMP *cpi, const int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  const unsigned int max_reference_frames =
+      cpi->oxcf.ref_frm_cfg.max_reference_frames;
+  const unsigned int num_refs_to_disable = get_num_refs_to_disable(
+      cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+  const unsigned int max_allowed_refs_for_given_speed =
+      INTER_REFS_PER_FRAME - num_refs_to_disable;
+  return AOMMIN(max_allowed_refs_for_given_speed, max_reference_frames);
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
+static AOM_INLINE void enforce_max_ref_frames(
+    AV1_COMP *cpi, int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  MV_REFERENCE_FRAME ref_frame;
+  int total_valid_refs = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      total_valid_refs++;
+    }
+  }
+
+  const int max_allowed_refs = get_max_allowed_ref_frames(
+      cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+
+  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+    if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
+      continue;
+    }
+
+    switch (ref_frame_to_disable) {
+      case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+      case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+      case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+      case BWDREF_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+      default: assert(0);
+    }
+    --total_valid_refs;
+  }
+  assert(total_valid_refs <= max_allowed_refs);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
new file mode 100644
index 0000000000..c78761dd98
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/txb_rdopt.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+                        ptrdiff_t diff_stride, const uint8_t *src8,
+                        ptrdiff_t src_stride, const uint8_t *pred8,
+                        ptrdiff_t pred_stride) {
+  assert(rows >= 4 && cols >= 4);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (bd_info.use_highbitdepth_buf) {
+    aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+                              pred8, pred_stride);
+    return;
+  }
+#endif
+  (void)bd_info;
+  aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                     pred_stride);
+}
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+                      int blk_col, int blk_row, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int tx1d_height = tx_size_high[tx_size];
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+  int16_t *src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+  av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride,
+                     src, src_stride, dst, dst_stride);
+}
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+
+  av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf,
+                     p->src.stride, pd->dst.buf, pd->dst.stride);
+}
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int *rate_cost) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const int segment_id = xd->mi[0]->segment_id;
+
+  if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
+      xd->lossless[segment_id]) {
+    *rate_cost = av1_cost_skip_txb(&x->coeff_costs, txb_ctx, plane, tx_size);
+    return eob;
+  }
+
+  return av1_optimize_txb(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+                          rate_cost, cpi->oxcf.algo_cfg.sharpness);
+}
+
+// Hyper-parameters for dropout optimization, based on following logics.
+// TODO(yjshen): These settings are tuned by experiments. They may still be
+// optimized for better performance.
+// (1) Coefficients which are large enough will ALWAYS be kept.
+const tran_low_t DROPOUT_COEFF_MAX = 2;  // Max dropout-able coefficient.
+// (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is
+//     NOT required. For example, `5 0 0 0 7` is treated as two continuous
+//     coefficients if three zeros do not fulfill the dropout condition.
+const int DROPOUT_CONTINUITY_MAX = 2;  // Max dropout-able continuous coeff.
+// (3) Dropout operation is NOT applicable to blocks with large or small
+//     quantization index.
+const int DROPOUT_Q_MAX = 128;
+const int DROPOUT_Q_MIN = 16;
+// (4) Recall that dropout optimization will forcibly set some quantized
+//     coefficients to zero. The key logic on determining whether a coefficient
+//     should be dropped is to check the number of continuous zeros before AND
+//     after this coefficient. The exact number of zeros for judgement depends
+//     on block size and quantization index. More concretely, block size
+//     determines the base number of zeros, while quantization index determines
+//     the multiplier. Intuitively, larger block requires more zeros and larger
+//     quantization index also requires more zeros (more information is lost
+//     when using larger quantization index).
+const int DROPOUT_BEFORE_BASE_MAX = 32;  // Max base number for leading zeros.
+const int DROPOUT_BEFORE_BASE_MIN = 16;  // Min base number for leading zeros.
+const int DROPOUT_AFTER_BASE_MAX = 32;   // Max base number for trailing zeros.
+const int DROPOUT_AFTER_BASE_MIN = 16;   // Min base number for trailing zeros.
+const int DROPOUT_MULTIPLIER_MAX = 8;    // Max multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_MIN = 2;    // Min multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_Q_BASE = 32;  // Base Q to compute multiplier.
+
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+                        TX_TYPE tx_type, int qindex) {
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+
+  // Early return if `qindex` is out of range.
+  if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) {
+    return;
+  }
+
+  // Compute number of zeros used for dropout judgement.
+  const int base_size = AOMMAX(tx_width, tx_height);
+  const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE,
+                              DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX);
+  const int dropout_num_before =
+      multiplier *
+      CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX);
+  const int dropout_num_after =
+      multiplier *
+      CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX);
+
+  av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before,
+                         dropout_num_after);
+}
+
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+                            TX_SIZE tx_size, TX_TYPE tx_type,
+                            int dropout_num_before, int dropout_num_after) {
+  const struct macroblock_plane *const p = &mb->plane[plane];
+  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
+  // Early return if there are not enough non-zero coefficients.
+  if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before ||
+      max_eob <= dropout_num_before + dropout_num_after) {
+    return;
+  }
+
+  int count_zeros_before = 0;
+  int count_zeros_after = 0;
+  int count_nonzeros = 0;
+  // Index of the first non-zero coefficient after sufficient number of
+  // continuous zeros. If equals to `-1`, it means number of leading zeros
+  // hasn't reach `dropout_num_before`.
+  int idx = -1;
+  int eob = 0;  // New end of block.
+
+  for (int i = 0; i < p->eobs[block]; ++i) {
+    const int scan_idx = scan_order->scan[i];
+    if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) {
+      // Keep large coefficients.
+      count_zeros_before = 0;
+      count_zeros_after = 0;
+      idx = -1;
+      eob = i + 1;
+    } else if (qcoeff[scan_idx] == 0) {  // Count zeros.
+      if (idx == -1) {
+        ++count_zeros_before;
+      } else {
+        ++count_zeros_after;
+      }
+    } else {  // Count non-zeros.
+      if (count_zeros_before >= dropout_num_before) {
+        idx = (idx == -1) ? i : idx;
+        ++count_nonzeros;
+      } else {
+        count_zeros_before = 0;
+        eob = i + 1;
+      }
+    }
+
+    // Handle continuity.
+    if (count_nonzeros > DROPOUT_CONTINUITY_MAX) {
+      count_zeros_before = 0;
+      count_zeros_after = 0;
+      count_nonzeros = 0;
+      idx = -1;
+      eob = i + 1;
+    }
+
+    // Handle the trailing zeros after original end of block.
+    if (idx != -1 && i == p->eobs[block] - 1) {
+      count_zeros_after += (max_eob - p->eobs[block]);
+    }
+
+    // Set redundant coefficients to zeros if needed.
+    if (count_zeros_after >= dropout_num_after) {
+      for (int j = idx; j <= i; ++j) {
+        qcoeff[scan_order->scan[j]] = 0;
+        dqcoeff[scan_order->scan[j]] = 0;
+      }
+      count_zeros_before += (i - idx + 1);
+      count_zeros_after = 0;
+      count_nonzeros = 0;
+    } else if (i == p->eobs[block] - 1) {
+      eob = i + 1;
+    }
+  }
+
+  if (eob != p->eobs[block]) {
+    p->eobs[block] = eob;
+    p->txb_entropy_ctx[block] =
+        av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+  }
+}
+
+// Settings for optimization type. NOTE: To set optimization type for all intra
+// frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set.
+// TODO(yjshen): These settings are hard-coded and look okay for now. They
+// should be made configurable later.
+// Blocks of key frames ONLY.
+const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of intra frames (key frames EXCLUSIVE).
+const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default
+// if trellis optimization is on for inter frames.)
+const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+
+enum {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_HIGHBD = 1,
+  QUANT_FUNC_TYPES = 2
+} UENUM1BYTE(QUANT_FUNC);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AV1_QUANT_FACADE
+    quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
+      { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+      { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+      { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+      { NULL, NULL }
+    };
+#else
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = {
+  av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL
+};
+#endif
+
+// Computes the transform for DC only blocks
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+                       TxfmParam *txfm_param, int64_t per_px_mean) {
+  assert(per_px_mean != INT64_MAX);
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+  memset(coeff, 0, sizeof(*coeff) * n_coeffs);
+  coeff[0] =
+      (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12);
+}
+
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+                     int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+                     const QUANT_PARAM *qparam) {
+  av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param);
+  av1_quant(x, plane, block, txfm_param, qparam);
+}
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+               BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  const int diff_stride = block_size_wide[plane_bsize];
+
+  const int src_offset = (blk_row * diff_stride + blk_col);
+  const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
+
+  av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+               const QUANT_PARAM *qparam) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const SCAN_ORDER *const scan_order =
+      get_scan(txfm_param->tx_size, txfm_param->tx_type);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  tran_low_t *const qcoeff = p->qcoeff + block_offset;
+  tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+  uint16_t *const eob = &p->eobs[block];
+
+  if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+    const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+    if (LIKELY(!x->seg_skip_block)) {
+#if CONFIG_AV1_HIGHBITDEPTH
+      quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#else
+      quant_func_list[qparam->xform_quant_idx](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#endif
+    } else {
+      av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
+    }
+  }
+  // use_optimize_b is true means av1_optimze_b will be called,
+  // thus cannot update entropy ctx now (performed in optimize_b)
+  if (qparam->use_optimize_b) {
+    p->txb_entropy_ctx[block] = 0;
+  } else {
+    p->txb_entropy_ctx[block] =
+        av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+  }
+}
+
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+                     TX_TYPE tx_type, TxfmParam *txfm_param) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  txfm_param->tx_type = tx_type;
+  txfm_param->tx_size = tx_size;
+  txfm_param->lossless = xd->lossless[mbmi->segment_id];
+  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+      tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used);
+
+  txfm_param->bd = xd->bd;
+  txfm_param->is_hbd = is_cur_buf_hbd(xd);
+}
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+                     int use_quant_b_adapt, QUANT_PARAM *qparam) {
+  qparam->log_scale = av1_get_tx_scale(tx_size);
+  qparam->tx_size = tx_size;
+
+  qparam->use_quant_b_adapt = use_quant_b_adapt;
+
+  // TODO(bohanli): optimize_b and quantization idx has relationship,
+  // but is kind of buried and complicated in different encoding stages.
+  // Should have a unified function to derive quant_idx, rather than
+  // determine and pass in the quant_idx
+  qparam->use_optimize_b = use_optimize_b;
+  qparam->xform_quant_idx = xform_quant_idx;
+
+  qparam->qmatrix = NULL;
+  qparam->iqmatrix = NULL;
+}
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+                       const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                       TX_TYPE tx_type, QUANT_PARAM *qparam) {
+  qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type);
+  qparam->iqmatrix =
+      av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type);
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
+                         RUN_TYPE dry_run) {
+  (void)dry_run;
+  struct encode_b_args *const args = arg;
+  const AV1_COMP *const cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+  uint8_t *dst;
+  ENTROPY_CONTEXT *a, *l;
+  int dummy_rate_cost = 0;
+
+  const int bw = mi_size_wide[plane_bsize];
+  dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+
+  TX_TYPE tx_type = DCT_DCT;
+  const int blk_skip_idx = blk_row * bw + blk_col;
+  if (!is_blk_skip(x->txfm_search_info.blk_skip, plane, blk_skip_idx) &&
+      !mbmi->skip_mode) {
+    tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
+                              cm->features.reduced_tx_set_used);
+    TxfmParam txfm_param;
+    QUANT_PARAM quant_param;
+    const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run);
+    int quant_idx;
+    if (use_trellis)
+      quant_idx = AV1_XFORM_QUANT_FP;
+    else
+      quant_idx =
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+    av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx,
+                    cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    // Whether trellis or dropout optimization is required for inter frames.
+    const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                            INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+    const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                            INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+
+    if (quant_param.use_optimize_b && do_trellis) {
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+                     &dummy_rate_cost);
+    }
+    if (!quant_param.use_optimize_b && do_dropout) {
+      av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+                         cm->quant_params.base_qindex);
+    }
+  } else {
+    p->eobs[block] = 0;
+    p->txb_entropy_ctx[block] = 0;
+  }
+
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  if (p->eobs[block]) {
+    // As long as any YUV plane has non-zero quantized transform coefficients,
+    // mbmi->skip_txfm flag is set to 0.
+    mbmi->skip_txfm = 0;
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                pd->dst.stride, p->eobs[block],
+                                cm->features.reduced_tx_set_used);
+  } else {
+    // Only when YUV planes all have zero quantized transform coefficients,
+    // mbmi->skip_txfm flag is set to 1.
+    mbmi->skip_txfm &= 1;
+  }
+
+  // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+  // case. It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+  if (p->eobs[block] == 0 && plane == 0) {
+#if 0
+    if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+        args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
+      // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
+      // enable_optimize_b is true to detect potential RD bug.
+      const uint8_t disable_txk_check = args->enable_optimize_b;
+      if (!disable_txk_check) {
+        assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+            DCT_DCT);
+      }
+    }
+#endif
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+  }
+
+#if CONFIG_MISMATCH_DEBUG
+  if (dry_run == OUTPUT_ENABLED) {
+    int pixel_c, pixel_r;
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int blk_w = block_size_wide[bsize];
+    int blk_h = block_size_high[bsize];
+    mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col,
+                    blk_row, pd->subsampling_x, pd->subsampling_y);
+    mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
+                             plane, pixel_c, pixel_r, blk_w, blk_h,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
+}
+
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg, RUN_TYPE dry_run) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+  if (!plane) {
+    assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] &&
+           tx_size_high[tx_size] >= tx_size_high[plane_tx_size]);
+  }
+
+  if (tx_size == plane_tx_size || plane) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
+                 dry_run);
+  } else {
+    assert(tx_size < TX_SIZES_ALL);
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+    assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+    // This is the square transform block partition entry point.
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
+        const int offsetc = blk_col + col;
+
+        encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+                           arg, dry_run);
+        block += step;
+      }
+    }
+  }
+}
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  // Call visit() directly with zero offsets if the current block size is the
+  // same as the transform block size.
+  if (plane_bsize == tx_bsize) {
+    visit(plane, 0, 0, 0, plane_bsize, tx_size, arg);
+    return;
+  }
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+  const int mu_blocks_wide =
+      AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide);
+  const int mu_blocks_high =
+      AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high);
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  int i = 0;
+  for (int r = 0; r < max_blocks_high; r += mu_blocks_high) {
+    const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+      const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+      for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+        for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+          visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+          i += step;
+        }
+      }
+    }
+  }
+  // Check if visit() is invoked at least once.
+  assert(i >= 1);
+}
+
+typedef struct encode_block_pass1_args {
+  AV1_COMP *cpi;
+  MACROBLOCK *x;
+} encode_block_pass1_args;
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
+  AV1_COMP *cpi = args->cpi;
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+
+  uint8_t *dst;
+  dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+                  &quant_param);
+  av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT,
+                    &quant_param);
+
+  av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                  &quant_param);
+
+  if (p->eobs[block] > 0) {
+    txfm_param.eob = p->eobs[block];
+    if (txfm_param.is_hbd) {
+      av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+      return;
+    }
+    av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+  }
+}
+
+void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  encode_block_pass1_args args = { cpi, x };
+  av1_subtract_plane(x, bsize, 0);
+  av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, &args);
+}
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   RUN_TYPE dry_run) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // In the current encoder implementation, for inter blocks,
+  // only when YUV planes all have zero quantized transform coefficients,
+  // mbmi->skip_txfm flag is set to 1.
+  // For intra blocks, this flag is set to 0 since skipped blocks are so rare
+  // that transmitting skip_txfm = 1 is very expensive.
+  // mbmi->skip_txfm is init to 1, and will be modified in encode_block() based
+  // on transform, quantization, and (if exists) trellis optimization.
+  mbmi->skip_txfm = 1;
+  if (x->txfm_search_info.skip_txfm) return;
+
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {
+    cpi, x, &ctx, NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
+  };
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int subsampling_x = pd->subsampling_x;
+    const int subsampling_y = pd->subsampling_y;
+    if (plane && !xd->is_chroma_ref) break;
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, subsampling_x, subsampling_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bw = mi_size_wide[txb_size];
+    const int bh = mi_size_high[txb_size];
+    int block = 0;
+    const int step =
+        tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]);
+    av1_subtract_plane(x, plane_bsize, plane);
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y);
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
+    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+    for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+        int blk_row, blk_col;
+        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+        for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
+                               max_tx_size, &arg, dry_run);
+            block += step;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void encode_block_intra_and_set_context(int plane, int block,
+                                               int blk_row, int blk_col,
+                                               BLOCK_SIZE plane_bsize,
+                                               TX_SIZE tx_size, void *arg) {
+  av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                         arg);
+
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *x = args->x;
+  ENTROPY_CONTEXT *a = &args->ta[blk_col];
+  ENTROPY_CONTEXT *l = &args->tl[blk_row];
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            void *arg) {
+  struct encode_b_args *const args = arg;
+  const AV1_COMP *const cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  uint16_t *eob = &p->eobs[block];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  int dummy_rate_cost = 0;
+
+  av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+
+  TX_TYPE tx_type = DCT_DCT;
+  const int bw = mi_size_wide[plane_bsize];
+  if (plane == 0 && is_blk_skip(x->txfm_search_info.blk_skip, plane,
+                                blk_row * bw + blk_col)) {
+    *eob = 0;
+    p->txb_entropy_ctx[block] = 0;
+  } else {
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+    const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+    const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+    tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                              cm->features.reduced_tx_set_used);
+    TxfmParam txfm_param;
+    QUANT_PARAM quant_param;
+    const int use_trellis =
+        is_trellis_used(args->enable_optimize_b, args->dry_run);
+    int quant_idx;
+    if (use_trellis)
+      quant_idx = AV1_XFORM_QUANT_FP;
+    else
+      quant_idx =
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+
+    av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx,
+                    cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    // Whether trellis or dropout optimization is required for key frames and
+    // intra frames.
+    const bool do_trellis = (frame_is_intra_only(cm) &&
+                             (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                              KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+                            (!frame_is_intra_only(cm) &&
+                             (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                              INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+    const bool do_dropout = (frame_is_intra_only(cm) &&
+                             (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                              KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+                            (!frame_is_intra_only(cm) &&
+                             (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                              INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+
+    if (quant_param.use_optimize_b && do_trellis) {
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+                     &dummy_rate_cost);
+    }
+    if (do_dropout) {
+      av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+                         cm->quant_params.base_qindex);
+    }
+  }
+
+  if (*eob) {
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                dst_stride, *eob,
+                                cm->features.reduced_tx_set_used);
+  }
+
+  // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+  // It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+  if (*eob == 0 && plane == 0) {
+#if 0
+    if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ
+        && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
+      assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+          DCT_DCT);
+    }
+#endif
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+  }
+
+  // For intra mode, skipped blocks are so rare that transmitting
+  // skip_txfm = 1 is very expensive.
+  mbmi->skip_txfm = 0;
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+  }
+}
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+                                  TRELLIS_OPT_TYPE enable_optimize_b) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  if (plane && !xd->is_chroma_ref) return;
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
+  ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
+  struct encode_b_args arg = {
+    cpi, x, NULL, ta, tl, dry_run, enable_optimize_b
+  };
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  if (enable_optimize_b) {
+    av1_get_entropy_contexts(plane_bsize, pd, ta, tl);
+  }
+  av1_foreach_transformed_block_in_plane(
+      xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg);
+}
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
new file mode 100644
index 0000000000..f97bf8f517
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  AV1_XFORM_QUANT_FP = 0,
+  AV1_XFORM_QUANT_B = 1,
+  AV1_XFORM_QUANT_DC = 2,
+  AV1_XFORM_QUANT_SKIP_QUANT,
+  AV1_XFORM_QUANT_TYPES,
+} UENUM1BYTE(AV1_XFORM_QUANT);
+
+// TODO(any): Merge OPT_TYPe and TRELLLIS_OPT_TYPE
+// Available optimization types to optimize the quantized coefficients.
+enum {
+  NONE_OPT = 0,            // No optimization.
+  TRELLIS_OPT = 1,         // Trellis optimization. See `av1_optimize_b()`.
+  DROPOUT_OPT = 2,         // Dropout optimization. See `av1_dropout_qcoeff()`.
+  TRELLIS_DROPOUT_OPT = 3  // Perform dropout after trellis optimization.
+} UENUM1BYTE(OPT_TYPE);
+
+enum {
+  NO_TRELLIS_OPT,          // No trellis optimization
+  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
+  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
+
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+  const struct AV1_COMP *cpi;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  RUN_TYPE dry_run;
+  TRELLIS_OPT_TYPE enable_optimize_b;
+};
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg);
+
+void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize);
+
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+                     TX_TYPE tx_type, TxfmParam *txfm_param);
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+                     int use_quant_b_adapt, QUANT_PARAM *qparam);
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+                       const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                       TX_TYPE tx_type, QUANT_PARAM *qparam);
+
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+                       TxfmParam *txfm_param, int64_t per_px_mean);
+
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+                     int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+                     const QUANT_PARAM *qparam);
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+               BLOCK_SIZE plane_bsize, TxfmParam *txfm_param);
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+               const QUANT_PARAM *qparam);
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int *rate_cost);
+
+// This function can be used as (i) a further optimization to reduce the
+// redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis
+// optimization, or (ii) an alternative to trellis optimization in high-speed
+// compression mode (e.g., real-time mode under speed-6) due to its LOW time
+// complexity. The rational behind is to drop out the may-be redundant quantized
+// coefficient which is among a bunch of zeros. NOTE: This algorithm is not as
+// accurate as trellis optimization since the hyper-parameters are hard-coded
+// instead of dynamic search. More adaptive logic may improve the performance.
+// This function should be applied to all or partical block cells.
+// Inputs:
+//   mb: Pointer to the MACROBLOCK to perform dropout on.
+//   plane: Index of the plane to which the target block belongs.
+//   block: Index of the target block.
+//   tx_size: Transform size of the target block.
+//   tx_type: Transform type of the target block. This field is particularly
+//            used to find out the scan order of the block.
+//   qindex: Quantization index used for target block. In general, all blocks
+//           in a same plane share the same quantization index. This field is
+//           particularly used to determine how many zeros should be used to
+//           drop out a coefficient.
+// Returns:
+//   Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as
+//   `txb_entropy_ctx`, which `mb` points to, may be modified by this function.
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+                        TX_TYPE tx_type, int qindex);
+// Same as above, with the number of zeroes needed before/after a coeff to drop
+// it explicitly passed in, instead of being derived from qindex.
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+                            TX_SIZE tx_size, TX_TYPE tx_type,
+                            int dropout_num_before, int dropout_num_after);
+
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+                        ptrdiff_t diff_stride, const uint8_t *src8,
+                        ptrdiff_t src_stride, const uint8_t *pred8,
+                        ptrdiff_t pred_stride);
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+                      int blk_col, int blk_row, TX_SIZE tx_size);
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane);
+
+static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+                                       TX_SIZE tx_size, ENTROPY_CONTEXT *a,
+                                       ENTROPY_CONTEXT *l) {
+  const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
+  memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a));
+  memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+                                  TRELLIS_OPT_TYPE enable_optimize_b);
+
+static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b,
+                                  RUN_TYPE dry_run) {
+  if (optimize_b == NO_TRELLIS_OPT) return false;
+  if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED)
+    return false;
+  return true;
+}
+
+// Scaling terms (precision of 12 bits) to perform tx-size specific
+// normalization that is used in DCT_DCT forward transform.
+// For transform blocks of 1:2 and 2:1       - sqrt(2) normalization is used
+// For transform blocks of 1:4 and 4:1       - factor of 2 is used
+// For transform blocks TX_8x8 and below     - an additional factor of 2 is used
+// For transform blocks max(width,height)=64 - currently not supported
+
+static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = {
+  1024, 2048, 4096, 4096, 0,    1448, 1448, 2896, 2896, 2896,
+  2896, 0,    0,    2048, 2048, 4096, 4096, 0,    0
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
new file mode 100644
index 0000000000..7cae72c159
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static void update_mv_component_stats(int comp, nmv_component *mvcomp,
+                                      MvSubpelPrecision precision) {
+  assert(comp != 0);
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = av1_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;         // int mv data
+  const int fr = (offset >> 1) & 3;  // fractional mv data
+  const int hp = offset & 1;         // high precision mv data
+
+  // Sign
+  update_cdf(mvcomp->sign_cdf, sign, 2);
+
+  // Class
+  update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE);
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (int i = 0; i < n; ++i)
+      update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2);
+  }
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
+    aom_cdf_prob *fp_cdf =
+        mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf;
+    update_cdf(fp_cdf, fr, MV_FP_SIZE);
+  }
+
+  // High precision bit
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
+    aom_cdf_prob *hp_cdf =
+        mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf;
+    update_cdf(hp_cdf, hp, 2);
+  }
+}
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+                         MvSubpelPrecision precision) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+  update_cdf(mvctx->joints_cdf, j, MV_JOINTS);
+
+  if (mv_joint_vertical(j))
+    update_mv_component_stats(diff.row, &mvctx->comps[0], precision);
+
+  if (mv_joint_horizontal(j))
+    update_mv_component_stats(diff.col, &mvctx->comps[1], precision);
+}
+
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+                                MvSubpelPrecision precision) {
+  assert(comp != 0);
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = av1_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;         // int mv data
+  const int fr = (offset >> 1) & 3;  // fractional mv data
+  const int hp = offset & 1;         // high precision mv data
+
+  // Sign
+  aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
+
+  // Class
+  aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < n; ++i)
+      aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2);
+  }
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
+    aom_write_symbol(
+        w, fr,
+        mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+        MV_FP_SIZE);
+  }
+
+  // High precision bit
+  if (precision > MV_SUBPEL_LOW_PRECISION)
+    aom_write_symbol(
+        w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
+        2);
+}
+
+/* TODO(siekyleb@amazon.com): This function writes MV_VALS ints or 128 KiB. This
+ *   is more than most L1D caches and is a significant chunk of L2. Write
+ *   SIMD that uses streaming writes to avoid loading all of that into L1, or
+ *   just don't update the larger component costs every time this called
+ *   (or both).
+ */
+void av1_build_nmv_component_cost_table(int *mvcost,
+                                        const nmv_component *const mvcomp,
+                                        MvSubpelPrecision precision) {
+  int i, j, v, o, mantissa;
+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+  int bits_cost[MV_OFFSET_BITS][2];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE] = { 0 },
+      fp_cost[MV_FP_SIZE] = { 0 };
+  int class0_hp_cost[2] = { 0 }, hp_cost[2] = { 0 };
+
+  av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+  av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+  av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+  }
+
+  if (precision > MV_SUBPEL_NONE) {
+    for (i = 0; i < CLASS0_SIZE; ++i)
+      av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i],
+                               NULL);
+    av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+  }
+
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
+    av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+    av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+  }
+
+  // Instead of accumulating the cost of each vector component's bits
+  //   individually, compute the costs based on smaller vectors. Costs for
+  //   [2^exp, 2 * 2^exp - 1] are calculated based on [0, 2^exp - 1]
+  //   respectively. Offsets are maintained to swap both 1) class costs when
+  //   treated as a complete vector component with the highest set bit when
+  //   treated as a mantissa (significand) and 2) leading zeros to account for
+  //   the current exponent.
+
+  // Cost offsets
+  int cost_swap[MV_OFFSET_BITS] = { 0 };
+  // Delta to convert positive vector to negative vector costs
+  int negate_sign = sign_cost[1] - sign_cost[0];
+
+  // Initialize with offsets to swap the class costs with the costs of the
+  //   highest set bit.
+  for (i = 1; i < MV_OFFSET_BITS; ++i) {
+    cost_swap[i] = bits_cost[i - 1][1];
+    if (i > CLASS0_BITS) cost_swap[i] -= class_cost[i - CLASS0_BITS];
+  }
+
+  // Seed the fractional costs onto the output (overwritten latter).
+  for (o = 0; o < MV_FP_SIZE; ++o) {
+    int hp;
+    for (hp = 0; hp < 2; ++hp) {
+      v = 2 * o + hp + 1;
+      mvcost[v] = fp_cost[o] + hp_cost[hp] + sign_cost[0];
+    }
+  }
+
+  mvcost[0] = 0;
+  // Fill the costs for each exponent's vectors, using the costs set in the
+  //   previous exponents.
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    const int exponent = (2 * MV_FP_SIZE) << i;
+
+    int class = 0;
+    if (i >= CLASS0_BITS) {
+      class = class_cost[i - CLASS0_BITS + 1];
+    }
+
+    // Iterate through mantissas, keeping track of the location
+    //   of the highest set bit for the mantissa.
+    // To be clear: in the outer loop, the position of the highest set bit
+    //   (exponent) is tracked and, in this loop, the highest set bit of the
+    //   mantissa is tracked.
+    mantissa = 0;
+    for (j = 0; j <= i; ++j) {
+      for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) {
+        int cost = mvcost[mantissa + 1] + class + cost_swap[j];
+        v = exponent + mantissa + 1;
+        mvcost[v] = cost;
+        mvcost[-v] = cost + negate_sign;
+      }
+      cost_swap[j] += bits_cost[i][0];
+    }
+  }
+
+  // Special case to avoid buffer overrun
+  {
+    int exponent = (2 * MV_FP_SIZE) << MV_OFFSET_BITS;
+    int class = class_cost[MV_CLASSES - 1];
+    mantissa = 0;
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) {
+        int cost = mvcost[mantissa + 1] + class + cost_swap[j];
+        v = exponent + mantissa + 1;
+        mvcost[v] = cost;
+        mvcost[-v] = cost + negate_sign;
+      }
+    }
+    // At this point: mantissa = exponent >> 1
+
+    // Manually calculate the final cost offset
+    int cost_swap_hi =
+        bits_cost[MV_OFFSET_BITS - 1][1] - class_cost[MV_CLASSES - 2];
+    for (; mantissa < exponent - 1; ++mantissa) {
+      int cost = mvcost[mantissa + 1] + class + cost_swap_hi;
+      v = exponent + mantissa + 1;
+      mvcost[v] = cost;
+      mvcost[-v] = cost + negate_sign;
+    }
+  }
+
+  // Fill costs for class0 vectors, overwriting previous placeholder values
+  //   used for calculating the costs of the larger vectors.
+  for (i = 0; i < CLASS0_SIZE; ++i) {
+    const int top = i * 2 * MV_FP_SIZE;
+    for (o = 0; o < MV_FP_SIZE; ++o) {
+      int hp;
+      int cost = class0_fp_cost[i][o] + class_cost[0] + class0_cost[i];
+      for (hp = 0; hp < 2; ++hp) {
+        v = top + 2 * o + hp + 1;
+        mvcost[v] = cost + class0_hp_cost[hp] + sign_cost[0];
+        mvcost[-v] = cost + class0_hp_cost[hp] + sign_cost[1];
+      }
+    }
+  }
+}
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+                   const MV *ref, nmv_context *mvctx, int usehp) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+  // If the mv_diff is zero, then we should have used near or nearest instead.
+  assert(j != MV_JOINT_ZERO);
+  if (cpi->common.features.cur_frame_force_integer_mv) {
+    usehp = MV_SUBPEL_NONE;
+  }
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+  // If auto_mv_step_size is enabled then keep track of the largest
+  // motion vector component used.
+  if (cpi->sf.mv_sf.auto_mv_step_size) {
+    int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+    td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude);
+  }
+}
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx) {
+  // DV and ref DV should not have sub-pel.
+  assert((mv->col & 7) == 0);
+  assert((mv->row & 7) == 0);
+  assert((ref->col & 7) == 0);
+  assert((ref->row & 7) == 0);
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
+}
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context *ctx,
+                              MvSubpelPrecision precision) {
+  av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
+  av1_build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
+  av1_build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
+}
+
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext) {
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const CANDIDATE_MV *curr_ref_mv_stack =
+      mbmi_ext->ref_mv_stack[ref_frame_type];
+
+  if (ref_frame[1] > INTRA_FRAME) {
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
+  }
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext->global_mvs[ref_frame_type];
+}
+
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+  return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+                                   &x->mbmi_ext);
+}
+
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer) {
+  const int ref_idx = 0;
+  MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext);
+  lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer);
+  *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext);
+  lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer);
+}
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
new file mode 100644
index 0000000000..c39001a5a2
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+                   const MV *ref, nmv_context *mvctx, int usehp);
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+                         MvSubpelPrecision precision);
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context *mvctx,
+                              MvSubpelPrecision precision);
+void av1_build_nmv_component_cost_table(int *mvcost,
+                                        const nmv_component *const mvcomp,
+                                        MvSubpelPrecision precision);
+
+void av1_update_mv_count(ThreadData *td);
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx);
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx);
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext);
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer);
+
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+  // row:  Z  col:  Z  | MV_JOINT_ZERO   (0)
+  // row:  Z  col: NZ  | MV_JOINT_HNZVZ  (1)
+  // row: NZ  col:  Z  | MV_JOINT_HZVNZ  (2)
+  // row: NZ  col: NZ  | MV_JOINT_HNZVNZ (3)
+  return (!!mv->col) | ((!!mv->row) << 1);
+}
+
+static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t av1_log_in_base_2(unsigned int n) {
+  // get_msb() is only valid when n != 0.
+  return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
+  assert(z >= 0);
+  const MV_CLASS_TYPE c = (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3);
+  assert(c <= MV_CLASS_10);
+  if (offset) *offset = z - av1_mv_class_base(c);
+  return c;
+}
+
+static INLINE int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm,
+                                                MACROBLOCK *const x) {
+  (void)cm;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  if (this_mode == NEW_NEWMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
+        mbmi->mv[1].as_int == ref_mv_1.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+    const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+    if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEWMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+      return 0;
+    }
+  }
+  return 1;
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
new file mode 100644
index 0000000000..4732ad435b
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -0,0 +1,5409 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
+
+#include "av1/common/scale.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aomcx.h"
+
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/filter.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/hash_motion.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#if CONFIG_SALIENCY_MAP
+#include "av1/encoder/saliency_map.h"
+#endif
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
+
+#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
+
+// #define OUTPUT_YUV_REC
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#define FILE_NAME_LEN 100
+#endif
+
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
+
+static INLINE void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) {
+  switch (mode) {
+    case AOME_NORMAL:
+      *hr = 1;
+      *hs = 1;
+      break;
+    case AOME_FOURFIVE:
+      *hr = 4;
+      *hs = 5;
+      break;
+    case AOME_THREEFIVE:
+      *hr = 3;
+      *hs = 5;
+      break;
+    case AOME_THREEFOUR:
+      *hr = 3;
+      *hs = 4;
+      break;
+    case AOME_ONEFOUR:
+      *hr = 1;
+      *hs = 4;
+      break;
+    case AOME_ONEEIGHT:
+      *hr = 1;
+      *hs = 8;
+      break;
+    case AOME_ONETWO:
+      *hr = 1;
+      *hs = 2;
+      break;
+    case AOME_TWOTHREE:
+      *hr = 2;
+      *hs = 3;
+      break;
+    case AOME_ONETHREE:
+      *hr = 1;
+      *hs = 3;
+      break;
+    default:
+      *hr = 1;
+      *hs = 1;
+      assert(0);
+      break;
+  }
+}
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) {
+    unsigned char *const active_map_4x4 = cpi->active_map.map;
+    const int mi_rows = mi_params->mi_rows;
+    const int mi_cols = mi_params->mi_cols;
+    const int row_scale = mi_size_high_log2[BLOCK_16X16];
+    const int col_scale = mi_size_wide_log2[BLOCK_16X16];
+    cpi->active_map.update = 0;
+    assert(mi_rows % 2 == 0);
+    assert(mi_cols % 2 == 0);
+    if (new_map_16x16) {
+      for (int r = 0; r < (mi_rows >> row_scale); ++r) {
+        for (int c = 0; c < (mi_cols >> col_scale); ++c) {
+          const uint8_t val = new_map_16x16[r * cols + c]
+                                  ? AM_SEGMENT_ID_ACTIVE
+                                  : AM_SEGMENT_ID_INACTIVE;
+          active_map_4x4[(2 * r + 0) * mi_cols + (c + 0)] = val;
+          active_map_4x4[(2 * r + 0) * mi_cols + (c + 1)] = val;
+          active_map_4x4[(2 * r + 1) * mi_cols + (c + 0)] = val;
+          active_map_4x4[(2 * r + 1) * mi_cols + (c + 1)] = val;
+        }
+      }
+      cpi->active_map.enabled = 1;
+    }
+    return 0;
+  }
+
+  return -1;
+}
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  if (rows == mi_params->mb_rows && cols == mi_params->mb_cols &&
+      new_map_16x16) {
+    unsigned char *const seg_map_8x8 = cpi->enc_seg.map;
+    const int mi_rows = mi_params->mi_rows;
+    const int mi_cols = mi_params->mi_cols;
+    const int row_scale = mi_size_high_log2[BLOCK_16X16];
+    const int col_scale = mi_size_wide_log2[BLOCK_16X16];
+    assert(mi_rows % 2 == 0);
+    assert(mi_cols % 2 == 0);
+
+    memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      for (int r = 0; r < (mi_rows >> row_scale); ++r) {
+        for (int c = 0; c < (mi_cols >> col_scale); ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          uint8_t temp = 0;
+          temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 0)] !=
+                  AM_SEGMENT_ID_INACTIVE;
+          temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 1)] !=
+                  AM_SEGMENT_ID_INACTIVE;
+          temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 0)] !=
+                  AM_SEGMENT_ID_INACTIVE;
+          temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 1)] !=
+                  AM_SEGMENT_ID_INACTIVE;
+          new_map_16x16[r * cols + c] |= temp;
+        }
+      }
+    }
+    return 0;
+  }
+
+  return -1;
+}
+
+void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage) {
+  bool is_allintra = usage == ALLINTRA;
+
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+  av1_init_intra_predictors();
+  av1_init_me_luts();
+  if (!is_allintra) av1_init_wedge_masks();
+  if (!is_allintra || end_usage != AOM_Q) av1_rc_init_minq_luts();
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate) {
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
+  av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
+}
+
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+                                 size_t encoded_frame_size) {
+  const int upscaled_width = cm->superres_upscaled_width;
+  const int height = cm->height;
+  const int64_t luma_pic_size = (int64_t)upscaled_width * height;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const int pic_size_profile_factor =
+      profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
+  encoded_frame_size =
+      (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1);
+  const int64_t uncompressed_frame_size =
+      (luma_pic_size * pic_size_profile_factor) >> 3;
+  return (double)uncompressed_frame_size / encoded_frame_size;
+}
+
+static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs,
+                                     int num_tiles_lg, int tile_col_row) {
+  CommonTileParams *const tiles = &cm->tiles;
+  int i, start_sb;
+  int size_sb = num_sbs >> num_tiles_lg;
+  int res_sbs = num_sbs - (size_sb << num_tiles_lg);
+  int num_tiles = 1 << num_tiles_lg;
+  int inc_index = num_tiles - res_sbs;
+
+  tiles->uniform_spacing = 0;
+
+  for (i = 0, start_sb = 0; start_sb < num_sbs && i < MAX_TILE_COLS; ++i) {
+    if (i == inc_index) ++size_sb;
+    if (tile_col_row)
+      tiles->col_start_sb[i] = start_sb;
+    else
+      tiles->row_start_sb[i] = start_sb;
+
+    start_sb += AOMMIN(size_sb, tiles->max_width_sb);
+  }
+
+  if (tile_col_row) {
+    tiles->cols = i;
+    tiles->col_start_sb[i] = num_sbs;
+  } else {
+    tiles->rows = i;
+    tiles->row_start_sb[i] = num_sbs;
+  }
+}
+
+static void set_tile_info(AV1_COMMON *const cm,
+                          const TileConfig *const tile_cfg) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  CommonTileParams *const tiles = &cm->tiles;
+  int i, start_sb;
+
+  av1_get_tile_limits(cm);
+
+  int sb_cols =
+      CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
+  // configure tile columns
+  if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) {
+    tiles->uniform_spacing = 1;
+    tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols);
+    // Add a special case to handle super resolution
+    sb_cols = coded_to_superres_mi(sb_cols, cm->superres_scale_denominator);
+    int min_log2_cols = 0;
+    for (; (tiles->max_width_sb << min_log2_cols) <= sb_cols; ++min_log2_cols) {
+    }
+    tiles->log2_cols = AOMMAX(tiles->log2_cols, min_log2_cols);
+
+    tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols);
+  } else if (tile_cfg->tile_widths[0] < 0) {
+    auto_tile_size_balancing(cm, sb_cols, tile_cfg->tile_columns, 1);
+  } else {
+    int size_sb, j = 0;
+    tiles->uniform_spacing = 0;
+    for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
+      tiles->col_start_sb[i] = start_sb;
+      size_sb = tile_cfg->tile_widths[j++];
+      if (j >= tile_cfg->tile_width_count) j = 0;
+      start_sb += AOMMIN(size_sb, tiles->max_width_sb);
+    }
+    tiles->cols = i;
+    tiles->col_start_sb[i] = sb_cols;
+  }
+  av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols,
+                          tiles);
+
+  // configure tile rows
+  int sb_rows =
+      CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
+  if (tiles->uniform_spacing) {
+    tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows);
+    tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows);
+  } else if (tile_cfg->tile_heights[0] < 0) {
+    auto_tile_size_balancing(cm, sb_rows, tile_cfg->tile_rows, 0);
+  } else {
+    int size_sb, j = 0;
+    for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
+      tiles->row_start_sb[i] = start_sb;
+      size_sb = tile_cfg->tile_heights[j++];
+      if (j >= tile_cfg->tile_height_count) j = 0;
+      start_sb += AOMMIN(size_sb, tiles->max_height_sb);
+    }
+    tiles->rows = i;
+    tiles->row_start_sb[i] = sb_rows;
+  }
+  av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles);
+}
+
+void av1_update_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  // Setup mi_params here in case we need more mi's.
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->set_mb_mi(mi_params, cm->width, cm->height,
+                       cpi->sf.part_sf.default_min_partition_size);
+
+  av1_init_macroblockd(cm, xd);
+
+  if (!cpi->ppi->seq_params_locked)
+    set_sb_size(cm->seq_params,
+                av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+                                   cpi->ppi->number_spatial_layers));
+
+  set_tile_info(cm, &cpi->oxcf.tile_cfg);
+}
+
+static INLINE int does_level_match(int width, int height, double fps,
+                                   int lvl_width, int lvl_height,
+                                   double lvl_fps, int lvl_dim_mult) {
+  const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height;
+  const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
+  const int64_t luma_pels = (int64_t)width * height;
+  const double display_sample_rate = luma_pels * fps;
+  return luma_pels <= lvl_luma_pels &&
+         display_sample_rate <= lvl_display_sample_rate &&
+         width <= lvl_width * lvl_dim_mult &&
+         height <= lvl_height * lvl_dim_mult;
+}
+
+static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width,
+                                     int height, double init_framerate) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const AV1LevelParams *const level_params = &ppi->level_params;
+  // TODO(any): This is a placeholder function that only addresses dimensions
+  // and max display sample rates.
+  // Need to add checks for max bit rate, max decoded luma sample rate, header
+  // rate, etc. that are not covered by this function.
+  AV1_LEVEL level = SEQ_LEVEL_MAX;
+  if (does_level_match(width, height, init_framerate, 512, 288, 30.0, 4)) {
+    level = SEQ_LEVEL_2_0;
+  } else if (does_level_match(width, height, init_framerate, 704, 396, 30.0,
+                              4)) {
+    level = SEQ_LEVEL_2_1;
+  } else if (does_level_match(width, height, init_framerate, 1088, 612, 30.0,
+                              4)) {
+    level = SEQ_LEVEL_3_0;
+  } else if (does_level_match(width, height, init_framerate, 1376, 774, 30.0,
+                              4)) {
+    level = SEQ_LEVEL_3_1;
+  } else if (does_level_match(width, height, init_framerate, 2048, 1152, 30.0,
+                              3)) {
+    level = SEQ_LEVEL_4_0;
+  } else if (does_level_match(width, height, init_framerate, 2048, 1152, 60.0,
+                              3)) {
+    level = SEQ_LEVEL_4_1;
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 30.0,
+                              2)) {
+    level = SEQ_LEVEL_5_0;
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 60.0,
+                              2)) {
+    level = SEQ_LEVEL_5_1;
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 120.0,
+                              2)) {
+    level = SEQ_LEVEL_5_2;
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 30.0,
+                              2)) {
+    level = SEQ_LEVEL_6_0;
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 60.0,
+                              2)) {
+    level = SEQ_LEVEL_6_1;
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0,
+                              2)) {
+    level = SEQ_LEVEL_6_2;
+  }
+#if CONFIG_CWG_C013
+  // TODO(bohanli): currently target level is only working for the 0th operating
+  // point, so scalable coding is not supported.
+  else if (level_params->target_seq_level_idx[0] >= SEQ_LEVEL_7_0 &&
+           level_params->target_seq_level_idx[0] <= SEQ_LEVEL_8_3) {
+    // Only use level 7.x to 8.x when explicitly asked to.
+    if (does_level_match(width, height, init_framerate, 16384, 8704, 30.0, 2)) {
+      level = SEQ_LEVEL_7_0;
+    } else if (does_level_match(width, height, init_framerate, 16384, 8704,
+                                60.0, 2)) {
+      level = SEQ_LEVEL_7_1;
+    } else if (does_level_match(width, height, init_framerate, 16384, 8704,
+                                120.0, 2)) {
+      level = SEQ_LEVEL_7_2;
+    } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+                                30.0, 2)) {
+      level = SEQ_LEVEL_8_0;
+    } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+                                60.0, 2)) {
+      level = SEQ_LEVEL_8_1;
+    } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+                                120.0, 2)) {
+      level = SEQ_LEVEL_8_2;
+    }
+  }
+#endif
+
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    assert(is_valid_seq_level_idx(level_params->target_seq_level_idx[i]) ||
+           level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS);
+    // If a higher target level is specified, it is then used rather than the
+    // inferred one from resolution and framerate.
+    seq_params->seq_level_idx[i] =
+        level_params->target_seq_level_idx[i] < SEQ_LEVELS &&
+                level_params->target_seq_level_idx[i] > level
+            ? level_params->target_seq_level_idx[i]
+            : level;
+    // Set the maximum parameters for bitrate and buffer size for this profile,
+    // level, and tier
+    seq_params->op_params[i].bitrate = av1_max_level_bitrate(
+        seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]);
+    // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
+    // check
+    if (seq_params->op_params[i].bitrate == 0)
+      aom_internal_error(
+          &ppi->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "AV1 does not support this combination of profile, level, and tier.");
+    // Buffer size in bits/s is bitrate in bits/s * 1 s
+    seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
+  }
+}
+
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
+                               const AV1EncoderConfig *oxcf,
+                               int disable_frame_id_numbers) {
+  SequenceHeader *const seq = &ppi->seq_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  const ToolCfg *const tool_cfg = &oxcf->tool_cfg;
+
+  seq->still_picture =
+      !tool_cfg->force_video_mode && (oxcf->input_cfg.limit == 1);
+  seq->reduced_still_picture_hdr =
+      seq->still_picture && !tool_cfg->full_still_picture_hdr;
+  seq->force_screen_content_tools = 2;
+  seq->force_integer_mv = 2;
+  seq->order_hint_info.enable_order_hint = tool_cfg->enable_order_hint;
+  seq->frame_id_numbers_present_flag =
+      !seq->reduced_still_picture_hdr &&
+      !oxcf->tile_cfg.enable_large_scale_tile &&
+      tool_cfg->error_resilient_mode && !disable_frame_id_numbers;
+  if (seq->reduced_still_picture_hdr) {
+    seq->order_hint_info.enable_order_hint = 0;
+    seq->force_screen_content_tools = 2;
+    seq->force_integer_mv = 2;
+  }
+  seq->order_hint_info.order_hint_bits_minus_1 =
+      seq->order_hint_info.enable_order_hint
+          ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1
+          : -1;
+
+  seq->max_frame_width = frm_dim_cfg->forced_max_frame_width
+                             ? frm_dim_cfg->forced_max_frame_width
+                             : frm_dim_cfg->width;
+  seq->max_frame_height = frm_dim_cfg->forced_max_frame_height
+                              ? frm_dim_cfg->forced_max_frame_height
+                              : frm_dim_cfg->height;
+  seq->num_bits_width =
+      (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1;
+  seq->num_bits_height =
+      (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1;
+  assert(seq->num_bits_width <= 16);
+  assert(seq->num_bits_height <= 16);
+
+  seq->frame_id_length = FRAME_ID_LENGTH;
+  seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
+  seq->enable_dual_filter = tool_cfg->enable_dual_filter;
+  seq->order_hint_info.enable_dist_wtd_comp =
+      oxcf->comp_type_cfg.enable_dist_wtd_comp;
+  seq->order_hint_info.enable_dist_wtd_comp &=
+      seq->order_hint_info.enable_order_hint;
+  seq->order_hint_info.enable_ref_frame_mvs = tool_cfg->ref_frame_mvs_present;
+  seq->order_hint_info.enable_ref_frame_mvs &=
+      seq->order_hint_info.enable_order_hint;
+  seq->enable_superres = oxcf->superres_cfg.enable_superres;
+  seq->enable_cdef = tool_cfg->cdef_control != CDEF_NONE ? 1 : 0;
+  seq->enable_restoration = tool_cfg->enable_restoration;
+  seq->enable_warped_motion = oxcf->motion_mode_cfg.enable_warped_motion;
+  seq->enable_interintra_compound = tool_cfg->enable_interintra_comp;
+  seq->enable_masked_compound = oxcf->comp_type_cfg.enable_masked_comp;
+  seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter;
+  seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra;
+
+  set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height,
+                           oxcf->input_cfg.init_framerate);
+
+  if (seq->operating_points_cnt_minus_1 == 0) {
+    seq->operating_point_idc[0] = 0;
+  } else {
+    // Set operating_point_idc[] such that the i=0 point corresponds to the
+    // highest quality operating point (all layers), and subsequent
+    // operarting points (i > 0) are lower quality corresponding to
+    // skip decoding enhancement  layers (temporal first).
+    int i = 0;
+    assert(seq->operating_points_cnt_minus_1 ==
+           (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1));
+    for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) {
+      for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) {
+        seq->operating_point_idc[i] =
+            (~(~0u << (ppi->number_spatial_layers - sl)) << 8) |
+            ~(~0u << (ppi->number_temporal_layers - tl));
+        i++;
+      }
+    }
+  }
+}
+
+static void init_config_sequence(struct AV1_PRIMARY *ppi,
+                                 const AV1EncoderConfig *oxcf) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+  const ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+  ppi->use_svc = 0;
+  ppi->number_spatial_layers = 1;
+  ppi->number_temporal_layers = 1;
+
+  seq_params->profile = oxcf->profile;
+  seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
+  seq_params->use_highbitdepth = oxcf->use_highbitdepth;
+  seq_params->color_primaries = color_cfg->color_primaries;
+  seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+  seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+  seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+  seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+  seq_params->color_range = color_cfg->color_range;
+  seq_params->timing_info_present = dec_model_cfg->timing_info_present;
+  seq_params->timing_info.num_units_in_display_tick =
+      dec_model_cfg->timing_info.num_units_in_display_tick;
+  seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
+  seq_params->timing_info.equal_picture_interval =
+      dec_model_cfg->timing_info.equal_picture_interval;
+  seq_params->timing_info.num_ticks_per_picture =
+      dec_model_cfg->timing_info.num_ticks_per_picture;
+
+  seq_params->display_model_info_present_flag =
+      dec_model_cfg->display_model_info_present_flag;
+  seq_params->decoder_model_info_present_flag =
+      dec_model_cfg->decoder_model_info_present_flag;
+  if (dec_model_cfg->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    seq_params->decoder_model_info.num_units_in_decoding_tick =
+        dec_model_cfg->num_units_in_decoding_tick;
+    ppi->buffer_removal_time_present = 1;
+    av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+    av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+  } else if (seq_params->timing_info_present &&
+             seq_params->timing_info.equal_picture_interval &&
+             !seq_params->decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    av1_set_resource_availability_parameters(&seq_params->op_params[0]);
+  } else {
+    seq_params->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
+
+  if (seq_params->monochrome) {
+    seq_params->subsampling_x = 1;
+    seq_params->subsampling_y = 1;
+  } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+             seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+             seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    seq_params->subsampling_x = 0;
+    seq_params->subsampling_y = 0;
+  } else {
+    if (seq_params->profile == 0) {
+      seq_params->subsampling_x = 1;
+      seq_params->subsampling_y = 1;
+    } else if (seq_params->profile == 1) {
+      seq_params->subsampling_x = 0;
+      seq_params->subsampling_y = 0;
+    } else {
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        seq_params->subsampling_x = oxcf->input_cfg.chroma_subsampling_x;
+        seq_params->subsampling_y = oxcf->input_cfg.chroma_subsampling_y;
+      } else {
+        seq_params->subsampling_x = 1;
+        seq_params->subsampling_y = 0;
+      }
+    }
+  }
+  av1_change_config_seq(ppi, oxcf, NULL);
+}
+
+static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->input_cfg.init_framerate;
+
+  cm->width = oxcf->frm_dim_cfg.width;
+  cm->height = oxcf->frm_dim_cfg.height;
+  cpi->is_dropped_frame = false;
+
+  alloc_compressor_data(cpi);
+
+  cpi->data_alloc_width = cm->width;
+  cpi->data_alloc_height = cm->height;
+  cpi->frame_size_related_setup_done = false;
+
+  // Single thread case: use counts in common.
+  cpi->td.counts = &cpi->counts;
+
+  // Init SVC parameters.
+  cpi->svc.number_spatial_layers = 1;
+  cpi->svc.number_temporal_layers = 1;
+  cm->spatial_layer_id = 0;
+  cm->temporal_layer_id = 0;
+  // Init rtc_ref parameters.
+  cpi->ppi->rtc_ref.set_ref_frame_config = 0;
+  cpi->ppi->rtc_ref.non_reference_frame = 0;
+  cpi->ppi->rtc_ref.ref_frame_comp[0] = 0;
+  cpi->ppi->rtc_ref.ref_frame_comp[1] = 0;
+  cpi->ppi->rtc_ref.ref_frame_comp[2] = 0;
+
+  // change includes all joint functionality
+  av1_change_config(cpi, oxcf, false);
+
+  cpi->ref_frame_flags = 0;
+
+  // Reset resize pending flags
+  resize_pending_params->width = 0;
+  resize_pending_params->height = 0;
+
+  // Setup identity scale factor
+  av1_setup_scale_factors_for_frame(&cm->sf_identity, 1, 1, 1, 1);
+
+  init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx);
+
+  av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+}
+
+void av1_change_config_seq(struct AV1_PRIMARY *ppi,
+                           const AV1EncoderConfig *oxcf,
+                           bool *is_sb_size_changed) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+  const ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+  if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
+  seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
+  seq_params->color_primaries = color_cfg->color_primaries;
+  seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+  seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+  seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+  seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+  seq_params->color_range = color_cfg->color_range;
+
+  assert(IMPLIES(seq_params->profile <= PROFILE_1,
+                 seq_params->bit_depth <= AOM_BITS_10));
+
+  seq_params->timing_info_present = dec_model_cfg->timing_info_present;
+  seq_params->timing_info.num_units_in_display_tick =
+      dec_model_cfg->timing_info.num_units_in_display_tick;
+  seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
+  seq_params->timing_info.equal_picture_interval =
+      dec_model_cfg->timing_info.equal_picture_interval;
+  seq_params->timing_info.num_ticks_per_picture =
+      dec_model_cfg->timing_info.num_ticks_per_picture;
+
+  seq_params->display_model_info_present_flag =
+      dec_model_cfg->display_model_info_present_flag;
+  seq_params->decoder_model_info_present_flag =
+      dec_model_cfg->decoder_model_info_present_flag;
+  if (dec_model_cfg->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    seq_params->decoder_model_info.num_units_in_decoding_tick =
+        dec_model_cfg->num_units_in_decoding_tick;
+    ppi->buffer_removal_time_present = 1;
+    av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+    av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+  } else if (seq_params->timing_info_present &&
+             seq_params->timing_info.equal_picture_interval &&
+             !seq_params->decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    av1_set_resource_availability_parameters(&seq_params->op_params[0]);
+  } else {
+    seq_params->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
+
+  av1_update_film_grain_parameters_seq(ppi, oxcf);
+
+  int sb_size = seq_params->sb_size;
+  // Superblock size should not be updated after the first key frame.
+  if (!ppi->seq_params_locked) {
+    set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width,
+                                               frm_dim_cfg->height,
+                                               ppi->number_spatial_layers));
+    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
+  }
+  if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size)
+    *is_sb_size_changed = true;
+
+  // Init sequence level coding tools
+  // This should not be called after the first key frame.
+  if (!ppi->seq_params_locked) {
+    seq_params->operating_points_cnt_minus_1 =
+        (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1)
+            ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1
+            : 0;
+    av1_init_seq_coding_tools(
+        ppi, oxcf, ppi->use_svc || ppi->rtc_ref.set_ref_frame_config);
+  }
+  seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  highbd_set_var_fns(ppi);
+#endif
+
+  set_primary_rc_buffer_sizes(oxcf, ppi);
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+                       bool is_sb_size_changed) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
+  RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  FeatureFlags *const features = &cm->features;
+
+  // in case of LAP, lag in frames is set according to number of lap buffers
+  // calculated at init time. This stores and restores LAP's lag in frames to
+  // prevent override by new cfg.
+  int lap_lag_in_frames = -1;
+  if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
+    lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
+  }
+
+  cpi->oxcf = *oxcf;
+
+  av1_update_film_grain_parameters(cpi, oxcf);
+
+  // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize
+  // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure
+  // that any analysis (e.g. TPL) happening outside the main encoding loop still
+  // happens at full resolution.
+  // This value will later be set appropriately just before main encoding loop.
+  cpi->superres_mode = oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO
+                           ? AOM_SUPERRES_NONE
+                           : oxcf->superres_cfg.superres_mode;  // default
+  x->e_mbd.bd = (int)seq_params->bit_depth;
+  x->e_mbd.global_motion = cm->global_motion;
+
+  memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx,
+         sizeof(level_params->target_seq_level_idx));
+  level_params->keep_level_stats = 0;
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    if (level_params->target_seq_level_idx[i] < SEQ_LEVELS ||
+        level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS) {
+      level_params->keep_level_stats |= 1u << i;
+      if (!level_params->level_info[i]) {
+        CHECK_MEM_ERROR(cm, level_params->level_info[i],
+                        aom_calloc(1, sizeof(*level_params->level_info[i])));
+      }
+    }
+  }
+
+  // TODO(huisu@): level targeting currently only works for the 0th operating
+  // point, so scalable coding is not supported yet.
+  if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) {
+    // Adjust encoder config in order to meet target level.
+    config_target_level(cpi, level_params->target_seq_level_idx[0],
+                        seq_params->tier[0]);
+  }
+
+  if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) {
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else if (!is_one_pass_rt_params(cpi) ||
+             cm->current_frame.frame_number == 0) {
+    // For rtc mode: logic for setting the baseline_gf_interval is done
+    // in av1_get_one_pass_rt_params(), and it should not be reset here in
+    // change_config(), unless after init_config (first frame).
+    p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
+
+  refresh_frame->golden_frame = false;
+  refresh_frame->bwd_ref_frame = false;
+
+  features->refresh_frame_context =
+      (oxcf->tool_cfg.frame_parallel_decoding_mode)
+          ? REFRESH_FRAME_CONTEXT_DISABLED
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->tile_cfg.enable_large_scale_tile)
+    features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  if (x->palette_buffer == NULL) {
+    CHECK_MEM_ERROR(cm, x->palette_buffer,
+                    aom_memalign(16, sizeof(*x->palette_buffer)));
+  }
+
+  if (x->tmp_conv_dst == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->tmp_conv_dst,
+        aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+    x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+  }
+  // The buffers 'tmp_pred_bufs[]' and 'comp_rd_buffer' are used in inter frames
+  // to store intermediate inter mode prediction results and are not required
+  // for allintra encoding mode. Hence, the memory allocations for these buffers
+  // are avoided for allintra encoding mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+    if (x->comp_rd_buffer.pred0 == NULL)
+      alloc_compound_type_rd_buffers(cm->error, &x->comp_rd_buffer);
+
+    for (int i = 0; i < 2; ++i) {
+      if (x->tmp_pred_bufs[i] == NULL) {
+        CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
+                        aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                             sizeof(*x->tmp_pred_bufs[i])));
+        x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i];
+      }
+    }
+  }
+
+  av1_reset_segment_features(cm);
+
+  av1_set_high_precision_mv(cpi, 1, 0);
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
+
+  // Set up frame rate and related parameters rate control values.
+  av1_new_framerate(cpi, cpi->framerate);
+
+  // Set absolute upper and lower quality limits
+  rc->worst_quality = rc_cfg->worst_allowed_q;
+  rc->best_quality = rc_cfg->best_allowed_q;
+
+  // If lossless has been requested make sure average Q accumulators are reset.
+  if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+    int i;
+    for (i = 0; i < FRAME_TYPES; ++i) {
+      p_rc->avg_frame_qindex[i] = 0;
+    }
+  }
+
+  features->interp_filter =
+      oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+  features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+      features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc);
+
+  if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) {
+    cm->render_width = frm_dim_cfg->render_width;
+    cm->render_height = frm_dim_cfg->render_height;
+  } else {
+    cm->render_width = frm_dim_cfg->width;
+    cm->render_height = frm_dim_cfg->height;
+  }
+  cm->width = frm_dim_cfg->width;
+  cm->height = frm_dim_cfg->height;
+
+  if (cm->width > cpi->data_alloc_width ||
+      cm->height > cpi->data_alloc_height || is_sb_size_changed) {
+    av1_free_context_buffers(cm);
+    av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+    av1_free_sms_tree(&cpi->td);
+    av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+    cpi->td.firstpass_ctx = NULL;
+    alloc_compressor_data(cpi);
+    realloc_segmentation_maps(cpi);
+    cpi->data_alloc_width = cm->width;
+    cpi->data_alloc_height = cm->height;
+    cpi->frame_size_related_setup_done = false;
+  }
+  av1_update_frame_size(cpi);
+
+  rc->is_src_frame_alt_ref = 0;
+
+  if (!cpi->ppi->rtc_ref.set_ref_frame_config)
+    cpi->ext_flags.refresh_frame.update_pending = 0;
+  cpi->ext_flags.refresh_frame_context_pending = 0;
+
+  if (cpi->ppi->use_svc)
+    av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth);
+
+  check_reset_rc_flag(cpi);
+
+  // restore the value of lag_in_frame for LAP stage.
+  if (lap_lag_in_frames != -1) {
+    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+  }
+
+#if CONFIG_REALTIME_ONLY
+  assert(!oxcf->tool_cfg.enable_global_motion);
+  cpi->image_pyramid_levels = 0;
+#else
+  if (oxcf->tool_cfg.enable_global_motion) {
+    cpi->image_pyramid_levels =
+        global_motion_pyr_levels[default_global_motion_method];
+  } else {
+    cpi->image_pyramid_levels = 0;
+  }
+#endif  // CONFIG_REALTIME_ONLY
+}
+
+static INLINE void init_frame_info(FRAME_INFO *frame_info,
+                                   const AV1_COMMON *const cm) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  frame_info->frame_width = cm->width;
+  frame_info->frame_height = cm->height;
+  frame_info->mi_cols = mi_params->mi_cols;
+  frame_info->mi_rows = mi_params->mi_rows;
+  frame_info->mb_cols = mi_params->mb_cols;
+  frame_info->mb_rows = mi_params->mb_rows;
+  frame_info->num_mbs = mi_params->MBs;
+  frame_info->bit_depth = seq_params->bit_depth;
+  frame_info->subsampling_x = seq_params->subsampling_x;
+  frame_info->subsampling_y = seq_params->subsampling_y;
+}
+
+static INLINE void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) {
+  frame_index_set->show_frame_count = 0;
+}
+
+static INLINE void update_counters_for_show_frame(AV1_COMP *const cpi) {
+  assert(cpi->common.show_frame);
+  cpi->frame_index_set.show_frame_count++;
+  cpi->common.current_frame.frame_number++;
+}
+
+AV1_PRIMARY *av1_create_primary_compressor(
+    struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+    const AV1EncoderConfig *oxcf) {
+  AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY));
+  if (!ppi) return NULL;
+  av1_zero(*ppi);
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(ppi->error.jmp)) {
+    ppi->error.setjmp = 0;
+    av1_remove_primary_compressor(ppi);
+    return 0;
+  }
+  ppi->error.setjmp = 1;
+
+  ppi->seq_params_locked = 0;
+  ppi->lap_enabled = num_lap_buffers > 0;
+  ppi->output_pkt_list = pkt_list_head;
+  ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+  ppi->frames_left = oxcf->input_cfg.limit;
+  ppi->num_fp_contexts = 1;
+
+  init_config_sequence(ppi, oxcf);
+
+#if CONFIG_ENTROPY_STATS
+  av1_zero(ppi->aggregate_fc);
+#endif  // CONFIG_ENTROPY_STATS
+
+  av1_primary_rc_init(oxcf, &ppi->p_rc);
+
+  // For two pass and lag_in_frames > 33 in LAP.
+  ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
+  if (ppi->lap_enabled) {
+    if ((num_lap_buffers <
+         (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) &&
+        num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) {
+      /*
+       * For lag in frames >= 19 and <33, enable scenecut
+       * with limited future frame prediction.
+       */
+      ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
+    } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) {
+      // Disable scenecut when lag_in_frames < 19.
+      ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT;
+    }
+  }
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+  ppi->fn_ptr[BT].sdf = SDF;                                            \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                          \
+  ppi->fn_ptr[BT].vf = VF;                                              \
+  ppi->fn_ptr[BT].svf = SVF;                                            \
+  ppi->fn_ptr[BT].svaf = SVAF;                                          \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                      \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                        \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;                                        \
+  ppi->fn_ptr[BT].sdx3df = SDX3DF;
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+  BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
+      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
+      aom_sad4x16x4d, aom_sad4x16x3d, aom_dist_wtd_sad4x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x16)
+
+  BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
+      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
+      aom_sad16x4x4d, aom_sad16x4x3d, aom_dist_wtd_sad16x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x4)
+
+  BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
+      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
+      aom_sad8x32x4d, aom_sad8x32x3d, aom_dist_wtd_sad8x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x32)
+
+  BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
+      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
+      aom_sad32x8x4d, aom_sad32x8x3d, aom_dist_wtd_sad32x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x8)
+
+  BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
+      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
+      aom_sad16x64x4d, aom_sad16x64x3d, aom_dist_wtd_sad16x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x64)
+
+  BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
+      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
+      aom_sad64x16x4d, aom_sad64x16x3d, aom_dist_wtd_sad64x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+  BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+      aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+      aom_sad128x128x4d, aom_sad128x128x3d, aom_dist_wtd_sad128x128_avg,
+      aom_dist_wtd_sub_pixel_avg_variance128x128)
+
+  BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
+      aom_sad128x64x4d, aom_sad128x64x3d, aom_dist_wtd_sad128x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance128x64)
+
+  BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
+      aom_sad64x128x4d, aom_sad64x128x3d, aom_dist_wtd_sad64x128_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x128)
+
+  BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
+      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
+      aom_sad32x16x4d, aom_sad32x16x3d, aom_dist_wtd_sad32x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x16)
+
+  BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
+      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
+      aom_sad16x32x4d, aom_sad16x32x3d, aom_dist_wtd_sad16x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x32)
+
+  BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
+      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
+      aom_sad64x32x4d, aom_sad64x32x3d, aom_dist_wtd_sad64x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x32)
+
+  BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
+      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
+      aom_sad32x64x4d, aom_sad32x64x3d, aom_dist_wtd_sad32x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x64)
+
+  BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
+      aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
+      aom_sad32x32x4d, aom_sad32x32x3d, aom_dist_wtd_sad32x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x32)
+
+  BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
+      aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
+      aom_sad64x64x4d, aom_sad64x64x3d, aom_dist_wtd_sad64x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x64)
+
+  BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
+      aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
+      aom_sad16x16x4d, aom_sad16x16x3d, aom_dist_wtd_sad16x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x16)
+
+  BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
+      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
+      aom_sad16x8x4d, aom_sad16x8x3d, aom_dist_wtd_sad16x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x8)
+
+  BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
+      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
+      aom_sad8x16x4d, aom_sad8x16x3d, aom_dist_wtd_sad8x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x16)
+
+  BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
+      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
+      aom_sad8x8x3d, aom_dist_wtd_sad8x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x8)
+
+  BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
+      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
+      aom_sad8x4x3d, aom_dist_wtd_sad8x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x4)
+
+  BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
+      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
+      aom_sad4x8x3d, aom_dist_wtd_sad4x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x8)
+
+  BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
+      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
+      aom_sad4x4x3d, aom_dist_wtd_sad4x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x4)
+
+#if !CONFIG_REALTIME_ONLY
+#define OBFP(BT, OSDF, OVF, OSVF) \
+  ppi->fn_ptr[BT].osdf = OSDF;    \
+  ppi->fn_ptr[BT].ovf = OVF;      \
+  ppi->fn_ptr[BT].osvf = OSVF;
+
+  OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+       aom_obmc_sub_pixel_variance128x128)
+  OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+       aom_obmc_sub_pixel_variance128x64)
+  OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+       aom_obmc_sub_pixel_variance64x128)
+  OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
+       aom_obmc_sub_pixel_variance64x64)
+  OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
+       aom_obmc_sub_pixel_variance64x32)
+  OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
+       aom_obmc_sub_pixel_variance32x64)
+  OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
+       aom_obmc_sub_pixel_variance32x32)
+  OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
+       aom_obmc_sub_pixel_variance32x16)
+  OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
+       aom_obmc_sub_pixel_variance16x32)
+  OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
+       aom_obmc_sub_pixel_variance16x16)
+  OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
+       aom_obmc_sub_pixel_variance16x8)
+  OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
+       aom_obmc_sub_pixel_variance8x16)
+  OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
+       aom_obmc_sub_pixel_variance8x8)
+  OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
+       aom_obmc_sub_pixel_variance4x8)
+  OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
+       aom_obmc_sub_pixel_variance8x4)
+  OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
+       aom_obmc_sub_pixel_variance4x4)
+  OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
+       aom_obmc_sub_pixel_variance4x16)
+  OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
+       aom_obmc_sub_pixel_variance16x4)
+  OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
+       aom_obmc_sub_pixel_variance8x32)
+  OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
+       aom_obmc_sub_pixel_variance32x8)
+  OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
+       aom_obmc_sub_pixel_variance16x64)
+  OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
+       aom_obmc_sub_pixel_variance64x16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define MBFP(BT, MCSDF, MCSVF)  \
+  ppi->fn_ptr[BT].msdf = MCSDF; \
+  ppi->fn_ptr[BT].msvf = MCSVF;
+
+  MBFP(BLOCK_128X128, aom_masked_sad128x128,
+       aom_masked_sub_pixel_variance128x128)
+  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
+  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
+  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
+
+#if !CONFIG_REALTIME_ONLY
+  MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
+  MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
+  MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
+  MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
+  MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
+  MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+#endif
+
+#define SDSFP(BT, SDSF, SDSX4DF) \
+  ppi->fn_ptr[BT].sdsf = SDSF;   \
+  ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+  SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d)
+  SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d)
+  SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d)
+  SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d)
+  SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d)
+
+  SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d)
+  SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d)
+  SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d)
+
+  SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d)
+  SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d)
+  SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d)
+  SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d)
+  SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d)
+
+  SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+  SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d)
+  SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d)
+  SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d)
+  SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d)
+  SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d)
+#endif
+#undef SDSFP
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  highbd_set_var_fns(ppi);
+#endif
+
+  {
+    // As cm->mi_params is a part of the frame level context (cpi), it is
+    // unavailable at this point. mi_params is created as a local temporary
+    // variable, to be passed into the functions used for allocating tpl
+    // buffers. The values in this variable are populated according to initial
+    // width and height of the frame.
+    CommonModeInfoParams mi_params;
+    enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+                  BLOCK_4X4);
+
+    const BLOCK_SIZE bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params.mi_cols + w - 1) / w;
+    const int num_rows = (mi_params.mi_rows + h - 1) / h;
+    AOM_CHECK_MEM_ERROR(
+        &ppi->error, ppi->tpl_sb_rdmult_scaling_factors,
+        aom_calloc(num_rows * num_cols,
+                   sizeof(*ppi->tpl_sb_rdmult_scaling_factors)));
+
+#if CONFIG_INTERNAL_STATS
+    ppi->b_calculate_blockiness = 1;
+    ppi->b_calculate_consistency = 1;
+
+    for (int i = 0; i <= STAT_ALL; i++) {
+      ppi->psnr[0].stat[i] = 0;
+      ppi->psnr[1].stat[i] = 0;
+
+      ppi->fastssim.stat[i] = 0;
+      ppi->psnrhvs.stat[i] = 0;
+    }
+
+    ppi->psnr[0].worst = 100.0;
+    ppi->psnr[1].worst = 100.0;
+    ppi->worst_ssim = 100.0;
+    ppi->worst_ssim_hbd = 100.0;
+
+    ppi->count[0] = 0;
+    ppi->count[1] = 0;
+    ppi->total_bytes = 0;
+
+    if (ppi->b_calculate_psnr) {
+      ppi->total_sq_error[0] = 0;
+      ppi->total_samples[0] = 0;
+      ppi->total_sq_error[1] = 0;
+      ppi->total_samples[1] = 0;
+      ppi->total_recode_hits = 0;
+      ppi->summed_quality = 0;
+      ppi->summed_weights = 0;
+      ppi->summed_quality_hbd = 0;
+      ppi->summed_weights_hbd = 0;
+    }
+
+    ppi->fastssim.worst = 100.0;
+    ppi->psnrhvs.worst = 100.0;
+
+    if (ppi->b_calculate_blockiness) {
+      ppi->total_blockiness = 0;
+      ppi->worst_blockiness = 0.0;
+    }
+
+    ppi->total_inconsistency = 0;
+    ppi->worst_consistency = 100.0;
+    if (ppi->b_calculate_consistency) {
+      AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars,
+                          aom_malloc(sizeof(*ppi->ssim_vars) * 4 *
+                                     mi_params.mi_rows * mi_params.mi_cols));
+    }
+#endif
+  }
+
+  ppi->error.setjmp = 0;
+  return ppi;
+}
+
+AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+                                BufferPool *const pool, COMPRESSOR_STAGE stage,
+                                int lap_lag_in_frames) {
+  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+
+  if (!cpi) return NULL;
+
+  av1_zero(*cpi);
+
+  cpi->ppi = ppi;
+
+  AV1_COMMON *volatile const cm = &cpi->common;
+  cm->seq_params = &ppi->seq_params;
+  cm->error =
+      (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error));
+  if (!cm->error) {
+    aom_free(cpi);
+    return NULL;
+  }
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error->jmp)) {
+    cm->error->setjmp = 0;
+    av1_remove_compressor(cpi);
+    return NULL;
+  }
+
+  cm->error->setjmp = 1;
+  cpi->compressor_stage = stage;
+
+  cpi->do_frame_data_update = true;
+
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->free_mi = enc_free_mi;
+  mi_params->setup_mi = enc_setup_mi;
+  mi_params->set_mb_mi =
+      (oxcf->pass == AOM_RC_FIRST_PASS || cpi->compressor_stage == LAP_STAGE)
+          ? stat_stage_set_mb_mi
+          : enc_set_mb_mi;
+
+  mi_params->mi_alloc_bsize = BLOCK_4X4;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(
+      cm, cm->default_frame_context,
+      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+  cpi->common.buffer_pool = pool;
+
+  init_config(cpi, oxcf);
+  if (cpi->compressor_stage == LAP_STAGE) {
+    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+  }
+
+  av1_rc_init(&cpi->oxcf, &cpi->rc);
+
+  init_frame_info(&cpi->frame_info, cm);
+  init_frame_index_set(&cpi->frame_index_set);
+
+  cm->current_frame.frame_number = 0;
+  cpi->rc.frame_number_encoded = 0;
+  cpi->rc.prev_frame_is_dropped = 0;
+  cpi->rc.max_consec_drop = INT_MAX;
+  cpi->rc.drop_count_consec = 0;
+  cm->current_frame_id = -1;
+  cpi->tile_data = NULL;
+  cpi->last_show_frame_buf = NULL;
+  realloc_segmentation_maps(cpi);
+
+  cpi->refresh_frame.alt_ref_frame = false;
+
+#if CONFIG_SPEED_STATS
+  cpi->tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
+
+  cpi->time_stamps.first_ts_start = INT64_MAX;
+
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+  yuv_denoised_file = fopen("denoised.yuv", "wb");
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+  if (is_stat_consumption_stage(cpi)) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
+
+    if (!cpi->ppi->lap_enabled) {
+      /*Re-initialize to stats buffer, populated by application in the case of
+       * two pass*/
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_start =
+          oxcf->twopass_stats_in.buf;
+      cpi->twopass_frame.stats_in =
+          cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_end =
+          &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
+
+      // The buffer size is packets - 1 because the last packet is total_stats.
+      av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info,
+                              oxcf->twopass_stats_in.buf, packets - 1);
+      av1_init_second_pass(cpi);
+    } else {
+      av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, NULL, 0);
+      av1_init_single_pass_lap(cpi);
+    }
+  }
+#endif
+
+  // The buffer "obmc_buffer" is used in inter frames for fast obmc search.
+  // Hence, the memory allocation for the same is avoided for allintra encoding
+  // mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0)
+    alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm->error);
+
+  for (int x = 0; x < 2; x++)
+    for (int y = 0; y < 2; y++)
+      CHECK_MEM_ERROR(
+          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+          (uint32_t *)aom_malloc(
+              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
+
+  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
+
+  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+  int max_mi_cols = mi_params->mi_cols;
+  int max_mi_rows = mi_params->mi_rows;
+  if (oxcf->frm_dim_cfg.forced_max_frame_width) {
+    max_mi_cols = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_width);
+  }
+  if (oxcf->frm_dim_cfg.forced_max_frame_height) {
+    max_mi_rows = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_height);
+  }
+
+  const int consec_zero_mv_alloc_size = (max_mi_rows * max_mi_cols) >> 2;
+  CHECK_MEM_ERROR(
+      cm, cpi->consec_zero_mv,
+      aom_calloc(consec_zero_mv_alloc_size, sizeof(*cpi->consec_zero_mv)));
+  cpi->consec_zero_mv_alloc_size = consec_zero_mv_alloc_size;
+
+  cpi->mb_weber_stats = NULL;
+  cpi->mb_delta_q = NULL;
+  cpi->palette_pixel_num = 0;
+  cpi->scaled_last_source_available = 0;
+
+  {
+    const BLOCK_SIZE bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (max_mi_cols + w - 1) / w;
+    const int num_rows = (max_mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
+    CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->tpl_rdmult_scaling_factors)));
+  }
+
+#if CONFIG_TUNE_VMAF
+  {
+    const BLOCK_SIZE bsize = BLOCK_64X64;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
+    for (int i = 0; i < MAX_ARF_LAYERS; i++) {
+      cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
+      cpi->vmaf_info.last_frame_ysse[i] = -1.0;
+      cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
+    }
+    cpi->vmaf_info.original_qindex = -1;
+    cpi->vmaf_info.vmaf_model = NULL;
+  }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  {
+    const int w = mi_size_wide[butteraugli_rdo_bsize];
+    const int h = mi_size_high[butteraugli_rdo_bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(
+        cm, cpi->butteraugli_info.rdmult_scaling_factors,
+        aom_malloc(num_rows * num_cols *
+                   sizeof(*cpi->butteraugli_info.rdmult_scaling_factors)));
+    memset(&cpi->butteraugli_info.source, 0,
+           sizeof(cpi->butteraugli_info.source));
+    memset(&cpi->butteraugli_info.resized_source, 0,
+           sizeof(cpi->butteraugli_info.resized_source));
+    cpi->butteraugli_info.recon_set = false;
+  }
+#endif
+
+#if CONFIG_SALIENCY_MAP
+  {
+    CHECK_MEM_ERROR(cm, cpi->saliency_map,
+                    (uint8_t *)aom_calloc(cm->height * cm->width,
+                                          sizeof(*cpi->saliency_map)));
+    // Buffer initialization based on MIN_MIB_SIZE_LOG2 to ensure that
+    // cpi->sm_scaling_factor buffer is allocated big enough, since we have no
+    // idea of the actual superblock size we are going to use yet.
+    const int min_mi_w_sb = (1 << MIN_MIB_SIZE_LOG2);
+    const int min_mi_h_sb = (1 << MIN_MIB_SIZE_LOG2);
+    const int max_sb_cols =
+        (cm->mi_params.mi_cols + min_mi_w_sb - 1) / min_mi_w_sb;
+    const int max_sb_rows =
+        (cm->mi_params.mi_rows + min_mi_h_sb - 1) / min_mi_h_sb;
+    CHECK_MEM_ERROR(cm, cpi->sm_scaling_factor,
+                    (double *)aom_calloc(max_sb_rows * max_sb_cols,
+                                         sizeof(*cpi->sm_scaling_factor)));
+  }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  av1_zero(cpi->partition_stats);
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+  // Initialize the members of DeltaQuantParams with INT_MAX to ensure that
+  // the quantizer tables are correctly initialized using the default deltaq
+  // parameters when av1_init_quantizer is called for the first time.
+  DeltaQuantParams *const prev_deltaq_params =
+      &cpi->enc_quant_dequant_params.prev_deltaq_params;
+  prev_deltaq_params->y_dc_delta_q = INT_MAX;
+  prev_deltaq_params->u_dc_delta_q = INT_MAX;
+  prev_deltaq_params->v_dc_delta_q = INT_MAX;
+  prev_deltaq_params->u_ac_delta_q = INT_MAX;
+  prev_deltaq_params->v_ac_delta_q = INT_MAX;
+
+  av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                     cm->seq_params->bit_depth);
+  av1_qm_init(&cm->quant_params, av1_num_planes(cm));
+
+  av1_loop_filter_init(cm);
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
+  cm->superres_upscaled_width = oxcf->frm_dim_cfg.width;
+  cm->superres_upscaled_height = oxcf->frm_dim_cfg.height;
+#if !CONFIG_REALTIME_ONLY
+  av1_loop_restoration_precal();
+#endif
+
+  cpi->third_pass_ctx = NULL;
+  if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+    av1_init_thirdpass_ctx(cm, &cpi->third_pass_ctx, NULL);
+  }
+
+  cpi->second_pass_log_stream = NULL;
+  cpi->use_ducky_encode = 0;
+
+  cm->error->setjmp = 0;
+  return cpi;
+}
+
+#if CONFIG_INTERNAL_STATS
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif  // CONFIG_INTERNAL_STATS
+
+void av1_remove_primary_compressor(AV1_PRIMARY *ppi) {
+  if (!ppi) return;
+#if !CONFIG_REALTIME_ONLY
+  av1_tf_info_free(&ppi->tf_info);
+#endif  // !CONFIG_REALTIME_ONLY
+
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    aom_free(ppi->level_params.level_info[i]);
+  }
+  av1_lookahead_destroy(ppi->lookahead);
+
+  aom_free(ppi->tpl_sb_rdmult_scaling_factors);
+  ppi->tpl_sb_rdmult_scaling_factors = NULL;
+
+  TplParams *const tpl_data = &ppi->tpl_data;
+  aom_free(tpl_data->txfm_stats_list);
+
+  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+    aom_free(tpl_data->tpl_stats_pool[frame]);
+    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
+    tpl_data->tpl_stats_pool[frame] = NULL;
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
+#endif
+
+  av1_terminate_workers(ppi);
+  free_thread_data(ppi);
+
+  aom_free(ppi->p_mt_info.tile_thr_data);
+  ppi->p_mt_info.tile_thr_data = NULL;
+  aom_free(ppi->p_mt_info.workers);
+  ppi->p_mt_info.workers = NULL;
+  ppi->p_mt_info.num_workers = 0;
+
+  aom_free(ppi);
+}
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+  if (!cpi) return;
+#if CONFIG_RATECTRL_LOG
+  if (cpi->oxcf.pass == 3) {
+    rc_log_show(&cpi->rc_log);
+  }
+#endif  // CONFIG_RATECTRL_LOG
+
+  AV1_COMMON *cm = &cpi->common;
+  if (cm->current_frame.frame_number > 0) {
+#if CONFIG_SPEED_STATS
+    if (!is_stat_generation_stage(cpi)) {
+      fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
+    }
+#endif  // CONFIG_SPEED_STATS
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+    if (!is_stat_generation_stage(cpi)) {
+      av1_print_fr_partition_timing_stats(&cpi->partition_stats,
+                                          "fr_part_timing_data.csv");
+    }
+#endif
+  }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  av1_denoiser_free(&(cpi->denoiser));
+#endif
+
+  if (cm->error) {
+    // Help detect use after free of the error detail string.
+    memset(cm->error->detail, 'A', sizeof(cm->error->detail) - 1);
+    cm->error->detail[sizeof(cm->error->detail) - 1] = '\0';
+    aom_free(cm->error);
+  }
+  aom_free(cpi->td.tctx);
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
+  pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_;
+  pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+  pthread_mutex_t *const tpl_error_mutex_ = mt_info->tpl_row_mt.mutex_;
+  pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
+  if (enc_row_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(enc_row_mt_mutex_);
+    aom_free(enc_row_mt_mutex_);
+  }
+  if (enc_row_mt_cond_ != NULL) {
+    pthread_cond_destroy(enc_row_mt_cond_);
+    aom_free(enc_row_mt_cond_);
+  }
+  if (gm_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(gm_mt_mutex_);
+    aom_free(gm_mt_mutex_);
+  }
+  if (tpl_error_mutex_ != NULL) {
+    pthread_mutex_destroy(tpl_error_mutex_);
+    aom_free(tpl_error_mutex_);
+  }
+  if (pack_bs_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(pack_bs_mt_mutex_);
+    aom_free(pack_bs_mt_mutex_);
+  }
+#endif
+  av1_row_mt_mem_dealloc(cpi);
+
+  if (mt_info->num_workers > 1) {
+    av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
+    av1_loop_filter_dealloc(&mt_info->lf_row_sync);
+    av1_cdef_mt_dealloc(&mt_info->cdef_sync);
+#if !CONFIG_REALTIME_ONLY
+    av1_loop_restoration_dealloc(&mt_info->lr_row_sync);
+    av1_tf_mt_dealloc(&mt_info->tf_sync);
+#endif
+  }
+
+  av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+
+  av1_close_second_pass_log(cpi);
+
+  dealloc_compressor_data(cpi);
+
+  av1_ext_part_delete(&cpi->ext_part_controller);
+
+  av1_remove_common(cm);
+
+  aom_free(cpi);
+
+#ifdef OUTPUT_YUV_REC
+  fclose(yuv_rec_file);
+#endif
+
+#ifdef OUTPUT_YUV_DENOISED
+  fclose(yuv_denoised_file);
+#endif
+}
+
+static void generate_psnr_packet(AV1_COMP *cpi) {
+  struct aom_codec_cx_pkt pkt;
+  int i;
+  PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
+                       bit_depth, in_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+#endif
+
+  for (i = 0; i < 4; ++i) {
+    pkt.data.psnr.samples[i] = psnr.samples[i];
+    pkt.data.psnr.sse[i] = psnr.sse[i];
+    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+      (in_bit_depth < bit_depth)) {
+    for (i = 0; i < 4; ++i) {
+      pkt.data.psnr.samples_hbd[i] = psnr.samples_hbd[i];
+      pkt.data.psnr.sse_hbd[i] = psnr.sse_hbd[i];
+      pkt.data.psnr.psnr_hbd[i] = psnr.psnr_hbd[i];
+    }
+  }
+#endif
+
+  pkt.kind = AOM_CODEC_PSNR_PKT;
+  aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt);
+}
+
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) {
+  if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
+
+  *ext_ref_frame_flags = ref_frame_flags;
+  return 0;
+}
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+  if (cfg) {
+    aom_yv12_copy_frame(cfg, sd, num_planes);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+  if (cfg) {
+    aom_yv12_copy_frame(sd, cfg, num_planes);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+#ifdef OUTPUT_YUV_REC
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+  uint8_t *src = s->y_buffer;
+  int h = cm->height;
+  if (yuv_rec_file == NULL) return;
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2, yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_rec_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  fflush(yuv_rec_file);
+}
+#endif  // OUTPUT_YUV_REC
+
+void av1_set_mv_search_params(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+  const int max_mv_def = AOMMAX(cm->width, cm->height);
+
+  // Default based on max resolution.
+  mv_search_params->mv_step_param = av1_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv_sf.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      mv_search_params->max_mv_magnitude = max_mv_def;
+    } else {
+      // Use adaptive mv steps based on previous frame stats for show frames and
+      // internal arfs.
+      FRAME_UPDATE_TYPE cur_update_type =
+          cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+      int use_auto_mv_step =
+          (cm->show_frame || cur_update_type == INTNL_ARF_UPDATE) &&
+          mv_search_params->max_mv_magnitude != -1 &&
+          cpi->sf.mv_sf.auto_mv_step_size >= 2;
+      if (use_auto_mv_step) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        mv_search_params->mv_step_param = av1_init_search_range(
+            AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude));
+      }
+      // Reset max_mv_magnitude based on update flag.
+      if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1;
+    }
+  }
+}
+
+void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  if (cm->seq_params->force_screen_content_tools != 2) {
+    features->allow_screen_content_tools = features->allow_intrabc =
+        cm->seq_params->force_screen_content_tools;
+    return;
+  }
+
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    features->allow_screen_content_tools = 1;
+    features->allow_intrabc = cpi->oxcf.mode == REALTIME ? 0 : 1;
+    cpi->is_screen_content_type = 1;
+    cpi->use_screen_content_tools = 1;
+    return;
+  }
+
+  if (cpi->oxcf.mode == REALTIME) {
+    features->allow_screen_content_tools = features->allow_intrabc = 0;
+    return;
+  }
+
+  // Screen content tools are not evaluated in non-RD encoding mode unless
+  // content type is not set explicitly, i.e., when
+  // cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN, use_nonrd_pick_mode = 1
+  // and hybrid_intra_pickmode = 0. Hence, screen content detection is
+  // disabled.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+      !cpi->sf.rt_sf.hybrid_intra_pickmode) {
+    features->allow_screen_content_tools = features->allow_intrabc = 0;
+    return;
+  }
+
+  // Estimate if the source frame is screen content, based on the portion of
+  // blocks that have few luma colors.
+  const uint8_t *src = cpi->unfiltered_source->y_buffer;
+  assert(src != NULL);
+  const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int stride = cpi->unfiltered_source->y_stride;
+  const int width = cpi->unfiltered_source->y_width;
+  const int height = cpi->unfiltered_source->y_height;
+  const int64_t area = (int64_t)width * height;
+  const int bd = cm->seq_params->bit_depth;
+  const int blk_w = 16;
+  const int blk_h = 16;
+  // These threshold values are selected experimentally.
+  const int color_thresh = 4;
+  const unsigned int var_thresh = 0;
+  // Counts of blocks with no more than color_thresh colors.
+  int64_t counts_1 = 0;
+  // Counts of blocks with no more than color_thresh colors and variance larger
+  // than var_thresh.
+  int64_t counts_2 = 0;
+
+  for (int r = 0; r + blk_h <= height; r += blk_h) {
+    for (int c = 0; c + blk_w <= width; c += blk_w) {
+      int count_buf[1 << 8];  // Maximum (1 << 8) bins for hbd path.
+      const uint8_t *const this_src = src + r * stride + c;
+      int n_colors;
+      if (use_hbd)
+        av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL,
+                                count_buf, &n_colors, NULL);
+      else
+        av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors);
+      if (n_colors > 1 && n_colors <= color_thresh) {
+        ++counts_1;
+        struct buf_2d buf;
+        buf.stride = stride;
+        buf.buf = (uint8_t *)this_src;
+        const unsigned int var = av1_get_perpixel_variance(
+            cpi, xd, &buf, BLOCK_16X16, AOM_PLANE_Y, use_hbd);
+        if (var > var_thresh) ++counts_2;
+      }
+    }
+  }
+
+  // The threshold values are selected experimentally.
+  features->allow_screen_content_tools = counts_1 * blk_h * blk_w * 10 > area;
+  // IntraBC would force loop filters off, so we use more strict rules that also
+  // requires that the block has high variance.
+  features->allow_intrabc = features->allow_screen_content_tools &&
+                            counts_2 * blk_h * blk_w * 12 > area;
+  cpi->use_screen_content_tools = features->allow_screen_content_tools;
+  cpi->is_screen_content_type =
+      features->allow_intrabc || (counts_1 * blk_h * blk_w * 10 > area * 4 &&
+                                  counts_2 * blk_h * blk_w * 30 > area);
+}
+
+static void init_motion_estimation(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+  const int aligned_width = (cm->width + 7) & ~7;
+  const int y_stride =
+      aom_calc_y_stride(aligned_width, cpi->oxcf.border_in_pixels);
+  const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width ||
+                             cpi->oxcf.frm_dim_cfg.height != cm->height) ||
+                            av1_superres_scaled(cm))
+                               ? y_stride
+                               : cpi->ppi->lookahead->buf->img.y_stride;
+  int fpf_y_stride =
+      cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride : y_stride;
+
+  // Update if search_site_cfg is uninitialized or the current frame has a new
+  // stride
+  const int should_update =
+      !mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride ||
+      !mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][DIAMOND].stride ||
+      (y_stride !=
+       mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride);
+
+  if (!should_update) {
+    return;
+  }
+
+  // Initialization of search_site_cfg for NUM_DISTINCT_SEARCH_METHODS.
+  for (SEARCH_METHODS i = DIAMOND; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+    const int level = ((i == NSTEP_8PT) || (i == CLAMPED_DIAMOND)) ? 1 : 0;
+    av1_init_motion_compensation[i](
+        &mv_search_params->search_site_cfg[SS_CFG_SRC][i], y_stride, level);
+    av1_init_motion_compensation[i](
+        &mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][i], y_stride_src,
+        level);
+  }
+
+  // First pass search site config initialization.
+  av1_init_motion_fpf(&mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+                      fpf_y_stride);
+  for (SEARCH_METHODS i = NSTEP; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+    memcpy(&mv_search_params->search_site_cfg[SS_CFG_FPF][i],
+           &mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+           sizeof(search_site_config));
+  }
+}
+
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i;
+  if (cm->cur_frame) {
+    cm->cur_frame->ref_count--;
+    cm->cur_frame = NULL;
+  }
+  for (i = 0; i < REF_FRAMES; ++i) {
+    if (cm->ref_frame_map[i]) {
+      cm->ref_frame_map[i]->ref_count--;
+      cm->ref_frame_map[i] = NULL;
+    }
+  }
+#ifndef NDEBUG
+  BufferPool *const pool = cm->buffer_pool;
+  for (i = 0; i < pool->num_frame_bufs; ++i) {
+    assert(pool->frame_bufs[i].ref_count == 0);
+  }
+#endif
+}
+
+// TODO(chengchen): consider renaming this function as it is necessary
+// for the encoder to setup critical parameters, and it does not
+// deal with initial width any longer.
+aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+                                        int subsampling_x, int subsampling_y) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+
+  if (!cpi->frame_size_related_setup_done ||
+      seq_params->use_highbitdepth != use_highbitdepth ||
+      seq_params->subsampling_x != subsampling_x ||
+      seq_params->subsampling_y != subsampling_y) {
+    seq_params->subsampling_x = subsampling_x;
+    seq_params->subsampling_y = subsampling_y;
+    seq_params->use_highbitdepth = use_highbitdepth;
+
+    av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+    av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
+
+    if (!is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+      if (!av1_tf_info_alloc(&cpi->ppi->tf_info, cpi))
+        return AOM_CODEC_MEM_ERROR;
+#endif  // !CONFIG_REALTIME_ONLY
+    }
+    init_ref_frame_bufs(cpi);
+
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
+
+    cpi->initial_mbs = cm->mi_params.MBs;
+    cpi->frame_size_related_setup_done = true;
+  }
+  return AOM_CODEC_OK;
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    if (av1_denoiser_alloc(
+            cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc,
+            cpi->oxcf.noise_sensitivity, cm->width, cm->height,
+            cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+            cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate denoiser");
+  }
+}
+#endif
+
+// Returns 1 if the assigned width or height was <= 0.
+static int set_size_literal(AV1_COMP *cpi, int width, int height) {
+  AV1_COMMON *cm = &cpi->common;
+  aom_codec_err_t err = av1_check_initial_width(
+      cpi, cm->seq_params->use_highbitdepth, cm->seq_params->subsampling_x,
+      cm->seq_params->subsampling_y);
+  if (err != AOM_CODEC_OK) {
+    aom_internal_error(cm->error, err, "av1_check_initial_width() failed");
+  }
+
+  if (width <= 0 || height <= 0) return 1;
+
+  cm->width = width;
+  cm->height = height;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
+  if (cm->width > cpi->data_alloc_width ||
+      cm->height > cpi->data_alloc_height) {
+    av1_free_context_buffers(cm);
+    av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+    av1_free_sms_tree(&cpi->td);
+    av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+    cpi->td.firstpass_ctx = NULL;
+    alloc_compressor_data(cpi);
+    realloc_segmentation_maps(cpi);
+    cpi->data_alloc_width = cm->width;
+    cpi->data_alloc_height = cm->height;
+    cpi->frame_size_related_setup_done = false;
+  }
+  alloc_mb_mode_info_buffers(cpi);
+  av1_update_frame_size(cpi);
+
+  return 0;
+}
+
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int ref_frame;
+
+  if (width != cm->width || height != cm->height) {
+    // There has been a change in the encoded frame size
+    set_size_literal(cpi, width, height);
+    // Recalculate 'all_lossless' in case super-resolution was (un)selected.
+    cm->features.all_lossless =
+        cm->features.coded_lossless && !av1_superres_scaled(cm);
+
+    av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    // Reset the denoiser on the resized frame.
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      av1_denoiser_free(&(cpi->denoiser));
+      setup_denoiser_buffer(cpi);
+    }
+#endif
+  }
+  if (is_stat_consumption_stage(cpi)) {
+    av1_set_target_rate(cpi, cm->width, cm->height);
+  }
+
+  alloc_frame_mvs(cm, cm->cur_frame);
+
+  // Allocate above context buffers
+  CommonContexts *const above_contexts = &cm->above_contexts;
+  if (above_contexts->num_planes < av1_num_planes(cm) ||
+      above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
+      above_contexts->num_tile_rows < cm->tiles.rows) {
+    av1_free_above_context_buffers(above_contexts);
+    if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
+                                        cm->mi_params.mi_cols,
+                                        av1_num_planes(cm)))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
+  }
+
+  AV1EncoderConfig *oxcf = &cpi->oxcf;
+  oxcf->border_in_pixels = av1_get_enc_border_size(
+      av1_is_resize_needed(oxcf), oxcf->kf_cfg.key_freq_max == 0,
+      cm->seq_params->sb_size);
+
+  // Reset the frame pointers to the current frame size.
+  if (aom_realloc_frame_buffer(
+          &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL, cpi->image_pyramid_levels, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+
+  if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+  if (is_restoration_used(cm)) {
+    for (int i = 0; i < num_planes; ++i)
+      cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
+    const bool is_sgr_enabled = !cpi->sf.lpf_sf.disable_sgr_filter;
+    av1_alloc_restoration_buffers(cm, is_sgr_enabled);
+    // Store the allocated restoration buffers in MT object.
+    if (cpi->ppi->p_mt_info.num_workers > 1) {
+      av1_init_lr_mt_buffers(cpi);
+    }
+  }
+#endif
+
+  init_motion_estimation(cpi);
+
+  int has_valid_ref_frame = 0;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf != NULL) {
+      struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+      av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width,
+                                        buf->buf.y_crop_height, cm->width,
+                                        cm->height);
+      has_valid_ref_frame |= av1_is_valid_scale(sf);
+      if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes);
+    }
+  }
+  if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+    aom_internal_error(
+        cm->error, AOM_CODEC_CORRUPT_FRAME,
+        "Can't find at least one reference frame with valid size");
+  }
+
+  av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+                                    cm->width, cm->height);
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static INLINE int extend_borders_mt(const AV1_COMP *cpi,
+                                    MULTI_THREADED_MODULES stage, int plane) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cpi->mt_info.num_mod_workers[stage] < 2) return 0;
+  switch (stage) {
+    // TODO(deepa.kg@ittiam.com): When cdef and loop-restoration are disabled,
+    // multi-thread frame border extension along with loop filter frame.
+    // As loop-filtering of a superblock row modifies the pixels of the
+    // above superblock row, border extension requires that loop filtering
+    // of the current and above superblock row is complete.
+    case MOD_LPF: return 0;
+    case MOD_CDEF:
+      return is_cdef_used(cm) && !cpi->ppi->rtc_ref.non_reference_frame &&
+             !is_restoration_used(cm) && !av1_superres_scaled(cm);
+    case MOD_LR:
+      return is_restoration_used(cm) &&
+             (cm->rst_info[plane].frame_restoration_type != RESTORE_NONE);
+    default: assert(0);
+  }
+  return 0;
+}
+
+/*!\brief Select and apply cdef filters and switchable restoration filters
+ *
+ * \ingroup high_level_algo
+ */
+static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
+                                   MACROBLOCKD *xd, int use_restoration,
+                                   int use_cdef,
+                                   unsigned int skip_apply_postproc_filters) {
+#if !CONFIG_REALTIME_ONLY
+  if (use_restoration)
+    av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
+#else
+  (void)use_restoration;
+#endif
+
+  if (use_cdef) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, cdef_time);
+#endif
+    const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF];
+    // Find CDEF parameters
+    av1_cdef_search(cpi);
+
+    // Apply the filter
+    if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) {
+      assert(!cpi->ppi->rtc_ref.non_reference_frame);
+      if (num_workers > 1) {
+        // Extension of frame borders is multi-threaded along with cdef.
+        const int do_extend_border =
+            extend_borders_mt(cpi, MOD_CDEF, /* plane */ 0);
+        av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker,
+                          cpi->mt_info.workers, &cpi->mt_info.cdef_sync,
+                          num_workers, av1_cdef_init_fb_row_mt,
+                          do_extend_border);
+      } else {
+        av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row);
+      }
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, cdef_time);
+#endif
+  }
+
+  const int use_superres = av1_superres_scaled(cm);
+  if (use_superres) {
+    if ((skip_apply_postproc_filters & SKIP_APPLY_SUPERRES) == 0) {
+      av1_superres_post_encode(cpi);
+    }
+  }
+
+#if !CONFIG_REALTIME_ONLY
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loop_restoration_time);
+#endif
+  if (use_restoration) {
+    MultiThreadInfo *const mt_info = &cpi->mt_info;
+    const int num_workers = mt_info->num_mod_workers[MOD_LR];
+    av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
+    av1_pick_filter_restoration(cpi->source, cpi);
+    if ((skip_apply_postproc_filters & SKIP_APPLY_RESTORATION) == 0 &&
+        (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+         cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
+      if (num_workers > 1) {
+        // Extension of frame borders is multi-threaded along with loop
+        // restoration filter.
+        const int do_extend_border = 1;
+        av1_loop_restoration_filter_frame_mt(
+            &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers,
+            &mt_info->lr_row_sync, &cpi->lr_ctxt, do_extend_border);
+      } else {
+        av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
+                                          &cpi->lr_ctxt);
+      }
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loop_restoration_time);
+#endif
+#endif  // !CONFIG_REALTIME_ONLY
+}
+
+static void extend_frame_borders(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  // TODO(debargha): Fix mv search range on encoder side
+  for (int plane = 0; plane < av1_num_planes(cm); ++plane) {
+    const bool extend_border_done = extend_borders_mt(cpi, MOD_CDEF, plane) ||
+                                    extend_borders_mt(cpi, MOD_LR, plane);
+    if (!extend_border_done) {
+      const YV12_BUFFER_CONFIG *const ybf = &cm->cur_frame->buf;
+      aom_extend_frame_borders_plane_row(ybf, plane, 0,
+                                         ybf->crop_heights[plane > 0]);
+    }
+  }
+}
+
+/*!\brief Select and apply deblocking filters, cdef filters, and restoration
+ * filters.
+ *
+ * \ingroup high_level_algo
+ */
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int num_workers = mt_info->num_mod_workers[MOD_LPF];
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+  assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg),
+                 cm->features.coded_lossless && cm->features.all_lossless));
+
+  const int use_loopfilter =
+      is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc;
+  const int use_cdef = is_cdef_used(cm);
+  const int use_superres = av1_superres_scaled(cm);
+  const int use_restoration = is_restoration_used(cm);
+
+  const unsigned int skip_apply_postproc_filters =
+      derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef,
+                                         use_superres, use_restoration);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loop_filter_time);
+#endif
+  if (use_loopfilter) {
+    av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick);
+    struct loopfilter *lf = &cm->lf;
+    if ((lf->filter_level[0] || lf->filter_level[1]) &&
+        (skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0) {
+      assert(!cpi->ppi->rtc_ref.non_reference_frame);
+      // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+      // lpf_opt_level is set to 1 if transform size search depth in inter
+      // blocks is limited to one as quad loop filtering assumes that all the
+      // transform blocks within a 16x8/8x16/16x16 prediction block are of the
+      // same size. lpf_opt_level = 2 : Filters both chroma planes together, in
+      // addition to enabling dual/quad loop-filtering. This is enabled when lpf
+      // pick method is LPF_PICK_FROM_Q as u and v plane filter levels are
+      // equal.
+      int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
+                               mt_info->workers, num_workers,
+                               &mt_info->lf_row_sync, lpf_opt_level);
+    }
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loop_filter_time);
+#endif
+
+  cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef,
+                         skip_apply_postproc_filters);
+}
+
+static void update_motion_stat(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
+  const int avg_cnt_zeromv =
+      100 * cpi->rc.cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
+  if (!cpi->ppi->use_svc ||
+      (cpi->ppi->use_svc &&
+       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+       cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+    rc->avg_frame_low_motion =
+        (rc->avg_frame_low_motion == 0)
+            ? avg_cnt_zeromv
+            : (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4;
+    // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
+    // to all lower spatial layers.
+    if (cpi->ppi->use_svc &&
+        svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+      for (int i = 0; i < svc->number_spatial_layers - 1; ++i) {
+        const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                           svc->number_temporal_layers);
+        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+        RATE_CONTROL *const lrc = &lc->rc;
+        lrc->avg_frame_low_motion = rc->avg_frame_low_motion;
+      }
+    }
+  }
+}
+
+/*!\brief Encode a frame without the recode loop, usually used in one-pass
+ * encoding and realtime coding.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_without_recode(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
+  SVC *const svc = &cpi->svc;
+  const int resize_pending = is_frame_resize_pending(cpi);
+  int top_index = 0, bottom_index = 0, q = 0;
+  YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
+  InterpFilter filter_scaler =
+      cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+                        : EIGHTTAP_SMOOTH;
+  int phase_scaler = cpi->ppi->use_svc
+                         ? svc->downsample_filter_phase[svc->spatial_layer_id]
+                         : 0;
+
+  set_size_independent_vars(cpi);
+  av1_setup_frame_size(cpi);
+  cm->prev_frame = get_primary_ref_frame_buf(cm);
+  av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+  av1_set_mv_search_params(cpi);
+
+  if (cm->current_frame.frame_number == 0 &&
+      (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) &&
+      cpi->svc.temporal_layer_id == 0) {
+    const SequenceHeader *seq_params = cm->seq_params;
+    if (aom_alloc_frame_buffer(
+            &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width,
+            cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) {
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate buffer for source_last_TL0");
+    }
+  }
+
+  if (!cpi->ppi->use_svc) {
+    phase_scaler = 8;
+    // 2:1 scaling.
+    if ((cm->width << 1) == unscaled->y_crop_width &&
+        (cm->height << 1) == unscaled->y_crop_height) {
+      filter_scaler = BILINEAR;
+      // For lower resolutions use eighttap_smooth.
+      if (cm->width * cm->height <= 320 * 180) filter_scaler = EIGHTTAP_SMOOTH;
+    } else if ((cm->width << 2) == unscaled->y_crop_width &&
+               (cm->height << 2) == unscaled->y_crop_height) {
+      // 4:1 scaling.
+      filter_scaler = EIGHTTAP_SMOOTH;
+    } else if ((cm->width << 2) == 3 * unscaled->y_crop_width &&
+               (cm->height << 2) == 3 * unscaled->y_crop_height) {
+      // 4:3 scaling.
+      filter_scaler = EIGHTTAP_REGULAR;
+    }
+  }
+
+  allocate_gradient_info_for_hog(cpi);
+
+  allocate_src_var_of_4x4_sub_block_buf(cpi);
+
+  const SPEED_FEATURES *sf = &cpi->sf;
+  if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION)
+    variance_partition_alloc(cpi);
+
+  if (cm->current_frame.frame_type == KEY_FRAME ||
+      ((sf->inter_sf.extra_prune_warped && cpi->refresh_frame.golden_frame)))
+    copy_frame_prob_info(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame: \n");
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+    av1_setup_butteraugli_rdmult(cpi);
+  }
+#endif
+
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true,
+      false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+  if (frame_is_intra_only(cm) || resize_pending != 0) {
+    const int current_size =
+        (cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2;
+    if (cpi->consec_zero_mv &&
+        (cpi->consec_zero_mv_alloc_size < current_size)) {
+      aom_free(cpi->consec_zero_mv);
+      cpi->consec_zero_mv_alloc_size = 0;
+      CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+                      aom_malloc(current_size * sizeof(*cpi->consec_zero_mv)));
+      cpi->consec_zero_mv_alloc_size = current_size;
+    }
+    assert(cpi->consec_zero_mv != NULL);
+    memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv));
+  }
+
+  if (cpi->scaled_last_source_available) {
+    cpi->last_source = &cpi->scaled_last_source;
+    cpi->scaled_last_source_available = 0;
+  } else if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler,
+        phase_scaler, true, false, cpi->oxcf.border_in_pixels,
+        cpi->image_pyramid_levels);
+  }
+
+  if (cpi->sf.rt_sf.use_temporal_noise_estimate) {
+    av1_update_noise_estimate(cpi);
+  }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc)
+    av1_denoiser_reset_on_first_frame(cpi);
+#endif
+
+  // For 1 spatial layer encoding: if the (non-LAST) reference has different
+  // resolution from the source then disable that reference. This is to avoid
+  // significant increase in encode time from scaling the references in
+  // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger)
+  // resized frame and ALTREF will be refreshed ~4 frames later, so both
+  // references become available again after few frames.
+  // For superres: don't disable golden reference.
+  if (svc->number_spatial_layers == 1) {
+    if (!cpi->oxcf.superres_cfg.enable_superres) {
+      if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
+        const YV12_BUFFER_CONFIG *const ref =
+            get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+        if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+          cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+      }
+    }
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) {
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+        cpi->ref_frame_flags ^= AOM_ALT_FLAG;
+    }
+  }
+
+  int scale_references = 0;
+#if CONFIG_FPMT_TEST
+  scale_references =
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+#endif  // CONFIG_FPMT_TEST
+  if (scale_references ||
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+    if (!frame_is_intra_only(cm)) {
+      av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+    }
+  }
+
+  av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                    q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+  av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+  av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                     cm->seq_params->bit_depth);
+  av1_set_variance_partition_thresholds(cpi, q, 0);
+  av1_setup_frame(cpi);
+
+  // Check if this high_source_sad (scene/slide change) frame should be
+  // encoded at high/max QP, and if so, set the q and adjust some rate
+  // control parameters.
+  if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
+      cpi->rc.high_source_sad) {
+    if (av1_encodedframe_overshoot_cbr(cpi, &q)) {
+      av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                        q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+      av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                         cm->seq_params->bit_depth);
+      av1_set_variance_partition_thresholds(cpi, q, 0);
+      if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+          cm->features.primary_ref_frame == PRIMARY_REF_NONE)
+        av1_setup_frame(cpi);
+    }
+  }
+
+  if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
+    suppress_active_map(cpi);
+    av1_cyclic_refresh_setup(cpi);
+  }
+  av1_apply_active_map(cpi);
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  // This is for rtc temporal filtering case.
+  if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf &&
+      cm->current_frame.frame_type != KEY_FRAME) {
+    const SequenceHeader *seq_params = cm->seq_params;
+
+    if (cpi->orig_source.buffer_alloc_sz == 0 ||
+        cpi->last_source->y_width != cpi->source->y_width ||
+        cpi->last_source->y_height != cpi->source->y_height) {
+      // Allocate a source buffer to store the true source for psnr calculation.
+      if (aom_alloc_frame_buffer(
+              &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width,
+              cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x,
+              seq_params->subsampling_y, seq_params->use_highbitdepth,
+              cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0))
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate scaled buffer");
+    }
+
+    aom_yv12_copy_y(cpi->source, &cpi->orig_source);
+    aom_yv12_copy_u(cpi->source, &cpi->orig_source);
+    aom_yv12_copy_v(cpi->source, &cpi->orig_source);
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_encode_frame_time);
+#endif
+
+  // Set the motion vector precision based on mv stats from the last coded
+  // frame.
+  if (!frame_is_intra_only(cm)) av1_pick_and_set_high_precision_mv(cpi, q);
+
+  // transform / motion compensation build reconstruction frame
+  av1_encode_frame(cpi);
+
+  if (!cpi->rc.rtc_external_ratectrl && !frame_is_intra_only(cm))
+    update_motion_stat(cpi);
+
+  // Adjust the refresh of the golden (longer-term) reference based on QP
+  // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode.
+  if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 &&
+      svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl &&
+      sf->rt_sf.gf_refresh_based_on_qp)
+    av1_adjust_gf_refresh_qp_one_pass_rt(cpi);
+
+  // For non-svc: if scaling is required, copy scaled_source
+  // into scaled_last_source.
+  if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc &&
+      cpi->scaled_source.y_buffer != NULL &&
+      cpi->scaled_last_source.y_buffer != NULL &&
+      cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width &&
+      cpi->scaled_source.y_crop_height ==
+          cpi->scaled_last_source.y_crop_height &&
+      (cm->width != cpi->unscaled_source->y_crop_width ||
+       cm->height != cpi->unscaled_source->y_crop_height)) {
+    cpi->scaled_last_source_available = 1;
+    aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source);
+    aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source);
+    aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source);
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_encode_frame_time);
+#endif
+#if CONFIG_INTERNAL_STATS
+  ++cpi->frame_recode_hits;
+#endif
+
+  return AOM_CODEC_OK;
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Recode loop for encoding one frame. the purpose of encoding one frame
+ * for multiple times can be approaching a target bitrate or adjusting the usage
+ * of global motions.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+  const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE);
+  // Must allow recode if minimum compression ratio is set.
+  assert(IMPLIES(oxcf->rc_cfg.min_cr > 0, allow_recode));
+
+  set_size_independent_vars(cpi);
+  if (is_stat_consumption_stage_twopass(cpi) &&
+      cpi->sf.interp_sf.adaptive_interp_filter_search)
+    cpi->interp_search_flags.interp_filter_search_mask =
+        av1_setup_interp_filter_search_mask(cpi);
+
+  av1_setup_frame_size(cpi);
+
+  if (av1_superres_in_recode_allowed(cpi) &&
+      cpi->superres_mode != AOM_SUPERRES_NONE &&
+      cm->superres_scale_denominator == SCALE_NUMERATOR) {
+    // Superres mode is currently enabled, but the denominator selected will
+    // disable superres. So no need to continue, as we will go through another
+    // recode loop for full-resolution after this anyway.
+    return -1;
+  }
+
+  int top_index = 0, bottom_index = 0;
+  int q = 0, q_low = 0, q_high = 0;
+  av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+  q_low = bottom_index;
+  q_high = top_index;
+
+  av1_set_mv_search_params(cpi);
+
+  allocate_gradient_info_for_hog(cpi);
+
+  allocate_src_var_of_4x4_sub_block_buf(cpi);
+
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION)
+    variance_partition_alloc(cpi);
+
+  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame: \n");
+#endif
+
+#if !CONFIG_RD_COMMAND
+  // Determine whether to use screen content tools using two fast encoding.
+  if (!cpi->sf.hl_sf.disable_extra_sc_testing && !cpi->use_ducky_encode)
+    av1_determine_sc_tools_with_encoding(cpi, q);
+#endif  // !CONFIG_RD_COMMAND
+
+#if CONFIG_TUNE_VMAF
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source);
+  }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  cpi->butteraugli_info.recon_set = false;
+  int original_q = 0;
+#endif
+
+  cpi->num_frame_recode = 0;
+
+  // Loop variables
+  int loop = 0;
+  int loop_count = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+  int low_cr_seen = 0;
+  int last_loop_allow_hp = 0;
+
+  do {
+    loop = 0;
+    int do_mv_stats_collection = 1;
+
+    // if frame was scaled calculate global_motion_search again if already
+    // done
+    if (loop_count > 0 && cpi->source && gm_info->search_done) {
+      if (cpi->source->y_crop_width != cm->width ||
+          cpi->source->y_crop_height != cm->height) {
+        gm_info->search_done = 0;
+      }
+    }
+    cpi->source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0,
+        false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+
+#if CONFIG_TUNE_BUTTERAUGLI
+    if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+      if (loop_count == 0) {
+        original_q = q;
+        // TODO(sdeng): different q here does not make big difference. Use a
+        // faster pass instead.
+        q = 96;
+        av1_setup_butteraugli_source(cpi);
+      } else {
+        q = original_q;
+      }
+    }
+#endif
+
+    if (cpi->unscaled_last_source != NULL) {
+      cpi->last_source = av1_realloc_and_scale_if_required(
+          cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+          EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels,
+          cpi->image_pyramid_levels);
+    }
+
+    int scale_references = 0;
+#if CONFIG_FPMT_TEST
+    scale_references =
+        cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+#endif  // CONFIG_FPMT_TEST
+    if (scale_references ||
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+      if (!frame_is_intra_only(cm)) {
+        if (loop_count > 0) {
+          release_scaled_references(cpi);
+        }
+        av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
+      }
+    }
+
+#if CONFIG_TUNE_VMAF
+    if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+        oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+      cpi->vmaf_info.original_qindex = q;
+      q = av1_get_vmaf_base_qindex(cpi, q);
+    }
+#endif
+
+#if CONFIG_RD_COMMAND
+    RD_COMMAND *rd_command = &cpi->rd_command;
+    RD_OPTION option = rd_command->option_ls[rd_command->frame_index];
+    if (option == RD_OPTION_SET_Q || option == RD_OPTION_SET_Q_RDMULT) {
+      q = rd_command->q_index_ls[rd_command->frame_index];
+    }
+#endif  // CONFIG_RD_COMMAND
+
+#if CONFIG_BITRATE_ACCURACY
+#if CONFIG_THREE_PASS
+    if (oxcf->pass == AOM_RC_THIRD_PASS && cpi->vbr_rc_info.ready == 1) {
+      int frame_coding_idx =
+          av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+      if (frame_coding_idx < cpi->vbr_rc_info.total_frame_count) {
+        q = cpi->vbr_rc_info.q_index_list[frame_coding_idx];
+      } else {
+        // TODO(angiebird): Investigate why sometimes there is an extra frame
+        // after the last GOP.
+        q = cpi->vbr_rc_info.base_q_index;
+      }
+    }
+#else
+    if (cpi->vbr_rc_info.q_index_list_ready) {
+      q = cpi->vbr_rc_info.q_index_list[cpi->gf_frame_index];
+    }
+#endif  // CONFIG_THREE_PASS
+#endif  // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+    // TODO(angiebird): Move this into a function.
+    if (oxcf->pass == AOM_RC_THIRD_PASS) {
+      int frame_coding_idx =
+          av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+      double qstep_ratio = cpi->vbr_rc_info.qstep_ratio_list[frame_coding_idx];
+      FRAME_UPDATE_TYPE update_type =
+          cpi->vbr_rc_info.update_type_list[frame_coding_idx];
+      rc_log_frame_encode_param(&cpi->rc_log, frame_coding_idx, qstep_ratio, q,
+                                update_type);
+    }
+#endif  // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+
+    if (cpi->use_ducky_encode) {
+      const DuckyEncodeFrameInfo *frame_info =
+          &cpi->ducky_encode_info.frame_info;
+      if (frame_info->qp_mode == DUCKY_ENCODE_FRAME_MODE_QINDEX) {
+        q = frame_info->q_index;
+        cm->delta_q_info.delta_q_present_flag = frame_info->delta_q_enabled;
+      }
+    }
+
+    av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                      q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+    av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+    av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                       cm->seq_params->bit_depth);
+
+    av1_set_variance_partition_thresholds(cpi, q, 0);
+
+    // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n",
+    //        cm->current_frame.frame_number, cm->show_frame, q,
+    //        cm->current_frame.frame_type, cm->superres_scale_denominator);
+
+    if (loop_count == 0) {
+      av1_setup_frame(cpi);
+    } else if (get_primary_ref_frame_buf(cm) == NULL) {
+      // Base q-index may have changed, so we need to assign proper default coef
+      // probs before every iteration.
+      av1_default_coef_probs(cm);
+      av1_setup_frame_contexts(cm);
+    }
+
+    if (q_cfg->aq_mode == VARIANCE_AQ) {
+      av1_vaq_frame_setup(cpi);
+    } else if (q_cfg->aq_mode == COMPLEXITY_AQ) {
+      av1_setup_in_frame_q_adj(cpi);
+    }
+
+    if (cm->seg.enabled) {
+      if (!cm->seg.update_data && cm->prev_frame) {
+        segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+        cm->seg.enabled = cm->prev_frame->seg.enabled;
+      } else {
+        av1_calculate_segdata(&cm->seg);
+      }
+    } else {
+      memset(&cm->seg, 0, sizeof(cm->seg));
+    }
+    segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+    cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_encode_frame_time);
+#endif
+    // Set the motion vector precision based on mv stats from the last coded
+    // frame.
+    if (!frame_is_intra_only(cm)) {
+      av1_pick_and_set_high_precision_mv(cpi, q);
+
+      // If the precision has changed during different iteration of the loop,
+      // then we need to reset the global motion vectors
+      if (loop_count > 0 &&
+          cm->features.allow_high_precision_mv != last_loop_allow_hp) {
+        gm_info->search_done = 0;
+      }
+      last_loop_allow_hp = cm->features.allow_high_precision_mv;
+    }
+
+    // transform / motion compensation build reconstruction frame
+    av1_encode_frame(cpi);
+
+    // Disable mv_stats collection for parallel frames based on update flag.
+    if (!cpi->do_frame_data_update) do_mv_stats_collection = 0;
+
+    // Reset the mv_stats in case we are interrupted by an intraframe or an
+    // overlay frame.
+    if (cpi->mv_stats.valid && do_mv_stats_collection) av1_zero(cpi->mv_stats);
+
+    // Gather the mv_stats for the next frame
+    if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+        av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) {
+      av1_collect_mv_stats(cpi, q);
+    }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_encode_frame_time);
+#endif
+
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+    const int do_dummy_pack = 1;
+#else   // CONFIG_BITRATE_ACCURACY
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    const int do_dummy_pack =
+        (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+         oxcf->rc_cfg.mode != AOM_Q) ||
+        oxcf->rc_cfg.min_cr > 0;
+#endif  // CONFIG_BITRATE_ACCURACY
+    if (do_dummy_pack) {
+      av1_finalize_encoded_frame(cpi);
+      int largest_tile_id = 0;  // Output from bitstream: unused here
+      rc->coefficient_size = 0;
+      if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) !=
+          AOM_CODEC_OK) {
+        return AOM_CODEC_ERROR;
+      }
+
+      // bits used for this frame
+      rc->projected_frame_size = (int)(*size) << 3;
+#if CONFIG_RD_COMMAND
+      PSNR_STATS psnr;
+      aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+      printf("q %d rdmult %d rate %d dist %" PRIu64 "\n", q, cpi->rd.RDMULT,
+             rc->projected_frame_size, psnr.sse[0]);
+      ++rd_command->frame_index;
+      if (rd_command->frame_index == rd_command->frame_count) {
+        return AOM_CODEC_ERROR;
+      }
+#endif  // CONFIG_RD_COMMAND
+
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+      if (oxcf->pass == AOM_RC_THIRD_PASS) {
+        int frame_coding_idx =
+            av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+        rc_log_frame_entropy(&cpi->rc_log, frame_coding_idx,
+                             rc->projected_frame_size, rc->coefficient_size);
+      }
+#endif  // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+    }
+
+#if CONFIG_TUNE_VMAF
+    if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+        oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+      q = cpi->vmaf_info.original_qindex;
+    }
+#endif
+    if (allow_recode) {
+      // Update q and decide whether to do a recode loop
+      recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
+                           bottom_index, &undershoot_seen, &overshoot_seen,
+                           &low_cr_seen, loop_count);
+    }
+
+#if CONFIG_TUNE_BUTTERAUGLI
+    if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+      loop = 1;
+      av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4);
+    }
+#endif
+
+    if (cpi->use_ducky_encode) {
+      // Ducky encode currently does not support recode loop.
+      loop = 0;
+    }
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+    loop = 0;  // turn off recode loop when CONFIG_BITRATE_ACCURACY is on
+#endif         // CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+
+    if (loop) {
+      ++loop_count;
+      cpi->num_frame_recode =
+          (cpi->num_frame_recode < (NUM_RECODES_PER_FRAME - 1))
+              ? (cpi->num_frame_recode + 1)
+              : (NUM_RECODES_PER_FRAME - 1);
+#if CONFIG_INTERNAL_STATS
+      ++cpi->frame_recode_hits;
+#endif
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    if (loop) printf("\n Recoding:");
+#endif
+  } while (loop);
+
+  return AOM_CODEC_OK;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// TODO(jingning, paulwilkins): Set up high grain level to test
+// hardware decoders. Need to adapt the actual noise variance
+// according to the difference between reconstructed frame and the
+// source signal.
+static void set_grain_syn_params(AV1_COMMON *cm) {
+  aom_film_grain_t *film_grain_params = &cm->film_grain_params;
+  film_grain_params->apply_grain = 1;
+  film_grain_params->update_parameters = 1;
+  film_grain_params->random_seed = rand() & 0xffff;
+
+  film_grain_params->num_y_points = 1;
+  film_grain_params->scaling_points_y[0][0] = 128;
+  film_grain_params->scaling_points_y[0][1] = 100;
+
+  if (!cm->seq_params->monochrome) {
+    film_grain_params->num_cb_points = 1;
+    film_grain_params->scaling_points_cb[0][0] = 128;
+    film_grain_params->scaling_points_cb[0][1] = 100;
+
+    film_grain_params->num_cr_points = 1;
+    film_grain_params->scaling_points_cr[0][0] = 128;
+    film_grain_params->scaling_points_cr[0][1] = 100;
+  } else {
+    film_grain_params->num_cb_points = 0;
+    film_grain_params->num_cr_points = 0;
+  }
+
+  film_grain_params->chroma_scaling_from_luma = 0;
+
+  film_grain_params->scaling_shift = 1;
+  film_grain_params->ar_coeff_lag = 0;
+  film_grain_params->ar_coeff_shift = 1;
+  film_grain_params->overlap_flag = 1;
+  film_grain_params->grain_scale_shift = 0;
+}
+
+/*!\brief Recode loop or a single loop for encoding one frame, followed by
+ * in-loop deblocking filters, CDEF filters, and restoration filters.
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ * \param[in]    sse             Total distortion of the frame
+ * \param[in]    rate            Total rate of the frame
+ * \param[in]    largest_tile_id Tile id of the last tile
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
+                                              uint8_t *dest, int64_t *sse,
+                                              int64_t *rate,
+                                              int *largest_tile_id) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_with_or_without_recode_time);
+#endif
+  for (int i = 0; i < NUM_RECODES_PER_FRAME; i++) {
+    cpi->do_update_frame_probs_txtype[i] = 0;
+    cpi->do_update_frame_probs_obmc[i] = 0;
+    cpi->do_update_frame_probs_warp[i] = 0;
+    cpi->do_update_frame_probs_interpfilter[i] = 0;
+  }
+
+  cpi->do_update_vbr_bits_off_target_fast = 0;
+  int err;
+#if CONFIG_REALTIME_ONLY
+  err = encode_without_recode(cpi);
+#else
+  if (cpi->sf.hl_sf.recode_loop == DISALLOW_RECODE)
+    err = encode_without_recode(cpi);
+  else
+    err = encode_with_recode_loop(cpi, size, dest);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_with_or_without_recode_time);
+#endif
+  if (err != AOM_CODEC_OK) {
+    if (err == -1) {
+      // special case as described in encode_with_recode_loop().
+      // Encoding was skipped.
+      err = AOM_CODEC_OK;
+      if (sse != NULL) *sse = INT64_MAX;
+      if (rate != NULL) *rate = INT64_MAX;
+      *largest_tile_id = 0;
+    }
+    return err;
+  }
+
+#ifdef OUTPUT_YUV_DENOISED
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
+    aom_write_yuv_frame(yuv_denoised_file,
+                        &cpi->denoiser.running_avg_y[INTRA_FRAME]);
+  }
+#endif
+
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (seq_params->use_highbitdepth) {
+      cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    } else {
+      cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    }
+#else
+    cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+  }
+
+  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+  cm->cur_frame->buf.transfer_characteristics =
+      seq_params->transfer_characteristics;
+  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+  cm->cur_frame->buf.monochrome = seq_params->monochrome;
+  cm->cur_frame->buf.chroma_sample_position =
+      seq_params->chroma_sample_position;
+  cm->cur_frame->buf.color_range = seq_params->color_range;
+  cm->cur_frame->buf.render_width = cm->render_width;
+  cm->cur_frame->buf.render_height = cm->render_height;
+
+  if (!cpi->mt_info.pipeline_lpf_mt_with_enc)
+    set_postproc_filter_default_params(&cpi->common);
+
+  if (!cm->features.allow_intrabc) {
+    loopfilter_frame(cpi, cm);
+  }
+
+  if (cpi->oxcf.mode != ALLINTRA && !cpi->ppi->rtc_ref.non_reference_frame) {
+    extend_frame_borders(cpi);
+  }
+
+#ifdef OUTPUT_YUV_REC
+  aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
+#endif
+
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_FILM) {
+    set_grain_syn_params(cm);
+  }
+
+  av1_finalize_encoded_frame(cpi);
+  // Build the bitstream
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+  cpi->rc.coefficient_size = 0;
+  if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK)
+    return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+
+  // Compute sse and rate.
+  if (sse != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    *sse = (seq_params->use_highbitdepth)
+               ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf)
+               : aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#else
+    *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+  }
+  if (rate != NULL) {
+    const int64_t bits = (*size << 3);
+    *rate = (bits << 5);  // To match scale.
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->use_ducky_encode) {
+    PSNR_STATS psnr;
+    aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+    DuckyEncodeFrameResult *frame_result = &cpi->ducky_encode_info.frame_result;
+    frame_result->global_order_idx = cm->cur_frame->display_order_hint;
+    frame_result->q_index = cm->quant_params.base_qindex;
+    frame_result->rdmult = cpi->rd.RDMULT;
+    frame_result->rate = (int)(*size) * 8;
+    frame_result->dist = psnr.sse[0];
+    frame_result->psnr = psnr.psnr[0];
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  return AOM_CODEC_OK;
+}
+
+static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
+                                            uint8_t *dest,
+                                            int *largest_tile_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  assert(cm->seq_params->enable_superres);
+  assert(av1_superres_in_recode_allowed(cpi));
+  aom_codec_err_t err = AOM_CODEC_OK;
+  av1_save_all_coding_context(cpi);
+
+  int64_t sse1 = INT64_MAX;
+  int64_t rate1 = INT64_MAX;
+  int largest_tile_id1 = 0;
+  int64_t sse2 = INT64_MAX;
+  int64_t rate2 = INT64_MAX;
+  int largest_tile_id2;
+  double proj_rdcost1 = DBL_MAX;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
+  const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+
+  // Encode with superres.
+  if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) {
+    SuperResCfg *const superres_cfg = &cpi->oxcf.superres_cfg;
+    int64_t superres_sses[SCALE_NUMERATOR];
+    int64_t superres_rates[SCALE_NUMERATOR];
+    int superres_largest_tile_ids[SCALE_NUMERATOR];
+    // Use superres for Key-frames and Alt-ref frames only.
+    if (update_type != OVERLAY_UPDATE && update_type != INTNL_OVERLAY_UPDATE) {
+      for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+           ++denom) {
+        superres_cfg->superres_scale_denominator = denom;
+        superres_cfg->superres_kf_scale_denominator = denom;
+        const int this_index = denom - (SCALE_NUMERATOR + 1);
+
+        cpi->superres_mode = AOM_SUPERRES_AUTO;  // Super-res on for this loop.
+        err = encode_with_recode_loop_and_filter(
+            cpi, size, dest, &superres_sses[this_index],
+            &superres_rates[this_index],
+            &superres_largest_tile_ids[this_index]);
+        cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+        if (err != AOM_CODEC_OK) return err;
+        restore_all_coding_context(cpi);
+      }
+      // Reset.
+      superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+      superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+    } else {
+      for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+           ++denom) {
+        const int this_index = denom - (SCALE_NUMERATOR + 1);
+        superres_sses[this_index] = INT64_MAX;
+        superres_rates[this_index] = INT64_MAX;
+      }
+    }
+    // Encode without superres.
+    assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+                                             &largest_tile_id2);
+    if (err != AOM_CODEC_OK) return err;
+
+    // Note: Both use common rdmult based on base qindex of fullres.
+    const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+        bit_depth, update_type, cm->quant_params.base_qindex);
+
+    // Find the best rdcost among all superres denoms.
+    int best_denom = -1;
+    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+         ++denom) {
+      const int this_index = denom - (SCALE_NUMERATOR + 1);
+      const int64_t this_sse = superres_sses[this_index];
+      const int64_t this_rate = superres_rates[this_index];
+      const int this_largest_tile_id = superres_largest_tile_ids[this_index];
+      const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+          rdmult, this_rate, this_sse, bit_depth);
+      if (this_rdcost < proj_rdcost1) {
+        sse1 = this_sse;
+        rate1 = this_rate;
+        largest_tile_id1 = this_largest_tile_id;
+        proj_rdcost1 = this_rdcost;
+        best_denom = denom;
+      }
+    }
+    const double proj_rdcost2 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
+    // Re-encode with superres if it's better.
+    if (proj_rdcost1 < proj_rdcost2) {
+      restore_all_coding_context(cpi);
+      // TODO(urvang): We should avoid rerunning the recode loop by saving
+      // previous output+state, or running encode only for the selected 'q' in
+      // previous step.
+      // Again, temporarily force the best denom.
+      superres_cfg->superres_scale_denominator = best_denom;
+      superres_cfg->superres_kf_scale_denominator = best_denom;
+      int64_t sse3 = INT64_MAX;
+      int64_t rate3 = INT64_MAX;
+      cpi->superres_mode =
+          AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+      err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+                                               largest_tile_id);
+      cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+      assert(sse1 == sse3);
+      assert(rate1 == rate3);
+      assert(largest_tile_id1 == *largest_tile_id);
+      // Reset.
+      superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+      superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+    } else {
+      *largest_tile_id = largest_tile_id2;
+    }
+  } else {
+    assert(cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_DUAL);
+    cpi->superres_mode =
+        AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
+                                             &largest_tile_id1);
+    cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+    if (err != AOM_CODEC_OK) return err;
+    restore_all_coding_context(cpi);
+    // Encode without superres.
+    assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+                                             &largest_tile_id2);
+    if (err != AOM_CODEC_OK) return err;
+
+    // Note: Both use common rdmult based on base qindex of fullres.
+    const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+        bit_depth, update_type, cm->quant_params.base_qindex);
+    proj_rdcost1 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth);
+    const double proj_rdcost2 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
+    // Re-encode with superres if it's better.
+    if (proj_rdcost1 < proj_rdcost2) {
+      restore_all_coding_context(cpi);
+      // TODO(urvang): We should avoid rerunning the recode loop by saving
+      // previous output+state, or running encode only for the selected 'q' in
+      // previous step.
+      int64_t sse3 = INT64_MAX;
+      int64_t rate3 = INT64_MAX;
+      cpi->superres_mode =
+          AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+      err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+                                               largest_tile_id);
+      cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+      assert(sse1 == sse3);
+      assert(rate1 == rate3);
+      assert(largest_tile_id1 == *largest_tile_id);
+    } else {
+      *largest_tile_id = largest_tile_id2;
+    }
+  }
+
+  return err;
+}
+
+// Conditions to disable cdf_update mode in selective mode for real-time.
+// Handle case for layers, scene change, and resizing.
+static AOM_INLINE int selective_disable_cdf_rtc(const AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  // For single layer.
+  if (cpi->svc.number_spatial_layers == 1 &&
+      cpi->svc.number_temporal_layers == 1) {
+    // Don't disable on intra_only, scene change (high_source_sad = 1),
+    // or resized frame. To avoid quality loss force enable at
+    // for ~30 frames after key or scene/slide change, and
+    // after 8 frames since last update if frame_source_sad > 0.
+    if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
+        rc->high_source_sad || rc->frames_since_key < 30 ||
+        (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+         cpi->cyclic_refresh->counter_encode_maxq_scene_change < 30) ||
+        (cpi->frames_since_last_update > 8 && cpi->rc.frame_source_sad > 0))
+      return 0;
+    else
+      return 1;
+  } else if (cpi->svc.number_temporal_layers > 1) {
+    // Disable only on top temporal enhancement layer for now.
+    return cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1;
+  }
+  return 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs -= frame->mvr_abs;
+  section->MVc -= frame->MVc;
+  section->mvc_abs -= frame->mvc_abs;
+  section->MVrv -= frame->MVrv;
+  section->MVcv -= frame->MVcv;
+  section->mv_in_out_count -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count -= frame->count;
+  section->duration -= frame->duration;
+}
+
+static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const FIRSTPASS_STATS *const total_stats =
+      twopass->stats_buf_ctx->total_stats;
+
+  if (is_one_pass_rt_params(cpi) ||
+      (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) ||
+      (is_fp_wavelet_energy_invalid(total_stats) == 0))
+    return;
+
+  const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cpi->common.mi_params.MBs;
+  const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source;
+  const uint8_t *const src = unfiltered_source->y_buffer;
+  const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int stride = unfiltered_source->y_stride;
+  const BLOCK_SIZE fp_block_size =
+      get_fp_block_size(cpi->is_screen_content_type);
+  const int fp_block_size_width = block_size_wide[fp_block_size];
+  const int fp_block_size_height = block_size_high[fp_block_size];
+  const int num_unit_cols =
+      get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width);
+  const int num_unit_rows =
+      get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height);
+  const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8);
+  const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8);
+  int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input(
+      src, stride, hbd, num_8x8_rows, num_8x8_cols);
+
+  cpi->twopass_frame.frame_avg_haar_energy =
+      log1p((double)frame_avg_wavelet_energy / num_mbs);
+}
+#endif
+
+extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
+                                     const char *filename);
+
+/*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack
+ * the bitstream
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
+                                     uint8_t *dest) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  struct segmentation *const seg = &cm->seg;
+  FeatureFlags *const features = &cm->features;
+  const TileConfig *const tile_cfg = &oxcf->tile_cfg;
+  assert(cpi->source != NULL);
+  cpi->td.mb.e_mbd.cur_buf = cpi->source;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_frame_to_data_rate_time);
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+  calculate_frame_avg_haar_energy(cpi);
+#endif
+
+  // frame type has been decided outside of this function call
+  cm->cur_frame->frame_type = current_frame->frame_type;
+
+  cm->tiles.large_scale = tile_cfg->enable_large_scale_tile;
+  cm->tiles.single_tile_decoding = tile_cfg->enable_single_tile_decoding;
+
+  features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+  // features->allow_ref_frame_mvs needs to be written into the frame header
+  // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case
+  // is separated from frame_might_allow_ref_frame_mvs().
+  features->allow_ref_frame_mvs &= !cm->tiles.large_scale;
+
+  features->allow_warped_motion = oxcf->motion_mode_cfg.allow_warped_motion &&
+                                  frame_might_allow_warped_motion(cm);
+
+  cpi->last_frame_type = current_frame->frame_type;
+
+  if (frame_is_intra_only(cm)) {
+    cpi->frames_since_last_update = 0;
+  }
+
+  if (frame_is_sframe(cm)) {
+    GF_GROUP *gf_group = &cpi->ppi->gf_group;
+    // S frame will wipe out any previously encoded altref so we cannot place
+    // an overlay frame
+    gf_group->update_type[gf_group->size] = GF_UPDATE;
+  }
+
+  if (encode_show_existing_frame(cm)) {
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+    // TODO(angiebird): Move this into a function.
+    if (oxcf->pass == AOM_RC_THIRD_PASS) {
+      int frame_coding_idx =
+          av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+      rc_log_frame_encode_param(
+          &cpi->rc_log, frame_coding_idx, 1, 255,
+          cpi->ppi->gf_group.update_type[cpi->gf_frame_index]);
+    }
+#endif
+    av1_finalize_encoded_frame(cpi);
+    // Build the bitstream
+    int largest_tile_id = 0;  // Output from bitstream: unused here
+    cpi->rc.coefficient_size = 0;
+    if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+
+    if (seq_params->frame_id_numbers_present_flag &&
+        current_frame->frame_type == KEY_FRAME) {
+      // Displaying a forward key-frame, so reset the ref buffer IDs
+      int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+      for (int i = 0; i < REF_FRAMES; i++)
+        cm->ref_frame_id[i] = display_frame_id;
+    }
+
+#if DUMP_RECON_FRAMES == 1
+    // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+    av1_dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+    // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+    //       for the purpose to verify no mismatch between encoder and decoder.
+    if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    av1_denoiser_update_ref_frame(cpi);
+#endif
+
+    // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+    // to do post-encoding update accordingly.
+    av1_set_target_rate(cpi, cm->width, cm->height);
+
+    if (is_psnr_calc_enabled(cpi)) {
+      cpi->source =
+          realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width,
+                                   cm->cur_frame->buf.y_crop_height);
+    }
+
+#if !CONFIG_REALTIME_ONLY
+    if (cpi->use_ducky_encode) {
+      PSNR_STATS psnr;
+      aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+      DuckyEncodeFrameResult *frame_result =
+          &cpi->ducky_encode_info.frame_result;
+      frame_result->global_order_idx = cm->cur_frame->display_order_hint;
+      frame_result->q_index = cm->quant_params.base_qindex;
+      frame_result->rdmult = cpi->rd.RDMULT;
+      frame_result->rate = (int)(*size) * 8;
+      frame_result->dist = psnr.sse[0];
+      frame_result->psnr = psnr.psnr[0];
+    }
+#endif  // !CONFIG_REALTIME_ONLY
+
+    update_counters_for_show_frame(cpi);
+    return AOM_CODEC_OK;
+  }
+
+  // Work out whether to force_integer_mv this frame
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->common.features.allow_screen_content_tools &&
+      !frame_is_intra_only(cm) && !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    if (cpi->common.seq_params->force_integer_mv == 2) {
+      // Adaptive mode: see what previous frame encoded did
+      if (cpi->unscaled_last_source != NULL) {
+        features->cur_frame_force_integer_mv = av1_is_integer_mv(
+            cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info);
+      } else {
+        cpi->common.features.cur_frame_force_integer_mv = 0;
+      }
+    } else {
+      cpi->common.features.cur_frame_force_integer_mv =
+          cpi->common.seq_params->force_integer_mv;
+    }
+  } else {
+    cpi->common.features.cur_frame_force_integer_mv = 0;
+  }
+
+  // This is used by av1_pack_bitstream. So this needs to be set in case of
+  // row-mt where the encoding code will use a temporary structure.
+  cpi->td.mb.e_mbd.cur_frame_force_integer_mv =
+      cpi->common.features.cur_frame_force_integer_mv;
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
+    // Reset the loop filter deltas and segmentation map.
+    av1_reset_segment_features(cm);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+  }
+  if (tile_cfg->mtu == 0) {
+    cpi->num_tg = tile_cfg->num_tile_groups;
+  } else {
+    // Use a default value for the purposes of weighting costs in probability
+    // updates
+    cpi->num_tg = DEFAULT_MAX_NUM_TG;
+  }
+
+  // For 1 pass CBR mode: check if we are dropping this frame.
+  if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR) {
+    // Always drop for spatial enhancement layer if layer bandwidth is 0.
+    // Otherwise check for frame-dropping based on buffer level in
+    // av1_rc_drop_frame().
+    if ((cpi->svc.spatial_layer_id > 0 &&
+         cpi->oxcf.rc_cfg.target_bandwidth == 0) ||
+        av1_rc_drop_frame(cpi)) {
+      cpi->is_dropped_frame = true;
+    }
+    if (cpi->is_dropped_frame) {
+      av1_setup_frame_size(cpi);
+      av1_set_mv_search_params(cpi);
+      av1_rc_postencode_update_drop_frame(cpi);
+      release_scaled_references(cpi);
+      cpi->ppi->gf_group.is_frame_dropped[cpi->gf_frame_index] = true;
+      // A dropped frame might not be shown but it always takes a slot in the gf
+      // group. Therefore, even when it is not shown, we still need to update
+      // the relevant frame counters.
+      if (cm->show_frame) {
+        update_counters_for_show_frame(cpi);
+      }
+      return AOM_CODEC_OK;
+    }
+  }
+
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM) {
+    av1_set_mb_ssim_rdmult_scaling(cpi);
+  }
+#if CONFIG_SALIENCY_MAP
+  else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP &&
+           !(cpi->source->flags & YV12_FLAG_HIGHBITDEPTH)) {
+    if (av1_set_saliency_map(cpi) == 0) {
+      return AOM_CODEC_MEM_ERROR;
+    }
+#if !CONFIG_REALTIME_ONLY
+    double motion_ratio = av1_setup_motion_ratio(cpi);
+#else
+    double motion_ratio = 1.0;
+#endif
+    if (av1_setup_sm_rdmult_scaling_factor(cpi, motion_ratio) == 0) {
+      return AOM_CODEC_MEM_ERROR;
+    }
+  }
+#endif
+#if CONFIG_TUNE_VMAF
+  else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+           oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+           oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    av1_set_mb_vmaf_rdmult_scaling(cpi);
+  }
+#endif
+
+  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI &&
+      cpi->sf.rt_sf.use_nonrd_pick_mode == 0) {
+    av1_init_mb_wiener_var_buffer(cpi);
+    av1_set_mb_wiener_variance(cpi);
+  }
+
+  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+    av1_init_mb_ur_var_buffer(cpi);
+    av1_set_mb_ur_variance(cpi);
+  }
+
+#if CONFIG_INTERNAL_STATS
+  memset(cpi->mode_chosen_counts, 0,
+         MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+  if (seq_params->frame_id_numbers_present_flag) {
+    /* Non-normative definition of current_frame_id ("frame counter" with
+     * wraparound) */
+    if (cm->current_frame_id == -1) {
+      int lsb, msb;
+      /* quasi-random initialization of current_frame_id for a key frame */
+      if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+        lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+        msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
+      } else {
+        lsb = cpi->source->y_buffer[0] & 0xff;
+        msb = cpi->source->y_buffer[1] & 0xff;
+      }
+      cm->current_frame_id =
+          ((msb << 8) + lsb) % (1 << seq_params->frame_id_length);
+
+      // S_frame is meant for stitching different streams of different
+      // resolutions together, so current_frame_id must be the
+      // same across different streams of the same content current_frame_id
+      // should be the same and not random. 0x37 is a chosen number as start
+      // point
+      if (oxcf->kf_cfg.sframe_dist != 0) cm->current_frame_id = 0x37;
+    } else {
+      cm->current_frame_id =
+          (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) %
+          (1 << seq_params->frame_id_length);
+    }
+  }
+
+  switch (oxcf->algo_cfg.cdf_update_mode) {
+    case 0:  // No CDF update for any frames(4~6% compression loss).
+      features->disable_cdf_update = 1;
+      break;
+    case 1:  // Enable CDF update for all frames.
+      if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame &&
+          cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2)
+        features->disable_cdf_update = 1;
+      else if (cpi->sf.rt_sf.selective_cdf_update)
+        features->disable_cdf_update = selective_disable_cdf_rtc(cpi);
+      else
+        features->disable_cdf_update = 0;
+      break;
+    case 2:
+      // Strategically determine at which frames to do CDF update.
+      // Currently only enable CDF update for all-intra and no-show frames(1.5%
+      // compression loss) for good qualiy or allintra mode.
+      if (oxcf->mode == GOOD || oxcf->mode == ALLINTRA) {
+        features->disable_cdf_update =
+            (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+      } else {
+        features->disable_cdf_update = selective_disable_cdf_rtc(cpi);
+      }
+      break;
+  }
+
+  // Disable cdf update for the INTNL_ARF_UPDATE frame with
+  // frame_parallel_level 1.
+  if (!cpi->do_frame_data_update &&
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+    assert(cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1);
+    features->disable_cdf_update = 1;
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->oxcf.tool_cfg.enable_global_motion && !frame_is_intra_only(cm)) {
+    // Flush any stale global motion information, which may be left over
+    // from a previous frame
+    aom_invalidate_pyramid(cpi->source->y_pyramid);
+    av1_invalidate_corner_list(cpi->source->corners);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  int largest_tile_id = 0;
+  if (av1_superres_in_recode_allowed(cpi)) {
+    if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    const aom_superres_mode orig_superres_mode = cpi->superres_mode;  // save
+    cpi->superres_mode = cpi->oxcf.superres_cfg.superres_mode;
+    if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL,
+                                           &largest_tile_id) != AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+    cpi->superres_mode = orig_superres_mode;  // restore
+  }
+
+  // Update reference frame ids for reference frames this frame will overwrite
+  if (seq_params->frame_id_numbers_present_flag) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      if ((current_frame->refresh_frame_flags >> i) & 1) {
+        cm->ref_frame_id[i] = cm->current_frame_id;
+      }
+    }
+  }
+
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->svc.num_encoded_top_layer++;
+
+#if DUMP_RECON_FRAMES == 1
+  // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+  av1_dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+  if (cm->seg.enabled) {
+    if (cm->seg.update_map == 0 && cm->last_frame_seg_map) {
+      memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
+             cm->cur_frame->mi_cols * cm->cur_frame->mi_rows *
+                 sizeof(*cm->cur_frame->seg_map));
+    }
+  }
+
+  int release_scaled_refs = 0;
+#if CONFIG_FPMT_TEST
+  release_scaled_refs =
+      (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0;
+#endif  // CONFIG_FPMT_TEST
+  if (release_scaled_refs ||
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+    if (frame_is_intra_only(cm) == 0) {
+      release_scaled_references(cpi);
+    }
+  }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  av1_denoiser_update_ref_frame(cpi);
+#endif
+
+  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+  //       for the purpose to verify no mismatch between encoder and decoder.
+  if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+  if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    *cm->fc = cpi->tile_data[largest_tile_id].tctx;
+    av1_reset_cdf_symbol_counters(cm->fc);
+  }
+  if (!cm->tiles.large_scale) {
+    cm->cur_frame->frame_context = *cm->fc;
+  }
+
+  if (tile_cfg->enable_ext_tile_debug) {
+    // (yunqing) This test ensures the correctness of large scale tile coding.
+    if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) {
+      char fn[20] = "./fc";
+      fn[4] = current_frame->frame_number / 100 + '0';
+      fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+      fn[6] = (current_frame->frame_number % 10) + '0';
+      fn[7] = '\0';
+      av1_print_frame_contexts(cm->fc, fn);
+    }
+  }
+
+  cpi->last_frame_type = current_frame->frame_type;
+
+  if (cm->features.disable_cdf_update) {
+    cpi->frames_since_last_update++;
+  } else {
+    cpi->frames_since_last_update = 1;
+  }
+
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->svc.prev_number_spatial_layers = cpi->svc.number_spatial_layers;
+
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
+  cm->seg.update_map = 0;
+  cm->seg.update_data = 0;
+  cm->lf.mode_ref_delta_update = 0;
+
+  if (cm->show_frame) {
+    update_counters_for_show_frame(cpi);
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_frame_to_data_rate_time);
+#endif
+
+  return AOM_CODEC_OK;
+}
+
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+               const EncodeFrameInput *const frame_input,
+               const EncodeFrameParams *const frame_params,
+               EncodeFrameResults *const frame_results) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+
+  cpi->unscaled_source = frame_input->source;
+  cpi->source = frame_input->source;
+  cpi->unscaled_last_source = frame_input->last_source;
+
+  current_frame->refresh_frame_flags = frame_params->refresh_frame_flags;
+  cm->features.error_resilient_mode = frame_params->error_resilient_mode;
+  cm->features.primary_ref_frame = frame_params->primary_ref_frame;
+  cm->current_frame.frame_type = frame_params->frame_type;
+  cm->show_frame = frame_params->show_frame;
+  cpi->ref_frame_flags = frame_params->ref_frame_flags;
+  cpi->speed = frame_params->speed;
+  cm->show_existing_frame = frame_params->show_existing_frame;
+  cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show;
+
+  memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
+         REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+  memcpy(&cpi->refresh_frame, &frame_params->refresh_frame,
+         sizeof(cpi->refresh_frame));
+
+  if (current_frame->frame_type == KEY_FRAME &&
+      cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+    current_frame->frame_number = 0;
+  }
+
+  current_frame->order_hint =
+      current_frame->frame_number + frame_params->order_offset;
+
+  current_frame->display_order_hint = current_frame->order_hint;
+  current_frame->order_hint %=
+      (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1));
+
+  current_frame->pyramid_level = get_true_pyr_level(
+      cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index],
+      current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth);
+
+  if (is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+    if (cpi->oxcf.q_cfg.use_fixed_qp_offsets)
+      av1_noop_first_pass_frame(cpi, frame_input->ts_duration);
+    else
+      av1_first_pass(cpi, frame_input->ts_duration);
+#endif
+  } else if (cpi->oxcf.pass == AOM_RC_ONE_PASS ||
+             cpi->oxcf.pass >= AOM_RC_SECOND_PASS) {
+    if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+#if CONFIG_DENOISE
+static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
+                            int block_size, float noise_level,
+                            int64_t time_stamp, int64_t end_time) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (!cpi->denoise_and_model) {
+    cpi->denoise_and_model = aom_denoise_and_model_alloc(
+        cm->seq_params->bit_depth, block_size, noise_level);
+    if (!cpi->denoise_and_model) {
+      aom_set_error(cm->error, AOM_CODEC_MEM_ERROR,
+                    "Error allocating denoise and model");
+      return -1;
+    }
+  }
+  if (!cpi->film_grain_table) {
+    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+    if (!cpi->film_grain_table) {
+      aom_set_error(cm->error, AOM_CODEC_MEM_ERROR,
+                    "Error allocating grain table");
+      return -1;
+    }
+    memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
+  }
+  if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
+                                &cm->film_grain_params,
+                                cpi->oxcf.enable_dnl_denoising)) {
+    if (cm->film_grain_params.apply_grain) {
+      aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
+                                  &cm->film_grain_params);
+    }
+  }
+  return 0;
+}
+#endif
+
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  int res = 0;
+  const int subsampling_x = sd->subsampling_x;
+  const int subsampling_y = sd->subsampling_y;
+  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+
+#if CONFIG_TUNE_VMAF
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
+    av1_vmaf_frame_preprocessing(cpi, sd);
+  }
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    av1_vmaf_blk_preprocessing(cpi, sd);
+  }
+#endif
+
+#if CONFIG_INTERNAL_STATS
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
+#if CONFIG_DENOISE
+  // even if denoise_noise_level is > 0, we don't need need to denoise on pass
+  // 1 of 2 if enable_dnl_denoising is disabled since the 2nd pass will be
+  // encoding the original (non-denoised) frame
+  if (cpi->oxcf.noise_level > 0 && !(cpi->oxcf.pass == AOM_RC_FIRST_PASS &&
+                                     !cpi->oxcf.enable_dnl_denoising)) {
+#if !CONFIG_REALTIME_ONLY
+    // Choose a synthetic noise level for still images for enhanced perceptual
+    // quality based on an estimated noise level in the source, but only if
+    // the noise level is set on the command line to > 0.
+    if (cpi->oxcf.mode == ALLINTRA) {
+      // No noise synthesis if source is very clean.
+      // Uses a low edge threshold to focus on smooth areas.
+      // Increase output noise setting a little compared to measured value.
+      double y_noise_level = 0.0;
+      av1_estimate_noise_level(sd, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+                               cm->seq_params->bit_depth, 16);
+      cpi->oxcf.noise_level = (float)(y_noise_level - 0.1);
+      cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level);
+      if (cpi->oxcf.noise_level > 0.0) {
+        cpi->oxcf.noise_level += (float)0.5;
+      }
+      cpi->oxcf.noise_level = (float)AOMMIN(5.0, cpi->oxcf.noise_level);
+    }
+#endif
+
+    if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
+                         cpi->oxcf.noise_level, time_stamp, end_time) < 0)
+      res = -1;
+  }
+#endif  //  CONFIG_DENOISE
+
+  if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time,
+                         use_highbitdepth, cpi->image_pyramid_levels,
+                         frame_flags)) {
+    aom_set_error(cm->error, AOM_CODEC_ERROR, "av1_lookahead_push() failed");
+    res = -1;
+  }
+#if CONFIG_INTERNAL_STATS
+  aom_usec_timer_mark(&timer);
+  cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer);
+#endif
+
+  // Note: Regarding profile setting, the following checks are added to help
+  // choose a proper profile for the input video. The criterion is that all
+  // bitstreams must be designated as the lowest profile that match its content.
+  // E.G. A bitstream that contains 4:4:4 video must be designated as High
+  // Profile in the seq header, and likewise a bitstream that contains 4:2:2
+  // bitstream must be designated as Professional Profile in the sequence
+  // header.
+  if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
+      (subsampling_x != 1 || subsampling_y != 1)) {
+    aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+                  "Non-4:2:0 color format requires profile 1 or 2");
+    res = -1;
+  }
+  if ((seq_params->profile == PROFILE_1) &&
+      !(subsampling_x == 0 && subsampling_y == 0)) {
+    aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+                  "Profile 1 requires 4:4:4 color format");
+    res = -1;
+  }
+  if ((seq_params->profile == PROFILE_2) &&
+      (seq_params->bit_depth <= AOM_BITS_10) &&
+      !(subsampling_x == 1 && subsampling_y == 0)) {
+    aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+                  "Profile 2 bit-depth <= 10 requires 4:2:2 color format");
+    res = -1;
+  }
+
+  return res;
+}
+
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi) {
+  if (!ppi->cpi) return;
+
+  if (ppi->cpi->oxcf.pass != 1 &&
+      ppi->cpi->common.current_frame.frame_number > 0) {
+    fprintf(stderr, "Writing counts.stt\n");
+    FILE *f = fopen("counts.stt", "wb");
+    fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f);
+    fclose(f);
+  }
+}
+#endif  // CONFIG_ENTROPY_STATS
+
+#if CONFIG_INTERNAL_STATS
+extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                                 const unsigned char *img2, int img2_pitch,
+                                 int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+                              ImageStat *s) {
+  s->stat[STAT_Y] += y;
+  s->stat[STAT_U] += u;
+  s->stat[STAT_V] += v;
+  s->stat[STAT_ALL] += all;
+  s->worst = AOMMIN(s->worst, all);
+}
+
+static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+  AV1_PRIMARY *const ppi = cpi->ppi;
+  AV1_COMMON *const cm = &cpi->common;
+  double samples = 0.0;
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+
+  if (cpi->ppi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+    return;
+
+#if CONFIG_INTER_STATS_ONLY
+  if (cm->current_frame.frame_type == KEY_FRAME) return;  // skip key frame
+#endif
+  cpi->bytes += frame_bytes;
+  if (cm->show_frame) {
+    const YV12_BUFFER_CONFIG *orig = cpi->source;
+    const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+    double y, u, v, frame_all;
+
+    ppi->count[0]++;
+    ppi->count[1]++;
+    if (cpi->ppi->b_calculate_psnr) {
+      PSNR_STATS psnr;
+      double weight[2] = { 0.0, 0.0 };
+      double frame_ssim2[2] = { 0.0, 0.0 };
+#if CONFIG_AV1_HIGHBITDEPTH
+      aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+      aom_calc_psnr(orig, recon, &psnr);
+#endif
+      adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
+                        &(ppi->psnr[0]));
+      ppi->total_sq_error[0] += psnr.sse[0];
+      ppi->total_samples[0] += psnr.samples[0];
+      samples = psnr.samples[0];
+
+      aom_calc_ssim(orig, recon, bit_depth, in_bit_depth,
+                    cm->seq_params->use_highbitdepth, weight, frame_ssim2);
+
+      ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]);
+      ppi->summed_quality += frame_ssim2[0] * weight[0];
+      ppi->summed_weights += weight[0];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      // Compute PSNR based on stream bit depth
+      if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+          (in_bit_depth < bit_depth)) {
+        adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3],
+                          psnr.psnr_hbd[0], &ppi->psnr[1]);
+        ppi->total_sq_error[1] += psnr.sse_hbd[0];
+        ppi->total_samples[1] += psnr.samples_hbd[0];
+
+        ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]);
+        ppi->summed_quality_hbd += frame_ssim2[1] * weight[1];
+        ppi->summed_weights_hbd += weight[1];
+      }
+#endif
+
+#if 0
+      {
+        FILE *f = fopen("q_used.stt", "a");
+        double y2 = psnr.psnr[1];
+        double u2 = psnr.psnr[2];
+        double v2 = psnr.psnr[3];
+        double frame_psnr2 = psnr.psnr[0];
+        fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                cm->current_frame.frame_number, y2, u2, v2,
+                frame_psnr2, frame_ssim2);
+        fclose(f);
+      }
+#endif
+    }
+    if (ppi->b_calculate_blockiness) {
+      if (!cm->seq_params->use_highbitdepth) {
+        const double frame_blockiness =
+            av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
+                               recon->y_stride, orig->y_width, orig->y_height);
+        ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness);
+        ppi->total_blockiness += frame_blockiness;
+      }
+
+      if (ppi->b_calculate_consistency) {
+        if (!cm->seq_params->use_highbitdepth) {
+          const double this_inconsistency = aom_get_ssim_metrics(
+              orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+              orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1);
+
+          const double peak = (double)((1 << in_bit_depth) - 1);
+          const double consistency =
+              aom_sse_to_psnr(samples, peak, ppi->total_inconsistency);
+          if (consistency > 0.0)
+            ppi->worst_consistency =
+                AOMMIN(ppi->worst_consistency, consistency);
+          ppi->total_inconsistency += this_inconsistency;
+        }
+      }
+    }
+
+    frame_all =
+        aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &ppi->fastssim);
+    frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs);
+  }
+}
+
+void print_internal_stats(AV1_PRIMARY *ppi) {
+  if (!ppi->cpi) return;
+  AV1_COMP *const cpi = ppi->cpi;
+
+  if (ppi->cpi->oxcf.pass != 1 &&
+      ppi->cpi->common.current_frame.frame_number > 0) {
+    char headings[512] = { 0 };
+    char results[512] = { 0 };
+    FILE *f = fopen("opsnr.stt", "a");
+    double time_encoded =
+        (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
+        10000000.000;
+    double total_encode_time =
+        (ppi->total_time_receive_data + ppi->total_time_compress_data) /
+        1000.000;
+    const double dr =
+        (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded;
+    const double peak =
+        (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1);
+    const double target_rate =
+        (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000;
+    const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+    if (ppi->b_calculate_psnr) {
+      const double total_psnr = aom_sse_to_psnr(
+          (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]);
+      const double total_ssim =
+          100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0);
+      snprintf(headings, sizeof(headings),
+               "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+               "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+               "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+               "AVPsrnY\tAPsnrCb\tAPsnrCr");
+      snprintf(results, sizeof(results),
+               "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f",
+               dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+               ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+               total_ssim, total_ssim,
+               ppi->fastssim.stat[STAT_ALL] / ppi->count[0],
+               ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst,
+               ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst,
+               ppi->psnr[0].stat[STAT_Y] / ppi->count[0],
+               ppi->psnr[0].stat[STAT_U] / ppi->count[0],
+               ppi->psnr[0].stat[STAT_V] / ppi->count[0]);
+
+      if (ppi->b_calculate_blockiness) {
+        SNPRINT(headings, "\t  Block\tWstBlck");
+        SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]);
+        SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness);
+      }
+
+      if (ppi->b_calculate_consistency) {
+        double consistency =
+            aom_sse_to_psnr((double)ppi->total_samples[0], peak,
+                            (double)ppi->total_inconsistency);
+
+        SNPRINT(headings, "\tConsist\tWstCons");
+        SNPRINT2(results, "\t%7.3f", consistency);
+        SNPRINT2(results, "\t%7.3f", ppi->worst_consistency);
+      }
+
+      SNPRINT(headings, "\t   Time\tRcErr\tAbsErr");
+      SNPRINT2(results, "\t%8.0f", total_encode_time);
+      SNPRINT2(results, " %7.2f", rate_err);
+      SNPRINT2(results, " %7.2f", fabs(rate_err));
+
+      SNPRINT(headings, "\tAPsnr611");
+      SNPRINT2(results, " %7.3f",
+               (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] +
+                ppi->psnr[0].stat[STAT_V]) /
+                   (ppi->count[0] * 8));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth;
+      const uint32_t bit_depth = ppi->seq_params.bit_depth;
+      // Since cpi->source->flags is not available here, but total_samples[1]
+      // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was
+      // true in compute_internal_stats
+      if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) {
+        const double peak_hbd = (double)((1 << bit_depth) - 1);
+        const double total_psnr_hbd =
+            aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd,
+                            (double)ppi->total_sq_error[1]);
+        const double total_ssim_hbd =
+            100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0);
+        SNPRINT(headings,
+                "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
+                " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
+                " AOMSSIMH VPSSIMPH WstSsimH");
+        SNPRINT2(results, "\t%7.3f",
+                 ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+        SNPRINT2(results, "  %7.3f",
+                 ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].worst);
+        SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+        SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+        SNPRINT2(results, "  %7.3f", ppi->worst_ssim_hbd);
+      }
+#endif
+      fprintf(f, "%s\n", headings);
+      fprintf(f, "%s\n", results);
+    }
+
+    fclose(f);
+
+    aom_free(ppi->ssim_vars);
+    ppi->ssim_vars = NULL;
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+  if (cpi->common.show_frame && cpi->rc.frames_to_key) {
+#if !CONFIG_REALTIME_ONLY
+    FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info;
+    if (firstpass_info->past_stats_count > FIRSTPASS_INFO_STATS_PAST_MIN) {
+      av1_firstpass_info_move_cur_index_and_pop(firstpass_info);
+    } else {
+      // When there is not enough past stats, we move the current
+      // index without popping the past stats
+      av1_firstpass_info_move_cur_index(firstpass_info);
+    }
+#endif
+    if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+      cpi->rc.frames_since_key++;
+      cpi->rc.frames_to_key--;
+      cpi->rc.frames_to_fwd_kf--;
+    }
+  }
+}
+
+static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+  // TODO(weitinglin): Updating this counter for is_frame_droppable
+  // is a work-around to handle the condition when a frame is drop.
+  // We should fix the cpi->common.show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cpi->common.show_frame ||
+      is_frame_droppable(&cpi->ppi->rtc_ref, &cpi->ext_flags.refresh_frame)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+}
+
+static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) {
+  // Increment the gf group index ready for the next frame.
+  if (is_one_pass_rt_params(cpi) &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+    ++cpi->gf_frame_index;
+    // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH
+    // for real time encoding.
+    if (cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH)
+      cpi->gf_frame_index = 0;
+  } else {
+    ++cpi->gf_frame_index;
+  }
+}
+
+static void update_fb_of_context_type(const AV1_COMP *const cpi,
+                                      int *const fb_of_context_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int current_frame_ref_type = get_current_frame_ref_type(cpi);
+
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      fb_of_context_type[i] = -1;
+    }
+    fb_of_context_type[current_frame_ref_type] =
+        cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+                       : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+  }
+
+  if (!encode_show_existing_frame(cm)) {
+    // Refresh fb_of_context_type[]: see encoder.h for explanation
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      // All ref frames are refreshed, pick one that will live long enough
+      fb_of_context_type[current_frame_ref_type] = 0;
+    } else {
+      // If more than one frame is refreshed, it doesn't matter which one we
+      // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
+
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+          fb_of_context_type[current_frame_ref_type] = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+  update_keyframe_counters(cpi);
+  update_frames_till_gf_update(cpi);
+  update_gf_group_index(cpi);
+}
+
+static void update_end_of_frame_stats(AV1_COMP *cpi) {
+  if (cpi->do_frame_data_update) {
+    // Store current frame loopfilter levels in ppi, if update flag is set.
+    if (!cpi->common.show_existing_frame) {
+      AV1_COMMON *const cm = &cpi->common;
+      struct loopfilter *const lf = &cm->lf;
+      cpi->ppi->filter_level[0] = lf->filter_level[0];
+      cpi->ppi->filter_level[1] = lf->filter_level[1];
+      cpi->ppi->filter_level_u = lf->filter_level_u;
+      cpi->ppi->filter_level_v = lf->filter_level_v;
+    }
+  }
+  // Store frame level mv_stats from cpi to ppi.
+  cpi->ppi->mv_stats = cpi->mv_stats;
+}
+
+// Updates frame level stats related to global motion
+static AOM_INLINE void update_gm_stats(AV1_COMP *cpi) {
+  FRAME_UPDATE_TYPE update_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+  int i, is_gm_present = 0;
+
+  // Check if the current frame has any valid global motion model across its
+  // reference frames
+  for (i = 0; i < REF_FRAMES; i++) {
+    if (cpi->common.global_motion[i].wmtype != IDENTITY) {
+      is_gm_present = 1;
+      break;
+    }
+  }
+  int update_actual_stats = 1;
+#if CONFIG_FPMT_TEST
+  update_actual_stats =
+      (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+  if (!update_actual_stats) {
+    if (cpi->ppi->temp_valid_gm_model_found[update_type] == INT32_MAX) {
+      cpi->ppi->temp_valid_gm_model_found[update_type] = is_gm_present;
+    } else {
+      cpi->ppi->temp_valid_gm_model_found[update_type] |= is_gm_present;
+    }
+    int show_existing_between_parallel_frames =
+        (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+             INTNL_OVERLAY_UPDATE &&
+         cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+    if (cpi->do_frame_data_update == 1 &&
+        !show_existing_between_parallel_frames) {
+      for (i = 0; i < FRAME_UPDATE_TYPES; i++) {
+        cpi->ppi->valid_gm_model_found[i] =
+            cpi->ppi->temp_valid_gm_model_found[i];
+      }
+    }
+  }
+#endif
+  if (update_actual_stats) {
+    if (cpi->ppi->valid_gm_model_found[update_type] == INT32_MAX) {
+      cpi->ppi->valid_gm_model_found[update_type] = is_gm_present;
+    } else {
+      cpi->ppi->valid_gm_model_found[update_type] |= is_gm_present;
+    }
+  }
+}
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+                             const AV1_COMP_DATA *const cpi_data) {
+  AV1_PRIMARY *const ppi = cpi->ppi;
+  AV1_COMMON *const cm = &cpi->common;
+
+  update_gm_stats(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+  // Update the total stats remaining structure.
+  if (cpi->twopass_frame.this_frame != NULL &&
+      ppi->twopass.stats_buf_ctx->total_left_stats) {
+    subtract_stats(ppi->twopass.stats_buf_ctx->total_left_stats,
+                   cpi->twopass_frame.this_frame);
+  }
+#endif
+
+#if CONFIG_OUTPUT_FRAME_SIZE
+  FILE *f = fopen("frame_sizes.csv", "a");
+  fprintf(f, "%d,", 8 * (int)cpi_data->frame_size);
+  fprintf(f, "%d\n", cm->quant_params.base_qindex);
+  fclose(f);
+#endif  // CONFIG_OUTPUT_FRAME_SIZE
+
+  if (!is_stat_generation_stage(cpi) && !cpi->is_dropped_frame) {
+    // Before calling refresh_reference_frames(), copy ppi->ref_frame_map_copy
+    // to cm->ref_frame_map for frame_parallel_level 2 frame in a parallel
+    // encode set of lower layer frames.
+    // TODO(Remya): Move ref_frame_map from AV1_COMMON to AV1_PRIMARY to avoid
+    // copy.
+    if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 2 &&
+        ppi->gf_group.frame_parallel_level[cpi->gf_frame_index - 1] == 1 &&
+        ppi->gf_group.update_type[cpi->gf_frame_index - 1] ==
+            INTNL_ARF_UPDATE) {
+      memcpy(cm->ref_frame_map, ppi->ref_frame_map_copy,
+             sizeof(cm->ref_frame_map));
+    }
+    refresh_reference_frames(cpi);
+    // For frame_parallel_level 1 frame in a parallel encode set of lower layer
+    // frames, store the updated cm->ref_frame_map in ppi->ref_frame_map_copy.
+    if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1 &&
+        ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+      memcpy(ppi->ref_frame_map_copy, cm->ref_frame_map,
+             sizeof(cm->ref_frame_map));
+    }
+    av1_rc_postencode_update(cpi, cpi_data->frame_size);
+  }
+
+  if (cpi_data->pop_lookahead == 1) {
+    av1_lookahead_pop(cpi->ppi->lookahead, cpi_data->flush,
+                      cpi->compressor_stage);
+  }
+  if (cpi->common.show_frame) {
+    cpi->ppi->ts_start_last_show_frame = cpi_data->ts_frame_start;
+    cpi->ppi->ts_end_last_show_frame = cpi_data->ts_frame_end;
+  }
+  if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
+    // Initialize level info. at the beginning of each sequence.
+    if (cm->current_frame.frame_type == KEY_FRAME &&
+        ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+      av1_init_level_info(cpi);
+    }
+    av1_update_level_info(cpi, cpi_data->frame_size, cpi_data->ts_frame_start,
+                          cpi_data->ts_frame_end);
+  }
+
+  if (!is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+    if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi);
+#endif
+    update_fb_of_context_type(cpi, ppi->fb_of_context_type);
+    update_rc_counts(cpi);
+    update_end_of_frame_stats(cpi);
+  }
+
+  if (cpi->oxcf.pass == AOM_RC_THIRD_PASS && cpi->third_pass_ctx) {
+    av1_pop_third_pass_info(cpi->third_pass_ctx);
+  }
+
+  if (ppi->rtc_ref.set_ref_frame_config) {
+    av1_svc_update_buffer_slot_refreshed(cpi);
+    av1_svc_set_reference_was_previous(cpi);
+  }
+
+  if (ppi->use_svc) av1_save_layer_context(cpi);
+
+  // Note *size = 0 indicates a dropped frame for which psnr is not calculated
+  if (ppi->b_calculate_psnr && cpi_data->frame_size > 0) {
+    if (cm->show_existing_frame ||
+        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
+      generate_psnr_packet(cpi);
+    }
+  }
+
+#if CONFIG_INTERNAL_STATS
+  if (!is_stat_generation_stage(cpi)) {
+    compute_internal_stats(cpi, (int)cpi_data->frame_size);
+  }
+#endif  // CONFIG_INTERNAL_STATS
+
+  // Write frame info. Subtract 1 from frame index since if was incremented in
+  // update_rc_counts.
+  av1_write_second_pass_per_frame_info(cpi, cpi->gf_frame_index - 1);
+}
+
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error->jmp)) {
+    cm->error->setjmp = 0;
+    return cm->error->error_code;
+  }
+  cm->error->setjmp = 1;
+
+#if CONFIG_INTERNAL_STATS
+  cpi->frame_recode_hits = 0;
+  cpi->time_compress_data = 0;
+  cpi->bytes = 0;
+#endif
+#if CONFIG_ENTROPY_STATS
+  if (cpi->compressor_stage == ENCODE_STAGE) {
+    av1_zero(cpi->counts);
+  }
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+  assert(cpi->oxcf.max_threads <= 1 &&
+         "bitstream debug tool does not support multithreading");
+  bitstream_queue_record_write();
+
+  if (cm->seq_params->order_hint_info.enable_order_hint) {
+    aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 +
+                                        cm->show_frame);
+  } else {
+    // This is currently used in RTC encoding. cm->show_frame is always 1.
+    aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number);
+  }
+#endif
+  if (cpi->ppi->use_svc) {
+    av1_one_pass_cbr_svc_start_layer(cpi);
+  }
+
+  cpi->is_dropped_frame = false;
+  cm->showable_frame = 0;
+  cpi_data->frame_size = 0;
+  cpi->available_bs_size = cpi_data->cx_data_sz;
+#if CONFIG_INTERNAL_STATS
+  struct aom_usec_timer cmptimer;
+  aom_usec_timer_start(&cmptimer);
+#endif
+  av1_set_high_precision_mv(cpi, 1, 0);
+
+  // Normal defaults
+  cm->features.refresh_frame_context =
+      oxcf->tool_cfg.frame_parallel_decoding_mode
+          ? REFRESH_FRAME_CONTEXT_DISABLED
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->tile_cfg.enable_large_scale_tile)
+    cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  if (assign_cur_frame_new_fb(cm) == NULL) {
+    aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                       "Failed to allocate new cur_frame");
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  // Accumulate 2nd pass time in 2-pass case or 1 pass time in 1-pass case.
+  if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+    start_timing(cpi, av1_encode_strategy_time);
+#endif
+
+  const int result = av1_encode_strategy(
+      cpi, &cpi_data->frame_size, cpi_data->cx_data, &cpi_data->lib_flags,
+      &cpi_data->ts_frame_start, &cpi_data->ts_frame_end,
+      cpi_data->timestamp_ratio, &cpi_data->pop_lookahead, cpi_data->flush);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+    end_timing(cpi, av1_encode_strategy_time);
+
+  // Print out timing information.
+  // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+  // show_existing_frame and lag-in-frames.
+  if ((cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) &&
+      cpi->frame_component_time[0] > 100) {
+    int i;
+    uint64_t frame_total = 0, total = 0;
+    const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    FRAME_UPDATE_TYPE frame_update_type =
+        get_frame_update_type(gf_group, cpi->gf_frame_index);
+
+    fprintf(stderr,
+            "\n Frame number: %d, Frame type: %s, Show Frame: %d, Frame Update "
+            "Type: %d, Q: %d\n",
+            cm->current_frame.frame_number,
+            get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame,
+            frame_update_type, cm->quant_params.base_qindex);
+    for (i = 0; i < kTimingComponents; i++) {
+      cpi->component_time[i] += cpi->frame_component_time[i];
+      // Use av1_encode_strategy_time (i = 0) as the total time.
+      if (i == 0) {
+        frame_total = cpi->frame_component_time[0];
+        total = cpi->component_time[0];
+      }
+      fprintf(stderr,
+              " %50s:  %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+              " us [%6.2f%%])\n",
+              get_component_name(i), cpi->frame_component_time[i],
+              (float)((float)cpi->frame_component_time[i] * 100.0 /
+                      (float)frame_total),
+              cpi->component_time[i],
+              (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+      cpi->frame_component_time[i] = 0;
+    }
+  }
+#endif
+
+  // Reset the flag to 0 afer encoding.
+  cpi->rc.use_external_qp_one_pass = 0;
+
+  if (result == -1) {
+    cm->error->setjmp = 0;
+    // Returning -1 indicates no frame encoded; more input is required
+    return -1;
+  }
+  if (result != AOM_CODEC_OK) {
+    aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                       "Failed to encode frame");
+  }
+#if CONFIG_INTERNAL_STATS
+  aom_usec_timer_mark(&cmptimer);
+  cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
+#endif  // CONFIG_INTERNAL_STATS
+
+#if CONFIG_SPEED_STATS
+  if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
+    cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count;
+    cpi->td.mb.txfm_search_info.tx_search_count = 0;
+  }
+#endif  // CONFIG_SPEED_STATS
+
+  cm->error->setjmp = 0;
+  return AOM_CODEC_OK;
+}
+
+// Populates cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set. Also sets the bitmask 'ref_buffers_used_map'.
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map) {
+  AV1_COMMON *cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+        continue;
+      }
+
+      // FPMT does not support scaling yet.
+      assert(ref->y_crop_width == cm->width &&
+             ref->y_crop_height == cm->height);
+
+      RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+      cpi->scaled_ref_buf[ref_frame - 1] = buf;
+      for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) {
+        if (&cm->buffer_pool->frame_bufs[i] == buf) {
+          *ref_buffers_used_map |= (1 << i);
+        }
+      }
+    } else {
+      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+    }
+  }
+}
+
+// Increments the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+                                          int ref_buffers_used_map) {
+  for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) {
+    if (ref_buffers_used_map & (1 << i)) {
+      ++buffer_pool->frame_bufs[i].ref_count;
+    }
+  }
+}
+
+// Releases cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set.
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi) {
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+    if (buf != NULL) {
+      cpi->scaled_ref_buf[i] = NULL;
+    }
+  }
+}
+
+// Decrements the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+                                   int ref_buffers_used_map) {
+  for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) {
+    if (ref_buffers_used_map & (1 << i)) {
+      --buffer_pool->frame_bufs[i].ref_count;
+    }
+  }
+}
+
+// Initialize parallel frame contexts with screen content decisions.
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi) {
+  AV1_COMP *const first_cpi = ppi->cpi;
+  for (int i = 1; i < ppi->num_fp_contexts; ++i) {
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[i];
+    cur_cpi->common.features.allow_screen_content_tools =
+        first_cpi->common.features.allow_screen_content_tools;
+    cur_cpi->common.features.allow_intrabc =
+        first_cpi->common.features.allow_intrabc;
+    cur_cpi->use_screen_content_tools = first_cpi->use_screen_content_tools;
+    cur_cpi->is_screen_content_type = first_cpi->is_screen_content_type;
+  }
+}
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+                                          AV1_COMP_DATA *const first_cpi_data) {
+  int cpi_idx = 0;
+
+  // Loop over parallel_cpi to find the cpi that processed the current
+  // gf_frame_index ahead of time.
+  for (int i = 1; i < ppi->num_fp_contexts; i++) {
+    if (ppi->cpi->gf_frame_index == ppi->parallel_cpi[i]->gf_frame_index) {
+      cpi_idx = i;
+      break;
+    }
+  }
+
+  assert(cpi_idx > 0);
+  assert(!ppi->parallel_cpi[cpi_idx]->common.show_existing_frame);
+
+  // Release the previously-used frame-buffer.
+  if (ppi->cpi->common.cur_frame != NULL) {
+    --ppi->cpi->common.cur_frame->ref_count;
+    ppi->cpi->common.cur_frame = NULL;
+  }
+
+  // Swap the appropriate parallel_cpi with the parallel_cpi[0].
+  ppi->cpi = ppi->parallel_cpi[cpi_idx];
+  ppi->parallel_cpi[cpi_idx] = ppi->parallel_cpi[0];
+  ppi->parallel_cpi[0] = ppi->cpi;
+
+  // Copy appropriate parallel_frames_data to local data.
+  {
+    AV1_COMP_DATA *data = &ppi->parallel_frames_data[cpi_idx - 1];
+    assert(data->frame_size > 0);
+    assert(first_cpi_data->cx_data_sz > data->frame_size);
+
+    first_cpi_data->lib_flags = data->lib_flags;
+    first_cpi_data->ts_frame_start = data->ts_frame_start;
+    first_cpi_data->ts_frame_end = data->ts_frame_end;
+    memcpy(first_cpi_data->cx_data, data->cx_data, data->frame_size);
+    first_cpi_data->frame_size = data->frame_size;
+    if (ppi->cpi->common.show_frame) {
+      first_cpi_data->pop_lookahead = 1;
+    }
+  }
+
+  return ppi->cpi;
+}
+
+// Initialises frames belonging to a parallel encode set.
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+                                    AV1_PRIMARY *const ppi,
+                                    int *ref_buffers_used_map) {
+  AV1_COMP *const first_cpi = ppi->cpi;
+  GF_GROUP *const gf_group = &ppi->gf_group;
+  int gf_index_start = first_cpi->gf_frame_index;
+  assert(gf_group->frame_parallel_level[gf_index_start] == 1);
+  int parallel_frame_count = 0;
+  int cur_frame_num = first_cpi->common.current_frame.frame_number;
+  int show_frame_count = first_cpi->frame_index_set.show_frame_count;
+  int frames_since_key = first_cpi->rc.frames_since_key;
+  int frames_to_key = first_cpi->rc.frames_to_key;
+  int frames_to_fwd_kf = first_cpi->rc.frames_to_fwd_kf;
+  int cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[gf_index_start];
+  const FIRSTPASS_STATS *stats_in = first_cpi->twopass_frame.stats_in;
+
+  assert(*ref_buffers_used_map == 0);
+
+  // Release the previously used frame-buffer by a frame_parallel_level 1 frame.
+  if (first_cpi->common.cur_frame != NULL) {
+    --first_cpi->common.cur_frame->ref_count;
+    first_cpi->common.cur_frame = NULL;
+  }
+
+  RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+  RefFrameMapPair first_ref_frame_map_pairs[REF_FRAMES];
+  init_ref_map_pair(first_cpi, first_ref_frame_map_pairs);
+  memcpy(ref_frame_map_pairs, first_ref_frame_map_pairs,
+         sizeof(RefFrameMapPair) * REF_FRAMES);
+
+  // Store the reference refresh index of frame_parallel_level 1 frame in a
+  // parallel encode set of lower layer frames.
+  if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+    first_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf(
+        first_cpi, ref_frame_map_pairs, gf_index_start);
+    assert(first_cpi->ref_refresh_index != INVALID_IDX &&
+           first_cpi->ref_refresh_index < REF_FRAMES);
+    first_cpi->refresh_idx_available = true;
+    // Update ref_frame_map_pairs.
+    ref_frame_map_pairs[first_cpi->ref_refresh_index].disp_order =
+        gf_group->display_idx[gf_index_start];
+    ref_frame_map_pairs[first_cpi->ref_refresh_index].pyr_level =
+        gf_group->layer_depth[gf_index_start];
+  }
+
+  // Set do_frame_data_update flag as false for frame_parallel_level 1 frame.
+  first_cpi->do_frame_data_update = false;
+  if (gf_group->arf_src_offset[gf_index_start] == 0) {
+    first_cpi->time_stamps.prev_ts_start = ppi->ts_start_last_show_frame;
+    first_cpi->time_stamps.prev_ts_end = ppi->ts_end_last_show_frame;
+  }
+
+  av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, first_cpi,
+                     gf_index_start, 1, first_cpi->common.remapped_ref_idx);
+
+  av1_scale_references_fpmt(first_cpi, ref_buffers_used_map);
+  parallel_frame_count++;
+
+  // Iterate through the GF_GROUP to find the remaining frame_parallel_level 2
+  // frames which are part of the current parallel encode set and initialize the
+  // required cpi elements.
+  for (int i = gf_index_start + 1; i < gf_group->size; i++) {
+    // Update frame counters if previous frame was show frame or show existing
+    // frame.
+    if (gf_group->arf_src_offset[i - 1] == 0) {
+      cur_frame_num++;
+      show_frame_count++;
+      if (frames_to_fwd_kf <= 0)
+        frames_to_fwd_kf = first_cpi->oxcf.kf_cfg.fwd_kf_dist;
+      if (frames_to_key) {
+        frames_since_key++;
+        frames_to_key--;
+        frames_to_fwd_kf--;
+      }
+      stats_in++;
+    }
+    cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[i];
+    if (gf_group->frame_parallel_level[i] == 2) {
+      AV1_COMP *cur_cpi = ppi->parallel_cpi[parallel_frame_count];
+      AV1_COMP_DATA *cur_cpi_data =
+          &ppi->parallel_frames_data[parallel_frame_count - 1];
+      cur_cpi->gf_frame_index = i;
+      cur_cpi->framerate = first_cpi->framerate;
+      cur_cpi->common.current_frame.frame_number = cur_frame_num;
+      cur_cpi->common.current_frame.frame_type = gf_group->frame_type[i];
+      cur_cpi->frame_index_set.show_frame_count = show_frame_count;
+      cur_cpi->rc.frames_since_key = frames_since_key;
+      cur_cpi->rc.frames_to_key = frames_to_key;
+      cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf;
+      cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality;
+      cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth;
+      cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth;
+      cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth;
+      cur_cpi->rc.intervals_till_gf_calculate_due =
+          first_cpi->rc.intervals_till_gf_calculate_due;
+      cur_cpi->mv_search_params.max_mv_magnitude =
+          first_cpi->mv_search_params.max_mv_magnitude;
+      if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+        cur_cpi->common.lf.mode_ref_delta_enabled = 1;
+      }
+      cur_cpi->do_frame_data_update = false;
+      // Initialize prev_ts_start and prev_ts_end for show frame(s) and show
+      // existing frame(s).
+      if (gf_group->arf_src_offset[i] == 0) {
+        // Choose source of prev frame.
+        int src_index = gf_group->src_offset[i];
+        struct lookahead_entry *prev_source = av1_lookahead_peek(
+            ppi->lookahead, src_index - 1, cur_cpi->compressor_stage);
+        // Save timestamps of prev frame.
+        cur_cpi->time_stamps.prev_ts_start = prev_source->ts_start;
+        cur_cpi->time_stamps.prev_ts_end = prev_source->ts_end;
+      }
+      cur_cpi->time_stamps.first_ts_start =
+          first_cpi->time_stamps.first_ts_start;
+
+      memcpy(cur_cpi->common.ref_frame_map, first_cpi->common.ref_frame_map,
+             sizeof(first_cpi->common.ref_frame_map));
+      cur_cpi_data->lib_flags = 0;
+      cur_cpi_data->timestamp_ratio = first_cpi_data->timestamp_ratio;
+      cur_cpi_data->flush = first_cpi_data->flush;
+      cur_cpi_data->frame_size = 0;
+      if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+        // If the first frame in a parallel encode set is INTNL_ARF_UPDATE
+        // frame, initialize lib_flags of frame_parallel_level 2 frame in the
+        // set with that of frame_parallel_level 1 frame.
+        cur_cpi_data->lib_flags = first_cpi_data->lib_flags;
+        // Store the reference refresh index of frame_parallel_level 2 frame in
+        // a parallel encode set of lower layer frames.
+        cur_cpi->ref_refresh_index =
+            av1_calc_refresh_idx_for_intnl_arf(cur_cpi, ref_frame_map_pairs, i);
+        cur_cpi->refresh_idx_available = true;
+        // Skip the reference frame which will be refreshed by
+        // frame_parallel_level 1 frame in a parallel encode set of lower layer
+        // frames.
+        cur_cpi->ref_idx_to_skip = first_cpi->ref_refresh_index;
+      } else {
+        cur_cpi->ref_idx_to_skip = INVALID_IDX;
+        cur_cpi->ref_refresh_index = INVALID_IDX;
+        cur_cpi->refresh_idx_available = false;
+      }
+      cur_cpi->twopass_frame.stats_in = stats_in;
+
+      av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, cur_cpi, i,
+                         1, cur_cpi->common.remapped_ref_idx);
+      av1_scale_references_fpmt(cur_cpi, ref_buffers_used_map);
+      parallel_frame_count++;
+    }
+
+    // Set do_frame_data_update to true for the last frame_parallel_level 2
+    // frame in the current parallel encode set.
+    if (i == (gf_group->size - 1) ||
+        (gf_group->frame_parallel_level[i + 1] == 0 &&
+         (gf_group->update_type[i + 1] == ARF_UPDATE ||
+          gf_group->update_type[i + 1] == INTNL_ARF_UPDATE)) ||
+        gf_group->frame_parallel_level[i + 1] == 1) {
+      ppi->parallel_cpi[parallel_frame_count - 1]->do_frame_data_update = true;
+      break;
+    }
+  }
+
+  av1_increment_scaled_ref_counts_fpmt(first_cpi->common.buffer_pool,
+                                       *ref_buffers_used_map);
+
+  // Return the number of frames in the parallel encode set.
+  return parallel_frame_count;
+}
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
+  AV1_COMMON *cm = &cpi->common;
+  if (!cm->show_frame) {
+    return -1;
+  } else {
+    int ret;
+    if (cm->cur_frame != NULL && !cpi->oxcf.algo_cfg.skip_postproc_filtering) {
+      *dest = cm->cur_frame->buf;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->seq_params->subsampling_x;
+      dest->uv_height = cm->height >> cm->seq_params->subsampling_y;
+      ret = 0;
+    } else {
+      ret = -1;
+    }
+    return ret;
+  }
+}
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+  if (cpi->last_show_frame_buf == NULL ||
+      cpi->oxcf.algo_cfg.skip_postproc_filtering)
+    return -1;
+
+  *frame = cpi->last_show_frame_buf->buf;
+  return 0;
+}
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error->error_code;
+}
+
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+                          ResizePendingParams *resize_pending_params,
+                          AOM_SCALING_MODE horiz_mode,
+                          AOM_SCALING_MODE vert_mode) {
+  int hr = 0, hs = 0, vr = 0, vs = 0;
+
+  // Checks for invalid AOM_SCALING_MODE values.
+  if (horiz_mode > AOME_ONETHREE || vert_mode > AOME_ONETHREE) return -1;
+
+  Scale2Ratio(horiz_mode, &hr, &hs);
+  Scale2Ratio(vert_mode, &vr, &vs);
+
+  // always go to the next whole number
+  resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs;
+  resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs;
+
+  if (horiz_mode != AOME_NORMAL || vert_mode != AOME_NORMAL) {
+    oxcf->resize_cfg.resize_mode = RESIZE_FIXED;
+    oxcf->algo_cfg.enable_tpl_model = 0;
+  }
+  return 0;
+}
+
+int av1_get_quantizer(AV1_COMP *cpi) {
+  return cpi->common.quant_params.base_qindex;
+}
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
+  size_t output_size = 0;
+  size_t total_bytes_read = 0;
+  size_t remaining_size = *frame_size;
+  uint8_t *buff_ptr = buffer;
+
+  // go through each OBUs
+  while (total_bytes_read < *frame_size) {
+    uint8_t saved_obu_header[2];
+    uint64_t obu_payload_size;
+    size_t length_of_payload_size;
+    size_t length_of_obu_size;
+    uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1;
+    size_t obu_bytes_read = obu_header_size;  // bytes read for current obu
+
+    // save the obu header (1 or 2 bytes)
+    memmove(saved_obu_header, buff_ptr, obu_header_size);
+    // clear the obu_has_size_field
+    saved_obu_header[0] = saved_obu_header[0] & (~0x2);
+
+    // get the payload_size and length of payload_size
+    if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size,
+                        &obu_payload_size, &length_of_payload_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+    obu_bytes_read += length_of_payload_size;
+
+    // calculate the length of size of the obu header plus payload
+    length_of_obu_size =
+        aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size));
+
+    // move the rest of data to new location
+    memmove(buff_ptr + length_of_obu_size + obu_header_size,
+            buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read);
+    obu_bytes_read += (size_t)obu_payload_size;
+
+    // write the new obu size
+    const uint64_t obu_size = obu_header_size + obu_payload_size;
+    size_t coded_obu_size;
+    if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr,
+                        &coded_obu_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+
+    // write the saved (modified) obu_header following obu size
+    memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size);
+
+    total_bytes_read += obu_bytes_read;
+    remaining_size -= obu_bytes_read;
+    buff_ptr += length_of_obu_size + obu_size;
+    output_size += length_of_obu_size + (size_t)obu_size;
+  }
+
+  *frame_size = output_size;
+  return AOM_CODEC_OK;
+}
+
+static void rtc_set_updates_ref_frame_config(
+    ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags,
+    RTC_REF *const rtc_ref) {
+  ext_refresh_frame_flags->update_pending = 1;
+  ext_refresh_frame_flags->last_frame = rtc_ref->refresh[rtc_ref->ref_idx[0]];
+  ext_refresh_frame_flags->golden_frame = rtc_ref->refresh[rtc_ref->ref_idx[3]];
+  ext_refresh_frame_flags->bwd_ref_frame =
+      rtc_ref->refresh[rtc_ref->ref_idx[4]];
+  ext_refresh_frame_flags->alt2_ref_frame =
+      rtc_ref->refresh[rtc_ref->ref_idx[5]];
+  ext_refresh_frame_flags->alt_ref_frame =
+      rtc_ref->refresh[rtc_ref->ref_idx[6]];
+  rtc_ref->non_reference_frame = 1;
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (rtc_ref->refresh[i] == 1) {
+      rtc_ref->non_reference_frame = 0;
+      break;
+    }
+  }
+}
+
+static int rtc_set_references_external_ref_frame_config(AV1_COMP *cpi) {
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  int ref = AOM_REFFRAME_ALL;
+  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+    if (!cpi->ppi->rtc_ref.reference[i]) ref ^= (1 << i);
+  }
+  return ref;
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+  // TODO(yunqingwang): For what references to use, external encoding flags
+  // should be consistent with internal reference frame selection. Need to
+  // ensure that there is not conflict between the two. In AV1 encoder, the
+  // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
+  // GOLDEN, BWDREF, ALTREF2.
+
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
+  ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &ext_flags->refresh_frame;
+  ext_flags->ref_frame_flags = AOM_REFFRAME_ALL;
+  if (flags &
+      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+       AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+       AOM_EFLAG_NO_REF_ARF2)) {
+    int ref = AOM_REFFRAME_ALL;
+
+    if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG;
+    if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+    if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
+
+    if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+    if (flags & AOM_EFLAG_NO_REF_ARF) {
+      ref ^= AOM_ALT_FLAG;
+      ref ^= AOM_BWD_FLAG;
+      ref ^= AOM_ALT2_FLAG;
+    } else {
+      if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+    }
+
+    av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+  } else {
+    if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+      int ref = rtc_set_references_external_ref_frame_config(cpi);
+      av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+    }
+  }
+
+  if (flags &
+      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
+    int upd = AOM_REFFRAME_ALL;
+
+    // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag.
+    if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG;
+
+    if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
+
+    if (flags & AOM_EFLAG_NO_UPD_ARF) {
+      upd ^= AOM_ALT_FLAG;
+      upd ^= AOM_BWD_FLAG;
+      upd ^= AOM_ALT2_FLAG;
+    }
+
+    ext_refresh_frame_flags->last_frame = (upd & AOM_LAST_FLAG) != 0;
+    ext_refresh_frame_flags->golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+    ext_refresh_frame_flags->alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+    ext_refresh_frame_flags->bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+    ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+    ext_refresh_frame_flags->update_pending = 1;
+  } else {
+    if (cpi->ppi->rtc_ref.set_ref_frame_config)
+      rtc_set_updates_ref_frame_config(ext_refresh_frame_flags,
+                                       &cpi->ppi->rtc_ref);
+    else
+      ext_refresh_frame_flags->update_pending = 0;
+  }
+
+  ext_flags->use_ref_frame_mvs = cpi->oxcf.tool_cfg.enable_ref_frame_mvs &
+                                 ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+  ext_flags->use_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode |
+                                   ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+  ext_flags->use_s_frame =
+      cpi->oxcf.kf_cfg.enable_sframe | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+  ext_flags->use_primary_ref_none =
+      (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+
+  if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
+    update_entropy(&ext_flags->refresh_frame_context,
+                   &ext_flags->refresh_frame_context_pending, 0);
+  }
+}
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) {
+  if (!ppi) return NULL;
+
+  uint8_t header_buf[512] = { 0 };
+  const uint32_t sequence_header_size =
+      av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]);
+  assert(sequence_header_size <= sizeof(header_buf));
+  if (sequence_header_size == 0) return NULL;
+
+  const size_t obu_header_size = 1;
+  const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+  const size_t payload_offset = obu_header_size + size_field_size;
+
+  if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+  memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+  if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count,
+                           OBU_SEQUENCE_HEADER, 0,
+                           &header_buf[0]) != obu_header_size) {
+    return NULL;
+  }
+
+  size_t coded_size_field_size = 0;
+  if (aom_uleb_encode(sequence_header_size, size_field_size,
+                      &header_buf[obu_header_size],
+                      &coded_size_field_size) != 0) {
+    return NULL;
+  }
+  assert(coded_size_field_size == size_field_size);
+
+  aom_fixed_buf_t *global_headers =
+      (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+  if (!global_headers) return NULL;
+
+  const size_t global_header_buf_size =
+      obu_header_size + size_field_size + sequence_header_size;
+
+  global_headers->buf = malloc(global_header_buf_size);
+  if (!global_headers->buf) {
+    free(global_headers);
+    return NULL;
+  }
+
+  memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+  global_headers->sz = global_header_buf_size;
+  return global_headers;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
new file mode 100644
index 0000000000..5f6f67eda8
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -0,0 +1,4512 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares top-level encoder structures and functions.
+ */
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/timing.h"
+
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/external_partition.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/level.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/svc_layercontext.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/bitstream.h"
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/variance.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/noise_model.h"
+#endif
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+#if CONFIG_AV1_TEMPORAL_DENOISING
+#include "av1/encoder/av1_temporal_denoiser.h"
+#endif
+#if CONFIG_TUNE_BUTTERAUGLI
+#include "av1/encoder/tune_butteraugli.h"
+#endif
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(yunqing, any): Added suppression tag to quiet Doxygen warnings. Need to
+// adjust it while we work on documentation.
+/*!\cond */
+// Number of frames required to test for scene cut detection
+#define SCENE_CUT_KEY_TEST_INTERVAL 16
+
+// Lookahead index threshold to enable temporal filtering for second arf.
+#define TF_LOOKAHEAD_IDX_THR 7
+
+#define HDR_QP_LEVELS 10
+#define CHROMA_CB_QP_SCALE 1.04
+#define CHROMA_CR_QP_SCALE 1.04
+#define CHROMA_QP_SCALE -0.46
+#define CHROMA_QP_OFFSET 9.26
+#define QP_SCALE_FACTOR 2.0
+#define DISABLE_HDR_LUMA_DELTAQ 1
+
+// Rational number with an int64 numerator
+// This structure holds a fractional value
+typedef struct aom_rational64 {
+  int64_t num;       // fraction numerator
+  int den;           // fraction denominator
+} aom_rational64_t;  // alias for struct aom_rational
+
+enum {
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD,
+  // Realtime Fast Encoding. Will force some restrictions on bitrate
+  // constraints.
+  REALTIME,
+  // All intra mode. All the frames are coded as intra frames.
+  ALLINTRA
+} UENUM1BYTE(MODE);
+
+enum {
+  FRAMEFLAGS_KEY = 1 << 0,
+  FRAMEFLAGS_GOLDEN = 1 << 1,
+  FRAMEFLAGS_BWDREF = 1 << 2,
+  // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
+  FRAMEFLAGS_ALTREF = 1 << 3,
+  FRAMEFLAGS_INTRAONLY = 1 << 4,
+  FRAMEFLAGS_SWITCH = 1 << 5,
+  FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
+} UENUM1BYTE(FRAMETYPE_FLAGS);
+
+#if CONFIG_FPMT_TEST
+enum {
+  PARALLEL_ENCODE = 0,
+  PARALLEL_SIMULATION_ENCODE,
+  NUM_FPMT_TEST_ENCODES
+} UENUM1BYTE(FPMT_TEST_ENC_CFG);
+#endif  // CONFIG_FPMT_TEST
+// 0 level frames are sometimes used for rate control purposes, but for
+// reference mapping purposes, the minimum level should be 1.
+#define MIN_PYR_LEVEL 1
+static INLINE int get_true_pyr_level(int frame_level, int frame_order,
+                                     int max_layer_depth) {
+  if (frame_order == 0) {
+    // Keyframe case
+    return MIN_PYR_LEVEL;
+  } else if (frame_level == MAX_ARF_LAYERS) {
+    // Leaves
+    return max_layer_depth;
+  } else if (frame_level == (MAX_ARF_LAYERS + 1)) {
+    // Altrefs
+    return MIN_PYR_LEVEL;
+  }
+  return AOMMAX(MIN_PYR_LEVEL, frame_level);
+}
+
+enum {
+  NO_AQ = 0,
+  VARIANCE_AQ = 1,
+  COMPLEXITY_AQ = 2,
+  CYCLIC_REFRESH_AQ = 3,
+  AQ_MODE_COUNT  // This should always be the last member of the enum
+} UENUM1BYTE(AQ_MODE);
+enum {
+  NO_DELTA_Q = 0,
+  DELTA_Q_OBJECTIVE = 1,      // Modulation to improve objective quality
+  DELTA_Q_PERCEPTUAL = 2,     // Modulation to improve video perceptual quality
+  DELTA_Q_PERCEPTUAL_AI = 3,  // Perceptual quality opt for all intra mode
+  DELTA_Q_USER_RATING_BASED = 4,  // User rating based delta q mode
+  DELTA_Q_HDR = 5,    // QP adjustment based on HDR block pixel average
+  DELTA_Q_MODE_COUNT  // This should always be the last member of the enum
+} UENUM1BYTE(DELTAQ_MODE);
+
+enum {
+  RESIZE_NONE = 0,     // No frame resizing allowed.
+  RESIZE_FIXED = 1,    // All frames are coded at the specified scale.
+  RESIZE_RANDOM = 2,   // All frames are coded at a random scale.
+  RESIZE_DYNAMIC = 3,  // Frames coded at lower scale based on rate control.
+  RESIZE_MODES
+} UENUM1BYTE(RESIZE_MODE);
+
+enum {
+  SS_CFG_SRC = 0,
+  SS_CFG_LOOKAHEAD = 1,
+  SS_CFG_FPF = 2,
+  SS_CFG_TOTAL = 3
+} UENUM1BYTE(SS_CFG_OFFSET);
+
+enum {
+  DISABLE_SCENECUT,        // For LAP, lag_in_frames < 19
+  ENABLE_SCENECUT_MODE_1,  // For LAP, lag_in_frames >=19 and < 33
+  ENABLE_SCENECUT_MODE_2   // For twopass and LAP - lag_in_frames >=33
+} UENUM1BYTE(SCENECUT_MODE);
+
+#define MAX_VBR_CORPUS_COMPLEXITY 10000
+
+typedef enum {
+  MOD_FP,           // First pass
+  MOD_TF,           // Temporal filtering
+  MOD_TPL,          // TPL
+  MOD_GME,          // Global motion estimation
+  MOD_ENC,          // Encode stage
+  MOD_LPF,          // Deblocking loop filter
+  MOD_CDEF_SEARCH,  // CDEF search
+  MOD_CDEF,         // CDEF frame
+  MOD_LR,           // Loop restoration filtering
+  MOD_PACK_BS,      // Pack bitstream
+  MOD_FRAME_ENC,    // Frame Parallel encode
+  MOD_AI,           // All intra
+  NUM_MT_MODULES
+} MULTI_THREADED_MODULES;
+
+/*!\endcond */
+
+/*!\enum COST_UPDATE_TYPE
+ * \brief    This enum controls how often the entropy costs should be updated.
+ * \warning  In case of any modifications/additions done to the enum
+ * COST_UPDATE_TYPE, the enum INTERNAL_COST_UPDATE_TYPE needs to be updated as
+ * well.
+ */
+typedef enum {
+  COST_UPD_SB,           /*!< Update every sb. */
+  COST_UPD_SBROW,        /*!< Update every sb rows inside a tile. */
+  COST_UPD_TILE,         /*!< Update every tile. */
+  COST_UPD_OFF,          /*!< Turn off cost updates. */
+  NUM_COST_UPDATE_TYPES, /*!< Number of cost update types. */
+} COST_UPDATE_TYPE;
+
+/*!\enum LOOPFILTER_CONTROL
+ * \brief This enum controls to which frames loopfilter is applied.
+ */
+typedef enum {
+  LOOPFILTER_NONE = 0,      /*!< Disable loopfilter on all frames. */
+  LOOPFILTER_ALL = 1,       /*!< Enable loopfilter for all frames. */
+  LOOPFILTER_REFERENCE = 2, /*!< Disable loopfilter on non reference frames. */
+  LOOPFILTER_SELECTIVELY =
+      3, /*!< Disable loopfilter on frames with low motion. */
+} LOOPFILTER_CONTROL;
+
+/*!\enum SKIP_APPLY_POSTPROC_FILTER
+ * \brief This enum controls the application of post-processing filters on a
+ * reconstructed frame.
+ */
+typedef enum {
+  SKIP_APPLY_RESTORATION = 1 << 0,
+  SKIP_APPLY_SUPERRES = 1 << 1,
+  SKIP_APPLY_CDEF = 1 << 2,
+  SKIP_APPLY_LOOPFILTER = 1 << 3,
+} SKIP_APPLY_POSTPROC_FILTER;
+
+/*!
+ * \brief Encoder config related to resize.
+ */
+typedef struct {
+  /*!
+   * Indicates the frame resize mode to be used by the encoder.
+   */
+  RESIZE_MODE resize_mode;
+  /*!
+   * Indicates the denominator for resize of inter frames, assuming 8 as the
+   *  numerator. Its value ranges between 8-16.
+   */
+  uint8_t resize_scale_denominator;
+  /*!
+   * Indicates the denominator for resize of key frames, assuming 8 as the
+   * numerator. Its value ranges between 8-16.
+   */
+  uint8_t resize_kf_scale_denominator;
+} ResizeCfg;
+
+/*!
+ * \brief Encoder config for coding block partitioning.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if rectanguar partitions should be enabled.
+   */
+  bool enable_rect_partitions;
+  /*!
+   * Flag to indicate if AB partitions should be enabled.
+   */
+  bool enable_ab_partitions;
+  /*!
+   * Flag to indicate if 1:4 / 4:1 partitions should be enabled.
+   */
+  bool enable_1to4_partitions;
+  /*!
+   * Indicates the minimum partition size that should be allowed. Both width and
+   * height of a partition cannot be smaller than the min_partition_size.
+   */
+  BLOCK_SIZE min_partition_size;
+  /*!
+   * Indicates the maximum partition size that should be allowed. Both width and
+   * height of a partition cannot be larger than the max_partition_size.
+   */
+  BLOCK_SIZE max_partition_size;
+} PartitionCfg;
+
+/*!
+ * \brief Encoder flags for intra prediction.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if intra edge filtering process should be enabled.
+   */
+  bool enable_intra_edge_filter;
+  /*!
+   * Flag to indicate if recursive filtering based intra prediction should be
+   * enabled.
+   */
+  bool enable_filter_intra;
+  /*!
+   * Flag to indicate if smooth intra prediction modes should be enabled.
+   */
+  bool enable_smooth_intra;
+  /*!
+   * Flag to indicate if PAETH intra prediction mode should be enabled.
+   */
+  bool enable_paeth_intra;
+  /*!
+   * Flag to indicate if CFL uv intra mode should be enabled.
+   */
+  bool enable_cfl_intra;
+  /*!
+   * Flag to indicate if directional modes should be enabled.
+   */
+  bool enable_directional_intra;
+  /*!
+   * Flag to indicate if the subset of directional modes from D45 to D203 intra
+   * should be enabled. Has no effect if directional modes are disabled.
+   */
+  bool enable_diagonal_intra;
+  /*!
+   * Flag to indicate if delta angles for directional intra prediction should be
+   * enabled.
+   */
+  bool enable_angle_delta;
+  /*!
+   * Flag to indicate whether to automatically turn off several intral coding
+   * tools.
+   * This flag is only used when "--deltaq-mode=3" is true.
+   * When set to 1, the encoder will analyze the reconstruction quality
+   * as compared to the source image in the preprocessing pass.
+   * If the recontruction quality is considered high enough, we disable
+   * the following intra coding tools, for better encoding speed:
+   * "--enable_smooth_intra",
+   * "--enable_paeth_intra",
+   * "--enable_cfl_intra",
+   * "--enable_diagonal_intra".
+   */
+  bool auto_intra_tools_off;
+} IntraModeCfg;
+
+/*!
+ * \brief Encoder flags for transform sizes and types.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if 64-pt transform should be enabled.
+   */
+  bool enable_tx64;
+  /*!
+   * Flag to indicate if flip and identity transform types should be enabled.
+   */
+  bool enable_flip_idtx;
+  /*!
+   * Flag to indicate if rectangular transform should be enabled.
+   */
+  bool enable_rect_tx;
+  /*!
+   * Flag to indicate whether or not to use a default reduced set for ext-tx
+   * rather than the potential full set of 16 transforms.
+   */
+  bool reduced_tx_type_set;
+  /*!
+   * Flag to indicate if transform type for intra blocks should be limited to
+   * DCT_DCT.
+   */
+  bool use_intra_dct_only;
+  /*!
+   * Flag to indicate if transform type for inter blocks should be limited to
+   * DCT_DCT.
+   */
+  bool use_inter_dct_only;
+  /*!
+   * Flag to indicate if intra blocks should use default transform type
+   * (mode-dependent) only.
+   */
+  bool use_intra_default_tx_only;
+  /*!
+   * Flag to indicate if transform size search should be enabled.
+   */
+  bool enable_tx_size_search;
+} TxfmSizeTypeCfg;
+
+/*!
+ * \brief Encoder flags for compound prediction modes.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if distance-weighted compound type should be enabled.
+   */
+  bool enable_dist_wtd_comp;
+  /*!
+   * Flag to indicate if masked (wedge/diff-wtd) compound type should be
+   * enabled.
+   */
+  bool enable_masked_comp;
+  /*!
+   * Flag to indicate if smooth interintra mode should be enabled.
+   */
+  bool enable_smooth_interintra;
+  /*!
+   * Flag to indicate if difference-weighted compound type should be enabled.
+   */
+  bool enable_diff_wtd_comp;
+  /*!
+   * Flag to indicate if inter-inter wedge compound type should be enabled.
+   */
+  bool enable_interinter_wedge;
+  /*!
+   * Flag to indicate if inter-intra wedge compound type should be enabled.
+   */
+  bool enable_interintra_wedge;
+} CompoundTypeCfg;
+
+/*!
+ * \brief Encoder config related to frame super-resolution.
+ */
+typedef struct {
+  /*!
+   * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+   * mode is used for inter frames.
+   */
+  int superres_qthresh;
+  /*!
+   * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+   * mode is used for key frames.
+   */
+  int superres_kf_qthresh;
+  /*!
+   * Indicates the denominator of the fraction that specifies the ratio between
+   * the superblock width before and after upscaling for inter frames. The
+   * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
+   */
+  uint8_t superres_scale_denominator;
+  /*!
+   * Indicates the denominator of the fraction that specifies the ratio between
+   * the superblock width before and after upscaling for key frames. The
+   * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
+   */
+  uint8_t superres_kf_scale_denominator;
+  /*!
+   * Indicates the Super-resolution mode to be used by the encoder.
+   */
+  aom_superres_mode superres_mode;
+  /*!
+   * Flag to indicate if super-resolution should be enabled for the sequence.
+   */
+  bool enable_superres;
+} SuperResCfg;
+
+/*!
+ * \brief Encoder config related to the coding of key frames.
+ */
+typedef struct {
+  /*!
+   * Indicates the minimum distance to a key frame.
+   */
+  int key_freq_min;
+
+  /*!
+   * Indicates the maximum distance to a key frame.
+   */
+  int key_freq_max;
+
+  /*!
+   * Indicates if temporal filtering should be applied on keyframe.
+   */
+  int enable_keyframe_filtering;
+
+  /*!
+   * Indicates the number of frames after which a frame may be coded as an
+   * S-Frame.
+   */
+  int sframe_dist;
+
+  /*!
+   * Indicates how an S-Frame should be inserted.
+   * 1: the considered frame will be made into an S-Frame only if it is an
+   * altref frame. 2: the next altref frame will be made into an S-Frame.
+   */
+  int sframe_mode;
+
+  /*!
+   * Indicates if encoder should autodetect cut scenes and set the keyframes.
+   */
+  bool auto_key;
+
+  /*!
+   * Indicates the forward key frame distance.
+   */
+  int fwd_kf_dist;
+
+  /*!
+   * Indicates if forward keyframe reference should be enabled.
+   */
+  bool fwd_kf_enabled;
+
+  /*!
+   * Indicates if S-Frames should be enabled for the sequence.
+   */
+  bool enable_sframe;
+
+  /*!
+   * Indicates if intra block copy prediction mode should be enabled or not.
+   */
+  bool enable_intrabc;
+} KeyFrameCfg;
+
+/*!
+ * \brief Encoder rate control configuration parameters
+ */
+typedef struct {
+  /*!\cond */
+  // BUFFERING PARAMETERS
+  /*!\endcond */
+  /*!
+   * Indicates the amount of data that will be buffered by the decoding
+   * application prior to beginning playback, and is expressed in units of
+   * time(milliseconds).
+   */
+  int64_t starting_buffer_level_ms;
+  /*!
+   * Indicates the amount of data that the encoder should try to maintain in the
+   * decoder's buffer, and is expressed in units of time(milliseconds).
+   */
+  int64_t optimal_buffer_level_ms;
+  /*!
+   * Indicates the maximum amount of data that may be buffered by the decoding
+   * application, and is expressed in units of time(milliseconds).
+   */
+  int64_t maximum_buffer_size_ms;
+
+  /*!
+   * Indicates the bandwidth to be used in bits per second.
+   */
+  int64_t target_bandwidth;
+
+  /*!
+   * Indicates average complexity of the corpus in single pass vbr based on
+   * LAP. 0 indicates that corpus complexity vbr mode is disabled.
+   */
+  unsigned int vbr_corpus_complexity_lap;
+  /*!
+   * Indicates the maximum allowed bitrate for any intra frame as % of bitrate
+   * target.
+   */
+  unsigned int max_intra_bitrate_pct;
+  /*!
+   * Indicates the maximum allowed bitrate for any inter frame as % of bitrate
+   * target.
+   */
+  unsigned int max_inter_bitrate_pct;
+  /*!
+   * Indicates the percentage of rate boost for golden frame in CBR mode.
+   */
+  unsigned int gf_cbr_boost_pct;
+  /*!
+   * min_cr / 100 indicates the target minimum compression ratio for each
+   * frame.
+   */
+  unsigned int min_cr;
+  /*!
+   * Indicates the frame drop threshold.
+   */
+  int drop_frames_water_mark;
+  /*!
+   * under_shoot_pct indicates the tolerance of the VBR algorithm to
+   * undershoot and is used as a trigger threshold for more aggressive
+   * adaptation of Q. It's value can range from 0-100.
+   */
+  int under_shoot_pct;
+  /*!
+   * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot
+   * and is used as a trigger threshold for more aggressive adaptation of Q.
+   * It's value can range from 0-1000.
+   */
+  int over_shoot_pct;
+  /*!
+   * Indicates the maximum qindex that can be used by the quantizer i.e. the
+   * worst quality qindex.
+   */
+  int worst_allowed_q;
+  /*!
+   * Indicates the minimum qindex that can be used by the quantizer i.e. the
+   * best quality qindex.
+   */
+  int best_allowed_q;
+  /*!
+   * Indicates the Constant/Constrained Quality level.
+   */
+  int cq_level;
+  /*!
+   * Indicates if the encoding mode is vbr, cbr, constrained quality or
+   * constant quality.
+   */
+  enum aom_rc_mode mode;
+  /*!
+   * Indicates the bias (expressed on a scale of 0 to 100) for determining
+   * target size for the current frame. The value 0 indicates the optimal CBR
+   * mode value should be used, and 100 indicates the optimal VBR mode value
+   * should be used.
+   */
+  int vbrbias;
+  /*!
+   * Indicates the minimum bitrate to be used for a single frame as a percentage
+   * of the target bitrate.
+   */
+  int vbrmin_section;
+  /*!
+   * Indicates the maximum bitrate to be used for a single frame as a percentage
+   * of the target bitrate.
+   */
+  int vbrmax_section;
+} RateControlCfg;
+
+/*!\cond */
+typedef struct {
+  // Indicates the number of frames lag before encoding is started.
+  int lag_in_frames;
+  // Indicates the minimum gf/arf interval to be used.
+  int min_gf_interval;
+  // Indicates the maximum gf/arf interval to be used.
+  int max_gf_interval;
+  // Indicates the minimum height for GF group pyramid structure to be used.
+  int gf_min_pyr_height;
+  // Indicates the maximum height for GF group pyramid structure to be used.
+  int gf_max_pyr_height;
+  // Indicates if automatic set and use of altref frames should be enabled.
+  bool enable_auto_arf;
+  // Indicates if automatic set and use of (b)ackward (r)ef (f)rames should be
+  // enabled.
+  bool enable_auto_brf;
+} GFConfig;
+
+typedef struct {
+  // Indicates the number of tile groups.
+  unsigned int num_tile_groups;
+  // Indicates the MTU size for a tile group. If mtu is non-zero,
+  // num_tile_groups is set to DEFAULT_MAX_NUM_TG.
+  unsigned int mtu;
+  // Indicates the number of tile columns in log2.
+  int tile_columns;
+  // Indicates the number of tile rows in log2.
+  int tile_rows;
+  // Indicates the number of widths in the tile_widths[] array.
+  int tile_width_count;
+  // Indicates the number of heights in the tile_heights[] array.
+  int tile_height_count;
+  // Indicates the tile widths, and may be empty.
+  int tile_widths[MAX_TILE_COLS];
+  // Indicates the tile heights, and may be empty.
+  int tile_heights[MAX_TILE_ROWS];
+  // Indicates if large scale tile coding should be used.
+  bool enable_large_scale_tile;
+  // Indicates if single tile decoding mode should be enabled.
+  bool enable_single_tile_decoding;
+  // Indicates if EXT_TILE_DEBUG should be enabled.
+  bool enable_ext_tile_debug;
+} TileConfig;
+
+typedef struct {
+  // Indicates the width of the input frame.
+  int width;
+  // Indicates the height of the input frame.
+  int height;
+  // If forced_max_frame_width is non-zero then it is used to force the maximum
+  // frame width written in write_sequence_header().
+  int forced_max_frame_width;
+  // If forced_max_frame_width is non-zero then it is used to force the maximum
+  // frame height written in write_sequence_header().
+  int forced_max_frame_height;
+  // Indicates the frame width after applying both super-resolution and resize
+  // to the coded frame.
+  int render_width;
+  // Indicates the frame height after applying both super-resolution and resize
+  // to the coded frame.
+  int render_height;
+} FrameDimensionCfg;
+
+typedef struct {
+  // Indicates if warped motion should be enabled.
+  bool enable_warped_motion;
+  // Indicates if warped motion should be evaluated or not.
+  bool allow_warped_motion;
+  // Indicates if OBMC motion should be enabled.
+  bool enable_obmc;
+} MotionModeCfg;
+
+typedef struct {
+  // Timing info for each frame.
+  aom_timing_info_t timing_info;
+  // Indicates the number of time units of a decoding clock.
+  uint32_t num_units_in_decoding_tick;
+  // Indicates if decoder model information is present in the coded sequence
+  // header.
+  bool decoder_model_info_present_flag;
+  // Indicates if display model information is present in the coded sequence
+  // header.
+  bool display_model_info_present_flag;
+  // Indicates if timing info for each frame is present.
+  bool timing_info_present;
+} DecoderModelCfg;
+
+typedef struct {
+  // Indicates the update frequency for coeff costs.
+  COST_UPDATE_TYPE coeff;
+  // Indicates the update frequency for mode costs.
+  COST_UPDATE_TYPE mode;
+  // Indicates the update frequency for mv costs.
+  COST_UPDATE_TYPE mv;
+  // Indicates the update frequency for dv costs.
+  COST_UPDATE_TYPE dv;
+} CostUpdateFreq;
+
+typedef struct {
+  // Indicates the maximum number of reference frames allowed per frame.
+  unsigned int max_reference_frames;
+  // Indicates if the reduced set of references should be enabled.
+  bool enable_reduced_reference_set;
+  // Indicates if one-sided compound should be enabled.
+  bool enable_onesided_comp;
+} RefFrameCfg;
+
+typedef struct {
+  // Indicates the color space that should be used.
+  aom_color_primaries_t color_primaries;
+  // Indicates the characteristics of transfer function to be used.
+  aom_transfer_characteristics_t transfer_characteristics;
+  // Indicates the matrix coefficients to be used for the transfer function.
+  aom_matrix_coefficients_t matrix_coefficients;
+  // Indicates the chroma 4:2:0 sample position info.
+  aom_chroma_sample_position_t chroma_sample_position;
+  // Indicates if a limited color range or full color range should be used.
+  aom_color_range_t color_range;
+} ColorCfg;
+
+typedef struct {
+  // Indicates if extreme motion vector unit test should be enabled or not.
+  unsigned int motion_vector_unit_test;
+  // Indicates if superblock multipass unit test should be enabled or not.
+  unsigned int sb_multipass_unit_test;
+} UnitTestCfg;
+
+typedef struct {
+  // Indicates the file path to the VMAF model.
+  const char *vmaf_model_path;
+  // Indicates the path to the film grain parameters.
+  const char *film_grain_table_filename;
+  // Indicates the visual tuning metric.
+  aom_tune_metric tuning;
+  // Indicates if the current content is screen or default type.
+  aom_tune_content content;
+  // Indicates the film grain parameters.
+  int film_grain_test_vector;
+  // Indicates the in-block distortion metric to use.
+  aom_dist_metric dist_metric;
+} TuneCfg;
+
+typedef struct {
+  // Indicates the framerate of the input video.
+  double init_framerate;
+  // Indicates the bit-depth of the input video.
+  unsigned int input_bit_depth;
+  // Indicates the maximum number of frames to be encoded.
+  unsigned int limit;
+  // Indicates the chrome subsampling x value.
+  unsigned int chroma_subsampling_x;
+  // Indicates the chrome subsampling y value.
+  unsigned int chroma_subsampling_y;
+} InputCfg;
+
+typedef struct {
+  // If true, encoder will use fixed QP offsets, that are either:
+  // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
+  // - Picked automatically from cq_level.
+  int use_fixed_qp_offsets;
+  // Indicates the minimum flatness of the quantization matrix.
+  int qm_minlevel;
+  // Indicates the maximum flatness of the quantization matrix.
+  int qm_maxlevel;
+  // Indicates if adaptive quantize_b should be enabled.
+  int quant_b_adapt;
+  // Indicates the Adaptive Quantization mode to be used.
+  AQ_MODE aq_mode;
+  // Indicates the delta q mode to be used.
+  DELTAQ_MODE deltaq_mode;
+  // Indicates the delta q mode strength.
+  DELTAQ_MODE deltaq_strength;
+  // Indicates if delta quantization should be enabled in chroma planes.
+  bool enable_chroma_deltaq;
+  // Indicates if delta quantization should be enabled for hdr video
+  bool enable_hdr_deltaq;
+  // Indicates if encoding with quantization matrices should be enabled.
+  bool using_qm;
+} QuantizationCfg;
+
+/*!\endcond */
+/*!
+ * \brief Algorithm configuration parameters.
+ */
+typedef struct {
+  /*!
+   * Controls the level at which rate-distortion optimization of transform
+   * coefficients favours sharpness in the block. Has no impact on RD when set
+   * to zero (default). For values 1-7, eob and skip block optimization are
+   * avoided and rdmult is adjusted in favour of block sharpness.
+   */
+  int sharpness;
+
+  /*!
+   * Indicates the trellis optimization mode of quantized coefficients.
+   * 0: disabled
+   * 1: enabled
+   * 2: enabled for rd search
+   * 3: true for estimate yrd search
+   */
+  int disable_trellis_quant;
+
+  /*!
+   * The maximum number of frames used to create an arf.
+   */
+  int arnr_max_frames;
+
+  /*!
+   * The temporal filter strength for arf used when creating ARFs.
+   */
+  int arnr_strength;
+
+  /*!
+   * Indicates the CDF update mode
+   * 0: no update
+   * 1: update on every frame(default)
+   * 2: selectively update
+   */
+  uint8_t cdf_update_mode;
+
+  /*!
+   * Indicates if RDO based on frame temporal dependency should be enabled.
+   */
+  bool enable_tpl_model;
+
+  /*!
+   * Indicates if coding of overlay frames for filtered ALTREF frames is
+   * enabled.
+   */
+  bool enable_overlay;
+
+  /*!
+   * Controls loop filtering
+   * 0: Loop filter is disabled for all frames
+   * 1: Loop filter is enabled for all frames
+   * 2: Loop filter is disabled for non-reference frames
+   * 3: Loop filter is disables for the frames with low motion
+   */
+  LOOPFILTER_CONTROL loopfilter_control;
+
+  /*!
+   * Indicates if the application of post-processing filters should be skipped
+   * on reconstructed frame.
+   */
+  bool skip_postproc_filtering;
+} AlgoCfg;
+/*!\cond */
+
+typedef struct {
+  // Indicates the codec bit-depth.
+  aom_bit_depth_t bit_depth;
+  // Indicates the superblock size that should be used by the encoder.
+  aom_superblock_size_t superblock_size;
+  // Indicates if loopfilter modulation should be enabled.
+  bool enable_deltalf_mode;
+  // Indicates how CDEF should be applied.
+  CDEF_CONTROL cdef_control;
+  // Indicates if loop restoration filter should be enabled.
+  bool enable_restoration;
+  // When enabled, video mode should be used even for single frame input.
+  bool force_video_mode;
+  // Indicates if the error resiliency features should be enabled.
+  bool error_resilient_mode;
+  // Indicates if frame parallel decoding feature should be enabled.
+  bool frame_parallel_decoding_mode;
+  // Indicates if the input should be encoded as monochrome.
+  bool enable_monochrome;
+  // When enabled, the encoder will use a full header even for still pictures.
+  // When disabled, a reduced header is used for still pictures.
+  bool full_still_picture_hdr;
+  // Indicates if dual interpolation filters should be enabled.
+  bool enable_dual_filter;
+  // Indicates if frame order hint should be enabled or not.
+  bool enable_order_hint;
+  // Indicates if ref_frame_mvs should be enabled at the sequence level.
+  bool ref_frame_mvs_present;
+  // Indicates if ref_frame_mvs should be enabled at the frame level.
+  bool enable_ref_frame_mvs;
+  // Indicates if interintra compound mode is enabled.
+  bool enable_interintra_comp;
+  // Indicates if global motion should be enabled.
+  bool enable_global_motion;
+  // Indicates if palette should be enabled.
+  bool enable_palette;
+} ToolCfg;
+
+/*!\endcond */
+/*!
+ * \brief Main encoder configuration data structure.
+ */
+typedef struct AV1EncoderConfig {
+  /*!\cond */
+  // Configuration related to the input video.
+  InputCfg input_cfg;
+
+  // Configuration related to frame-dimensions.
+  FrameDimensionCfg frm_dim_cfg;
+
+  /*!\endcond */
+  /*!
+   * Encoder algorithm configuration.
+   */
+  AlgoCfg algo_cfg;
+
+  /*!
+   * Configuration related to key-frames.
+   */
+  KeyFrameCfg kf_cfg;
+
+  /*!
+   * Rate control configuration
+   */
+  RateControlCfg rc_cfg;
+  /*!\cond */
+
+  // Configuration related to Quantization.
+  QuantizationCfg q_cfg;
+
+  // Internal frame size scaling.
+  ResizeCfg resize_cfg;
+
+  // Frame Super-Resolution size scaling.
+  SuperResCfg superres_cfg;
+
+  /*!\endcond */
+  /*!
+   * stats_in buffer contains all of the stats packets produced in the first
+   * pass, concatenated.
+   */
+  aom_fixed_buf_t twopass_stats_in;
+  /*!\cond */
+
+  // Configuration related to encoder toolsets.
+  ToolCfg tool_cfg;
+
+  // Configuration related to Group of frames.
+  GFConfig gf_cfg;
+
+  // Tile related configuration parameters.
+  TileConfig tile_cfg;
+
+  // Configuration related to Tune.
+  TuneCfg tune_cfg;
+
+  // Configuration related to color.
+  ColorCfg color_cfg;
+
+  // Configuration related to decoder model.
+  DecoderModelCfg dec_model_cfg;
+
+  // Configuration related to reference frames.
+  RefFrameCfg ref_frm_cfg;
+
+  // Configuration related to unit tests.
+  UnitTestCfg unit_test_cfg;
+
+  // Flags related to motion mode.
+  MotionModeCfg motion_mode_cfg;
+
+  // Flags related to intra mode search.
+  IntraModeCfg intra_mode_cfg;
+
+  // Flags related to transform size/type.
+  TxfmSizeTypeCfg txfm_cfg;
+
+  // Flags related to compound type.
+  CompoundTypeCfg comp_type_cfg;
+
+  // Partition related information.
+  PartitionCfg part_cfg;
+
+  // Configuration related to frequency of cost update.
+  CostUpdateFreq cost_upd_freq;
+
+#if CONFIG_DENOISE
+  // Indicates the noise level.
+  float noise_level;
+  // Indicates the the denoisers block size.
+  int noise_block_size;
+  // Indicates whether to apply denoising to the frame to be encoded
+  int enable_dnl_denoising;
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  // Noise sensitivity.
+  int noise_sensitivity;
+#endif
+  // Bit mask to specify which tier each of the 32 possible operating points
+  // conforms to.
+  unsigned int tier_mask;
+
+  // Indicates the number of pixels off the edge of a reference frame we're
+  // allowed to go when forming an inter prediction.
+  int border_in_pixels;
+
+  // Indicates the maximum number of threads that may be used by the encoder.
+  int max_threads;
+
+  // Indicates the speed preset to be used.
+  int speed;
+
+  // Indicates the target sequence level index for each operating point(OP).
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+
+  // Indicates the bitstream profile to be used.
+  BITSTREAM_PROFILE profile;
+
+  /*!\endcond */
+  /*!
+   * Indicates the current encoder pass :
+   * AOM_RC_ONE_PASS = One pass encode,
+   * AOM_RC_FIRST_PASS = First pass of multiple-pass
+   * AOM_RC_SECOND_PASS = Second pass of multiple-pass
+   * AOM_RC_THIRD_PASS = Third pass of multiple-pass
+   */
+  enum aom_enc_pass pass;
+  /*!\cond */
+
+  // Total number of encoding passes.
+  int passes;
+
+  // the name of the second pass output file when passes > 2
+  const char *two_pass_output;
+
+  // the name of the second pass log file when passes > 2
+  const char *second_pass_log;
+
+  // Indicates if the encoding is GOOD or REALTIME.
+  MODE mode;
+
+  // Indicates if row-based multi-threading should be enabled or not.
+  bool row_mt;
+
+  // Indicates if frame parallel multi-threading should be enabled or not.
+  bool fp_mt;
+
+  // Indicates if 16bit frame buffers are to be used i.e., the content is >
+  // 8-bit.
+  bool use_highbitdepth;
+
+  // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as
+  // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B
+  // format.
+  bool save_as_annexb;
+
+  // The path for partition stats reading and writing, used in the experiment
+  // CONFIG_PARTITION_SEARCH_ORDER.
+  const char *partition_info_path;
+
+  // The flag that indicates whether we use an external rate distribution to
+  // guide adaptive quantization. It requires --deltaq-mode=3. The rate
+  // distribution map file name is stored in |rate_distribution_info|.
+  unsigned int enable_rate_guide_deltaq;
+
+  // The input file of rate distribution information used in all intra mode
+  // to determine delta quantization.
+  const char *rate_distribution_info;
+
+  // Exit the encoder when it fails to encode to a given level.
+  int strict_level_conformance;
+
+  // Max depth for the GOP after a key frame
+  int kf_max_pyr_height;
+
+  // A flag to control if we enable the superblock qp sweep for a given lambda
+  int sb_qp_sweep;
+  /*!\endcond */
+} AV1EncoderConfig;
+
+/*!\cond */
+static INLINE int is_lossless_requested(const RateControlCfg *const rc_cfg) {
+  return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0;
+}
+/*!\endcond */
+
+/*!
+ * \brief Encoder-side probabilities for pruning of various AV1 tools
+ */
+typedef struct {
+  /*!
+   * obmc_probs[i][j] is the probability of OBMC being the best motion mode for
+   * jth block size and ith frame update type, averaged over past frames. If
+   * obmc_probs[i][j] < thresh, then OBMC search is pruned.
+   */
+  int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+  /*!
+   * warped_probs[i] is the probability of warped motion being the best motion
+   * mode for ith frame update type, averaged over past frames. If
+   * warped_probs[i] < thresh, then warped motion search is pruned.
+   */
+  int warped_probs[FRAME_UPDATE_TYPES];
+
+  /*!
+   * tx_type_probs[i][j][k] is the probability of kth tx_type being the best
+   * for jth transform size and ith frame update type, averaged over past
+   * frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
+   * type is pruned.
+   */
+  int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
+
+  /*!
+   * switchable_interp_probs[i][j][k] is the probability of kth interpolation
+   * filter being the best for jth filter context and ith frame update type,
+   * averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
+   * then interpolation filter search is pruned for that case.
+   */
+  int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS]
+                             [SWITCHABLE_FILTERS];
+} FrameProbInfo;
+
+/*!\cond */
+
+typedef struct FRAME_COUNTS {
+// Note: This structure should only contain 'unsigned int' fields, or
+// aggregates built solely from 'unsigned int' fields/elements
+#if CONFIG_ENTROPY_STATS
+  unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES];
+  unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  unsigned int cfl_sign[CFL_JOINT_SIGNS];
+  unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE];
+  unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2];
+  unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_y_color_index[PALETTE_SIZES]
+                                    [PALETTE_COLOR_INDEX_CONTEXTS]
+                                    [PALETTE_COLORS];
+  unsigned int palette_uv_color_index[PALETTE_SIZES]
+                                     [PALETTE_COLOR_INDEX_CONTEXTS]
+                                     [PALETTE_COLORS];
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
+  unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                        [EOB_COEF_CONTEXTS][2];
+  unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+  unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS]
+                        [2];
+  unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+  unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5];
+  unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6];
+  unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7];
+  unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8];
+  unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9];
+  unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10];
+  unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11];
+  unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [LEVEL_CONTEXTS][BR_CDF_SIZE];
+  unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                               [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2];
+  unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                                   [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1];
+  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+  unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2];
+  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  unsigned int wedge_idx[BLOCK_SIZES_ALL][16];
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
+  unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+  unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
+  unsigned int obmc[BLOCK_SIZES_ALL][2];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
+  unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+  unsigned int intrabc[2];
+
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+  unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
+  unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
+  unsigned int skip_txfm[SKIP_CONTEXTS][2];
+  unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
+  unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
+  unsigned int delta_q[DELTA_Q_PROBS][2];
+  unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
+  unsigned int delta_lf[DELTA_LF_PROBS][2];
+
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+  unsigned int filter_intra_mode[FILTER_INTRA_MODES];
+  unsigned int filter_intra[BLOCK_SIZES_ALL][2];
+  unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES];
+  unsigned int wiener_restore[2];
+  unsigned int sgrproj_restore[2];
+#endif  // CONFIG_ENTROPY_STATS
+
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+} FRAME_COUNTS;
+
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+  int ready;
+  double a;
+  double b;
+  double dist_mean;
+  double ld_mean;
+  double sse_mean;
+  double sse_sse_mean;
+  double sse_ld_mean;
+  int num;
+  double dist_sum;
+  double ld_sum;
+  double sse_sum;
+  double sse_sse_sum;
+  double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+  int idx;
+  int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_mode_info, which makes this terribly confusing.
+/*!\endcond */
+/*!
+ * \brief Struct used to hold inter mode data for fast tx search.
+ *
+ * This struct is used to perform a full transform search only on winning
+ * candidates searched with an estimate for transform coding RD.
+ */
+typedef struct inter_modes_info {
+  /*!
+   * The number of inter modes for which data was stored in each of the
+   * following arrays.
+   */
+  int num;
+  /*!
+   * Mode info struct for each of the candidate modes.
+   */
+  MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+  /*!
+   * The rate for each of the candidate modes.
+   */
+  int mode_rate_arr[MAX_INTER_MODES];
+  /*!
+   * The sse of the predictor for each of the candidate modes.
+   */
+  int64_t sse_arr[MAX_INTER_MODES];
+  /*!
+   * The estimated rd of the predictor for each of the candidate modes.
+   */
+  int64_t est_rd_arr[MAX_INTER_MODES];
+  /*!
+   * The rate and mode index for each of the candidate modes.
+   */
+  RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats for each of the candidate modes.
+   */
+  RD_STATS rd_cost_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats of luma only for each of the candidate modes.
+   */
+  RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats of chroma only for each of the candidate modes.
+   */
+  RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
+} InterModesInfo;
+
+/*!\cond */
+typedef struct {
+  // TODO(kyslov): consider changing to 64bit
+
+  // This struct is used for computing variance in choose_partitioning(), where
+  // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+  // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+  // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+  uint32_t sum_square_error;
+  int32_t sum_error;
+  int log2_count;
+  int variance;
+} VPartVar;
+
+typedef struct {
+  VPartVar none;
+  VPartVar horz[2];
+  VPartVar vert[2];
+} VPVariance;
+
+typedef struct {
+  VPVariance part_variances;
+  VPartVar split[4];
+} VP4x4;
+
+typedef struct {
+  VPVariance part_variances;
+  VP4x4 split[4];
+} VP8x8;
+
+typedef struct {
+  VPVariance part_variances;
+  VP8x8 split[4];
+} VP16x16;
+
+typedef struct {
+  VPVariance part_variances;
+  VP16x16 split[4];
+} VP32x32;
+
+typedef struct {
+  VPVariance part_variances;
+  VP32x32 split[4];
+} VP64x64;
+
+typedef struct {
+  VPVariance part_variances;
+  VP64x64 *split;
+} VP128x128;
+
+/*!\endcond */
+
+/*!
+ * \brief Thresholds for variance based partitioning.
+ */
+typedef struct {
+  /*!
+   * If block variance > threshold, then that block is forced to split.
+   * thresholds[0] - threshold for 128x128;
+   * thresholds[1] - threshold for 64x64;
+   * thresholds[2] - threshold for 32x32;
+   * thresholds[3] - threshold for 16x16;
+   * thresholds[4] - threshold for 8x8;
+   */
+  int64_t thresholds[5];
+
+  /*!
+   * MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
+   * minmax > threshold_minmax, the 16x16 is forced to split.
+   */
+  int64_t threshold_minmax;
+} VarBasedPartitionInfo;
+
+/*!
+ * \brief Encoder parameters for synchronization of row based multi-threading
+ */
+typedef struct {
+#if CONFIG_MULTITHREAD
+  /**
+   * \name Synchronization objects for top-right dependency.
+   */
+  /**@{*/
+  pthread_mutex_t *mutex_; /*!< Mutex lock object */
+  pthread_cond_t *cond_;   /*!< Condition variable */
+  /**@}*/
+#endif  // CONFIG_MULTITHREAD
+  /*!
+   * Buffer to store the superblock whose encoding is complete.
+   * num_finished_cols[i] stores the number of superblocks which finished
+   * encoding in the ith superblock row.
+   */
+  int *num_finished_cols;
+  /*!
+   * Denotes the superblock interval at which conditional signalling should
+   * happen. Also denotes the minimum number of extra superblocks of the top row
+   * to be complete to start encoding the current superblock. A value of 1
+   * indicates top-right dependency.
+   */
+  int sync_range;
+  /*!
+   * Denotes the additional number of superblocks in the previous row to be
+   * complete to start encoding the current superblock when intraBC tool is
+   * enabled. This additional top-right delay is required to satisfy the
+   * hardware constraints for intraBC tool when row multithreading is enabled.
+   */
+  int intrabc_extra_top_right_sb_delay;
+  /*!
+   * Number of superblock rows.
+   */
+  int rows;
+  /*!
+   * The superblock row (in units of MI blocks) to be processed next.
+   */
+  int next_mi_row;
+  /*!
+   * Number of threads processing the current tile.
+   */
+  int num_threads_working;
+} AV1EncRowMultiThreadSync;
+
+/*!\cond */
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+  FRAME_CONTEXT *row_ctx;
+  uint64_t abs_sum_level;
+  uint8_t allow_update_cdf;
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  AV1EncRowMultiThreadSync row_mt_sync;
+  MV firstpass_top_mv;
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+  int compound_ref_used_flag;
+  int skip_mode_used_flag;
+  int tx_type_used[TX_SIZES_ALL][TX_TYPES];
+  int obmc_used[BLOCK_SIZES_ALL][2];
+  int warped_used[2];
+  int newmv_or_intra_blocks;
+  uint64_t seg_tmp_pred_cost[2];
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  MvCosts *mv_costs_alloc;
+  IntraBCMVCosts *dv_costs_alloc;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+  PC_TREE_SHARED_BUFFERS shared_coeff_buf;
+  SIMPLE_MOTION_DATA_TREE *sms_tree;
+  SIMPLE_MOTION_DATA_TREE *sms_root;
+  uint32_t *hash_value_buffer[2][2];
+  OBMCBuffer obmc_buffer;
+  PALETTE_BUFFER *palette_buffer;
+  CompoundTypeRdBuffers comp_rd_buffer;
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint64_t abs_sum_level;
+  uint8_t *tmp_pred_bufs[2];
+  uint8_t *wiener_tmp_pred_buf;
+  int intrabc_used;
+  int deltaq_used;
+  int coefficient_size;
+  int max_mv_magnitude;
+  int interp_filter_selected[SWITCHABLE];
+  FRAME_CONTEXT *tctx;
+  VP64x64 *vt64x64;
+  int32_t num_64x64_blocks;
+  PICK_MODE_CONTEXT *firstpass_ctx;
+  TemporalFilterData tf_data;
+  TplBuffers tpl_tmp_buffers;
+  TplTxfmStats tpl_txfm_stats;
+  GlobalMotionData gm_data;
+  // Pointer to the array of structures to store gradient information of each
+  // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+  // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+  PixelLevelGradientInfo *pixel_gradient_info;
+  // Pointer to the array of structures to store source variance information of
+  // each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to
+  // store source variance and log of source variance of each 4x4 sub-block
+  // for subsequent retrieval.
+  Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
+  // Pointer to pc tree root.
+  PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+/*!\endcond */
+
+/*!
+ * \brief Encoder data related to row-based multi-threading
+ */
+typedef struct {
+  /*!
+   * Number of tile rows for which row synchronization memory is allocated.
+   */
+  int allocated_tile_rows;
+  /*!
+   * Number of tile cols for which row synchronization memory is allocated.
+   */
+  int allocated_tile_cols;
+  /*!
+   * Number of rows for which row synchronization memory is allocated
+   * per tile. During first-pass/look-ahead stage this equals the
+   * maximum number of macroblock rows in a tile. During encode stage,
+   * this equals the maximum number of superblock rows in a tile.
+   */
+  int allocated_rows;
+  /*!
+   * Number of columns for which entropy context memory is allocated
+   * per tile. During encode stage, this equals the maximum number of
+   * superblock columns in a tile minus 1. The entropy context memory
+   * is not allocated during first-pass/look-ahead stage.
+   */
+  int allocated_cols;
+
+  /*!
+   * thread_id_to_tile_id[i] indicates the tile id assigned to the ith thread.
+   */
+  int thread_id_to_tile_id[MAX_NUM_THREADS];
+
+  /*!
+   * num_tile_cols_done[i] indicates the number of tile columns whose encoding
+   * is complete in the ith superblock row.
+   */
+  int *num_tile_cols_done;
+
+  /*!
+   * Number of superblock rows in a frame for which 'num_tile_cols_done' is
+   * allocated.
+   */
+  int allocated_sb_rows;
+
+  /*!
+   * Initialized to false, set to true by the worker thread that encounters an
+   * error in order to abort the processing of other worker threads.
+   */
+  bool row_mt_exit;
+
+  /*!
+   * Initialized to false, set to true during first pass encoding by the worker
+   * thread that encounters an error in order to abort the processing of other
+   * worker threads.
+   */
+  bool firstpass_mt_exit;
+
+  /*!
+   * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker
+   * thread that encounters an error in order to abort the processing of other
+   * worker threads.
+   */
+  bool mb_wiener_mt_exit;
+
+#if CONFIG_MULTITHREAD
+  /*!
+   * Mutex lock used while dispatching jobs.
+   */
+  pthread_mutex_t *mutex_;
+  /*!
+   *  Condition variable used to dispatch loopfilter jobs.
+   */
+  pthread_cond_t *cond_;
+#endif
+
+  /**
+   * \name Row synchronization related function pointers.
+   */
+  /**@{*/
+  /*!
+   * Reader.
+   */
+  void (*sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int);
+  /*!
+   * Writer.
+   */
+  void (*sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int);
+  /**@}*/
+} AV1EncRowMultiThreadInfo;
+
+/*!
+ * \brief Encoder data related to multi-threading for allintra deltaq-mode=3
+ */
+typedef struct {
+#if CONFIG_MULTITHREAD
+  /*!
+   * Mutex lock used while dispatching jobs.
+   */
+  pthread_mutex_t *mutex_;
+  /*!
+   *  Condition variable used to dispatch loopfilter jobs.
+   */
+  pthread_cond_t *cond_;
+#endif
+
+  /**
+   * \name Row synchronization related function pointers for all intra mode
+   */
+  /**@{*/
+  /*!
+   * Reader.
+   */
+  void (*intra_sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int);
+  /*!
+   * Writer.
+   */
+  void (*intra_sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int);
+  /**@}*/
+} AV1EncAllIntraMultiThreadInfo;
+
+/*!
+ * \brief Max number of recodes used to track the frame probabilities.
+ */
+#define NUM_RECODES_PER_FRAME 10
+
+/*!
+ * \brief Max number of frames that can be encoded in a parallel encode set.
+ */
+#define MAX_PARALLEL_FRAMES 4
+
+/*!
+ * \brief Buffers to be backed up during parallel encode set to be restored
+ * later.
+ */
+typedef struct RestoreStateBuffers {
+  /*!
+   * Backup of original CDEF srcbuf.
+   */
+  uint16_t *cdef_srcbuf;
+
+  /*!
+   * Backup of original CDEF colbuf.
+   */
+  uint16_t *cdef_colbuf[MAX_MB_PLANE];
+
+  /*!
+   * Backup of original LR rst_tmpbuf.
+   */
+  int32_t *rst_tmpbuf;
+
+  /*!
+   * Backup of original LR rlbs.
+   */
+  RestorationLineBuffers *rlbs;
+} RestoreStateBuffers;
+
+/*!
+ * \brief Parameters related to restoration types.
+ */
+typedef struct {
+  /*!
+   * Stores the best coefficients for Wiener restoration.
+   */
+  WienerInfo wiener;
+
+  /*!
+   * Stores the best coefficients for Sgrproj restoration.
+   */
+  SgrprojInfo sgrproj;
+
+  /*!
+   * The rtype to use for this unit given a frame rtype as index. Indices:
+   * WIENER, SGRPROJ, SWITCHABLE.
+   */
+  RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+/*!
+ * \brief Structure to hold search parameter per restoration unit and
+ * intermediate buffer of Wiener filter used in pick filter stage of Loop
+ * restoration.
+ */
+typedef struct {
+  /*!
+   * Array of pointers to 'RestUnitSearchInfo' which holds data related to
+   * restoration types.
+   */
+  RestUnitSearchInfo *rusi[MAX_MB_PLANE];
+
+  /*!
+   * Buffer used to hold dgd-avg data during SIMD call of Wiener filter.
+   */
+  int16_t *dgd_avg;
+} AV1LrPickStruct;
+
+/*!
+ * \brief Primary Encoder parameters related to multi-threading.
+ */
+typedef struct PrimaryMultiThreadInfo {
+  /*!
+   * Number of workers created for multi-threading.
+   */
+  int num_workers;
+
+  /*!
+   * Number of workers used for different MT modules.
+   */
+  int num_mod_workers[NUM_MT_MODULES];
+
+  /*!
+   * Synchronization object used to launch job in the worker thread.
+   */
+  AVxWorker *workers;
+
+  /*!
+   * Data specific to each worker in encoder multi-threading.
+   * tile_thr_data[i] stores the worker data of the ith thread.
+   */
+  struct EncWorkerData *tile_thr_data;
+
+  /*!
+   * CDEF row multi-threading data.
+   */
+  AV1CdefWorkerData *cdef_worker;
+
+  /*!
+   * Primary(Level 1) Synchronization object used to launch job in the worker
+   * thread.
+   */
+  AVxWorker *p_workers[MAX_PARALLEL_FRAMES];
+
+  /*!
+   * Number of primary workers created for multi-threading.
+   */
+  int p_num_workers;
+
+  /*!
+   * Tracks the number of workers in encode stage multi-threading.
+   */
+  int prev_num_enc_workers;
+} PrimaryMultiThreadInfo;
+
+/*!
+ * \brief Encoder parameters related to multi-threading.
+ */
+typedef struct MultiThreadInfo {
+  /*!
+   * Number of workers created for multi-threading.
+   */
+  int num_workers;
+
+  /*!
+   * Number of workers used for different MT modules.
+   */
+  int num_mod_workers[NUM_MT_MODULES];
+
+  /*!
+   * Synchronization object used to launch job in the worker thread.
+   */
+  AVxWorker *workers;
+
+  /*!
+   * Data specific to each worker in encoder multi-threading.
+   * tile_thr_data[i] stores the worker data of the ith thread.
+   */
+  struct EncWorkerData *tile_thr_data;
+
+  /*!
+   * When set, indicates that row based multi-threading of the encoder is
+   * enabled.
+   */
+  bool row_mt_enabled;
+
+  /*!
+   * When set, indicates that multi-threading for bitstream packing is enabled.
+   */
+  bool pack_bs_mt_enabled;
+
+  /*!
+   * Encoder row multi-threading data.
+   */
+  AV1EncRowMultiThreadInfo enc_row_mt;
+
+  /*!
+   * Encoder multi-threading data for allintra mode in the preprocessing stage
+   * when --deltaq-mode=3.
+   */
+  AV1EncAllIntraMultiThreadInfo intra_mt;
+
+  /*!
+   * Tpl row multi-threading data.
+   */
+  AV1TplRowMultiThreadInfo tpl_row_mt;
+
+  /*!
+   * Loop Filter multi-threading object.
+   */
+  AV1LfSync lf_row_sync;
+
+  /*!
+   * Loop Restoration multi-threading object.
+   */
+  AV1LrSync lr_row_sync;
+
+  /*!
+   * Pack bitstream multi-threading object.
+   */
+  AV1EncPackBSSync pack_bs_sync;
+
+  /*!
+   * Global Motion multi-threading object.
+   */
+  AV1GlobalMotionSync gm_sync;
+
+  /*!
+   * Temporal Filter multi-threading object.
+   */
+  AV1TemporalFilterSync tf_sync;
+
+  /*!
+   * CDEF search multi-threading object.
+   */
+  AV1CdefSync cdef_sync;
+
+  /*!
+   * Pointer to CDEF row multi-threading data for the frame.
+   */
+  AV1CdefWorkerData *cdef_worker;
+
+  /*!
+   * Buffers to be stored/restored before/after parallel encode.
+   */
+  RestoreStateBuffers restore_state_buf;
+
+  /*!
+   * In multi-threaded realtime encoding with row-mt enabled, pipeline
+   * loop-filtering after encoding.
+   */
+  int pipeline_lpf_mt_with_enc;
+} MultiThreadInfo;
+
+/*!\cond */
+
+typedef struct ActiveMap {
+  int enabled;
+  int update;
+  unsigned char *map;
+} ActiveMap;
+
+/*!\endcond */
+
+/*!
+ * \brief Encoder info used for decision on forcing integer motion vectors.
+ */
+typedef struct {
+  /*!
+   * cs_rate_array[i] is the fraction of blocks in a frame which either match
+   * with the collocated block or are smooth, where i is the rate_index.
+   */
+  double cs_rate_array[32];
+  /*!
+   * rate_index is used to index cs_rate_array.
+   */
+  int rate_index;
+  /*!
+   * rate_size is the total number of entries populated in cs_rate_array.
+   */
+  int rate_size;
+} ForceIntegerMVInfo;
+
+/*!\cond */
+
+#if CONFIG_INTERNAL_STATS
+// types of stats
+enum {
+  STAT_Y,
+  STAT_U,
+  STAT_V,
+  STAT_ALL,
+  NUM_STAT_TYPES  // This should always be the last member of the enum
+} UENUM1BYTE(StatType);
+
+typedef struct IMAGE_STAT {
+  double stat[NUM_STAT_TYPES];
+  double worst;
+} ImageStat;
+#endif  // CONFIG_INTERNAL_STATS
+
+typedef struct {
+  int ref_count;
+  YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+/*!\endcond */
+
+/*!
+ * \brief Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level
+ *
+ * This is used for bitstream preparation.
+ */
+typedef struct {
+  /*!
+   * frame_base[mi_row * stride + mi_col] stores the mode information of
+   * block (mi_row,mi_col).
+   */
+  MB_MODE_INFO_EXT_FRAME *frame_base;
+  /*!
+   * Size of frame_base buffer.
+   */
+  int alloc_size;
+  /*!
+   * Stride of frame_base buffer.
+   */
+  int stride;
+} MBMIExtFrameBufferInfo;
+
+/*!\cond */
+
+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct FramePartitionTimingStats {
+  int partition_decisions[6][EXT_PARTITION_TYPES];
+  int partition_attempts[6][EXT_PARTITION_TYPES];
+  int64_t partition_times[6][EXT_PARTITION_TYPES];
+
+  int partition_redo;
+} FramePartitionTimingStats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "aom_ports/aom_timer.h"
+// Adjust the following to add new components.
+enum {
+  av1_encode_strategy_time,
+  av1_get_one_pass_rt_params_time,
+  av1_get_second_pass_params_time,
+  denoise_and_encode_time,
+  apply_filtering_time,
+  av1_tpl_setup_stats_time,
+  encode_frame_to_data_rate_time,
+  encode_with_or_without_recode_time,
+  loop_filter_time,
+  cdef_time,
+  loop_restoration_time,
+  av1_pack_bitstream_final_time,
+  av1_encode_frame_time,
+  av1_compute_global_motion_time,
+  av1_setup_motion_field_time,
+  encode_sb_row_time,
+
+  rd_pick_partition_time,
+  rd_use_partition_time,
+  choose_var_based_partitioning_time,
+  av1_prune_partitions_time,
+  none_partition_search_time,
+  split_partition_search_time,
+  rectangular_partition_search_time,
+  ab_partitions_search_time,
+  rd_pick_4partition_time,
+  encode_sb_time,
+
+  rd_pick_sb_modes_time,
+  av1_rd_pick_intra_mode_sb_time,
+  av1_rd_pick_inter_mode_sb_time,
+  set_params_rd_pick_inter_mode_time,
+  skip_inter_mode_time,
+  handle_inter_mode_time,
+  evaluate_motion_mode_for_winner_candidates_time,
+  do_tx_search_time,
+  handle_intra_mode_time,
+  refine_winner_mode_tx_time,
+  av1_search_palette_mode_time,
+  handle_newmv_time,
+  compound_type_rd_time,
+  interpolation_filter_search_time,
+  motion_mode_rd_time,
+
+  nonrd_use_partition_time,
+  pick_sb_modes_nonrd_time,
+  hybrid_intra_mode_search_time,
+  nonrd_pick_inter_mode_sb_time,
+  encode_b_nonrd_time,
+
+  kTimingComponents,
+} UENUM1BYTE(TIMING_COMPONENT);
+
+static INLINE char const *get_component_name(int index) {
+  switch (index) {
+    case av1_encode_strategy_time: return "av1_encode_strategy_time";
+    case av1_get_one_pass_rt_params_time:
+      return "av1_get_one_pass_rt_params_time";
+    case av1_get_second_pass_params_time:
+      return "av1_get_second_pass_params_time";
+    case denoise_and_encode_time: return "denoise_and_encode_time";
+    case apply_filtering_time: return "apply_filtering_time";
+    case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time";
+    case encode_frame_to_data_rate_time:
+      return "encode_frame_to_data_rate_time";
+    case encode_with_or_without_recode_time:
+      return "encode_with_or_without_recode_time";
+    case loop_filter_time: return "loop_filter_time";
+    case cdef_time: return "cdef_time";
+    case loop_restoration_time: return "loop_restoration_time";
+    case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time";
+    case av1_encode_frame_time: return "av1_encode_frame_time";
+    case av1_compute_global_motion_time:
+      return "av1_compute_global_motion_time";
+    case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
+    case encode_sb_row_time: return "encode_sb_row_time";
+
+    case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_use_partition_time: return "rd_use_partition_time";
+    case choose_var_based_partitioning_time:
+      return "choose_var_based_partitioning_time";
+    case av1_prune_partitions_time: return "av1_prune_partitions_time";
+    case none_partition_search_time: return "none_partition_search_time";
+    case split_partition_search_time: return "split_partition_search_time";
+    case rectangular_partition_search_time:
+      return "rectangular_partition_search_time";
+    case ab_partitions_search_time: return "ab_partitions_search_time";
+    case rd_pick_4partition_time: return "rd_pick_4partition_time";
+    case encode_sb_time: return "encode_sb_time";
+
+    case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+    case av1_rd_pick_intra_mode_sb_time:
+      return "av1_rd_pick_intra_mode_sb_time";
+    case av1_rd_pick_inter_mode_sb_time:
+      return "av1_rd_pick_inter_mode_sb_time";
+    case set_params_rd_pick_inter_mode_time:
+      return "set_params_rd_pick_inter_mode_time";
+    case skip_inter_mode_time: return "skip_inter_mode_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case evaluate_motion_mode_for_winner_candidates_time:
+      return "evaluate_motion_mode_for_winner_candidates_time";
+    case do_tx_search_time: return "do_tx_search_time";
+    case handle_intra_mode_time: return "handle_intra_mode_time";
+    case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time";
+    case av1_search_palette_mode_time: return "av1_search_palette_mode_time";
+    case handle_newmv_time: return "handle_newmv_time";
+    case compound_type_rd_time: return "compound_type_rd_time";
+    case interpolation_filter_search_time:
+      return "interpolation_filter_search_time";
+    case motion_mode_rd_time: return "motion_mode_rd_time";
+
+    case nonrd_use_partition_time: return "nonrd_use_partition_time";
+    case pick_sb_modes_nonrd_time: return "pick_sb_modes_nonrd_time";
+    case hybrid_intra_mode_search_time: return "hybrid_intra_mode_search_time";
+    case nonrd_pick_inter_mode_sb_time: return "nonrd_pick_inter_mode_sb_time";
+    case encode_b_nonrd_time: return "encode_b_nonrd_time";
+
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
+// The maximum number of internal ARFs except ALTREF_FRAME
+#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
+
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to global motion search
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if global motion search needs to be rerun.
+   */
+  bool search_done;
+
+  /*!
+   * Array of pointers to the frame buffers holding the reference frames.
+   * ref_buf[i] stores the pointer to the reference frame of the ith
+   * reference frame type.
+   */
+  YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
+
+  /*!
+   * Holds the number of valid reference frames in past and future directions
+   * w.r.t. the current frame. num_ref_frames[i] stores the total number of
+   * valid reference frames in 'i' direction.
+   */
+  int num_ref_frames[MAX_DIRECTIONS];
+
+  /*!
+   * Array of structure which stores the valid reference frames in past and
+   * future directions and their corresponding distance from the source frame.
+   * reference_frames[i][j] holds the jth valid reference frame type in the
+   * direction 'i' and its temporal distance from the source frame .
+   */
+  FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1];
+
+  /**
+   * \name Dimensions for which segment map is allocated.
+   */
+  /**@{*/
+  int segment_map_w; /*!< segment map width */
+  int segment_map_h; /*!< segment map height */
+  /**@}*/
+} GlobalMotionInfo;
+
+/*!
+ * \brief Flags related to interpolation filter search
+ */
+typedef struct {
+  /*!
+   * Stores the default value of skip flag depending on chroma format
+   * Set as 1 for monochrome and 3 for other color formats
+   */
+  int default_interp_skip_flags;
+  /*!
+   * Filter mask to allow certain interp_filter type.
+   */
+  uint16_t interp_filter_search_mask;
+} InterpSearchFlags;
+
+/*!
+ * \brief Parameters for motion vector search process
+ */
+typedef struct {
+  /*!
+   * Largest MV component used in a frame.
+   * The value from the previous frame is used to set the full pixel search
+   * range for the current frame.
+   */
+  int max_mv_magnitude;
+  /*!
+   * Parameter indicating initial search window to be used in full-pixel search.
+   * Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+   */
+  int mv_step_param;
+  /*!
+   * Pointer to sub-pixel search function.
+   * In encoder: av1_find_best_sub_pixel_tree
+   *             av1_find_best_sub_pixel_tree_pruned
+   *             av1_find_best_sub_pixel_tree_pruned_more
+   * In MV unit test: av1_return_max_sub_pixel_mv
+   *                  av1_return_min_sub_pixel_mv
+   */
+  fractional_mv_step_fp *find_fractional_mv_step;
+  /*!
+   * Search site configuration for full-pel MV search.
+   * search_site_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple
+   * motion search. search_site_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal
+   * filter search_site_cfg[SS_CFG_FPF]: Used during first pass and lookahead
+   */
+  search_site_config search_site_cfg[SS_CFG_TOTAL][NUM_DISTINCT_SEARCH_METHODS];
+} MotionVectorSearchParams;
+
+/*!
+ * \brief Refresh frame flags for different type of frames.
+ *
+ * If the refresh flag is true for a particular reference frame, after the
+ * current frame is encoded, the reference frame gets refreshed (updated) to
+ * be the current frame. Note: Usually at most one flag will be set to true at
+ * a time. But, for key-frames, all flags are set to true at once.
+ */
+typedef struct {
+  bool golden_frame;  /*!< Refresh flag for golden frame */
+  bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
+  bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
+} RefreshFrameInfo;
+
+/*!
+ * \brief Desired dimensions for an externally triggered resize.
+ *
+ * When resize is triggered externally, the desired dimensions are stored in
+ * this struct until used in the next frame to be coded. These values are
+ * effective only for one frame and are reset after they are used.
+ */
+typedef struct {
+  int width;  /*!< Desired resized width */
+  int height; /*!< Desired resized height */
+} ResizePendingParams;
+
+/*!
+ * \brief Refrence frame distance related variables.
+ */
+typedef struct {
+  /*!
+   * True relative distance of reference frames w.r.t. the current frame.
+   */
+  int ref_relative_dist[INTER_REFS_PER_FRAME];
+  /*!
+   * The nearest reference w.r.t. current frame in the past.
+   */
+  int8_t nearest_past_ref;
+  /*!
+   * The nearest reference w.r.t. current frame in the future.
+   */
+  int8_t nearest_future_ref;
+} RefFrameDistanceInfo;
+
+/*!
+ * \brief Parameters used for winner mode processing.
+ *
+ * This is a basic two pass approach: in the first pass, we reduce the number of
+ * transform searches based on some thresholds during the rdopt process to find
+ * the  "winner mode". In the second pass, we perform a more through tx search
+ * on the winner mode.
+ * There are some arrays in the struct, and their indices are used in the
+ * following manner:
+ * Index 0: Default mode evaluation, Winner mode processing is not applicable
+ * (Eg : IntraBc).
+ * Index 1: Mode evaluation.
+ * Index 2: Winner mode evaluation
+ * Index 1 and 2 are only used when the respective speed feature is on.
+ */
+typedef struct {
+  /*!
+   * Threshold to determine if trellis optimization is to be enabled
+   * based on :
+   * 0 : dist threshold
+   * 1 : satd threshold
+   * Corresponds to enable_winner_mode_for_coeff_opt speed feature.
+   */
+  unsigned int coeff_opt_thresholds[MODE_EVAL_TYPES][2];
+
+  /*!
+   * Determines the tx size search method during rdopt.
+   * Corresponds to enable_winner_mode_for_tx_size_srch speed feature.
+   */
+  TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES];
+
+  /*!
+   * Controls how often we should approximate prediction error with tx
+   * coefficients. If it's 0, then never. If 1, then it's during the tx_type
+   * search only. If 2, then always.
+   * Corresponds to tx_domain_dist_level speed feature.
+   */
+  unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES];
+
+  /*!
+   * Threshold to approximate pixel domain distortion with transform domain
+   * distortion. This is only used if use_transform_domain_distortion is on.
+   * Corresponds to enable_winner_mode_for_use_tx_domain_dist speed feature.
+   */
+  unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+
+  /*!
+   * Controls how often we should try to skip the transform process based on
+   * result from dct.
+   * Corresponds to use_skip_flag_prediction speed feature.
+   */
+  unsigned int skip_txfm_level[MODE_EVAL_TYPES];
+
+  /*!
+   * Predict DC only txfm blocks for default, mode and winner mode evaluation.
+   * Index 0: Default mode evaluation, Winner mode processing is not applicable.
+   * Index 1: Mode evaluation, Index 2: Winner mode evaluation
+   */
+  unsigned int predict_dc_level[MODE_EVAL_TYPES];
+} WinnerModeParams;
+
+/*!
+ * \brief Frame refresh flags set by the external interface.
+ *
+ * Flags set by external interface to determine which reference buffers are
+ * refreshed by this frame. When set, the encoder will update the particular
+ * reference frame buffer with the contents of the current frame.
+ */
+typedef struct {
+  bool last_frame;     /*!< Refresh flag for last frame */
+  bool golden_frame;   /*!< Refresh flag for golden frame */
+  bool bwd_ref_frame;  /*!< Refresh flag for bwd-ref frame */
+  bool alt2_ref_frame; /*!< Refresh flag for alt2-ref frame */
+  bool alt_ref_frame;  /*!< Refresh flag for alt-ref frame */
+  /*!
+   * Flag indicating if the update of refresh frame flags is pending.
+   */
+  bool update_pending;
+} ExtRefreshFrameFlagsInfo;
+
+/*!
+ * \brief Flags signalled by the external interface at frame level.
+ */
+typedef struct {
+  /*!
+   * Bit mask to disable certain reference frame types.
+   */
+  int ref_frame_flags;
+
+  /*!
+   * Frame refresh flags set by the external interface.
+   */
+  ExtRefreshFrameFlagsInfo refresh_frame;
+
+  /*!
+   * Flag to enable the update of frame contexts at the end of a frame decode.
+   */
+  bool refresh_frame_context;
+
+  /*!
+   * Flag to indicate that update of refresh_frame_context from external
+   * interface is pending.
+   */
+  bool refresh_frame_context_pending;
+
+  /*!
+   * Flag to enable temporal MV prediction.
+   */
+  bool use_ref_frame_mvs;
+
+  /*!
+   * Indicates whether the current frame is to be coded as error resilient.
+   */
+  bool use_error_resilient;
+
+  /*!
+   * Indicates whether the current frame is to be coded as s-frame.
+   */
+  bool use_s_frame;
+
+  /*!
+   * Indicates whether the current frame's primary_ref_frame is set to
+   * PRIMARY_REF_NONE.
+   */
+  bool use_primary_ref_none;
+} ExternalFlags;
+
+/*!\cond */
+
+typedef struct {
+  // Some misc info
+  int high_prec;
+  int q;
+  int order;
+
+  // MV counters
+  int inter_count;
+  int intra_count;
+  int default_mvs;
+  int mv_joint_count[4];
+  int last_bit_zero;
+  int last_bit_nonzero;
+
+  // Keep track of the rates
+  int total_mv_rate;
+  int hp_total_mv_rate;
+  int lp_total_mv_rate;
+
+  // Texture info
+  int horz_text;
+  int vert_text;
+  int diag_text;
+
+  // Whether the current struct contains valid data
+  int valid;
+} MV_STATS;
+
+typedef struct WeberStats {
+  int64_t mb_wiener_variance;
+  int64_t src_variance;
+  int64_t rec_variance;
+  int16_t src_pix_max;
+  int16_t rec_pix_max;
+  int64_t distortion;
+  int64_t satd;
+  double max_scale;
+} WeberStats;
+
+typedef struct {
+  struct loopfilter lf;
+  CdefInfo cdef_info;
+  YV12_BUFFER_CONFIG copy_buffer;
+  RATE_CONTROL rc;
+  MV_STATS mv_stats;
+} CODING_CONTEXT;
+
+typedef struct {
+  int frame_width;
+  int frame_height;
+  int mi_rows;
+  int mi_cols;
+  int mb_rows;
+  int mb_cols;
+  int num_mbs;
+  aom_bit_depth_t bit_depth;
+  int subsampling_x;
+  int subsampling_y;
+} FRAME_INFO;
+
+/*!
+ * \brief This structure stores different types of frame indices.
+ */
+typedef struct {
+  int show_frame_count;
+} FRAME_INDEX_SET;
+
+/*!\endcond */
+
+/*!
+ * \brief Segmentation related information for the current frame.
+ */
+typedef struct {
+  /*!
+   * 3-bit number containing the segment affiliation for each 4x4 block in the
+   * frame. map[y * stride + x] contains the segment id of the 4x4 block at
+   * (x,y) position.
+   */
+  uint8_t *map;
+  /*!
+   * Flag to indicate if current frame has lossless segments or not.
+   * 1: frame has at least one lossless segment.
+   * 0: frame has no lossless segments.
+   */
+  bool has_lossless_segment;
+} EncSegmentationInfo;
+
+/*!
+ * \brief Frame time stamps.
+ */
+typedef struct {
+  /*!
+   * Start time stamp of the previous frame
+   */
+  int64_t prev_ts_start;
+  /*!
+   * End time stamp of the previous frame
+   */
+  int64_t prev_ts_end;
+  /*!
+   * Start time stamp of the first frame
+   */
+  int64_t first_ts_start;
+} TimeStamps;
+
+/*!
+ * Pointers to the memory allocated for frame level transform coeff related
+ * info.
+ */
+typedef struct {
+  /*!
+   * Pointer to the transformed coefficients buffer.
+   */
+  tran_low_t *tcoeff;
+  /*!
+   * Pointer to the eobs buffer.
+   */
+  uint16_t *eobs;
+  /*!
+   * Pointer to the entropy_ctx buffer.
+   */
+  uint8_t *entropy_ctx;
+} CoeffBufferPool;
+
+#if !CONFIG_REALTIME_ONLY
+/*!\cond */
+// DUCKY_ENCODE_FRAME_MODE is c version of EncodeFrameMode
+enum {
+  DUCKY_ENCODE_FRAME_MODE_NONE,  // Let native AV1 determine q index and rdmult
+  DUCKY_ENCODE_FRAME_MODE_QINDEX,  // DuckyEncode determines q index and AV1
+                                   // determines rdmult
+  DUCKY_ENCODE_FRAME_MODE_QINDEX_RDMULT,  // DuckyEncode determines q index and
+                                          // rdmult
+} UENUM1BYTE(DUCKY_ENCODE_FRAME_MODE);
+
+enum {
+  DUCKY_ENCODE_GOP_MODE_NONE,  // native AV1 decides GOP
+  DUCKY_ENCODE_GOP_MODE_RCL,   // rate control lib decides GOP
+} UENUM1BYTE(DUCKY_ENCODE_GOP_MODE);
+
+typedef struct DuckyEncodeFrameInfo {
+  DUCKY_ENCODE_FRAME_MODE qp_mode;
+  DUCKY_ENCODE_GOP_MODE gop_mode;
+  int q_index;
+  int rdmult;
+  // These two arrays are equivalent to std::vector<SuperblockEncodeParameters>
+  int *superblock_encode_qindex;
+  int *superblock_encode_rdmult;
+  int delta_q_enabled;
+} DuckyEncodeFrameInfo;
+
+typedef struct DuckyEncodeFrameResult {
+  int global_order_idx;
+  int q_index;
+  int rdmult;
+  int rate;
+  int64_t dist;
+  double psnr;
+} DuckyEncodeFrameResult;
+
+typedef struct DuckyEncodeInfo {
+  DuckyEncodeFrameInfo frame_info;
+  DuckyEncodeFrameResult frame_result;
+} DuckyEncodeInfo;
+/*!\endcond */
+#endif
+
+/*!\cond */
+typedef struct RTC_REF {
+  /*!
+   * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+   * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+   */
+  int reference[INTER_REFS_PER_FRAME];
+  int ref_idx[INTER_REFS_PER_FRAME];
+  int refresh[REF_FRAMES];
+  int set_ref_frame_config;
+  int non_reference_frame;
+  int ref_frame_comp[3];
+  int gld_idx_1layer;
+  /*!
+   * Frame number of the last frame that refreshed the buffer slot.
+   */
+  unsigned int buffer_time_index[REF_FRAMES];
+  /*!
+   * Spatial layer id of the last frame that refreshed the buffer slot.
+   */
+  unsigned char buffer_spatial_layer[REF_FRAMES];
+  /*!
+   * Flag to indicate whether closest reference was the previous frame.
+   */
+  bool reference_was_previous_frame;
+  /*!
+   * Flag to indicate this frame is based on longer term reference only,
+   * for recovery from past loss, and it should be biased for improved coding.
+   */
+  bool bias_recovery_frame;
+} RTC_REF;
+/*!\endcond */
+
+/*!
+ * \brief Structure to hold data corresponding to an encoded frame.
+ */
+typedef struct AV1_COMP_DATA {
+  /*!
+   * Buffer to store packed bitstream data of a frame.
+   */
+  unsigned char *cx_data;
+
+  /*!
+   * Allocated size of the cx_data buffer.
+   */
+  size_t cx_data_sz;
+
+  /*!
+   * Size of data written in the cx_data buffer.
+   */
+  size_t frame_size;
+
+  /*!
+   * Flags for the frame.
+   */
+  unsigned int lib_flags;
+
+  /*!
+   * Time stamp for start of frame.
+   */
+  int64_t ts_frame_start;
+
+  /*!
+   * Time stamp for end of frame.
+   */
+  int64_t ts_frame_end;
+
+  /*!
+   * Flag to indicate flush call.
+   */
+  int flush;
+
+  /*!
+   * Time base for sequence.
+   */
+  const aom_rational64_t *timestamp_ratio;
+
+  /*!
+   * Decide to pop the source for this frame from input buffer queue.
+   */
+  int pop_lookahead;
+
+  /*!
+   * Display order hint of frame whose packed data is in cx_data buffer.
+   */
+  int frame_display_order_hint;
+} AV1_COMP_DATA;
+
+/*!
+ * \brief Top level primary encoder structure
+ */
+typedef struct AV1_PRIMARY {
+  /*!
+   * Array of frame level encoder stage top level structures
+   */
+  struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES];
+
+  /*!
+   * Array of structures to hold data of frames encoded in a given parallel
+   * encode set.
+   */
+  struct AV1_COMP_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1];
+#if CONFIG_FPMT_TEST
+  /*!
+   * Flag which enables/disables simulation path for fpmt unit test.
+   * 0 - FPMT integration
+   * 1 - FPMT simulation
+   */
+  FPMT_TEST_ENC_CFG fpmt_unit_test_cfg;
+
+  /*!
+   * Temporary variable simulating the delayed frame_probability update.
+   */
+  FrameProbInfo temp_frame_probs;
+
+  /*!
+   * Temporary variable holding the updated frame probability across
+   * frames. Copy its value to temp_frame_probs for frame_parallel_level 0
+   * frames or last frame in parallel encode set.
+   */
+  FrameProbInfo temp_frame_probs_simulation;
+
+  /*!
+   * Temporary variable simulating the delayed update of valid global motion
+   * model across frames.
+   */
+  int temp_valid_gm_model_found[FRAME_UPDATE_TYPES];
+#endif  // CONFIG_FPMT_TEST
+  /*!
+   * Copy of cm->ref_frame_map maintained to facilitate sequential update of
+   * ref_frame_map by lower layer depth frames encoded ahead of time in a
+   * parallel encode set.
+   */
+  RefCntBuffer *ref_frame_map_copy[REF_FRAMES];
+
+  /*!
+   * Start time stamp of the last encoded show frame
+   */
+  int64_t ts_start_last_show_frame;
+
+  /*!
+   * End time stamp of the last encoded show frame
+   */
+  int64_t ts_end_last_show_frame;
+
+  /*!
+   * Number of frame level contexts(cpis)
+   */
+  int num_fp_contexts;
+
+  /*!
+   * Loopfilter levels of the previous encoded frame.
+   */
+  int filter_level[2];
+
+  /*!
+   * Chrominance component loopfilter level of the previous encoded frame.
+   */
+  int filter_level_u;
+
+  /*!
+   * Chrominance component loopfilter level of the previous encoded frame.
+   */
+  int filter_level_v;
+
+  /*!
+   * Encode stage top level structure
+   * During frame parallel encode, this is the same as parallel_cpi[0]
+   */
+  struct AV1_COMP *cpi;
+
+  /*!
+   * Lookahead processing stage top level structure
+   */
+  struct AV1_COMP *cpi_lap;
+
+  /*!
+   * Look-ahead context.
+   */
+  struct lookahead_ctx *lookahead;
+
+  /*!
+   * Sequence parameters have been transmitted already and locked
+   * or not. Once locked av1_change_config cannot change the seq
+   * parameters.
+   */
+  int seq_params_locked;
+
+  /*!
+   * Pointer to internal utility functions that manipulate aom_codec_* data
+   * structures.
+   */
+  struct aom_codec_pkt_list *output_pkt_list;
+
+  /*!
+   * When set, indicates that internal ARFs are enabled.
+   */
+  int internal_altref_allowed;
+
+  /*!
+   * Tell if OVERLAY frame shows existing alt_ref frame.
+   */
+  int show_existing_alt_ref;
+
+  /*!
+   * Information related to a gf group.
+   */
+  GF_GROUP gf_group;
+
+  /*!
+   * Track prior gf group state.
+   */
+  GF_STATE gf_state;
+
+  /*!
+   * Flag indicating whether look ahead processing (LAP) is enabled.
+   */
+  int lap_enabled;
+
+  /*!
+   * Parameters for AV1 bitstream levels.
+   */
+  AV1LevelParams level_params;
+
+  /*!
+   * Calculates PSNR on each frame when set to 1.
+   */
+  int b_calculate_psnr;
+
+  /*!
+   * Number of frames left to be encoded, is 0 if limit is not set.
+   */
+  int frames_left;
+
+  /*!
+   * Information related to two pass encoding.
+   */
+  TWO_PASS twopass;
+
+  /*!
+   * Rate control related parameters.
+   */
+  PRIMARY_RATE_CONTROL p_rc;
+
+  /*!
+   * Info and resources used by temporal filtering.
+   */
+  TEMPORAL_FILTER_INFO tf_info;
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
+  SequenceHeader seq_params;
+
+  /*!
+   * Indicates whether to use SVC.
+   */
+  int use_svc;
+
+  /*!
+   * If true, buffer removal times are present.
+   */
+  bool buffer_removal_time_present;
+
+  /*!
+   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_temporal_layers;
+
+  /*!
+   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_spatial_layers;
+
+  /*!
+   * Code and details about current error status.
+   */
+  struct aom_internal_error_info error;
+
+  /*!
+   * Function pointers to variants of sse/sad/variance computation functions.
+   * fn_ptr[i] indicates the list of function pointers corresponding to block
+   * size i.
+   */
+  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+  /*!
+   * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+   * the ith 16 x 16 block in raster scan order.
+   */
+  double *tpl_sb_rdmult_scaling_factors;
+
+  /*!
+   * Parameters related to tpl.
+   */
+  TplParams tpl_data;
+
+  /*!
+   * Motion vector stats of the previous encoded frame.
+   */
+  MV_STATS mv_stats;
+
+#if CONFIG_INTERNAL_STATS
+  /*!\cond */
+  uint64_t total_time_receive_data;
+  uint64_t total_time_compress_data;
+
+  unsigned int total_mode_chosen_counts[MAX_MODES];
+
+  int count[2];
+  uint64_t total_sq_error[2];
+  uint64_t total_samples[2];
+  ImageStat psnr[2];
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int total_bytes;
+  double summed_quality;
+  double summed_weights;
+  double summed_quality_hbd;
+  double summed_weights_hbd;
+  unsigned int total_recode_hits;
+  double worst_ssim;
+  double worst_ssim_hbd;
+
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_blockiness;
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+  /*!\endcond */
+#endif
+
+#if CONFIG_ENTROPY_STATS
+  /*!
+   * Aggregates frame counts for the sequence.
+   */
+  FRAME_COUNTS aggregate_fc;
+#endif  // CONFIG_ENTROPY_STATS
+
+  /*!
+   * For each type of reference frame, this contains the index of a reference
+   * frame buffer for a reference frame of the same type.  We use this to
+   * choose our primary reference frame (which is the most recent reference
+   * frame of the same type as the current frame).
+   */
+  int fb_of_context_type[REF_FRAMES];
+
+  /*!
+   * Primary Multi-threading parameters.
+   */
+  PrimaryMultiThreadInfo p_mt_info;
+
+  /*!
+   * Probabilities for pruning of various AV1 tools.
+   */
+  FrameProbInfo frame_probs;
+
+  /*!
+   * Indicates if a valid global motion model has been found in the different
+   * frame update types of a GF group.
+   * valid_gm_model_found[i] indicates if valid global motion model has been
+   * found in the frame update type with enum value equal to i
+   */
+  int valid_gm_model_found[FRAME_UPDATE_TYPES];
+
+  /*!
+   * Struct for the reference structure for RTC.
+   */
+  RTC_REF rtc_ref;
+
+  /*!
+   * Struct for all intra mode row multi threading in the preprocess stage
+   * when --deltaq-mode=3.
+   */
+  AV1EncRowMultiThreadSync intra_row_mt_sync;
+} AV1_PRIMARY;
+
+/*!
+ * \brief Top level encoder structure.
+ */
+typedef struct AV1_COMP {
+  /*!
+   * Pointer to top level primary encoder structure
+   */
+  AV1_PRIMARY *ppi;
+
+  /*!
+   * Quantization and dequantization parameters for internal quantizer setup
+   * in the encoder.
+   */
+  EncQuantDequantParams enc_quant_dequant_params;
+
+  /*!
+   * Structure holding thread specific variables.
+   */
+  ThreadData td;
+
+  /*!
+   * Statistics collected at frame level.
+   */
+  FRAME_COUNTS counts;
+
+  /*!
+   * Holds buffer storing mode information at 4x4/8x8 level.
+   */
+  MBMIExtFrameBufferInfo mbmi_ext_info;
+
+  /*!
+   * Buffer holding the transform block related information.
+   * coeff_buffer_base[i] stores the transform block related information of the
+   * ith superblock in raster scan order.
+   */
+  CB_COEFF_BUFFER *coeff_buffer_base;
+
+  /*!
+   * Structure holding pointers to frame level memory allocated for transform
+   * block related information.
+   */
+  CoeffBufferPool coeff_buffer_pool;
+
+  /*!
+   * Structure holding variables common to encoder and decoder.
+   */
+  AV1_COMMON common;
+
+  /*!
+   * Encoder configuration related parameters.
+   */
+  AV1EncoderConfig oxcf;
+
+  /*!
+   * Stores the trellis optimization type at segment level.
+   * optimize_seg_arr[i] stores the trellis opt type for ith segment.
+   */
+  TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS];
+
+  /*!
+   * Pointer to the frame buffer holding the source frame to be used during the
+   * current stage of encoding. It can be the raw input, temporally filtered
+   * input or scaled input.
+   */
+  YV12_BUFFER_CONFIG *source;
+
+  /*!
+   * Pointer to the frame buffer holding the last raw source frame.
+   * last_source is NULL for the following cases:
+   * 1) First frame
+   * 2) Alt-ref frames
+   * 3) All frames for all-intra frame encoding.
+   */
+  YV12_BUFFER_CONFIG *last_source;
+
+  /*!
+   * Pointer to the frame buffer holding the unscaled source frame.
+   * It can be either the raw input or temporally filtered input.
+   */
+  YV12_BUFFER_CONFIG *unscaled_source;
+
+  /*!
+   * Frame buffer holding the resized source frame (cropping / superres).
+   */
+  YV12_BUFFER_CONFIG scaled_source;
+
+  /*!
+   * Pointer to the frame buffer holding the unscaled last source frame.
+   */
+  YV12_BUFFER_CONFIG *unscaled_last_source;
+
+  /*!
+   * Frame buffer holding the resized last source frame.
+   */
+  YV12_BUFFER_CONFIG scaled_last_source;
+
+  /*!
+   * Pointer to the original source frame. This is used to determine if the
+   * content is screen.
+   */
+  YV12_BUFFER_CONFIG *unfiltered_source;
+
+  /*!
+   * Frame buffer holding the orig source frame for PSNR calculation in rtc tf
+   * case.
+   */
+  YV12_BUFFER_CONFIG orig_source;
+
+  /*!
+   * Skip tpl setup when tpl data from gop length decision can be reused.
+   */
+  int skip_tpl_setup_stats;
+
+  /*!
+   * Scaling factors used in the RD multiplier modulation.
+   * TODO(sdeng): consider merge the following arrays.
+   * tpl_rdmult_scaling_factors is a temporary buffer used to store the
+   * intermediate scaling factors which are used in the calculation of
+   * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
+   * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
+   */
+  double *tpl_rdmult_scaling_factors;
+
+  /*!
+   * Temporal filter context.
+   */
+  TemporalFilterCtx tf_ctx;
+
+  /*!
+   * Pointer to CDEF search context.
+   */
+  CdefSearchCtx *cdef_search_ctx;
+
+  /*!
+   * Variables related to forcing integer mv decisions for the current frame.
+   */
+  ForceIntegerMVInfo force_intpel_info;
+
+  /*!
+   * Pointer to the buffer holding the scaled reference frames.
+   * scaled_ref_buf[i] holds the scaled reference frame of type i.
+   */
+  RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
+
+  /*!
+   * Pointer to the buffer holding the last show frame.
+   */
+  RefCntBuffer *last_show_frame_buf;
+
+  /*!
+   * Refresh frame flags for golden, bwd-ref and alt-ref frames.
+   */
+  RefreshFrameInfo refresh_frame;
+
+  /*!
+   * Flag to reduce the number of reference frame buffers used in rt.
+   */
+  int rt_reduce_num_ref_buffers;
+
+  /*!
+   * Flags signalled by the external interface at frame level.
+   */
+  ExternalFlags ext_flags;
+
+  /*!
+   * Temporary frame buffer used to store the non-loop filtered reconstructed
+   * frame during the search of loop filter level.
+   */
+  YV12_BUFFER_CONFIG last_frame_uf;
+
+  /*!
+   * Temporary frame buffer used to store the loop restored frame during loop
+   * restoration search.
+   */
+  YV12_BUFFER_CONFIG trial_frame_rst;
+
+  /*!
+   * Ambient reconstruction err target for force key frames.
+   */
+  int64_t ambient_err;
+
+  /*!
+   * Parameters related to rate distortion optimization.
+   */
+  RD_OPT rd;
+
+  /*!
+   * Temporary coding context used to save and restore when encoding with and
+   * without super-resolution.
+   */
+  CODING_CONTEXT coding_context;
+
+  /*!
+   * Parameters related to global motion search.
+   */
+  GlobalMotionInfo gm_info;
+
+  /*!
+   * Parameters related to winner mode processing.
+   */
+  WinnerModeParams winner_mode_params;
+
+  /*!
+   * Frame time stamps.
+   */
+  TimeStamps time_stamps;
+
+  /*!
+   * Rate control related parameters.
+   */
+  RATE_CONTROL rc;
+
+  /*!
+   * Frame rate of the video.
+   */
+  double framerate;
+
+  /*!
+   * Bitmask indicating which reference buffers may be referenced by this frame.
+   */
+  int ref_frame_flags;
+
+  /*!
+   * speed is passed as a per-frame parameter into the encoder.
+   */
+  int speed;
+
+  /*!
+   * sf contains fine-grained config set internally based on speed.
+   */
+  SPEED_FEATURES sf;
+
+  /*!
+   * Parameters for motion vector search process.
+   */
+  MotionVectorSearchParams mv_search_params;
+
+  /*!
+   * When set, indicates that all reference frames are forward references,
+   * i.e., all the reference frames are output before the current frame.
+   */
+  int all_one_sided_refs;
+
+  /*!
+   * Segmentation related information for current frame.
+   */
+  EncSegmentationInfo enc_seg;
+
+  /*!
+   * Parameters related to cyclic refresh aq-mode.
+   */
+  CYCLIC_REFRESH *cyclic_refresh;
+  /*!
+   * Parameters related to active map. Active maps indicate
+   * if there is any activity on a 4x4 block basis.
+   */
+  ActiveMap active_map;
+
+  /*!
+   * The frame processing order within a GOP.
+   */
+  unsigned char gf_frame_index;
+
+#if CONFIG_INTERNAL_STATS
+  /*!\cond */
+  uint64_t time_compress_data;
+
+  unsigned int mode_chosen_counts[MAX_MODES];
+  int bytes;
+  unsigned int frame_recode_hits;
+  /*!\endcond */
+#endif
+
+#if CONFIG_SPEED_STATS
+  /*!
+   * For debugging: number of transform searches we have performed.
+   */
+  unsigned int tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+
+  /*!
+   * When set, indicates that the frame is droppable, i.e., this frame
+   * does not update any reference buffers.
+   */
+  int droppable;
+
+  /*!
+   * Stores the frame parameters during encoder initialization.
+   */
+  FRAME_INFO frame_info;
+
+  /*!
+   * Stores different types of frame indices.
+   */
+  FRAME_INDEX_SET frame_index_set;
+
+  /*!
+   * Store the cm->width in the last call of alloc_compressor_data(). Help
+   * determine whether compressor data should be reallocated when cm->width
+   * changes.
+   */
+  int data_alloc_width;
+
+  /*!
+   * Store the cm->height in the last call of alloc_compressor_data(). Help
+   * determine whether compressor data should be reallocated when cm->height
+   * changes.
+   */
+  int data_alloc_height;
+
+  /*!
+   * Number of MBs in the full-size frame; to be used to
+   * normalize the firstpass stats. This will differ from the
+   * number of MBs in the current frame when the frame is
+   * scaled.
+   */
+  int initial_mbs;
+
+  /*!
+   * Flag to indicate whether the frame size inforamation has been
+   * setup and propagated to associated allocations.
+   */
+  bool frame_size_related_setup_done;
+
+  /*!
+   * The width of the frame that is lastly encoded.
+   * It is updated in the function "encoder_encode()".
+   */
+  int last_coded_width;
+
+  /*!
+   * The height of the frame that is lastly encoded.
+   * It is updated in the function "encoder_encode()".
+   */
+  int last_coded_height;
+
+  /*!
+   * Resize related parameters.
+   */
+  ResizePendingParams resize_pending_params;
+
+  /*!
+   * Pointer to struct holding adaptive data/contexts/models for the tile during
+   * encoding.
+   */
+  TileDataEnc *tile_data;
+  /*!
+   * Number of tiles for which memory has been allocated for tile_data.
+   */
+  int allocated_tiles;
+
+  /*!
+   * Structure to store the palette token related information.
+   */
+  TokenInfo token_info;
+
+  /*!
+   * VARIANCE_AQ segment map refresh.
+   */
+  int vaq_refresh;
+
+  /*!
+   * Thresholds for variance based partitioning.
+   */
+  VarBasedPartitionInfo vbp_info;
+
+  /*!
+   * Number of recodes in the frame.
+   */
+  int num_frame_recode;
+
+  /*!
+   * Current frame probability of parallel frames, across recodes.
+   */
+  FrameProbInfo frame_new_probs[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for transform type frame_probability calculation
+   */
+  int do_update_frame_probs_txtype[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for obmc frame_probability calculation
+   */
+  int do_update_frame_probs_obmc[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for warped motion frame_probability calculation
+   */
+  int do_update_frame_probs_warp[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for interpolation filter frame_probability calculation
+   */
+  int do_update_frame_probs_interpfilter[NUM_RECODES_PER_FRAME];
+
+#if CONFIG_FPMT_TEST
+  /*!
+   * Temporary variable for simulation.
+   * Previous frame's framerate.
+   */
+  double temp_framerate;
+#endif
+  /*!
+   * Updated framerate for the current parallel frame.
+   * cpi->framerate is updated with new_framerate during
+   * post encode updates for parallel frames.
+   */
+  double new_framerate;
+
+  /*!
+   * Retain condition for fast_extra_bits calculation.
+   */
+  int do_update_vbr_bits_off_target_fast;
+
+  /*!
+   * Multi-threading parameters.
+   */
+  MultiThreadInfo mt_info;
+
+  /*!
+   * Specifies the frame to be output. It is valid only if show_existing_frame
+   * is 1. When show_existing_frame is 0, existing_fb_idx_to_show is set to
+   * INVALID_IDX.
+   */
+  int existing_fb_idx_to_show;
+
+  /*!
+   * A flag to indicate if intrabc is ever used in current frame.
+   */
+  int intrabc_used;
+
+  /*!
+   * Mark which ref frames can be skipped for encoding current frame during RDO.
+   */
+  int prune_ref_frame_mask;
+
+  /*!
+   * Loop Restoration context.
+   */
+  AV1LrStruct lr_ctxt;
+
+  /*!
+   * Loop Restoration context used during pick stage.
+   */
+  AV1LrPickStruct pick_lr_ctxt;
+
+  /*!
+   * Pointer to list of tables with film grain parameters.
+   */
+  aom_film_grain_table_t *film_grain_table;
+
+#if CONFIG_DENOISE
+  /*!
+   * Pointer to structure holding the denoised image buffers and the helper
+   * noise models.
+   */
+  struct aom_denoise_and_model_t *denoise_and_model;
+#endif
+
+  /*!
+   * Flags related to interpolation filter search.
+   */
+  InterpSearchFlags interp_search_flags;
+
+  /*!
+   * Turn on screen content tools flag.
+   * Note that some videos are not screen content videos, but
+   * screen content tools could also improve coding efficiency.
+   * For example, videos with large flat regions, gaming videos that look
+   * like natural videos.
+   */
+  int use_screen_content_tools;
+
+  /*!
+   * A flag to indicate "real" screen content videos.
+   * For example, screen shares, screen editing.
+   * This type is true indicates |use_screen_content_tools| must be true.
+   * In addition, rate control strategy is adjusted when this flag is true.
+   */
+  int is_screen_content_type;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  /*!
+   * Accumulates the partition timing stat over the whole frame.
+   */
+  FramePartitionTimingStats partition_stats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  /*!
+   * component_time[] are initialized to zero while encoder starts.
+   */
+  uint64_t component_time[kTimingComponents];
+  /*!
+   * Stores timing for individual components between calls of start_timing()
+   * and end_timing().
+   */
+  struct aom_usec_timer component_timer[kTimingComponents];
+  /*!
+   * frame_component_time[] are initialized to zero at beginning of each frame.
+   */
+  uint64_t frame_component_time[kTimingComponents];
+#endif
+
+  /*!
+   * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
+   */
+  int frame_header_count;
+
+  /*!
+   * Whether any no-zero delta_q was actually used.
+   */
+  int deltaq_used;
+
+  /*!
+   * Refrence frame distance related variables.
+   */
+  RefFrameDistanceInfo ref_frame_dist_info;
+
+  /*!
+   * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+   * the ith 16 x 16 block in raster scan order. This scaling factor is used for
+   * RD multiplier modulation when SSIM tuning is enabled.
+   */
+  double *ssim_rdmult_scaling_factors;
+
+#if CONFIG_TUNE_VMAF
+  /*!
+   * Parameters for VMAF tuning.
+   */
+  TuneVMAFInfo vmaf_info;
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  /*!
+   * Parameters for Butteraugli tuning.
+   */
+  TuneButteraugliInfo butteraugli_info;
+#endif
+
+  /*!
+   * Parameters for scalable video coding.
+   */
+  SVC svc;
+
+  /*!
+   * Indicates whether current processing stage is encode stage or LAP stage.
+   */
+  COMPRESSOR_STAGE compressor_stage;
+
+  /*!
+   * Frame type of the last frame. May be used in some heuristics for speeding
+   * up the encoding.
+   */
+  FRAME_TYPE last_frame_type;
+
+  /*!
+   * Number of tile-groups.
+   */
+  int num_tg;
+
+  /*!
+   * Super-resolution mode currently being used by the encoder.
+   * This may / may not be same as user-supplied mode in oxcf->superres_mode
+   * (when we are recoding to try multiple options for example).
+   */
+  aom_superres_mode superres_mode;
+
+  /*!
+   * First pass related data.
+   */
+  FirstPassData firstpass_data;
+
+  /*!
+   * Temporal Noise Estimate
+   */
+  NOISE_ESTIMATE noise_estimate;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  /*!
+   * Temporal Denoiser
+   */
+  AV1_DENOISER denoiser;
+#endif
+
+  /*!
+   * Count on how many consecutive times a block uses small/zeromv for encoding
+   * in a scale of 8x8 block.
+   */
+  uint8_t *consec_zero_mv;
+
+  /*!
+   * Allocated memory size for |consec_zero_mv|.
+   */
+  int consec_zero_mv_alloc_size;
+
+  /*!
+   * Block size of first pass encoding
+   */
+  BLOCK_SIZE fp_block_size;
+
+  /*!
+   * The counter of encoded super block, used to differentiate block names.
+   * This number starts from 0 and increases whenever a super block is encoded.
+   */
+  int sb_counter;
+
+  /*!
+   * Available bitstream buffer size in bytes
+   */
+  size_t available_bs_size;
+
+  /*!
+   * The controller of the external partition model.
+   * It is used to do partition type selection based on external models.
+   */
+  ExtPartController ext_part_controller;
+
+  /*!
+   * Motion vector stats of the current encoded frame, used to update the
+   * ppi->mv_stats during postencode.
+   */
+  MV_STATS mv_stats;
+  /*!
+   * Stores the reference refresh index for the current frame.
+   */
+  int ref_refresh_index;
+
+  /*!
+   * A flag to indicate if the reference refresh index is available for the
+   * current frame.
+   */
+  bool refresh_idx_available;
+
+  /*!
+   * Reference frame index corresponding to the frame to be excluded from being
+   * used as a reference by frame_parallel_level 2 frame in a parallel
+   * encode set of lower layer frames.
+   */
+  int ref_idx_to_skip;
+#if CONFIG_FPMT_TEST
+  /*!
+   * Stores the wanted frame buffer index for choosing primary ref frame by a
+   * frame_parallel_level 2 frame in a parallel encode set of lower layer
+   * frames.
+   */
+
+  int wanted_fb;
+#endif  // CONFIG_FPMT_TEST
+
+  /*!
+   * A flag to indicate frames that will update their data to the primary
+   * context at the end of the encode. It is set for non-parallel frames and the
+   * last frame in encode order in a given parallel encode set.
+   */
+  bool do_frame_data_update;
+
+#if CONFIG_RD_COMMAND
+  /*!
+   *  A structure for assigning external q_index / rdmult for experiments
+   */
+  RD_COMMAND rd_command;
+#endif  // CONFIG_RD_COMMAND
+
+  /*!
+   * Buffer to store MB variance after Wiener filter.
+   */
+  WeberStats *mb_weber_stats;
+
+  /*!
+   * Buffer to store rate cost estimates for each macro block (8x8) in the
+   * preprocessing stage used in allintra mode.
+   */
+  int *prep_rate_estimates;
+
+  /*!
+   * Buffer to store rate cost estimates for each 16x16 block read
+   * from an external file, used in allintra mode.
+   */
+  double *ext_rate_distribution;
+
+  /*!
+   * The scale that equals sum_rate_uniform_quantizer / sum_ext_rate.
+   */
+  double ext_rate_scale;
+
+  /*!
+   * Buffer to store MB variance after Wiener filter.
+   */
+  BLOCK_SIZE weber_bsize;
+
+  /*!
+   * Frame level Wiener filter normalization.
+   */
+  int64_t norm_wiener_variance;
+
+  /*!
+   * Buffer to store delta-q values for delta-q mode 4.
+   */
+  int *mb_delta_q;
+
+  /*!
+   * Flag to indicate that current frame is dropped.
+   */
+  bool is_dropped_frame;
+
+#if CONFIG_BITRATE_ACCURACY
+  /*!
+   * Structure stores information needed for bitrate accuracy experiment.
+   */
+  VBR_RATECTRL_INFO vbr_rc_info;
+#endif
+
+#if CONFIG_RATECTRL_LOG
+  /*!
+   * Structure stores information of rate control decisions.
+   */
+  RATECTRL_LOG rc_log;
+#endif  // CONFIG_RATECTRL_LOG
+
+  /*!
+   * Frame level twopass status and control data
+   */
+  TWO_PASS_FRAME twopass_frame;
+
+  /*!
+   * Context needed for third pass encoding.
+   */
+  THIRD_PASS_DEC_CTX *third_pass_ctx;
+
+  /*!
+   * File pointer to second pass log
+   */
+  FILE *second_pass_log_stream;
+
+  /*!
+   * Buffer to store 64x64 SAD
+   */
+  uint64_t *src_sad_blk_64x64;
+
+  /*!
+   * SSE between the current frame and the reconstructed last frame
+   * It is only used for CBR mode.
+   * It is not used if the reference frame has a different frame size.
+   */
+  uint64_t rec_sse;
+
+  /*!
+   * A flag to indicate whether the encoder is controlled by DuckyEncode or not.
+   * 1:yes 0:no
+   */
+  int use_ducky_encode;
+
+#if !CONFIG_REALTIME_ONLY
+  /*! A structure that facilitates the communication between DuckyEncode and AV1
+   * encoder.
+   */
+  DuckyEncodeInfo ducky_encode_info;
+#endif  // CONFIG_REALTIME_ONLY
+        //
+  /*!
+   * Frames since last frame with cdf update.
+   */
+  int frames_since_last_update;
+
+  /*!
+   * Block level thresholds to force zeromv-skip at partition level.
+   */
+  unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL];
+
+  /*!
+   *  Number of downsampling pyramid levels to allocate for each frame
+   *  This is currently only used for global motion
+   */
+  int image_pyramid_levels;
+
+#if CONFIG_SALIENCY_MAP
+  /*!
+   * Pixel level saliency map for each frame.
+   */
+  uint8_t *saliency_map;
+
+  /*!
+   * Superblock level rdmult scaling factor driven by saliency map.
+   */
+  double *sm_scaling_factor;
+#endif
+
+  /*!
+   * Number of pixels that choose palette mode for luma in the
+   * fast encoding pass in av1_determine_sc_tools_with_encoding().
+   */
+  int palette_pixel_num;
+
+  /*!
+   * Flag to indicate scaled_last_source is available,
+   * so scaling is not needed for last_source.
+   */
+  int scaled_last_source_available;
+} AV1_COMP;
+
+/*!
+ * \brief Input frames and last input frame
+ */
+typedef struct EncodeFrameInput {
+  /*!\cond */
+  YV12_BUFFER_CONFIG *source;
+  YV12_BUFFER_CONFIG *last_source;
+  int64_t ts_duration;
+  /*!\endcond */
+} EncodeFrameInput;
+
+/*!
+ * \brief contains per-frame encoding parameters decided upon by
+ * av1_encode_strategy() and passed down to av1_encode().
+ */
+typedef struct EncodeFrameParams {
+  /*!
+   * Is error resilient mode enabled
+   */
+  int error_resilient_mode;
+  /*!
+   * Frame type (eg KF vs inter frame etc)
+   */
+  FRAME_TYPE frame_type;
+
+  /*!\cond */
+  int primary_ref_frame;
+  int order_offset;
+
+  /*!\endcond */
+  /*!
+   * Should the current frame be displayed after being decoded
+   */
+  int show_frame;
+
+  /*!\cond */
+  int refresh_frame_flags;
+
+  int show_existing_frame;
+  int existing_fb_idx_to_show;
+
+  /*!\endcond */
+  /*!
+   *  Bitmask of which reference buffers may be referenced by this frame.
+   */
+  int ref_frame_flags;
+
+  /*!
+   *  Reference buffer assignment for this frame.
+   */
+  int remapped_ref_idx[REF_FRAMES];
+
+  /*!
+   *  Flags which determine which reference buffers are refreshed by this
+   *  frame.
+   */
+  RefreshFrameInfo refresh_frame;
+
+  /*!
+   *  Speed level to use for this frame: Bigger number means faster.
+   */
+  int speed;
+} EncodeFrameParams;
+
+/*!\cond */
+
+// EncodeFrameResults contains information about the result of encoding a
+// single frame
+typedef struct {
+  size_t size;  // Size of resulting bitstream
+} EncodeFrameResults;
+
+void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage);
+
+struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi,
+                                       const AV1EncoderConfig *oxcf,
+                                       BufferPool *const pool,
+                                       COMPRESSOR_STAGE stage,
+                                       int lap_lag_in_frames);
+
+struct AV1_PRIMARY *av1_create_primary_compressor(
+    struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+    const AV1EncoderConfig *oxcf);
+
+void av1_remove_compressor(AV1_COMP *cpi);
+
+void av1_remove_primary_compressor(AV1_PRIMARY *ppi);
+
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+void print_internal_stats(AV1_PRIMARY *ppi);
+#endif
+
+void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+                           bool *sb_size_changed);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+                       bool sb_size_changed);
+
+aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+                                        int subsampling_x, int subsampling_y);
+
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
+                               const AV1EncoderConfig *oxcf, int use_svc);
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+                             const AV1_COMP_DATA *const cpi_data);
+
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map);
+
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+                                          int ref_buffers_used_map);
+
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi);
+
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+                                   int ref_buffers_used_map);
+
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi);
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+                                          AV1_COMP_DATA *const first_cpi_data);
+
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+                                    AV1_PRIMARY *const ppi,
+                                    int *ref_buffers_used_map);
+/*!\endcond */
+
+/*!\brief Obtain the raw frame data
+ *
+ * \ingroup high_level_algo
+ * This function receives the raw frame data from input.
+ *
+ * \param[in]     cpi            Top-level encoder structure
+ * \param[in]     frame_flags    Flags to decide how to encoding the frame
+ * \param[in,out] sd             Contain raw frame data
+ * \param[in]     time_stamp     Time stamp of the frame
+ * \param[in]     end_time_stamp End time stamp
+ *
+ * \return Returns a value to indicate if the frame data is received
+ * successfully.
+ * \note The caller can assume that a copy of this frame is made and not just a
+ * copy of the pointer.
+ */
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time_stamp);
+
+/*!\brief Encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function encodes the raw frame data, and outputs the frame bit stream
+ * to the designated buffer. The caller should use the output parameters
+ * cpi_data->ts_frame_start and cpi_data->ts_frame_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in]     cpi         Top-level encoder structure
+ * \param[in,out] cpi_data    Data corresponding to a frame encode
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ *     No frame encoded; more input is required.
+ * \retval "A nonzero (positive) aom_codec_err_t code"
+ *     The encoding failed with the error. Sets the error code and error message
+ * in \c cpi->common.error.
+ */
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data);
+
+/*!\brief Run 1-pass/2-pass encoding
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ */
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+               const EncodeFrameInput *const frame_input,
+               const EncodeFrameParams *const frame_params,
+               EncodeFrameResults *const frame_results);
+
+/*!\cond */
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
+
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags);
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
+
+void av1_set_mv_search_params(AV1_COMP *cpi);
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+                          ResizePendingParams *resize_pending_params,
+                          AOM_SCALING_MODE horiz_mode,
+                          AOM_SCALING_MODE vert_mode);
+
+int av1_get_quantizer(struct AV1_COMP *cpi);
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td);
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td);
+
+// Set screen content options.
+// This function estimates whether to use screen content tools, by counting
+// the portion of blocks that have few luma colors.
+// Modifies:
+//   cpi->commom.features.allow_screen_content_tools
+//   cpi->common.features.allow_intrabc
+//   cpi->use_screen_content_tools
+//   cpi->is_screen_content_type
+// However, the estimation is not accurate and may misclassify videos.
+// A slower but more accurate approach that determines whether to use screen
+// content tools is employed later. See av1_determine_sc_tools_with_encoding().
+void av1_set_screen_content_options(struct AV1_COMP *cpi,
+                                    FeatureFlags *features);
+
+void av1_update_frame_size(AV1_COMP *cpi);
+
+typedef struct {
+  int pyr_level;
+  int disp_order;
+} RefFrameMapPair;
+
+static INLINE void init_ref_map_pair(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+  if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) {
+    memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+    return;
+  }
+  memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    // Get reference frame buffer.
+    const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx];
+    if (ref_frame_map_pairs[map_idx].disp_order == -1) continue;
+    if (buf == NULL) {
+      ref_frame_map_pairs[map_idx].disp_order = -1;
+      ref_frame_map_pairs[map_idx].pyr_level = -1;
+      continue;
+    } else if (buf->ref_count > 1) {
+      // Once the keyframe is coded, the slots in ref_frame_map will all
+      // point to the same frame. In that case, all subsequent pointers
+      // matching the current are considered "free" slots. This will find
+      // the next occurrence of the current pointer if ref_count indicates
+      // there are multiple instances of it and mark it as free.
+      for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) {
+        const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2];
+        if (buf2 == buf) {
+          ref_frame_map_pairs[idx2].disp_order = -1;
+          ref_frame_map_pairs[idx2].pyr_level = -1;
+        }
+      }
+    }
+    ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint;
+    ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level;
+  }
+}
+
+#if CONFIG_FPMT_TEST
+static AOM_INLINE void calc_frame_data_update_flag(
+    GF_GROUP *const gf_group, int gf_frame_index,
+    bool *const do_frame_data_update) {
+  *do_frame_data_update = true;
+  // Set the flag to false for all frames in a given parallel encode set except
+  // the last frame in the set with frame_parallel_level = 2.
+  if (gf_group->frame_parallel_level[gf_frame_index] == 1) {
+    *do_frame_data_update = false;
+  } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) {
+    // Check if this is the last frame in the set with frame_parallel_level = 2.
+    for (int i = gf_frame_index + 1; i < gf_group->size; i++) {
+      if ((gf_group->frame_parallel_level[i] == 0 &&
+           (gf_group->update_type[i] == ARF_UPDATE ||
+            gf_group->update_type[i] == INTNL_ARF_UPDATE)) ||
+          gf_group->frame_parallel_level[i] == 1) {
+        break;
+      } else if (gf_group->frame_parallel_level[i] == 2) {
+        *do_frame_data_update = false;
+        break;
+      }
+    }
+  }
+}
+#endif
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static INLINE int64_t
+timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) {
+  return n * timestamp_ratio->num / timestamp_ratio->den;
+}
+
+static INLINE int64_t
+ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) {
+  int64_t round = timestamp_ratio->num / 2;
+  if (round > 0) --round;
+  return (n * timestamp_ratio->den + round) / timestamp_ratio->num;
+}
+
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
+
+  return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE ||
+         update_type == GF_UPDATE;
+}
+
+// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
+static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) {
+  return (cpi->common.features.allow_screen_content_tools &&
+          cpi->common.features.allow_intrabc &&
+          frame_is_intra_only(&cpi->common));
+}
+
+static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
+    const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+  return buf != NULL ? &buf->buf : NULL;
+}
+
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
+  assert(buf != NULL);
+  ensure_mv_buffer(buf, cm);
+  buf->width = cm->width;
+  buf->height = cm->height;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE unsigned int allocated_tokens(const TileInfo *tile,
+                                            int sb_size_log2, int num_planes) {
+  int tile_mb_rows =
+      ROUND_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, 2);
+  int tile_mb_cols =
+      ROUND_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, 2);
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+                                 int mi_row, TokenExtra **tok, int sb_size_log2,
+                                 int num_planes) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  const int tile_mb_cols =
+      (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+  const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+  *tok = cpi->token_info.tile_tok[tile_row][tile_col] +
+         get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
+
+#define ALT_MIN_LAG 3
+static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
+  return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
+}
+
+static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) {
+  return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) &&
+         (gf_cfg->gf_min_pyr_height == 0);
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+  return (frame_length + mb_length - 1) / mb_length;
+}
+
+// Check if statistics generation stage
+static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
+  assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
+                 cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled));
+  return (cpi->oxcf.pass == AOM_RC_FIRST_PASS ||
+          (cpi->compressor_stage == LAP_STAGE));
+}
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
+  return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS);
+}
+
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
+  return (is_stat_consumption_stage_twopass(cpi) ||
+          (cpi->oxcf.pass == AOM_RC_ONE_PASS &&
+           (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled));
+}
+
+// Decide whether 'dv_costs' need to be allocated/stored during the encoding.
+static AOM_INLINE bool av1_need_dv_costs(const AV1_COMP *const cpi) {
+  return !cpi->sf.rt_sf.use_nonrd_pick_mode &&
+         av1_allow_intrabc(&cpi->common) && !is_stat_generation_stage(cpi);
+}
+
+/*!\endcond */
+/*!\brief Check if the current stage has statistics
+ *
+ *\ingroup two_pass_algo
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ *
+ * \return 0 if no stats for current stage else 1
+ */
+static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
+  assert(
+      IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
+  return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled);
+}
+
+/*!\cond */
+
+static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
+  return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+         cpi->oxcf.gf_cfg.lag_in_frames == 0;
+}
+
+// Use default/internal reference structure for single-layer RTC.
+static INLINE int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) {
+  return is_one_pass_rt_params(cpi) && cpi->ppi->number_spatial_layers == 1 &&
+         cpi->ppi->number_temporal_layers == 1 &&
+         !cpi->ppi->rtc_ref.set_ref_frame_config;
+}
+
+// Function return size of frame stats buffer
+static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
+  /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
+  return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer);
+}
+
+// TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf
+
+static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
+  xd->block_ref_scale_factors[0] =
+      get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1);
+  xd->block_ref_scale_factors[1] =
+      get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1);
+}
+
+static INLINE int get_chessboard_index(int frame_index) {
+  return frame_index & 0x1;
+}
+
+static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi,
+                                              const int *cost_list) {
+  const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+                            cpi->sf.mv_sf.use_fullpel_costlist;
+  return use_cost_list ? cost_list : NULL;
+}
+
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+  const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+                            cpi->sf.mv_sf.use_fullpel_costlist;
+  return use_cost_list ? cost_list : NULL;
+}
+
+// Compression ratio of current frame.
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+                                 size_t encoded_frame_size);
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate);
+
+void av1_setup_frame_size(AV1_COMP *cpi);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+// Returns 1 if a frame is scaled and 0 otherwise.
+static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+  return cm->superres_upscaled_width != cm->render_width ||
+         cm->superres_upscaled_height != cm->render_height;
+}
+
+static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+  return av1_superres_scaled(cm) || av1_resize_scaled(cm);
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+  return cm->show_existing_frame && (!cm->features.error_resilient_mode ||
+                                     cm->current_frame.frame_type == KEY_FRAME);
+}
+
+// Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given
+// 'mi_row' and 'mi_col'.
+static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col,
+                                 const BLOCK_SIZE mi_alloc_bsize,
+                                 const int mbmi_ext_stride) {
+  const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize];
+  const int mi_ext_row = mi_row / mi_ext_size_1d;
+  const int mi_ext_col = mi_col / mi_ext_size_1d;
+  return mi_ext_row * mbmi_ext_stride + mi_ext_col;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(
+    const CommonModeInfoParams *const mi_params,
+    const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x,
+    MACROBLOCKD *const xd, int mi_row, int mi_col) {
+  set_mi_offsets(mi_params, xd, mi_row, mi_col);
+  const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize,
+                                     mbmi_ext_info->stride);
+  x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx;
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+                                             int cols_left, int *bh, int *bw) {
+  int int_size = (int)bsize;
+  if (rows_left <= 0 || cols_left <= 0) {
+    return AOMMIN(bsize, BLOCK_8X8);
+  } else {
+    for (; int_size > 0; int_size -= 3) {
+      *bh = mi_size_high[int_size];
+      *bw = mi_size_wide[int_size];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return (BLOCK_SIZE)int_size;
+}
+
+static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0,
+                                                             AOM_LAST_FLAG,
+                                                             AOM_LAST2_FLAG,
+                                                             AOM_LAST3_FLAG,
+                                                             AOM_GOLD_FLAG,
+                                                             AOM_BWD_FLAG,
+                                                             AOM_ALT2_FLAG,
+                                                             AOM_ALT_FLAG };
+
+// When more than 'max_allowed_refs' are available, we reduce the number of
+// reference frames one at a time based on this order.
+static const MV_REFERENCE_FRAME disable_order[] = {
+  LAST3_FRAME,
+  LAST2_FRAME,
+  ALTREF2_FRAME,
+  BWDREF_FRAME,
+};
+
+static const MV_REFERENCE_FRAME
+    ref_frame_priority_order[INTER_REFS_PER_FRAME] = {
+      LAST_FRAME,    ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME,
+      ALTREF2_FRAME, LAST2_FRAME,  LAST3_FRAME,
+    };
+
+static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
+                                      const int use_one_pass_rt_params,
+                                      const YV12_BUFFER_CONFIG **ref_frames,
+                                      const int ext_ref_frame_flags) {
+  // cpi->ext_flags.ref_frame_flags allows certain reference types to be
+  // disabled by the external interface.  These are set by
+  // av1_apply_encoding_flags(). Start with what the external interface allows,
+  // then suppress any reference types which we have found to be duplicates.
+  int flags = ext_ref_frame_flags;
+
+  for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) {
+    const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i];
+    // If this_ref has appeared before, mark the corresponding ref frame as
+    // invalid. For one_pass_rt mode, only disable GOLDEN_FRAME if it's the
+    // same as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
+    int index =
+        (use_one_pass_rt_params && ref_frame_priority_order[i] == GOLDEN_FRAME)
+            ? (1 + sf->rt_sf.use_nonrd_altref_frame)
+            : i;
+    for (int j = 0; j < index; ++j) {
+      // If this_ref has appeared before (same as the reference corresponding
+      // to lower index j), remove it as a reference only if that reference
+      // (for index j) is actually used as a reference.
+      if (this_ref == ref_frames[j] &&
+          (flags & (1 << (ref_frame_priority_order[j] - 1)))) {
+        flags &= ~(1 << (ref_frame_priority_order[i] - 1));
+        break;
+      }
+    }
+  }
+  return flags;
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi);
+
+#define MAX_GFUBOOST_FACTOR 10.0
+#define MIN_GFUBOOST_FACTOR 4.0
+
+static INLINE int is_frame_tpl_eligible(const GF_GROUP *const gf_group,
+                                        uint8_t index) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index];
+  return update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+         update_type == KF_UPDATE;
+}
+
+static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
+                                                    int selective_ref_frame,
+                                                    int prune_ref_frames,
+                                                    int gf_index) {
+  return (selective_ref_frame > 0) && (prune_ref_frames > 0) &&
+         !is_frame_tpl_eligible(gf_group, gf_index);
+}
+
+// Get update type of the current frame.
+static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group,
+                                                      int gf_frame_index) {
+  return gf_group->update_type[gf_frame_index];
+}
+
+static INLINE int av1_pixels_to_mi(int pixels) {
+  return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
+}
+
+static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
+         cm->show_frame;
+}
+
+static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) {
+  const ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  return (resize_pending_params->width && resize_pending_params->height &&
+          (cpi->common.width != resize_pending_params->width ||
+           cpi->common.height != resize_pending_params->height));
+}
+
+// Check if loop filter is used.
+static INLINE int is_loopfilter_used(const AV1_COMMON *const cm) {
+  return !cm->features.coded_lossless && !cm->tiles.large_scale;
+}
+
+// Check if CDEF is used.
+static INLINE int is_cdef_used(const AV1_COMMON *const cm) {
+  return cm->seq_params->enable_cdef && !cm->features.coded_lossless &&
+         !cm->tiles.large_scale;
+}
+
+// Check if loop restoration filter is used.
+static INLINE int is_restoration_used(const AV1_COMMON *const cm) {
+  return cm->seq_params->enable_restoration && !cm->features.all_lossless &&
+         !cm->tiles.large_scale;
+}
+
+// Checks if post-processing filters need to be applied.
+// NOTE: This function decides if the application of different post-processing
+// filters on the reconstructed frame can be skipped at the encoder side.
+// However the computation of different filter parameters that are signaled in
+// the bitstream is still required.
+static INLINE unsigned int derive_skip_apply_postproc_filters(
+    const AV1_COMP *cpi, int use_loopfilter, int use_cdef, int use_superres,
+    int use_restoration) {
+  // Though CDEF parameter selection should be dependent on
+  // deblocked/loop-filtered pixels for cdef_pick_method <=
+  // CDEF_FAST_SEARCH_LVL5, CDEF strength values are calculated based on the
+  // pixel values that are not loop-filtered in svc real-time encoding mode.
+  // Hence this case is handled separately using the condition below.
+  if (cpi->ppi->rtc_ref.non_reference_frame)
+    return (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF);
+
+  if (!cpi->oxcf.algo_cfg.skip_postproc_filtering || cpi->ppi->b_calculate_psnr)
+    return 0;
+  assert(cpi->oxcf.mode == ALLINTRA);
+
+  // The post-processing filters are applied one after the other in the
+  // following order: deblocking->cdef->superres->restoration. In case of
+  // ALLINTRA encoding, the reconstructed frame is not used as a reference
+  // frame. Hence, the application of these filters can be skipped when
+  // 1. filter parameters of the subsequent stages are not dependent on the
+  // filtered output of the current stage or
+  // 2. subsequent filtering stages are disabled
+  if (use_restoration) return SKIP_APPLY_RESTORATION;
+  if (use_superres) return SKIP_APPLY_SUPERRES;
+  if (use_cdef) {
+    // CDEF parameter selection is not dependent on the deblocked frame if
+    // cdef_pick_method is CDEF_PICK_FROM_Q. Hence the application of deblocking
+    // filters and cdef filters can be skipped in this case.
+    return (cpi->sf.lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q &&
+            use_loopfilter)
+               ? (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF)
+               : SKIP_APPLY_CDEF;
+  }
+  if (use_loopfilter) return SKIP_APPLY_LOOPFILTER;
+
+  // If we reach here, all post-processing stages are disabled, so none need to
+  // be skipped.
+  return 0;
+}
+
+static INLINE void set_postproc_filter_default_params(AV1_COMMON *cm) {
+  struct loopfilter *const lf = &cm->lf;
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  RestorationInfo *const rst_info = cm->rst_info;
+
+  lf->filter_level[0] = 0;
+  lf->filter_level[1] = 0;
+  cdef_info->cdef_bits = 0;
+  cdef_info->cdef_strengths[0] = 0;
+  cdef_info->nb_cdef_strengths = 1;
+  cdef_info->cdef_uv_strengths[0] = 0;
+  rst_info[0].frame_restoration_type = RESTORE_NONE;
+  rst_info[1].frame_restoration_type = RESTORE_NONE;
+  rst_info[2].frame_restoration_type = RESTORE_NONE;
+}
+
+static INLINE int is_inter_tx_size_search_level_one(
+    const TX_SPEED_FEATURES *tx_sf) {
+  return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 &&
+          tx_sf->inter_tx_size_search_init_depth_sqr >= 1);
+}
+
+static INLINE int get_lpf_opt_level(const SPEED_FEATURES *sf) {
+  int lpf_opt_level = 0;
+  if (is_inter_tx_size_search_level_one(&sf->tx_sf))
+    lpf_opt_level = (sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1;
+  return lpf_opt_level;
+}
+
+// Enable switchable motion mode only if warp and OBMC tools are allowed
+static INLINE bool is_switchable_motion_mode_allowed(bool allow_warped_motion,
+                                                     bool enable_obmc) {
+  return (allow_warped_motion || enable_obmc);
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
+  return (!cpi->ppi->use_svc ||
+          (cpi->ppi->use_svc &&
+           cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
+}
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+static INLINE void av1_print_fr_partition_timing_stats(
+    const FramePartitionTimingStats *part_stats, const char *filename) {
+  FILE *f = fopen(filename, "w");
+  if (!f) {
+    return;
+  }
+
+  fprintf(f, "bsize,redo,");
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "decision_%d,", part);
+  }
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "attempt_%d,", part);
+  }
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "time_%d,", part);
+  }
+  fprintf(f, "\n");
+
+  static const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
+
+  for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) {
+    fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo);
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]);
+    }
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]);
+    }
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]);
+    }
+    fprintf(f, "\n");
+  }
+  fclose(f);
+}
+#endif  // CONFIG_COLLECT_PARTITION_STATS == 2
+
+#if CONFIG_COLLECT_PARTITION_STATS
+static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
+  assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
+         bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 ||
+         bsize == BLOCK_4X4);
+  switch (bsize) {
+    case BLOCK_128X128: return 0;
+    case BLOCK_64X64: return 1;
+    case BLOCK_32X32: return 2;
+    case BLOCK_16X16: return 3;
+    case BLOCK_8X8: return 4;
+    case BLOCK_4X4: return 5;
+    default: assert(0 && "Invalid bsize for partition_stats."); return -1;
+  }
+}
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(AV1_COMP *cpi, int component) {
+  aom_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(AV1_COMP *cpi, int component) {
+  aom_usec_timer_mark(&cpi->component_timer[component]);
+  cpi->frame_component_time[component] +=
+      aom_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+  switch (type) {
+    case 0: return "KEY_FRAME";
+    case 1: return "INTER_FRAME";
+    case 2: return "INTRA_ONLY_FRAME";
+    case 3: return "S_FRAME";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
+/*!\endcond */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encoder_alloc.h b/third_party/aom/av1/encoder/encoder_alloc.h
new file mode 100644
index 0000000000..ce48496d48
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_alloc.h
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+#define AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/pickcdef.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void dealloc_context_buffers_ext(
+    MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  aom_free(mbmi_ext_info->frame_base);
+  mbmi_ext_info->frame_base = NULL;
+  mbmi_ext_info->alloc_size = 0;
+}
+
+static AOM_INLINE void alloc_context_buffers_ext(
+    AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int mi_alloc_rows =
+      (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int mi_alloc_cols =
+      (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
+
+  if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
+    dealloc_context_buffers_ext(mbmi_ext_info);
+    CHECK_MEM_ERROR(
+        cm, mbmi_ext_info->frame_base,
+        aom_malloc(new_ext_mi_size * sizeof(*mbmi_ext_info->frame_base)));
+    mbmi_ext_info->alloc_size = new_ext_mi_size;
+  }
+  // The stride needs to be updated regardless of whether new allocation
+  // happened or not.
+  mbmi_ext_info->stride = mi_alloc_cols;
+}
+
+static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  // Setup mi_params
+  mi_params->set_mb_mi(mi_params, cm->width, cm->height,
+                       cpi->sf.part_sf.default_min_partition_size);
+
+  if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi);
+
+  aom_free(cpi->td.mv_costs_alloc);
+  cpi->td.mv_costs_alloc = NULL;
+  // Avoid the memory allocation of 'mv_costs_alloc' for allintra encoding
+  // mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+    CHECK_MEM_ERROR(cm, cpi->td.mv_costs_alloc,
+                    (MvCosts *)aom_calloc(1, sizeof(*cpi->td.mv_costs_alloc)));
+    cpi->td.mb.mv_costs = cpi->td.mv_costs_alloc;
+  }
+
+  av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf,
+                                cm->error);
+  if (av1_setup_sms_tree(cpi, &cpi->td)) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate SMS tree");
+  }
+  cpi->td.firstpass_ctx =
+      av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf);
+  if (!cpi->td.firstpass_ctx)
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate PICK_MODE_CONTEXT");
+}
+
+// Allocate mbmi buffers which are used to store mode information at block
+// level.
+static AOM_INLINE void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (av1_alloc_context_buffers(cm, cm->width, cm->height,
+                                cpi->sf.part_sf.default_min_partition_size)) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
+
+  if (!is_stat_generation_stage(cpi))
+    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
+}
+
+static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  // Create the encoder segmentation map and set all entries to 0
+  aom_free(cpi->enc_seg.map);
+  CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(
+      cm, cpi->cyclic_refresh,
+      av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  aom_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+}
+
+static AOM_INLINE void alloc_obmc_buffers(
+    OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) {
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->wsrc,
+      (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc)));
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->mask,
+      (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask)));
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->above_pred,
+      (uint8_t *)aom_memalign(
+          16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred)));
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->left_pred,
+      (uint8_t *)aom_memalign(
+          16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred)));
+}
+
+static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) {
+  aom_free(obmc_buffer->mask);
+  aom_free(obmc_buffer->above_pred);
+  aom_free(obmc_buffer->left_pred);
+  aom_free(obmc_buffer->wsrc);
+
+  obmc_buffer->mask = NULL;
+  obmc_buffer->above_pred = NULL;
+  obmc_buffer->left_pred = NULL;
+  obmc_buffer->wsrc = NULL;
+}
+
+static AOM_INLINE void alloc_compound_type_rd_buffers(
+    struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) {
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->pred0,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->pred1,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->residual1,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->diff10,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+  AOM_CHECK_MEM_ERROR(error, bufs->tmp_best_mask_buf,
+                      (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+                                            sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static AOM_INLINE void release_compound_type_rd_buffers(
+    CompoundTypeRdBuffers *const bufs) {
+  aom_free(bufs->pred0);
+  aom_free(bufs->pred1);
+  aom_free(bufs->residual1);
+  aom_free(bufs->diff10);
+  aom_free(bufs->tmp_best_mask_buf);
+  av1_zero(*bufs);  // Set all pointers to NULL for safety.
+}
+
+static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  TokenInfo *token_info = &cpi->token_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int num_planes = av1_num_planes(cm);
+  dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
+
+  aom_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+  cpi->allocated_tiles = 0;
+  enc_row_mt->allocated_tile_cols = 0;
+  enc_row_mt->allocated_tile_rows = 0;
+
+  // Delete sementation map
+  aom_free(cpi->enc_seg.map);
+  cpi->enc_seg.map = NULL;
+
+  av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  aom_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  aom_free(cpi->ssim_rdmult_scaling_factors);
+  cpi->ssim_rdmult_scaling_factors = NULL;
+
+  aom_free(cpi->tpl_rdmult_scaling_factors);
+  cpi->tpl_rdmult_scaling_factors = NULL;
+
+#if CONFIG_TUNE_VMAF
+  aom_free(cpi->vmaf_info.rdmult_scaling_factors);
+  cpi->vmaf_info.rdmult_scaling_factors = NULL;
+  aom_close_vmaf_model(cpi->vmaf_info.vmaf_model);
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  aom_free(cpi->butteraugli_info.rdmult_scaling_factors);
+  cpi->butteraugli_info.rdmult_scaling_factors = NULL;
+  aom_free_frame_buffer(&cpi->butteraugli_info.source);
+  aom_free_frame_buffer(&cpi->butteraugli_info.resized_source);
+#endif
+
+#if CONFIG_SALIENCY_MAP
+  aom_free(cpi->saliency_map);
+  aom_free(cpi->sm_scaling_factor);
+#endif
+
+  release_obmc_buffers(&cpi->td.mb.obmc_buffer);
+
+  aom_free(cpi->td.mv_costs_alloc);
+  cpi->td.mv_costs_alloc = NULL;
+  aom_free(cpi->td.dv_costs_alloc);
+  cpi->td.dv_costs_alloc = NULL;
+
+  aom_free(cpi->td.mb.sb_stats_cache);
+  cpi->td.mb.sb_stats_cache = NULL;
+
+  aom_free(cpi->td.mb.sb_fp_stats);
+  cpi->td.mb.sb_fp_stats = NULL;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+  aom_free(cpi->td.mb.rdcost);
+  cpi->td.mb.rdcost = NULL;
+#endif
+
+  av1_free_pc_tree_recursive(cpi->td.pc_root, num_planes, 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  cpi->td.pc_root = NULL;
+
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++) {
+      aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
+      cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
+    }
+
+  av1_hash_table_destroy(&cpi->td.mb.intrabc_hash_info.intrabc_hash_table);
+
+  aom_free(cm->tpl_mvs);
+  cm->tpl_mvs = NULL;
+
+  aom_free(cpi->td.pixel_gradient_info);
+  cpi->td.pixel_gradient_info = NULL;
+
+  aom_free(cpi->td.src_var_info_of_4x4_sub_blocks);
+  cpi->td.src_var_info_of_4x4_sub_blocks = NULL;
+
+  aom_free(cpi->td.vt64x64);
+  cpi->td.vt64x64 = NULL;
+
+  av1_free_pmc(cpi->td.firstpass_ctx, num_planes);
+  cpi->td.firstpass_ctx = NULL;
+
+  const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+  // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+  // in av1_temporal_filter() for single-threaded encode are freed in case an
+  // error is encountered during temporal filtering (due to early termination
+  // tf_dealloc_data() in av1_temporal_filter() would not be invoked).
+  tf_dealloc_data(&cpi->td.tf_data, is_highbitdepth);
+
+  // This call ensures that tpl_tmp_buffers for single-threaded encode are freed
+  // in case of an error during tpl.
+  tpl_dealloc_temp_buffers(&cpi->td.tpl_tmp_buffers);
+
+  // This call ensures that the global motion (gm) data buffers for
+  // single-threaded encode are freed in case of an error during gm.
+  gm_dealloc_data(&cpi->td.gm_data);
+
+  // This call ensures that CDEF search context buffers are deallocated in case
+  // of an error during cdef search.
+  av1_cdef_dealloc_data(cpi->cdef_search_ctx);
+  aom_free(cpi->cdef_search_ctx);
+  cpi->cdef_search_ctx = NULL;
+
+  av1_dealloc_mb_data(&cpi->td.mb, num_planes);
+
+  av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
+
+  av1_free_txb_buf(cpi);
+  av1_free_context_buffers(cm);
+
+  aom_free_frame_buffer(&cpi->last_frame_uf);
+#if !CONFIG_REALTIME_ONLY
+  av1_free_restoration_buffers(cm);
+  av1_free_firstpass_data(&cpi->firstpass_data);
+#endif
+
+  if (!is_stat_generation_stage(cpi)) {
+    av1_free_cdef_buffers(cm, &cpi->ppi->p_mt_info.cdef_worker,
+                          &cpi->mt_info.cdef_sync);
+  }
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+    cpi->pick_lr_ctxt.rusi[plane] = NULL;
+  }
+  aom_free(cpi->pick_lr_ctxt.dgd_avg);
+  cpi->pick_lr_ctxt.dgd_avg = NULL;
+
+  aom_free_frame_buffer(&cpi->trial_frame_rst);
+  aom_free_frame_buffer(&cpi->scaled_source);
+  aom_free_frame_buffer(&cpi->scaled_last_source);
+  aom_free_frame_buffer(&cpi->orig_source);
+  aom_free_frame_buffer(&cpi->svc.source_last_TL0);
+
+  free_token_info(token_info);
+
+  av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+  av1_free_sms_tree(&cpi->td);
+
+  aom_free(cpi->td.mb.palette_buffer);
+  release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
+  aom_free(cpi->td.mb.tmp_conv_dst);
+  for (int j = 0; j < 2; ++j) {
+    aom_free(cpi->td.mb.tmp_pred_bufs[j]);
+  }
+
+#if CONFIG_DENOISE
+  if (cpi->denoise_and_model) {
+    aom_denoise_and_model_free(cpi->denoise_and_model);
+    cpi->denoise_and_model = NULL;
+  }
+#endif
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    aom_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+  aom_free(cpi->svc.layer_context);
+  cpi->svc.layer_context = NULL;
+
+  aom_free(cpi->consec_zero_mv);
+  cpi->consec_zero_mv = NULL;
+  cpi->consec_zero_mv_alloc_size = 0;
+
+  aom_free(cpi->src_sad_blk_64x64);
+  cpi->src_sad_blk_64x64 = NULL;
+
+  aom_free(cpi->mb_weber_stats);
+  cpi->mb_weber_stats = NULL;
+
+  if (cpi->oxcf.enable_rate_guide_deltaq) {
+    aom_free(cpi->prep_rate_estimates);
+    cpi->prep_rate_estimates = NULL;
+
+    aom_free(cpi->ext_rate_distribution);
+    cpi->ext_rate_distribution = NULL;
+  }
+
+  aom_free(cpi->mb_delta_q);
+  cpi->mb_delta_q = NULL;
+}
+
+static AOM_INLINE void allocate_gradient_info_for_hog(AV1_COMP *cpi) {
+  if (!is_gradient_caching_for_hog_enabled(cpi)) return;
+
+  PixelLevelGradientInfo *pixel_gradient_info = cpi->td.pixel_gradient_info;
+  if (!pixel_gradient_info) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome;
+    CHECK_MEM_ERROR(
+        cm, pixel_gradient_info,
+        aom_malloc(sizeof(*pixel_gradient_info) * plane_types * MAX_SB_SQUARE));
+    cpi->td.pixel_gradient_info = pixel_gradient_info;
+  }
+
+  cpi->td.mb.pixel_gradient_info = pixel_gradient_info;
+}
+
+static AOM_INLINE void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) {
+  if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
+
+  Block4x4VarInfo *source_variance_info =
+      cpi->td.src_var_info_of_4x4_sub_blocks;
+  if (!source_variance_info) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+    const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size];
+    CHECK_MEM_ERROR(cm, source_variance_info,
+                    aom_malloc(sizeof(*source_variance_info) * mi_count_in_sb));
+    cpi->td.src_var_info_of_4x4_sub_blocks = source_variance_info;
+  }
+
+  cpi->td.mb.src_var_info_of_4x4_sub_blocks = source_variance_info;
+}
+
+static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
+  if (cpi->td.vt64x64) {
+    if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
+      aom_free(cpi->td.vt64x64);
+      cpi->td.vt64x64 = NULL;
+    }
+  }
+  if (!cpi->td.vt64x64) {
+    CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
+                    aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
+    cpi->td.num_64x64_blocks = num_64x64_blocks;
+  }
+}
+
+static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
+    AV1_COMP *cpi, int scaled_width, int scaled_height) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (scaled_width == cpi->unscaled_source->y_crop_width &&
+      scaled_height == cpi->unscaled_source->y_crop_height) {
+    return cpi->unscaled_source;
+  }
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_source, scaled_width, scaled_height,
+          cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+          cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->features.byte_alignment, NULL, NULL, NULL,
+          cpi->image_pyramid_levels, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to reallocate scaled source buffer");
+  assert(cpi->scaled_source.y_crop_width == scaled_width);
+  assert(cpi->scaled_source.y_crop_height == scaled_height);
+  if (!av1_resize_and_extend_frame_nonnormative(
+          cpi->unscaled_source, &cpi->scaled_source,
+          (int)cm->seq_params->bit_depth, num_planes))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to reallocate buffers during resize");
+  return &cpi->scaled_source;
+}
+
+// Deallocate allocated thread_data.
+static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  const int num_tf_workers =
+      AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers);
+  const int num_tpl_workers =
+      AOMMIN(p_mt_info->num_mod_workers[MOD_TPL], p_mt_info->num_workers);
+  const int is_highbitdepth = ppi->seq_params.use_highbitdepth;
+  const int num_planes = ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+  for (int t = 1; t < p_mt_info->num_workers; ++t) {
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t];
+    thread_data->td = thread_data->original_td;
+    ThreadData *const td = thread_data->td;
+    if (!td) continue;
+    aom_free(td->tctx);
+    aom_free(td->palette_buffer);
+    aom_free(td->tmp_conv_dst);
+    release_compound_type_rd_buffers(&td->comp_rd_buffer);
+    for (int j = 0; j < 2; ++j) {
+      aom_free(td->tmp_pred_bufs[j]);
+    }
+    aom_free(td->pixel_gradient_info);
+    aom_free(td->src_var_info_of_4x4_sub_blocks);
+    release_obmc_buffers(&td->obmc_buffer);
+    aom_free(td->vt64x64);
+
+    for (int x = 0; x < 2; x++) {
+      for (int y = 0; y < 2; y++) {
+        aom_free(td->hash_value_buffer[x][y]);
+        td->hash_value_buffer[x][y] = NULL;
+      }
+    }
+    aom_free(td->mv_costs_alloc);
+    td->mv_costs_alloc = NULL;
+    aom_free(td->dv_costs_alloc);
+    td->dv_costs_alloc = NULL;
+    aom_free(td->counts);
+    av1_free_pmc(td->firstpass_ctx, num_planes);
+    td->firstpass_ctx = NULL;
+    av1_free_shared_coeff_buffer(&td->shared_coeff_buf);
+    av1_free_sms_tree(td);
+    // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+    // in prepare_tf_workers() for MT encode are freed in case an error is
+    // encountered during temporal filtering (due to early termination
+    // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be
+    // invoked).
+    if (t < num_tf_workers) tf_dealloc_data(&td->tf_data, is_highbitdepth);
+    // This call ensures that tpl_tmp_buffers for MT encode are freed in case of
+    // an error during tpl.
+    if (t < num_tpl_workers) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
+    // This call ensures that the buffers in gm_data for MT encode are freed in
+    // case of an error during gm.
+    gm_dealloc_data(&td->gm_data);
+    av1_dealloc_mb_data(&td->mb, num_planes);
+    aom_free(td->mb.sb_stats_cache);
+    td->mb.sb_stats_cache = NULL;
+    aom_free(td->mb.sb_fp_stats);
+    td->mb.sb_fp_stats = NULL;
+#if CONFIG_PARTITION_SEARCH_ORDER
+    aom_free(td->mb.rdcost);
+    td->mb.rdcost = NULL;
+#endif
+    av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, SEARCH_PARTITION);
+    td->pc_root = NULL;
+    av1_dealloc_mb_wiener_var_pred_buf(td);
+    aom_free(td);
+    thread_data->td = NULL;
+    thread_data->original_td = NULL;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_ALLOC_H_
diff --git a/third_party/aom/av1/encoder/encoder_utils.c b/third_party/aom/av1/encoder/encoder_utils.c
new file mode 100644
index 0000000000..c35873d207
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_utils.c
@@ -0,0 +1,1503 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aomcx.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define MIN_BOOST_COMBINE_FACTOR 4.0
+#define MAX_BOOST_COMBINE_FACTOR 12.0
+
+const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
+  { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
+    { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
+    { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
+    { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
+    { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
+    { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
+    { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
+    { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
+    { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
+    { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
+    { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
+    { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
+    { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
+    { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
+    { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
+    { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
+    { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
+    { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
+    { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
+    { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
+    { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
+    { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
+    { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
+    { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
+    { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
+    { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
+    { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
+    { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
+    { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
+    { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
+    { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
+    { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
+    { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
+    { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
+    { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
+    { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
+    { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
+    { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
+    { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
+    { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
+    { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
+    { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
+    { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
+    { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+};
+
+const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  106, 90, 90, 97, 67, 59, 70, 28,
+    30, 38, 16, 16,  16, 0,  0,  44, 50, 26, 25 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  98, 93, 97, 68, 82, 85, 33, 30,
+    33, 16, 16, 16, 16, 0,  0,  43, 37, 26, 16 },
+  { 0,  0,  0,  91, 80, 76, 78, 55, 49, 24, 16,
+    16, 16, 16, 16, 16, 0,  0,  29, 45, 16, 38 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  103, 89, 89, 89, 62, 63, 76, 34,
+    35, 32, 19, 16,  16, 0,  0,  49, 55, 29, 19 }
+};
+
+const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
+                                                       64, 64, 64 };
+
+// TODO(yunqing): the default probs can be trained later from better
+// performance.
+const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+                                         [SWITCHABLE_FILTER_CONTEXTS]
+                                         [SWITCHABLE_FILTERS] = {
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } }
+                                         };
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  double avg_q;
+#if CONFIG_FPMT_TEST
+  avg_q = ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) &&
+           (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE))
+              ? cpi->ppi->p_rc.temp_avg_q
+              : cpi->ppi->p_rc.avg_q;
+#else
+  avg_q = cpi->ppi->p_rc.avg_q;
+#endif
+
+  int high_q = (int)(avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+
+    // Disable segmentation
+    av1_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    av1_clearall_segfeatures(seg);
+  } else if (cpi->refresh_frame.alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+
+    // Disable segmentation and individual segment features by default
+    av1_disable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta = av1_compute_qdelta(rc, avg_q, avg_q * 0.875,
+                                    cm->seq_params->bit_depth);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      // Disable segmentation and clear down features if alt ref
+      // is not active for this group
+
+      av1_disable_segmentation(seg);
+
+      memset(cpi->enc_seg.map, 0,
+             cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+      seg->update_map = 0;
+      seg->update_data = 0;
+
+      av1_clearall_segfeatures(seg);
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+void av1_apply_active_map(AV1_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      const int num_mis =
+          cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+      for (i = 0; i < num_mis; ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      av1_enable_segmentation(seg);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+                      -MAX_LOOP_FILTER);
+    } else {
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void process_tpl_stats_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+
+  assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size));
+
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  if (tpl_frame->is_valid) {
+    int tpl_stride = tpl_frame->stride;
+    double intra_cost_base = 0;
+    double mc_dep_cost_base = 0;
+    double cbcmp_base = 1;
+    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+    const int row_step = step;
+    const int col_step_sr =
+        coded_to_superres_mi(step, cm->superres_scale_denominator);
+    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+    for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) {
+      for (int col = 0; col < mi_cols_sr; col += col_step_sr) {
+        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+        double cbcmp = (double)(this_stats->srcrf_dist);
+        int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+        intra_cost_base += log(dist_scaled) * cbcmp;
+        mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp;
+        cbcmp_base += cbcmp;
+      }
+    }
+
+    if (mc_dep_cost_base == 0) {
+      tpl_frame->is_valid = 0;
+    } else {
+      cpi->rd.r0 = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base);
+      if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+        if (cpi->ppi->lap_enabled) {
+          double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval);
+          const int gfu_boost = get_gfu_boost_from_r0_lap(
+              min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0,
+              cpi->ppi->p_rc.num_stats_required_for_gfu_boost);
+          // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
+          //        gfu_boost);
+          cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+              min_boost_factor, MAX_BOOST_COMBINE_FACTOR,
+              cpi->ppi->p_rc.gfu_boost, gfu_boost,
+              cpi->ppi->p_rc.num_stats_used_for_gfu_boost);
+        } else {
+          // TPL may only look at a subset of frame in the gf group when the
+          // speed feature 'reduce_num_frames' is on, which affects the r0
+          // calcuation. Thus, to compensate for TPL not using all frames a
+          // factor to adjust r0 is used.
+          const int gfu_boost =
+              (int)(200.0 * cpi->ppi->tpl_data.r0_adjust_factor / cpi->rd.r0);
+          cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+              MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+              cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+        }
+      }
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                 int *top_index) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  // Setup variables that depend on the dimensions of the frame.
+  av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
+
+#if !CONFIG_REALTIME_ONLY
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (cpi->oxcf.algo_cfg.enable_tpl_model &&
+      av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+    process_tpl_stats_frame(cpi);
+    av1_tpl_rdmult_setup(cpi);
+  }
+#endif
+
+  // Decide q and q bounds.
+  *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index,
+                                bottom_index, top_index);
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q &&
+      cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid &&
+      !is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+    const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+    const int tpl_q = av1_tpl_get_q_index(
+        &cpi->ppi->tpl_data, cpi->gf_frame_index, cpi->rc.active_worst_quality,
+        cm->seq_params->bit_depth);
+    *q = clamp(tpl_q, rc_cfg->best_allowed_q, rc_cfg->worst_allowed_q);
+    *top_index = *bottom_index = *q;
+    if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE)
+      cpi->ppi->p_rc.arf_q = *q;
+  }
+
+  if (cpi->oxcf.q_cfg.use_fixed_qp_offsets && cpi->oxcf.rc_cfg.mode == AOM_Q) {
+    if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+      const double qratio_grad =
+          cpi->ppi->p_rc.baseline_gf_interval > 20 ? 0.2 : 0.3;
+      const double qstep_ratio =
+          0.2 +
+          (1.0 - (double)cpi->rc.active_worst_quality / MAXQ) * qratio_grad;
+      *q = av1_get_q_index_from_qstep_ratio(
+          cpi->rc.active_worst_quality, qstep_ratio, cm->seq_params->bit_depth);
+      *top_index = *bottom_index = *q;
+      if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+          gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE ||
+          gf_group->update_type[cpi->gf_frame_index] == GF_UPDATE)
+        cpi->ppi->p_rc.arf_q = *q;
+    } else if (gf_group->layer_depth[cpi->gf_frame_index] <
+               gf_group->max_layer_depth) {
+      int this_height = gf_group->layer_depth[cpi->gf_frame_index];
+      int arf_q = cpi->ppi->p_rc.arf_q;
+      while (this_height > 1) {
+        arf_q = (arf_q + cpi->oxcf.rc_cfg.cq_level + 1) / 2;
+        --this_height;
+      }
+      *top_index = *bottom_index = *q = arf_q;
+    }
+  }
+#endif
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (is_stat_consumption_stage_twopass(cpi) &&
+      cpi->sf.hl_sf.static_segmentation)
+    configure_static_seg_features(cpi);
+}
+
+static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
+  pars->num_cr_points = 0;
+  pars->cr_mult = 0;
+  pars->cr_luma_mult = 0;
+  memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
+  memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
+  pars->num_cb_points = 0;
+  pars->cb_mult = 0;
+  pars->cb_luma_mult = 0;
+  pars->chroma_scaling_from_luma = 0;
+  memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
+  memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
+}
+
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+                                          const AV1EncoderConfig *oxcf) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename ||
+      tune_cfg->content == AOM_CONTENT_FILM) {
+    seq_params->film_grain_params_present = 1;
+  } else {
+#if CONFIG_DENOISE
+    seq_params->film_grain_params_present = (oxcf->noise_level > 0);
+#else
+    seq_params->film_grain_params_present = 0;
+#endif
+  }
+}
+
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+                                      const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    aom_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  if (tune_cfg->film_grain_test_vector) {
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      memcpy(&cm->film_grain_params,
+             film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1,
+             sizeof(cm->film_grain_params));
+      if (oxcf->tool_cfg.enable_monochrome)
+        reset_film_grain_chroma_params(&cm->film_grain_params);
+      cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+      if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) {
+        cm->film_grain_params.clip_to_restricted_range = 0;
+      }
+    }
+  } else if (tune_cfg->film_grain_table_filename) {
+    CHECK_MEM_ERROR(cm, cpi->film_grain_table,
+                    aom_calloc(1, sizeof(*cpi->film_grain_table)));
+
+    aom_film_grain_table_read(cpi->film_grain_table,
+                              tune_cfg->film_grain_table_filename, cm->error);
+  } else if (tune_cfg->content == AOM_CONTENT_FILM) {
+    cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+    if (oxcf->tool_cfg.enable_monochrome)
+      reset_film_grain_chroma_params(&cm->film_grain_params);
+    if (cm->seq_params->color_range == AOM_CR_FULL_RANGE)
+      cm->film_grain_params.clip_to_restricted_range = 0;
+  } else {
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+  }
+}
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+                          const int phase, const int use_optimized_scaler) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+        continue;
+      }
+
+      // For RTC-SVC: if force_zero_mode_spatial_ref is enabled, check if the
+      // motion search can be skipped for the references: last, golden, altref.
+      // If so, we can skip scaling that reference.
+      if (cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref &&
+          cpi->ppi->rtc_ref.set_ref_frame_config) {
+        if (ref_frame == LAST_FRAME && cpi->svc.skip_mvsearch_last) continue;
+        if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_mvsearch_gf) continue;
+        if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref)
+          continue;
+      }
+      // For RTC with superres on: golden reference only needs to be scaled
+      // if it was refreshed in previous frame.
+      if (is_one_pass_rt_params(cpi) &&
+          cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME &&
+          cpi->rc.frame_num_last_gf_refresh <
+              (int)cm->current_frame.frame_number - 1) {
+        continue;
+      }
+
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        // Replace the reference buffer with a copy having a thicker border,
+        // if the reference buffer is higher resolution than the current
+        // frame, and the border is thin.
+        if ((ref->y_crop_width > cm->width ||
+             ref->y_crop_height > cm->height) &&
+            ref->border < AOM_BORDER_IN_PIXELS) {
+          RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
+          if (aom_yv12_realloc_with_new_border(
+                  &ref_fb->buf, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, cpi->image_pyramid_levels,
+                  num_planes) != 0) {
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+        }
+        int force_scaling = 0;
+        RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
+        if (new_fb == NULL) {
+          const int new_fb_idx = get_free_fb(cm);
+          if (new_fb_idx == INVALID_IDX) {
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Unable to find free frame buffer");
+          }
+          force_scaling = 1;
+          new_fb = &pool->frame_bufs[new_fb_idx];
+        }
+
+        if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
+            new_fb->buf.y_crop_height != cm->height) {
+          if (aom_realloc_frame_buffer(
+                  &new_fb->buf, cm->width, cm->height,
+                  cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+                  cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) {
+            if (force_scaling) {
+              // Release the reference acquired in the get_free_fb() call above.
+              --new_fb->ref_count;
+            }
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+          bool has_optimized_scaler = av1_has_optimized_scaler(
+              ref->y_crop_width, ref->y_crop_height, new_fb->buf.y_crop_width,
+              new_fb->buf.y_crop_height);
+          if (num_planes > 1) {
+            has_optimized_scaler =
+                has_optimized_scaler &&
+                av1_has_optimized_scaler(
+                    ref->uv_crop_width, ref->uv_crop_height,
+                    new_fb->buf.uv_crop_width, new_fb->buf.uv_crop_height);
+          }
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (use_optimized_scaler && has_optimized_scaler &&
+              cm->seq_params->bit_depth == AOM_BITS_8) {
+            av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+                                        num_planes);
+          } else if (!av1_resize_and_extend_frame_nonnormative(
+                         ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+                         num_planes)) {
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate buffer during resize");
+          }
+#else
+          if (use_optimized_scaler && has_optimized_scaler) {
+            av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+                                        num_planes);
+          } else if (!av1_resize_and_extend_frame_nonnormative(
+                         ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+                         num_planes)) {
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate buffer during resize");
+          }
+#endif
+          cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+      } else {
+        RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_buf[ref_frame - 1] = buf;
+        ++buf->ref_count;
+      }
+    } else {
+      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+    }
+  }
+}
+
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+                              int height, int number_spatial_layers) {
+  if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) {
+    return BLOCK_64X64;
+  }
+  if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) {
+    return BLOCK_128X128;
+  }
+#if CONFIG_TFLITE
+  if (oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) return BLOCK_64X64;
+#endif
+  // Force 64x64 superblock size to increase resolution in perceptual
+  // AQ mode.
+  if (oxcf->mode == ALLINTRA &&
+      (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI ||
+       oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED)) {
+    return BLOCK_64X64;
+  }
+  assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+  if (number_spatial_layers > 1 ||
+      oxcf->resize_cfg.resize_mode != RESIZE_NONE) {
+    // Use the configured size (top resolution) for spatial layers or
+    // on resize.
+    return AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) > 720
+               ? BLOCK_128X128
+               : BLOCK_64X64;
+  } else if (oxcf->mode == REALTIME) {
+    if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) {
+      const TileConfig *const tile_cfg = &oxcf->tile_cfg;
+      const int num_tiles =
+          (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows);
+      // For multi-thread encode: if the number of (128x128) superblocks
+      // per tile is low use 64X64 superblock.
+      if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 &&
+          oxcf->max_threads >= num_tiles && AOMMIN(width, height) > 720 &&
+          (width * height) / (128 * 128 * num_tiles) <= 38)
+        return BLOCK_64X64;
+      else
+        return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64;
+    } else {
+      return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
+    }
+  }
+
+  // TODO(any): Possibly could improve this with a heuristic.
+  // When superres / resize is on, 'cm->width / height' can change between
+  // calls, so we don't apply this heuristic there.
+  // Things break if superblock size changes between the first pass and second
+  // pass encoding, which is why this heuristic is not configured as a
+  // speed-feature.
+  if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE &&
+      oxcf->resize_cfg.resize_mode == RESIZE_NONE) {
+    int is_480p_or_lesser = AOMMIN(width, height) <= 480;
+    if (oxcf->speed >= 1 && is_480p_or_lesser) return BLOCK_64X64;
+
+    // For 1080p and lower resolutions, choose SB size adaptively based on
+    // resolution and speed level for multi-thread encode.
+    int is_1080p_or_lesser = AOMMIN(width, height) <= 1080;
+    if (!is_480p_or_lesser && is_1080p_or_lesser && oxcf->mode == GOOD &&
+        oxcf->row_mt == 1 && oxcf->max_threads > 1 && oxcf->speed >= 5)
+      return BLOCK_64X64;
+
+    // For allintra encode, since the maximum partition size is set to 32X32 for
+    // speed>=6, superblock size is set to 64X64 instead of 128X128. This
+    // improves the multithread performance due to reduction in top right delay
+    // and thread sync wastage. Currently, this setting is selectively enabled
+    // only for speed>=9 and resolutions less than 4k since cost update
+    // frequency is set to INTERNAL_COST_UPD_OFF in these cases.
+    const int is_4k_or_larger = AOMMIN(width, height) >= 2160;
+    if (oxcf->mode == ALLINTRA && oxcf->speed >= 9 && !is_4k_or_larger)
+      return BLOCK_64X64;
+  }
+  return BLOCK_128X128;
+}
+
+void av1_setup_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    av1_setup_past_independence(cm);
+  }
+
+  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+      frame_is_sframe(cm)) {
+    if (!cpi->ppi->seq_params_locked) {
+      set_sb_size(cm->seq_params,
+                  av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+                                     cpi->ppi->number_spatial_layers));
+    }
+  } else {
+    const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
+    if (primary_ref_buf == NULL) {
+      av1_setup_past_independence(cm);
+      cm->seg.update_map = 1;
+      cm->seg.update_data = 1;
+    } else {
+      *cm->fc = primary_ref_buf->frame_context;
+    }
+  }
+
+  av1_zero(cm->cur_frame->interp_filter_selected);
+  cm->prev_frame = get_primary_ref_frame_buf(cm);
+  cpi->vaq_refresh = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+                                      MV_REFERENCE_FRAME ref,
+                                      InterpFilter ifilter) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+  if (buf == NULL) return 0;
+  return buf->interp_filter_selected[ifilter];
+}
+
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int ref_total[REF_FRAMES] = { 0 };
+  uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
+
+  if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_frame.alt_ref_frame)
+    return mask;
+
+  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+         ++ifilter) {
+      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+    }
+  }
+  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+  for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+       ++ifilter) {
+    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+      int filter_score =
+          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+      if (filter_score < ref_total_total) {
+        DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
+        reset_interp_filter_allowed_mask(&mask, filt_type);
+      }
+    }
+  }
+  return mask;
+}
+
+#define STRICT_PSNR_DIFF_THRESH 0.9
+// Encode key frame with/without screen content tools to determine whether
+// screen content tools should be enabled for this key frame group or not.
+// The first encoding is without screen content tools.
+// The second encoding is with screen content tools.
+// We compare the psnr and frame size to make the decision.
+static void screen_content_tools_determination(
+    AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
+    const int allow_intrabc_orig_decision,
+    const int use_screen_content_tools_orig_decision,
+    const int is_screen_content_type_orig_decision, const int pass,
+    int *projected_size_pass, PSNR_STATS *psnr) {
+  AV1_COMMON *const cm = &cpi->common;
+  FeatureFlags *const features = &cm->features;
+
+#if CONFIG_FPMT_TEST
+  projected_size_pass[pass] =
+      ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) &&
+       (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE))
+          ? cpi->ppi->p_rc.temp_projected_frame_size
+          : cpi->rc.projected_frame_size;
+#else
+  projected_size_pass[pass] = cpi->rc.projected_frame_size;
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
+                       bit_depth, in_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
+#endif
+  if (pass != 1) return;
+
+  const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
+  // Calculate % of palette mode to be chosen in a frame from mode decision.
+  const double palette_ratio =
+      (double)cpi->palette_pixel_num / (double)(cm->height * cm->width);
+  const int psnr_diff_is_large = (psnr_diff > STRICT_PSNR_DIFF_THRESH);
+  const int ratio_is_large =
+      ((palette_ratio >= 0.0001) && ((psnr_diff / palette_ratio) > 4));
+  const int is_sc_encoding_much_better = (psnr_diff_is_large || ratio_is_large);
+  if (is_sc_encoding_much_better) {
+    // Use screen content tools, if we get coding gain.
+    features->allow_screen_content_tools = 1;
+    features->allow_intrabc = cpi->intrabc_used;
+    cpi->use_screen_content_tools = 1;
+    cpi->is_screen_content_type = 1;
+  } else {
+    // Use original screen content decision.
+    features->allow_screen_content_tools =
+        allow_screen_content_tools_orig_decision;
+    features->allow_intrabc = allow_intrabc_orig_decision;
+    cpi->use_screen_content_tools = use_screen_content_tools_orig_decision;
+    cpi->is_screen_content_type = is_screen_content_type_orig_decision;
+  }
+}
+
+// Set some encoding parameters to make the encoding process fast.
+// A fixed block partition size, and a large q is used.
+static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
+                                                   const int pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (pass == 0) {
+    // In the first pass, encode without screen content tools.
+    // Use a high q, and a fixed block size for fast encoding.
+    cm->features.allow_screen_content_tools = 0;
+    cm->features.allow_intrabc = 0;
+    cpi->use_screen_content_tools = 0;
+    cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+    cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+    return;
+  }
+  assert(pass == 1);
+  // In the second pass, encode with screen content tools.
+  // Use a high q, and a fixed block size for fast encoding.
+  cm->features.allow_screen_content_tools = 1;
+  // TODO(chengchen): turn intrabc on could lead to data race issue.
+  // cm->allow_intrabc = 1;
+  cpi->use_screen_content_tools = 1;
+  cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+  cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+}
+
+// Determines whether to use screen content tools for the key frame group.
+// This function modifies "cm->features.allow_screen_content_tools",
+// "cm->features.allow_intrabc" and "cpi->use_screen_content_tools".
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+  // Variables to help determine if we should allow screen content tools.
+  int projected_size_pass[3] = { 0 };
+  PSNR_STATS psnr[3];
+  const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
+  const int allow_screen_content_tools_orig_decision =
+      cm->features.allow_screen_content_tools;
+  const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
+  const int use_screen_content_tools_orig_decision =
+      cpi->use_screen_content_tools;
+  const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
+  // Turn off the encoding trial for forward key frame and superres.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode || oxcf->kf_cfg.fwd_kf_enabled ||
+      cpi->superres_mode != AOM_SUPERRES_NONE || oxcf->mode == REALTIME ||
+      use_screen_content_tools_orig_decision || !is_key_frame) {
+    return;
+  }
+
+  // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
+  // Find a better way to determine whether screen content tools should be used
+  // for lossless coding.
+  // Use a high q and a fixed partition to do quick encoding.
+  const int q_for_screen_content_quick_run =
+      is_lossless_requested(&oxcf->rc_cfg) ? q_orig : AOMMAX(q_orig, 244);
+  const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
+  const BLOCK_SIZE fixed_partition_block_size_orig =
+      cpi->sf.part_sf.fixed_partition_size;
+
+  // Setup necessary params for encoding, including frame source, etc.
+
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+      0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+        cpi->image_pyramid_levels);
+  }
+
+  av1_setup_frame(cpi);
+
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  // The two encoding passes aim to help determine whether to use screen
+  // content tools, with a high q and fixed partition.
+  for (int pass = 0; pass < 2; ++pass) {
+    set_encoding_params_for_screen_content(cpi, pass);
+    av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel,
+                      q_for_screen_content_quick_run,
+                      q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+    av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+    av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                       cm->seq_params->bit_depth);
+
+    av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
+                                          0);
+    // transform / motion compensation build reconstruction frame
+    av1_encode_frame(cpi);
+    // Screen content decision
+    screen_content_tools_determination(
+        cpi, allow_screen_content_tools_orig_decision,
+        allow_intrabc_orig_decision, use_screen_content_tools_orig_decision,
+        is_screen_content_type_orig_decision, pass, projected_size_pass, psnr);
+  }
+
+  // Set partition speed feature back.
+  cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
+  cpi->sf.part_sf.fixed_partition_size = fixed_partition_block_size_orig;
+
+  // Free token related info if screen content coding tools are not enabled.
+  if (!cm->features.allow_screen_content_tools)
+    free_token_info(&cpi->token_info);
+}
+#endif  // CONFIG_REALTIME_ONLY
+
+static void fix_interp_filter(InterpFilter *const interp_filter,
+                              const FRAME_COUNTS *const counts) {
+  if (*interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS] = { 0 };
+    int num_filters_used = 0;
+    for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      num_filters_used += (count[i] > 0);
+    }
+    if (num_filters_used == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          *interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+
+  if (!cm->seq_params->reduced_still_picture_hdr &&
+      encode_show_existing_frame(cm)) {
+    RefCntBuffer *const frame_to_show =
+        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+    if (frame_to_show == NULL) {
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Buffer does not contain a reconstructed frame");
+    }
+    assert(frame_to_show->ref_count > 0);
+    assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+  }
+
+  if (!encode_show_existing_frame(cm) &&
+      cm->seq_params->film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
+    // Copy the current frame's film grain params to the its corresponding
+    // RefCntBuffer slot.
+    cm->cur_frame->film_grain_params = cm->film_grain_params;
+
+    // We must update the parameters if this is not an INTER_FRAME
+    if (current_frame->frame_type != INTER_FRAME)
+      cm->cur_frame->film_grain_params.update_parameters = 1;
+
+    // Iterate the random seed for the next frame.
+    cm->film_grain_params.random_seed += 3381;
+    if (cm->film_grain_params.random_seed == 0)
+      cm->film_grain_params.random_seed = 7391;
+  }
+
+  // Initialise all tiles' contexts from the global frame context
+  for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+    for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+      const int tile_idx = tile_row * cm->tiles.cols + tile_col;
+      cpi->tile_data[tile_idx].tctx = *cm->fc;
+    }
+  }
+
+  if (!frame_is_intra_only(cm))
+    fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
+}
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+                      const YV12_BUFFER_CONFIG *last_picture,
+                      ForceIntegerMVInfo *const force_intpel_info) {
+  // check use hash ME
+  int k;
+
+  const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
+  const double threshold_current = 0.8;
+  const double threshold_average = 0.95;
+  const int max_history_size = 32;
+  int T = 0;  // total block
+  int C = 0;  // match with collocated block
+  int S = 0;  // smooth region but not match with collocated block
+
+  const int pic_width = cur_picture->y_width;
+  const int pic_height = cur_picture->y_height;
+  for (int i = 0; i + block_size <= pic_height; i += block_size) {
+    for (int j = 0; j + block_size <= pic_width; j += block_size) {
+      const int x_pos = j;
+      const int y_pos = i;
+      int match = 1;
+      T++;
+
+      // check whether collocated block match with current
+      uint8_t *p_cur = cur_picture->y_buffer;
+      uint8_t *p_ref = last_picture->y_buffer;
+      int stride_cur = cur_picture->y_stride;
+      int stride_ref = last_picture->y_stride;
+      p_cur += (y_pos * stride_cur + x_pos);
+      p_ref += (y_pos * stride_ref + x_pos);
+
+      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p16_cur[tmpX] != p16_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p16_cur += stride_cur;
+          p16_ref += stride_ref;
+        }
+      } else {
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p_cur[tmpX] != p_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p_cur += stride_cur;
+          p_ref += stride_ref;
+        }
+      }
+
+      if (match) {
+        C++;
+        continue;
+      }
+
+      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+                                         y_pos) ||
+          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+        S++;
+        continue;
+      }
+    }
+  }
+
+  assert(T > 0);
+  double cs_rate = ((double)(C + S)) / ((double)(T));
+
+  force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
+
+  force_intpel_info->rate_index =
+      (force_intpel_info->rate_index + 1) % max_history_size;
+  force_intpel_info->rate_size++;
+  force_intpel_info->rate_size =
+      AOMMIN(force_intpel_info->rate_size, max_history_size);
+
+  if (cs_rate < threshold_current) {
+    return 0;
+  }
+
+  if (C == T) {
+    return 1;
+  }
+
+  double cs_average = 0.0;
+
+  for (k = 0; k < force_intpel_info->rate_size; k++) {
+    cs_average += force_intpel_info->cs_rate_array[k];
+  }
+  cs_average /= force_intpel_info->rate_size;
+
+  if (cs_average < threshold_average) {
+    return 0;
+  }
+
+  if ((T - C - S) < 0) {
+    return 1;
+  }
+
+  if (cs_average > 1.01) {
+    return 1;
+  }
+
+  return 0;
+}
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = BLOCK_16X16;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+  double log_sum = 0.0;
+
+  // Loop through each 16x16 block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      // Loop through each 8x8 block.
+      for (int mi_row = row * num_mi_h;
+           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+           mi_row += 2) {
+        for (int mi_col = col * num_mi_w;
+             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+             mi_col += 2) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 2;
+          const int col_offset_y = mi_col << 2;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          var += av1_get_perpixel_variance_facade(cpi, xd, &buf, BLOCK_8X8,
+                                                  AOM_PLANE_Y);
+          num_of_var += 1.0;
+        }
+      }
+      var = var / num_of_var;
+
+      // Curve fitting with an exponential model on all 16x16 blocks from the
+      // midres dataset.
+      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+
+      // As per the above computation, var will be in the range of
+      // [17.492222, 84.527656], assuming the data type is of infinite
+      // precision. The following assert conservatively checks if var is in the
+      // range of [17.0, 85.0] to avoid any issues due to the precision of the
+      // relevant data type.
+      assert(var > 17.0 && var < 85.0);
+      cpi->ssim_rdmult_scaling_factors[index] = var;
+      log_sum += log(var);
+    }
+  }
+
+  // As log_sum holds the geometric mean, it will be in the range
+  // [17.492222, 84.527656]. Hence, in the below loop, the value of
+  // cpi->ssim_rdmult_scaling_factors[index] would be in the range
+  // [0.2069, 4.8323].
+  log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+    }
+  }
+}
+
+// Coding context that only needs to be saved when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void save_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+
+  cc->lf = cm->lf;
+  cc->cdef_info = cm->cdef_info;
+  cc->rc = cpi->rc;
+  cc->mv_stats = cpi->ppi->mv_stats;
+}
+
+void av1_save_all_coding_context(AV1_COMP *cpi) {
+  save_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+#if DUMP_RECON_FRAMES == 1
+
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
+
+  if (recon_buf == NULL) {
+    printf("Frame %d is not ready.\n", current_frame->frame_number);
+    return;
+  }
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  printf(
+      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+      "show_existing_frame=%d) "
+      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+      current_frame->frame_number, current_frame->order_hint, cm->show_frame,
+      cm->show_existing_frame);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
+    printf(" %d(%c)", ref_offset,
+           (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
+  }
+  printf(" ]\n");
+
+  if (!cm->show_frame) {
+    printf("Frame %d is a no show frame, so no image dump.\n",
+           current_frame->frame_number);
+    return;
+  }
+
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (current_frame->frame_number == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf(
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+      "refresh_alt_ref_frame=%d, "
+      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
+      current_frame->frame_number, cpi->gf_frame_index,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+      current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
+      cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame,
+      recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+#if 0
+  int ref_frame;
+  printf("get_ref_frame_map_idx: [");
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
+  printf(" ]\n");
+#endif  // 0
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+           f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
diff --git a/third_party/aom/av1/encoder/encoder_utils.h b/third_party/aom/av1/encoder/encoder_utils.h
new file mode 100644
index 0000000000..113f62aa59
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_utils.h
@@ -0,0 +1,1141 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODER_UTILS_H_
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+#define DUMP_RECON_FRAMES 0
+
+extern const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL]
+                                      [TX_TYPES];
+
+extern const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+extern const int default_warped_probs[FRAME_UPDATE_TYPES];
+
+extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+                                                [SWITCHABLE_FILTER_CONTEXTS]
+                                                [SWITCHABLE_FILTERS];
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) {
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  int i;
+  const int num_mis =
+      cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0; i < num_mis; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+// Returns 'size' in the number of Mode Info (MI) units. 'size' is either the
+// width or height.
+static AOM_INLINE int size_in_mi(int size) {
+  // Ensure that the decoded width and height are both multiples of
+  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+  // subsampling is used).
+  // This simplifies the implementation of various experiments,
+  // eg. cdef, which operates on units of 8x8 luma pixels.
+  const int aligned_size = ALIGN_POWER_OF_TWO(size, 3);
+  return aligned_size >> MI_SIZE_LOG2;
+}
+
+static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                 int height) {
+  mi_params->mi_cols = size_in_mi(width);
+  mi_params->mi_rows = size_in_mi(height);
+  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+  mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2);
+  mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2);
+  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  mi_params->mi_alloc_stride =
+      (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+
+  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+         mi_size_high[mi_params->mi_alloc_bsize]);
+}
+
+static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) {
+  aom_free(mi_params->mi_alloc);
+  mi_params->mi_alloc = NULL;
+  mi_params->mi_alloc_size = 0;
+  aom_free(mi_params->mi_grid_base);
+  mi_params->mi_grid_base = NULL;
+  mi_params->mi_grid_size = 0;
+  aom_free(mi_params->tx_type_map);
+  mi_params->tx_type_map = NULL;
+}
+
+static AOM_INLINE void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                     int height,
+                                     BLOCK_SIZE min_partition_size) {
+  mi_params->mi_alloc_bsize = min_partition_size;
+
+  set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params,
+                                            int width, int height,
+                                            BLOCK_SIZE min_partition_size) {
+  (void)min_partition_size;
+  mi_params->mi_alloc_bsize = BLOCK_16X16;
+
+  set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) {
+  const int mi_grid_size =
+      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+  memset(mi_params->mi_alloc, 0,
+         mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
+  memset(mi_params->mi_grid_base, 0,
+         mi_grid_size * sizeof(*mi_params->mi_grid_base));
+  memset(mi_params->tx_type_map, 0,
+         mi_grid_size * sizeof(*mi_params->tx_type_map));
+}
+
+static AOM_INLINE void init_buffer_indices(
+    ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) {
+  int fb_idx;
+  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+    remapped_ref_idx[fb_idx] = fb_idx;
+  force_intpel_info->rate_index = 0;
+  force_intpel_info->rate_size = 0;
+}
+
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+  ppi->fn_ptr[BT].sdf = SDF;                                                   \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                                 \
+  ppi->fn_ptr[BT].vf = VF;                                                     \
+  ppi->fn_ptr[BT].svf = SVF;                                                   \
+  ppi->fn_ptr[BT].svaf = SVAF;                                                 \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                             \
+  ppi->fn_ptr[BT].sdx3df = SDX3DF;                                             \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                               \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;
+
+#define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD)                                \
+  HIGHBD_BFP(                                                                \
+      BLOCK_##WIDTH##X##HEIGHT, aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, \
+      aom_highbd_sad##WIDTH##x##HEIGHT##_avg_bits##BD,                       \
+      aom_highbd_##BD##_variance##WIDTH##x##HEIGHT,                          \
+      aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT,                \
+      aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT,            \
+      aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD,                        \
+      aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD,                        \
+      aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD,              \
+      aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
+  static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
+                                     int source_stride,                        \
+                                     const uint8_t *ref_ptr, int ref_stride) { \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride);                \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2;           \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4;           \
+  }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname)                                        \
+  static unsigned int fnname##_bits8(                                          \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred);   \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           2;                                                                  \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           4;                                                                  \
+  }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
+  static unsigned int fnname##_bits8(                                       \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param);                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits10(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           2;                                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits12(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           4;                                                               \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d)
+#endif
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+  ppi->fn_ptr[BT].msdf = MCSDF;       \
+  ppi->fn_ptr[BT].msvf = MCSVF;
+
+#define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD)                    \
+  HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT,                           \
+              aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \
+              aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
+  static unsigned int fnname##_bits8(                                    \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask);            \
+  }                                                                      \
+  static unsigned int fnname##_bits10(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           2;                                                            \
+  }                                                                      \
+  static unsigned int fnname##_bits12(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           4;                                                            \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+#if !CONFIG_REALTIME_ONLY
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+#endif
+#endif
+
+#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
+  ppi->fn_ptr[BT].sdsf = SDSF;          \
+  ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
+  HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT,                          \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD)
+
+#define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname)                                  \
+  static unsigned int fnname##_bits8(const uint8_t *src, int src_stride,    \
+                                     const uint8_t *ref, int ref_stride) {  \
+    return fnname(src, src_stride, ref, ref_stride);                        \
+  }                                                                         \
+  static unsigned int fnname##_bits10(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride) { \
+    return fnname(src, src_stride, ref, ref_stride) >> 2;                   \
+  }                                                                         \
+  static unsigned int fnname##_bits12(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride) { \
+    return fnname(src, src_stride, ref, ref_stride) >> 4;                   \
+  }
+
+#define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname)                                 \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x8)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32)
+#endif
+
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d)
+#endif
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT)                 \
+  HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT,                      \
+              aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \
+              aom_highbd_8_obmc_variance##WIDTH##x##HEIGHT,  \
+              aom_highbd_8_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+  ppi->fn_ptr[BT].osdf = OSDF;           \
+  ppi->fn_ptr[BT].ovf = OVF;             \
+  ppi->fn_ptr[BT].osvf = OSVF;
+
+#define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
+  HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT,                          \
+              aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits##BD,  \
+              aom_highbd_##BD##_obmc_variance##WIDTH##x##HEIGHT, \
+              aom_highbd_##BD##_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                     \
+  static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,  \
+                                     const int32_t *wsrc,                 \
+                                     const int32_t *msk) {                \
+    return fnname(ref, ref_stride, wsrc, msk);                            \
+  }                                                                       \
+  static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 2;                       \
+  }                                                                       \
+  static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+#endif
+
+static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 64, 8)
+        HIGHBD_BFP_WRAPPER(32, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 32, 8)
+        HIGHBD_BFP_WRAPPER(16, 4, 8)
+        HIGHBD_BFP_WRAPPER(4, 16, 8)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 32, 8)
+        HIGHBD_BFP_WRAPPER(64, 32, 8)
+        HIGHBD_BFP_WRAPPER(32, 64, 8)
+        HIGHBD_BFP_WRAPPER(32, 32, 8)
+        HIGHBD_BFP_WRAPPER(64, 64, 8)
+        HIGHBD_BFP_WRAPPER(16, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 16, 8)
+        HIGHBD_BFP_WRAPPER(8, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 4, 8)
+        HIGHBD_BFP_WRAPPER(4, 8, 8)
+        HIGHBD_BFP_WRAPPER(4, 4, 8)
+        HIGHBD_BFP_WRAPPER(128, 128, 8)
+        HIGHBD_BFP_WRAPPER(128, 64, 8)
+        HIGHBD_BFP_WRAPPER(64, 128, 8)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 8)
+        HIGHBD_MBFP_WRAPPER(128, 64, 8)
+        HIGHBD_MBFP_WRAPPER(64, 128, 8)
+        HIGHBD_MBFP_WRAPPER(64, 64, 8)
+        HIGHBD_MBFP_WRAPPER(64, 32, 8)
+        HIGHBD_MBFP_WRAPPER(32, 64, 8)
+        HIGHBD_MBFP_WRAPPER(32, 32, 8)
+        HIGHBD_MBFP_WRAPPER(32, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 32, 8)
+        HIGHBD_MBFP_WRAPPER(16, 16, 8)
+        HIGHBD_MBFP_WRAPPER(8, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 8, 8)
+        HIGHBD_MBFP_WRAPPER(4, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 4, 8)
+        HIGHBD_MBFP_WRAPPER(4, 4, 8)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 64, 8)
+        HIGHBD_MBFP_WRAPPER(32, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 32, 8)
+        HIGHBD_MBFP_WRAPPER(16, 4, 8)
+        HIGHBD_MBFP_WRAPPER(4, 16, 8)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_OBFP_WRAPPER_8(128, 128)
+        HIGHBD_OBFP_WRAPPER_8(128, 64)
+        HIGHBD_OBFP_WRAPPER_8(64, 128)
+        HIGHBD_OBFP_WRAPPER_8(64, 64)
+        HIGHBD_OBFP_WRAPPER_8(64, 32)
+        HIGHBD_OBFP_WRAPPER_8(32, 64)
+        HIGHBD_OBFP_WRAPPER_8(32, 32)
+        HIGHBD_OBFP_WRAPPER_8(32, 16)
+        HIGHBD_OBFP_WRAPPER_8(16, 32)
+        HIGHBD_OBFP_WRAPPER_8(16, 16)
+        HIGHBD_OBFP_WRAPPER_8(8, 16)
+        HIGHBD_OBFP_WRAPPER_8(16, 8)
+        HIGHBD_OBFP_WRAPPER_8(8, 8)
+        HIGHBD_OBFP_WRAPPER_8(4, 8)
+        HIGHBD_OBFP_WRAPPER_8(8, 4)
+        HIGHBD_OBFP_WRAPPER_8(4, 4)
+        HIGHBD_OBFP_WRAPPER_8(64, 16)
+        HIGHBD_OBFP_WRAPPER_8(16, 64)
+        HIGHBD_OBFP_WRAPPER_8(32, 8)
+        HIGHBD_OBFP_WRAPPER_8(8, 32)
+        HIGHBD_OBFP_WRAPPER_8(16, 4)
+        HIGHBD_OBFP_WRAPPER_8(4, 16)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 8)
+        HIGHBD_SDSFP_WRAPPER(128, 64, 8)
+        HIGHBD_SDSFP_WRAPPER(64, 128, 8)
+        HIGHBD_SDSFP_WRAPPER(64, 64, 8)
+        HIGHBD_SDSFP_WRAPPER(64, 32, 8)
+        HIGHBD_SDSFP_WRAPPER(32, 64, 8)
+        HIGHBD_SDSFP_WRAPPER(32, 32, 8)
+        HIGHBD_SDSFP_WRAPPER(32, 16, 8)
+        HIGHBD_SDSFP_WRAPPER(16, 32, 8)
+        HIGHBD_SDSFP_WRAPPER(16, 16, 8)
+        HIGHBD_SDSFP_WRAPPER(16, 8, 8)
+        HIGHBD_SDSFP_WRAPPER(8, 16, 8)
+        HIGHBD_SDSFP_WRAPPER(8, 8, 8)
+        HIGHBD_SDSFP_WRAPPER(4, 8, 8)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 8)
+        HIGHBD_SDSFP_WRAPPER(32, 8, 8)
+        HIGHBD_SDSFP_WRAPPER(16, 64, 8)
+        HIGHBD_SDSFP_WRAPPER(8, 32, 8)
+        HIGHBD_SDSFP_WRAPPER(4, 16, 8)
+#endif
+        break;
+
+      case AOM_BITS_10:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 64, 10)
+        HIGHBD_BFP_WRAPPER(32, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 32, 10)
+        HIGHBD_BFP_WRAPPER(16, 4, 10)
+        HIGHBD_BFP_WRAPPER(4, 16, 10)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 32, 10)
+        HIGHBD_BFP_WRAPPER(64, 32, 10)
+        HIGHBD_BFP_WRAPPER(32, 64, 10)
+        HIGHBD_BFP_WRAPPER(32, 32, 10)
+        HIGHBD_BFP_WRAPPER(64, 64, 10)
+        HIGHBD_BFP_WRAPPER(16, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 16, 10)
+        HIGHBD_BFP_WRAPPER(8, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 4, 10)
+        HIGHBD_BFP_WRAPPER(4, 8, 10)
+        HIGHBD_BFP_WRAPPER(4, 4, 10)
+        HIGHBD_BFP_WRAPPER(128, 128, 10)
+        HIGHBD_BFP_WRAPPER(128, 64, 10)
+        HIGHBD_BFP_WRAPPER(64, 128, 10)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 10)
+        HIGHBD_MBFP_WRAPPER(128, 64, 10)
+        HIGHBD_MBFP_WRAPPER(64, 128, 10)
+        HIGHBD_MBFP_WRAPPER(64, 64, 10)
+        HIGHBD_MBFP_WRAPPER(64, 32, 10)
+        HIGHBD_MBFP_WRAPPER(32, 64, 10)
+        HIGHBD_MBFP_WRAPPER(32, 32, 10)
+        HIGHBD_MBFP_WRAPPER(32, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 32, 10)
+        HIGHBD_MBFP_WRAPPER(16, 16, 10)
+        HIGHBD_MBFP_WRAPPER(8, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 8, 10)
+        HIGHBD_MBFP_WRAPPER(4, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 4, 10)
+        HIGHBD_MBFP_WRAPPER(4, 4, 10)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 64, 10)
+        HIGHBD_MBFP_WRAPPER(32, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 32, 10)
+        HIGHBD_MBFP_WRAPPER(16, 4, 10)
+        HIGHBD_MBFP_WRAPPER(4, 16, 10)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_OBFP_WRAPPER(128, 128, 10)
+        HIGHBD_OBFP_WRAPPER(128, 64, 10)
+        HIGHBD_OBFP_WRAPPER(64, 128, 10)
+        HIGHBD_OBFP_WRAPPER(64, 64, 10)
+        HIGHBD_OBFP_WRAPPER(64, 32, 10)
+        HIGHBD_OBFP_WRAPPER(32, 64, 10)
+        HIGHBD_OBFP_WRAPPER(32, 32, 10)
+        HIGHBD_OBFP_WRAPPER(32, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 32, 10)
+        HIGHBD_OBFP_WRAPPER(16, 16, 10)
+        HIGHBD_OBFP_WRAPPER(8, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 8, 10)
+        HIGHBD_OBFP_WRAPPER(4, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 4, 10)
+        HIGHBD_OBFP_WRAPPER(4, 4, 10)
+        HIGHBD_OBFP_WRAPPER(64, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 64, 10)
+        HIGHBD_OBFP_WRAPPER(32, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 32, 10)
+        HIGHBD_OBFP_WRAPPER(16, 4, 10)
+        HIGHBD_OBFP_WRAPPER(4, 16, 10)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 10)
+        HIGHBD_SDSFP_WRAPPER(128, 64, 10)
+        HIGHBD_SDSFP_WRAPPER(64, 128, 10)
+        HIGHBD_SDSFP_WRAPPER(64, 64, 10)
+        HIGHBD_SDSFP_WRAPPER(64, 32, 10)
+        HIGHBD_SDSFP_WRAPPER(32, 64, 10)
+        HIGHBD_SDSFP_WRAPPER(32, 32, 10)
+        HIGHBD_SDSFP_WRAPPER(32, 16, 10)
+        HIGHBD_SDSFP_WRAPPER(16, 32, 10)
+        HIGHBD_SDSFP_WRAPPER(16, 16, 10)
+        HIGHBD_SDSFP_WRAPPER(16, 8, 10)
+        HIGHBD_SDSFP_WRAPPER(8, 16, 10)
+        HIGHBD_SDSFP_WRAPPER(8, 8, 10)
+        HIGHBD_SDSFP_WRAPPER(4, 8, 10)
+
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 10)
+        HIGHBD_SDSFP_WRAPPER(32, 8, 10)
+        HIGHBD_SDSFP_WRAPPER(16, 64, 10)
+        HIGHBD_SDSFP_WRAPPER(8, 32, 10)
+        HIGHBD_SDSFP_WRAPPER(4, 16, 10)
+#endif
+        break;
+
+      case AOM_BITS_12:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 64, 12)
+        HIGHBD_BFP_WRAPPER(32, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 32, 12)
+        HIGHBD_BFP_WRAPPER(16, 4, 12)
+        HIGHBD_BFP_WRAPPER(4, 16, 12)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 32, 12)
+        HIGHBD_BFP_WRAPPER(64, 32, 12)
+        HIGHBD_BFP_WRAPPER(32, 64, 12)
+        HIGHBD_BFP_WRAPPER(32, 32, 12)
+        HIGHBD_BFP_WRAPPER(64, 64, 12)
+        HIGHBD_BFP_WRAPPER(16, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 16, 12)
+        HIGHBD_BFP_WRAPPER(8, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 4, 12)
+        HIGHBD_BFP_WRAPPER(4, 8, 12)
+        HIGHBD_BFP_WRAPPER(4, 4, 12)
+        HIGHBD_BFP_WRAPPER(128, 128, 12)
+        HIGHBD_BFP_WRAPPER(128, 64, 12)
+        HIGHBD_BFP_WRAPPER(64, 128, 12)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 12)
+        HIGHBD_MBFP_WRAPPER(128, 64, 12)
+        HIGHBD_MBFP_WRAPPER(64, 128, 12)
+        HIGHBD_MBFP_WRAPPER(64, 64, 12)
+        HIGHBD_MBFP_WRAPPER(64, 32, 12)
+        HIGHBD_MBFP_WRAPPER(32, 64, 12)
+        HIGHBD_MBFP_WRAPPER(32, 32, 12)
+        HIGHBD_MBFP_WRAPPER(32, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 32, 12)
+        HIGHBD_MBFP_WRAPPER(16, 16, 12)
+        HIGHBD_MBFP_WRAPPER(8, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 8, 12)
+        HIGHBD_MBFP_WRAPPER(4, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 4, 12)
+        HIGHBD_MBFP_WRAPPER(4, 4, 12)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 64, 12)
+        HIGHBD_MBFP_WRAPPER(32, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 32, 12)
+        HIGHBD_MBFP_WRAPPER(16, 4, 12)
+        HIGHBD_MBFP_WRAPPER(4, 16, 12)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_OBFP_WRAPPER(128, 128, 12)
+        HIGHBD_OBFP_WRAPPER(128, 64, 12)
+        HIGHBD_OBFP_WRAPPER(64, 128, 12)
+        HIGHBD_OBFP_WRAPPER(64, 64, 12)
+        HIGHBD_OBFP_WRAPPER(64, 32, 12)
+        HIGHBD_OBFP_WRAPPER(32, 64, 12)
+        HIGHBD_OBFP_WRAPPER(32, 32, 12)
+        HIGHBD_OBFP_WRAPPER(32, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 32, 12)
+        HIGHBD_OBFP_WRAPPER(16, 16, 12)
+        HIGHBD_OBFP_WRAPPER(8, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 8, 12)
+        HIGHBD_OBFP_WRAPPER(4, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 4, 12)
+        HIGHBD_OBFP_WRAPPER(4, 4, 12)
+        HIGHBD_OBFP_WRAPPER(64, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 64, 12)
+        HIGHBD_OBFP_WRAPPER(32, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 32, 12)
+        HIGHBD_OBFP_WRAPPER(16, 4, 12)
+        HIGHBD_OBFP_WRAPPER(4, 16, 12)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 12)
+        HIGHBD_SDSFP_WRAPPER(128, 64, 12)
+        HIGHBD_SDSFP_WRAPPER(64, 128, 12)
+        HIGHBD_SDSFP_WRAPPER(64, 64, 12)
+        HIGHBD_SDSFP_WRAPPER(64, 32, 12)
+        HIGHBD_SDSFP_WRAPPER(32, 64, 12)
+        HIGHBD_SDSFP_WRAPPER(32, 32, 12)
+        HIGHBD_SDSFP_WRAPPER(32, 16, 12)
+        HIGHBD_SDSFP_WRAPPER(16, 32, 12)
+        HIGHBD_SDSFP_WRAPPER(16, 16, 12)
+        HIGHBD_SDSFP_WRAPPER(16, 8, 12)
+        HIGHBD_SDSFP_WRAPPER(8, 16, 12)
+        HIGHBD_SDSFP_WRAPPER(8, 8, 12)
+        HIGHBD_SDSFP_WRAPPER(4, 8, 12)
+
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 12)
+        HIGHBD_SDSFP_WRAPPER(32, 8, 12)
+        HIGHBD_SDSFP_WRAPPER(16, 64, 12)
+        HIGHBD_SDSFP_WRAPPER(8, 32, 12)
+        HIGHBD_SDSFP_WRAPPER(4, 16, 12)
+#endif
+        break;
+
+      default:
+        assert(0 &&
+               "cm->seq_params->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+    av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
+  }
+  if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+      cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+    av1_copy(frame_probs->obmc_probs, default_obmc_probs);
+  }
+  if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    av1_copy(frame_probs->warped_probs, default_warped_probs);
+  }
+  if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+    av1_copy(frame_probs->switchable_interp_probs,
+             default_switchable_interp_probs);
+  }
+
+#if CONFIG_FPMT_TEST
+  if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs);
+    }
+    if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+        cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+      av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs);
+    }
+    if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+      av1_copy(temp_frame_probs->warped_probs, default_warped_probs);
+    }
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+      av1_copy(temp_frame_probs->switchable_interp_probs,
+               default_switchable_interp_probs);
+    }
+
+    FrameProbInfo *const temp_frame_probs_simulation =
+        &cpi->ppi->temp_frame_probs_simulation;
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      av1_copy(temp_frame_probs_simulation->tx_type_probs,
+               default_tx_type_probs);
+    }
+    if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+        cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+      av1_copy(temp_frame_probs_simulation->obmc_probs, default_obmc_probs);
+    }
+    if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+      av1_copy(temp_frame_probs_simulation->warped_probs, default_warped_probs);
+    }
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+      av1_copy(temp_frame_probs_simulation->switchable_interp_probs,
+               default_switchable_interp_probs);
+    }
+  }
+#endif
+}
+
+static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst,
+                                                   const CdefInfo *const src) {
+  dst->cdef_bits = src->cdef_bits;
+  dst->cdef_damping = src->cdef_damping;
+  av1_copy(dst->cdef_strengths, src->cdef_strengths);
+  av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths);
+  dst->nb_cdef_strengths = src->nb_cdef_strengths;
+}
+
+// Coding context that only needs to be restored when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  cm->lf = cc->lf;
+  restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info);
+  cpi->rc = cc->rc;
+  cpi->ppi->mv_stats = cc->mv_stats;
+}
+
+static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                                  const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context,
+                                     bool *ext_refresh_frame_context_pending,
+                                     bool update) {
+  *ext_refresh_frame_context = update;
+  *ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor,
+                                                   double max_factor,
+                                                   int prior_boost,
+                                                   int tpl_boost,
+                                                   int frames_to_key) {
+  double factor = sqrt((double)frames_to_key);
+  double range = max_factor - min_factor;
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor -= min_factor;
+  int boost =
+      (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
+  return boost;
+}
+#endif
+
+static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
+  int i;
+  AV1_COMMON *const cm = &cpi->common;
+  FeatureFlags *const features = &cm->features;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    cm->global_motion[i] = default_warp_params;
+  }
+  cpi->gm_info.search_done = 0;
+
+  av1_set_speed_features_framesize_independent(cpi, cpi->speed);
+  av1_set_rd_speed_thresholds(cpi);
+  features->interp_filter = SWITCHABLE;
+  features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+      features->allow_warped_motion, cpi->oxcf.motion_mode_cfg.enable_obmc);
+}
+
+static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
+  // Scaled references should only need to be released under certain conditions:
+  // if the reference will be updated, or if the scaled reference has same
+  // resolution. For now only apply this to Golden for non-svc RTC mode.
+  AV1_COMMON *const cm = &cpi->common;
+  const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0;
+  bool release_golden = true;
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+    const int golden_ref = (i == GOLDEN_FRAME - 1);
+    if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc &&
+        buf != NULL) {
+      const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME);
+      const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width &&
+                               buf->buf.y_crop_height == ref->buf.y_crop_height;
+      release_golden = refresh_golden || same_resoln;
+    }
+    if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) {
+      --buf->ref_count;
+      cpi->scaled_ref_buf[i] = NULL;
+    }
+  }
+}
+
+static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) {
+  restore_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+static AOM_INLINE int reduce_num_ref_buffers(const AV1_COMP *cpi) {
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
+  return is_one_pass_rt_params(cpi) &&
+         use_rtc_reference_structure_one_layer(cpi) &&
+         (seq_params->order_hint_info.enable_order_hint == 0) &&
+         cpi->rt_reduce_num_ref_buffers;
+}
+
+// Refresh reference frame buffers according to refresh_frame_flags.
+static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // All buffers are refreshed for shown keyframes and S-frames.
+  // In case of RT, golden frame refreshes the 6th slot and other reference
+  // frames refresh slots 0 to 5. Slot 7 is not refreshed by any reference
+  // frame. Thus, only 7 buffers are refreshed for keyframes and S-frames
+  // instead of 8.
+  int num_ref_buffers = REF_FRAMES;
+  if (reduce_num_ref_buffers(cpi)) {
+    const int refresh_all_bufs =
+        (cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET ||
+         frame_is_sframe(cm));
+    assert(IMPLIES(((cm->current_frame.refresh_frame_flags >> 7) & 1) == 1,
+                   refresh_all_bufs));
+    (void)refresh_all_bufs;
+    num_ref_buffers--;
+  }
+
+  for (int ref_frame = 0; ref_frame < num_ref_buffers; ref_frame++) {
+    if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
+      assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
+    }
+  }
+}
+
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+                                          const AV1EncoderConfig *oxcf);
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+                                      const AV1EncoderConfig *oxcf);
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+                          const int phase, const int use_optimized_scaler);
+
+void av1_setup_frame(AV1_COMP *cpi);
+
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+                              int height, int number_spatial_layers);
+
+void av1_apply_active_map(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi);
+
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig);
+#endif
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                 int *top_index);
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi);
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+                      const YV12_BUFFER_CONFIG *last_picture,
+                      ForceIntegerMVInfo *const force_intpel_info);
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi);
+
+void av1_save_all_coding_context(AV1_COMP *cpi);
+
+#if DUMP_RECON_FRAMES == 1
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi);
+#endif
+
+static AOM_INLINE int av1_get_enc_border_size(bool resize, bool all_intra,
+                                              BLOCK_SIZE sb_size) {
+  // For allintra encoding mode, inter-frame motion search is not applicable and
+  // the intraBC motion vectors are restricted within the tile boundaries. Hence
+  // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case.
+  if (resize) {
+    return AOM_BORDER_IN_PIXELS;
+  }
+  if (all_intra) {
+    return AOM_ENC_ALLINTRA_BORDER;
+  }
+  return block_size_wide[sb_size] + 32;
+}
+
+static AOM_INLINE bool av1_is_resize_needed(const AV1EncoderConfig *oxcf) {
+  const ResizeCfg *resize_cfg = &oxcf->resize_cfg;
+  const SuperResCfg *superres_cfg = &oxcf->superres_cfg;
+  return resize_cfg->resize_mode || superres_cfg->superres_mode;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_UTILS_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
new file mode 100644
index 0000000000..5fe2a497c7
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -0,0 +1,886 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_alloc_txb_buf(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+  const int num_sb_rows =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  const int num_sb_cols =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+  const int size = num_sb_rows * num_sb_cols;
+  const int num_planes = av1_num_planes(cm);
+  const int subsampling_x = cm->seq_params->subsampling_x;
+  const int subsampling_y = cm->seq_params->subsampling_y;
+  const int luma_max_sb_square =
+      1 << num_pels_log2_lookup[cm->seq_params->sb_size];
+  const int chroma_max_sb_square =
+      luma_max_sb_square >> (subsampling_x + subsampling_y);
+  const int num_tcoeffs =
+      size * (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square);
+  const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN;
+
+  av1_free_txb_buf(cpi);
+  // TODO(jingning): This should be further reduced.
+  CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
+                  aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
+  CHECK_MEM_ERROR(
+      cm, coeff_buf_pool->tcoeff,
+      aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs));
+  CHECK_MEM_ERROR(
+      cm, coeff_buf_pool->eobs,
+      aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size));
+  CHECK_MEM_ERROR(cm, coeff_buf_pool->entropy_ctx,
+                  aom_malloc(sizeof(*coeff_buf_pool->entropy_ctx) *
+                             num_tcoeffs / txb_unit_size));
+
+  tran_low_t *tcoeff_ptr = coeff_buf_pool->tcoeff;
+  uint16_t *eob_ptr = coeff_buf_pool->eobs;
+  uint8_t *entropy_ctx_ptr = coeff_buf_pool->entropy_ctx;
+  for (int i = 0; i < size; i++) {
+    for (int plane = 0; plane < num_planes; plane++) {
+      const int max_sb_square =
+          (plane == AOM_PLANE_Y) ? luma_max_sb_square : chroma_max_sb_square;
+      cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr;
+      cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr;
+      cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr;
+      tcoeff_ptr += max_sb_square;
+      eob_ptr += max_sb_square / txb_unit_size;
+      entropy_ctx_ptr += max_sb_square / txb_unit_size;
+    }
+  }
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) {
+  CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+  aom_free(cpi->coeff_buffer_base);
+  cpi->coeff_buffer_base = NULL;
+  aom_free(coeff_buf_pool->tcoeff);
+  coeff_buf_pool->tcoeff = NULL;
+  aom_free(coeff_buf_pool->eobs);
+  coeff_buf_pool->eobs = NULL;
+  aom_free(coeff_buf_pool->entropy_ctx);
+  coeff_buf_pool->entropy_ctx = NULL;
+}
+
+static void write_golomb(aom_writer *w, int level) {
+  int x = level + 1;
+  int i = x;
+  int length = 0;
+
+  while (i) {
+    i >>= 1;
+    ++length;
+  }
+  assert(length > 0);
+
+  for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
+
+  for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
+}
+
+static const int8_t eob_to_pos_small[33] = {
+  0, 1, 2,                                        // 0-2
+  3, 3,                                           // 3-4
+  4, 4, 4, 4,                                     // 5-8
+  5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+  6,                               // place holder
+  7,                               // 33-64
+  8,  8,                           // 65-128
+  9,  9,  9,  9,                   // 129-256
+  10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
+  11                               // 513-
+};
+
+int av1_get_eob_pos_token(const int eob, int *const extra) {
+  int t;
+
+  if (eob < 33) {
+    t = eob_to_pos_small[eob];
+  } else {
+    const int e = AOMMIN((eob - 1) >> 5, 16);
+    t = eob_to_pos_large[e];
+  }
+
+  *extra = eob - av1_eob_group_start[t];
+
+  return t;
+}
+
+#if CONFIG_ENTROPY_STATS
+void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+                            TX_CLASS tx_class, PLANE_TYPE plane,
+                            FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+                            uint8_t allow_update_cdf) {
+#else
+void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+                            PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+                            uint8_t allow_update_cdf) {
+#endif
+  int eob_extra;
+  const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+  TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+
+  switch (eob_multi_size) {
+    case 0:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
+      break;
+    case 1:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
+      break;
+    case 2:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
+      break;
+    case 3:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
+                   8);
+      }
+      break;
+    case 4:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
+                   9);
+      }
+      break;
+    case 5:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
+                   10);
+      }
+      break;
+    case 6:
+    default:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
+                   11);
+      }
+      break;
+  }
+
+  if (av1_eob_offset_bits[eob_pt] > 0) {
+    int eob_ctx = eob_pt - 3;
+    int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+#if CONFIG_ENTROPY_STATS
+    counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
+  }
+}
+
+static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+                                 const int coeff_idx, const int bhl,
+                                 const int width, const int scan_idx,
+                                 const int is_eob, const TX_SIZE tx_size,
+                                 const TX_CLASS tx_class) {
+  if (is_eob) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (width << bhl) / 8) return 1;
+    if (scan_idx <= (width << bhl) / 4) return 2;
+    return 3;
+  }
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class);
+}
+
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+                           const int height, uint8_t *const levels) {
+  const int stride = height + TX_PAD_HOR;
+  uint8_t *ls = levels;
+
+  memset(levels + stride * width, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+  for (int i = 0; i < width; i++) {
+    for (int j = 0; j < height; j++) {
+      *ls++ = (uint8_t)clamp(abs(coeff[i * height + j]), 0, INT8_MAX);
+    }
+    for (int j = 0; j < TX_PAD_HOR; j++) {
+      *ls++ = 0;
+    }
+  }
+}
+
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+                               const int16_t *const scan, const uint16_t eob,
+                               const TX_SIZE tx_size, const TX_CLASS tx_class,
+                               int8_t *const coeff_contexts) {
+  const int bhl = get_txb_bhl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  for (int i = 0; i < eob; ++i) {
+    const int pos = scan[i];
+    coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bhl, width, i,
+                                         i == eob - 1, tx_size, tx_class);
+  }
+}
+
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          int block, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+                         (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+  const uint16_t eob = eob_txb[block];
+  const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+  const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
+  if (eob == 0) return;
+
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  // Only y plane's tx_type is transmitted
+  if (plane == 0) {
+    av1_write_tx_type(cm, xd, tx_type, tx_size, w);
+  }
+
+  int eob_extra;
+  const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
+      break;
+    case 1:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
+      break;
+    case 2:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
+      break;
+    case 3:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
+      break;
+    case 4:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
+      break;
+    case 5:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
+      break;
+    default:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
+      break;
+  }
+
+  const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
+  if (eob_offset_bits > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int eob_shift = eob_offset_bits - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    aom_write_symbol(w, bit,
+                     ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
+    for (int i = 1; i < eob_offset_bits; i++) {
+      eob_shift = eob_offset_bits - 1 - i;
+      bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+      aom_write_bit(w, bit);
+    }
+  }
+
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, height);
+  const tran_low_t *tcoeff_txb =
+      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+  const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
+  av1_txb_init_levels(tcoeff, width, height, levels);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  const int bhl = get_txb_bhl(tx_size);
+  for (int c = eob - 1; c >= 0; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = tcoeff[pos];
+    const tran_low_t level = abs(v);
+
+    if (c == eob - 1) {
+      aom_write_symbol(
+          w, AOMMIN(level, 3) - 1,
+          ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
+    } else {
+      aom_write_symbol(w, AOMMIN(level, 3),
+                       ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
+                       4);
+    }
+    if (level > NUM_BASE_LEVELS) {
+      // level is above 1.
+      const int base_range = level - 1 - NUM_BASE_LEVELS;
+      const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+      aom_cdf_prob *cdf =
+          ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+        aom_write_symbol(w, k, cdf, BR_CDF_SIZE);
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+  }
+
+  // Loop to code all signs in the transform block,
+  // starting with the sign of DC (if applicable)
+  for (int c = 0; c < eob; ++c) {
+    const tran_low_t v = tcoeff[scan[c]];
+    const tran_low_t level = abs(v);
+    const int sign = (v < 0) ? 1 : 0;
+    if (level) {
+      if (c == 0) {
+        const int dc_sign_ctx =
+            (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK;
+        aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+                         2);
+      } else {
+        aom_write_bit(w, sign);
+      }
+      if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+        write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+    }
+  }
+}
+
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                               aom_writer *w, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_planes = av1_num_planes(cm);
+  int block[MAX_MB_PLANE] = { 0 };
+  int row, col;
+  assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                       xd->plane[0].subsampling_y));
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+  int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+  int mu_blocks_high = mi_size_high[max_unit_bsize];
+  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+  for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+    for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        if (plane && !xd->is_chroma_ref) break;
+        const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        const int step = stepr * stepc;
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const int unit_height = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+        const int unit_width = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+        for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+             blk_row += stepr) {
+          for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+               blk_col += stepc) {
+            av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane,
+                                 block[plane], tx_size);
+            block[plane] += step;
+          }
+        }
+      }
+    }
+  }
+}
+
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                    const SCAN_ORDER *scan_order, int eob) {
+  const int16_t *const scan = scan_order->scan;
+  int cul_level = 0;
+  int c;
+
+  if (eob == 0) return 0;
+  for (c = 0; c < eob; ++c) {
+    cul_level += abs(qcoeff[scan[c]]);
+    if (cul_level > COEFF_CONTEXT_MASK) break;
+  }
+
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+  set_dc_sign(&cul_level, qcoeff[0]);
+
+  return (uint8_t)cul_level;
+}
+
+static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
+                                 MACROBLOCKD *xd, int blk_row, int blk_col,
+                                 int plane, TX_SIZE tx_size,
+                                 FRAME_COUNTS *counts,
+                                 uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int is_inter = is_inter_block(mbmi);
+  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+  (void)counts;
+#endif  // !CONFIG_ENTROPY_STATS
+
+  // Only y plane's tx_type is updated
+  if (plane > 0) return;
+  const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
+  if (is_inter) {
+    if (cpi->oxcf.txfm_cfg.use_inter_dct_only) {
+      assert(tx_type == DCT_DCT);
+    }
+  } else {
+    if (cpi->oxcf.txfm_cfg.use_intra_dct_only) {
+      assert(tx_type == DCT_DCT);
+    } else if (cpi->oxcf.txfm_cfg.use_intra_default_tx_only) {
+      const TX_TYPE default_type = get_default_tx_type(
+          PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools);
+      (void)default_type;
+      // TODO(kyslov): We don't always respect use_intra_default_tx_only flag in
+      // NonRD and REALTIME case. Specifically we ignore it in hybrid inta mode
+      // search, when picking up intra mode in nonRD inter mode search and in RD
+      // REALTIME mode when we limit TX type usage.
+      // We need to fix txfm cfg for these cases. Meanwhile relieving the
+      // assert.
+      assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode ||
+             cpi->oxcf.mode == REALTIME);
+    }
+  }
+
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+      cm->quant_params.base_qindex > 0 && !mbmi->skip_txfm &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+    if (eset > 0) {
+      const TxSetType tx_set_type =
+          av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used);
+      if (is_inter) {
+        if (allow_update_cdf) {
+          update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+                     av1_ext_tx_ind[tx_set_type][tx_type],
+                     av1_num_ext_tx_set[tx_set_type]);
+        }
+#if CONFIG_ENTROPY_STATS
+        ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+      } else {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+#if CONFIG_ENTROPY_STATS
+        ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+        if (allow_update_cdf) {
+          update_cdf(
+              fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
+              av1_ext_tx_ind[tx_set_type][tx_type],
+              av1_num_ext_tx_set[tx_set_type]);
+        }
+      }
+    }
+  }
+}
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+                                       int blk_col, BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  tran_low_t *tcoeff;
+  assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+  if (args->dry_run == OUTPUT_ENABLED) {
+    MB_MODE_INFO *mbmi = xd->mi[0];
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, plane,
+                pd->above_entropy_context + blk_col,
+                pd->left_entropy_context + blk_row, &txb_ctx);
+    const int bhl = get_txb_bhl(tx_size);
+    const int width = get_txb_wide(tx_size);
+    const int height = get_txb_high(tx_size);
+    const uint8_t allow_update_cdf = args->allow_update_cdf;
+    const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_ENTROPY_STATS
+    int cdf_idx = cm->coef_cdf_category;
+    ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf) {
+      update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx],
+                 eob == 0, 2);
+    }
+
+    CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+    const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+                           (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+    uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+    entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+    eob_txb[block] = eob;
+
+    if (eob == 0) {
+      av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+                               blk_row);
+      return;
+    }
+    const int segment_id = mbmi->segment_id;
+    const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+    tran_low_t *tcoeff_txb =
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+    tcoeff = tcoeff_txb + block_offset;
+    memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+    uint8_t levels_buf[TX_PAD_2D];
+    uint8_t *const levels = set_levels(levels_buf, height);
+    av1_txb_init_levels(tcoeff, width, height, levels);
+    update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+                         td->counts, allow_update_cdf);
+
+    const TX_CLASS tx_class = tx_type_to_class[tx_type];
+    const int16_t *const scan = scan_order->scan;
+
+    // record tx type usage
+    td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+    av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                           td->counts, allow_update_cdf);
+#else
+    av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+                           allow_update_cdf);
+#endif
+
+    DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+    av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+                            coeff_contexts);
+
+    for (int c = eob - 1; c >= 0; --c) {
+      const int pos = scan[c];
+      const int coeff_ctx = coeff_contexts[pos];
+      const tran_low_t v = qcoeff[pos];
+      const tran_low_t level = abs(v);
+      /* abs_sum_level is needed to decide the job scheduling order of
+       * pack bitstream multi-threading. This data is not needed if
+       * multi-threading is disabled. */
+      if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level;
+
+      if (allow_update_cdf) {
+        if (c == eob - 1) {
+          assert(coeff_ctx < 4);
+          update_cdf(
+              ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+              AOMMIN(level, 3) - 1, 3);
+        } else {
+          update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+                     AOMMIN(level, 3), 4);
+        }
+      }
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+#if CONFIG_ENTROPY_STATS
+        ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+                                          [coeff_ctx][AOMMIN(level, 3) - 1];
+      } else {
+        ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+                                      [coeff_ctx][AOMMIN(level, 3)];
+#endif
+      }
+      if (level > NUM_BASE_LEVELS) {
+        const int base_range = level - 1 - NUM_BASE_LEVELS;
+        const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+        for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+          const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+          if (allow_update_cdf) {
+            update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+                                           [plane_type][br_ctx],
+                       k, BR_CDF_SIZE);
+          }
+          for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+#if CONFIG_ENTROPY_STATS
+            ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+                                   [lps][br_ctx][lps == k];
+#endif  // CONFIG_ENTROPY_STATS
+            if (lps == k) break;
+          }
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                       [plane_type][br_ctx][k];
+#endif
+          if (k < BR_CDF_SIZE - 1) break;
+        }
+      }
+    }
+    // Update the context needed to code the DC sign (if applicable)
+    if (tcoeff[0] != 0) {
+      const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+      const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+      ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif  // CONFIG_ENTROPY_STATS
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+      entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+    }
+  } else {
+    tcoeff = qcoeff;
+  }
+  const uint8_t cul_level =
+      av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+                           blk_col, blk_row);
+}
+
+void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  tran_low_t *tcoeff;
+  assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+  if (args->dry_run == OUTPUT_ENABLED) {
+    MB_MODE_INFO *mbmi = xd->mi[0];
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, plane,
+                pd->above_entropy_context + blk_col,
+                pd->left_entropy_context + blk_row, &txb_ctx);
+#if CONFIG_ENTROPY_STATS
+    const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+    const int bhl = get_txb_bhl(tx_size);
+    const int width = get_txb_wide(tx_size);
+    const int height = get_txb_high(tx_size);
+    int cdf_idx = cm->coef_cdf_category;
+    ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif  // CONFIG_ENTROPY_STATS
+
+    CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+    const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+                           (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+    uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+    entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+    eob_txb[block] = eob;
+
+    if (eob == 0) {
+      av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+                               blk_row);
+      return;
+    }
+    const int segment_id = mbmi->segment_id;
+    const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+    tran_low_t *tcoeff_txb =
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+    tcoeff = tcoeff_txb + block_offset;
+    memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+#if CONFIG_ENTROPY_STATS
+    uint8_t levels_buf[TX_PAD_2D];
+    uint8_t *const levels = set_levels(levels_buf, height);
+    av1_txb_init_levels(tcoeff, width, height, levels);
+    update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+                         td->counts, 0 /*allow_update_cdf*/);
+
+    const TX_CLASS tx_class = tx_type_to_class[tx_type];
+    const bool do_coeff_scan = true;
+#else
+    const bool do_coeff_scan = cpi->mt_info.pack_bs_mt_enabled;
+#endif
+    const int16_t *const scan = scan_order->scan;
+
+    // record tx type usage
+    td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                           td->counts, 0 /*allow_update_cdf*/);
+
+    DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+    av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+                            coeff_contexts);
+#endif
+
+    for (int c = eob - 1; (c >= 0) && do_coeff_scan; --c) {
+      const int pos = scan[c];
+      const tran_low_t v = qcoeff[pos];
+      const tran_low_t level = abs(v);
+      /* abs_sum_level is needed to decide the job scheduling order of
+       * pack bitstream multi-threading. This data is not needed if
+       * multi-threading is disabled. */
+      if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level;
+
+#if CONFIG_ENTROPY_STATS
+      const int coeff_ctx = coeff_contexts[pos];
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+        ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+                                          [coeff_ctx][AOMMIN(level, 3) - 1];
+      } else {
+        ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+                                      [coeff_ctx][AOMMIN(level, 3)];
+      }
+      if (level > NUM_BASE_LEVELS) {
+        const int base_range = level - 1 - NUM_BASE_LEVELS;
+        const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+        for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+          const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+          for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+            ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+                                   [lps][br_ctx][lps == k];
+            if (lps == k) break;
+          }
+          ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                       [plane_type][br_ctx][k];
+          if (k < BR_CDF_SIZE - 1) break;
+        }
+      }
+#endif
+    }
+    // Update the context needed to code the DC sign (if applicable)
+    if (tcoeff[0] != 0) {
+      const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+      const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+      ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif  // CONFIG_ENTROPY_STATS
+      entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+    }
+  } else {
+    tcoeff = qcoeff;
+  }
+  const uint8_t cul_level =
+      av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+                           blk_col, blk_row);
+}
+
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                                     uint8_t allow_update_cdf) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
+  if (mbmi->skip_txfm) {
+    av1_reset_entropy_context(xd, bsize, num_planes);
+    return;
+  }
+  const foreach_transformed_block_visitor visit =
+      allow_update_cdf ? av1_update_and_record_txb_context
+                       : av1_record_txb_context;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int ss_x = pd->subsampling_x;
+    const int ss_y = pd->subsampling_y;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+    av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, visit, &arg);
+  }
+}
+
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+                                         int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const int stride =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+  const int offset =
+      (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+  return cpi->coeff_buffer_base + offset;
+}
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
new file mode 100644
index 0000000000..67b94046b4
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/bitwriter.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define TXB_SKIP_CTX_MASK 15
+#define DC_SIGN_CTX_SHIFT 4
+#define DC_SIGN_CTX_MASK 3
+
+int av1_get_eob_pos_token(const int eob, int *const extra);
+
+/*!\endcond */
+/*!\brief Allocate the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * Each macro block will need a \ref CB_COEFF_BUFFER to store information for
+ * rate-distortion optimization and entropy coding of transform coefficients.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ */
+void av1_alloc_txb_buf(AV1_COMP *cpi);
+/*!\brief Free the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * See \ref av1_alloc_txb_buf and \ref CB_COEFF_BUFFER for more details.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ */
+void av1_free_txb_buf(AV1_COMP *cpi);
+
+/*!\brief Write quantized coefficients in a transform block into bitstream using
+ * entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will write the quantized coefficients in a transform block into
+ * the bitstream using entropy coding.
+ *
+ * The coding steps are as follows.
+ *
+ * 1) Code the end of block position "eob", which is the scan index of the
+ * last non-zero coefficient plus one.
+ *
+ * 2) Code the lower magnitude level (<= COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+ * for each coefficient in reversed scan order.
+ *
+ * 3) Code the sign and higher magnitude level
+ * (> COEFF_BASE_RANGE + NUM_BASE_LEVELS) in forward scan order.
+ *
+ * \param[in]    cm             Top-level structure shared by encoder and
+ * decoder
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    w              Entropy coding write pointer
+ * \param[in]    blk_row      The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in]    blk_col      The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The given transform size
+ */
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          int block, TX_SIZE tx_size);
+
+/*!\brief Write quantized coefficients of all transform blocks in an intra
+ * macroblock into the bitstream using entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * All transform blocks in the intra macroblock share the same transform size.
+ *
+ * This function use \ref av1_write_coeffs_txb() to code each transform block in
+ * raster order.
+ *
+ * \param[in]    cm             Top-level structure shared by encoder and
+ * decoder
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    w              Entropy coding write pointer
+ * \param[in]    bsize          Block size of the current macroblock
+ */
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                               aom_writer *w, BLOCK_SIZE bsize);
+
+/*!\brief Pack the context info of the current transform block into an uint8_t.
+ * \ingroup coefficient_coding
+ *
+ * This context info will be collected and consolidated by its neighbor
+ * transform blocks for coding transform block skip flag (tx_skip) and
+ * the sign of DC coefficient (dc_sign).
+ *
+ * \param[in]    qcoeff         Buffer of quantized coefficients
+ * \param[in]    scan_order     Coding order of coefficients in the transform
+ * block
+ * \param[in]    eob            The scan index of last non-zero coefficient plus
+ * one
+ */
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                    const SCAN_ORDER *scan_order, int eob);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for all transform blocks in the intra macroblock.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will go through each transform block in the intra macorblock
+ * and call \ref av1_update_and_record_txb_context to update the probability
+ * model and entropy context properly.
+ *
+ * \param[in]    cpi               Top-level encoder structure
+ * \param[in]    td                Top-level multithreading structure
+ * \param[in]    dry_run           Whether this is a dry run.
+ * \param[in]    bsize             Block size of the current macroblock
+ * \param[in]    allow_update_cdf  Allowed to update probability model (cdf) or
+ * not.
+ */
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                                     uint8_t allow_update_cdf);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this funtion.
+ *
+ * Regular mode:
+ *
+ * The probability model (cdf) for each coding symbol in the
+ * transform block will be updated.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in]    plane        The index of the current plane.
+ * \param[in]    block        The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in]    blk_row      The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    blk_col      The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    plane_bsize  Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in]    tx_size      The given transform size.
+ * \param[in]    arg          This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+                                       int blk_col, BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, void *arg);
+
+/*!\brief Update the entropy context related to coefficient coding for a
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this function.
+ *
+ * Regular mode:
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in]    plane        The index of the current plane.
+ * \param[in]    block        The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in]    blk_row      The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    blk_col      The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    plane_bsize  Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in]    tx_size      The given transform size.
+ * \param[in]    arg          This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
+void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+/*!\brief Get the corresponding \ref CB_COEFF_BUFFER of the current macro block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * The macroblock's location is described by mi_row and mi_col, row and column
+ * mi indexes in the coding frame.
+ *
+ * Each mi unit is a 4x4 pixel block.
+ *
+ * \param[in]    cpi               Top-level encoder structure.
+ * \param[in]    mi_row            Row mi index of the current transform block
+ * in the frame.
+ * \param[in]    mi_col           Column mi index of the current transform
+ * block in the frame.
+ * \return       CB_COEFF_BUFFER*  Pointer of \ref CB_COEFF_BUFFER associated
+ * to this macroblock.
+ */
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+                                         int mi_col);
+
+/*!\brief Returns the entropy cost associated with skipping the current
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in]    coeff_costs    Table of entropy cost for coefficient coding.
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    tx_size        The transform size
+ */
+static INLINE int av1_cost_skip_txb(const CoeffCosts *coeff_costs,
+                                    const TXB_CTX *const txb_ctx, int plane,
+                                    TX_SIZE tx_size) {
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs_ =
+      &coeff_costs->coeff_costs[txs_ctx][plane_type];
+  return coeff_costs_->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+/*!\cond */
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+  { 17, 13 },
+  { 16, 10 },
+};
+/*!\endcond */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
new file mode 100644
index 0000000000..d6a806d504
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -0,0 +1,3469 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "av1/common/warped_motion.h"
+#include "av1/common/thread_common.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/ethread.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/firstpass.h"
+#endif
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/rdopt.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
+
+static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  td->rd_counts.compound_ref_used_flag |=
+      td_t->rd_counts.compound_ref_used_flag;
+  td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
+
+  for (int i = 0; i < TX_SIZES_ALL; i++) {
+    for (int j = 0; j < TX_TYPES; j++)
+      td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j];
+  }
+
+  for (int i = 0; i < BLOCK_SIZES_ALL; i++) {
+    for (int j = 0; j < 2; j++) {
+      td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j];
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i];
+  }
+
+  td->rd_counts.seg_tmp_pred_cost[0] += td_t->rd_counts.seg_tmp_pred_cost[0];
+  td->rd_counts.seg_tmp_pred_cost[1] += td_t->rd_counts.seg_tmp_pred_cost[1];
+
+  td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks;
+}
+
+static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  const int mib_size = cm->seq_params->mib_size;
+  const int frame_lf_count =
+      av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+  for (int row = 0; row < cm->tiles.rows; row++) {
+    for (int col = 0; col < cm->tiles.cols; col++) {
+      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
+      const TileInfo *const tile_info = &tile_data->tile_info;
+      for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+           mi_row += mib_size) {
+        if (mi_row == tile_info->mi_row_start)
+          av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+        for (int mi_col = tile_info->mi_col_start;
+             mi_col < tile_info->mi_col_end; mi_col += mib_size) {
+          const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
+          MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
+          MB_MODE_INFO *mbmi = mi[0];
+          if (mbmi->skip_txfm == 1 &&
+              (mbmi->bsize == cm->seq_params->sb_size)) {
+            for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+              mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+            mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+          } else {
+            if (cm->delta_q_info.delta_lf_multi) {
+              for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+                xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+            } else {
+              xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                int c) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+}
+
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                 int c, int cols) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+}
+
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+
+  if (r) {
+    pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > row_mt_sync->num_finished_cols[r - 1] - nsync -
+                   row_mt_sync->intrabc_extra_top_right_sb_delay) {
+      pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+                           int cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+  int cur;
+  // Only signal when there are enough encoded blocks for next row to run.
+  int sig = 1;
+
+  if (c < cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = cols + nsync + row_mt_sync->intrabc_extra_top_right_sb_delay;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&row_mt_sync->mutex_[r]);
+
+    // When a thread encounters an error, num_finished_cols[r] is set to maximum
+    // column number. In this case, the AOMMAX operation here ensures that
+    // num_finished_cols[r] is not overwritten with a smaller value thus
+    // preventing the infinite waiting of threads in the relevant sync_read()
+    // function.
+    row_mt_sync->num_finished_cols[r] =
+        AOMMAX(row_mt_sync->num_finished_cols[r], cur);
+
+    pthread_cond_signal(&row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Allocate memory for row synchronization
+static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync,
+                                  AV1_COMMON *cm, int rows) {
+#if CONFIG_MULTITHREAD
+  int i;
+
+  CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+                  aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+  if (row_mt_sync->mutex_) {
+    for (i = 0; i < rows; ++i) {
+      pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+    }
+  }
+
+  CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+                  aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
+  if (row_mt_sync->cond_) {
+    for (i = 0; i < rows; ++i) {
+      pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols,
+                  aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows));
+
+  row_mt_sync->rows = rows;
+  // Set up nsync.
+  row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
+  if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+      }
+      aom_free(row_mt_sync->mutex_);
+    }
+    if (row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_cond_destroy(&row_mt_sync->cond_[i]);
+      }
+      aom_free(row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(row_mt_sync->num_finished_cols);
+
+    // clear the structure as the source of this call may be dynamic change
+    // in tiles in which case this call will be followed by an _alloc()
+    // which may fail.
+    av1_zero(*row_mt_sync);
+  }
+}
+
+static AOM_INLINE int get_sb_rows_in_frame(AV1_COMMON *cm) {
+  return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows,
+                           cm->seq_params->mib_size_log2);
+}
+
+static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
+                             int alloc_row_ctx) {
+  struct AV1Common *cm = &cpi->common;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int tile_col, tile_row;
+
+  av1_row_mt_mem_dealloc(cpi);
+
+  // Allocate memory for row based multi-threading
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+      row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows);
+
+      if (alloc_row_ctx) {
+        assert(max_cols > 0);
+        const int num_row_ctx = AOMMAX(1, (max_cols - 1));
+        CHECK_MEM_ERROR(cm, this_tile->row_ctx,
+                        (FRAME_CONTEXT *)aom_memalign(
+                            16, num_row_ctx * sizeof(*this_tile->row_ctx)));
+      }
+    }
+  }
+  const int sb_rows = get_sb_rows_in_frame(cm);
+  CHECK_MEM_ERROR(
+      cm, enc_row_mt->num_tile_cols_done,
+      aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows));
+
+  enc_row_mt->allocated_rows = max_rows;
+  enc_row_mt->allocated_cols = max_cols - 1;
+  enc_row_mt->allocated_sb_rows = sb_rows;
+}
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = enc_row_mt->allocated_tile_cols;
+  const int tile_rows = enc_row_mt->allocated_tile_rows;
+  int tile_col, tile_row;
+
+  // Free row based multi-threading sync memory
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+      av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+
+      if (cpi->oxcf.algo_cfg.cdf_update_mode) {
+        aom_free(this_tile->row_ctx);
+        this_tile->row_ctx = NULL;
+      }
+    }
+  }
+  aom_free(enc_row_mt->num_tile_cols_done);
+  enc_row_mt->num_tile_cols_done = NULL;
+  enc_row_mt->allocated_rows = 0;
+  enc_row_mt->allocated_cols = 0;
+  enc_row_mt->allocated_sb_rows = 0;
+}
+
+static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
+                                             int num_tiles, int num_workers) {
+  int tile_id = 0;
+  int i;
+
+  for (i = 0; i < num_workers; i++) {
+    thread_id_to_tile_id[i] = tile_id++;
+    if (tile_id == num_tiles) tile_id = 0;
+  }
+}
+
+static AOM_INLINE int get_next_job(TileDataEnc *const tile_data,
+                                   int *current_mi_row, int mib_size) {
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+  const int mi_row_end = tile_data->tile_info.mi_row_end;
+
+  if (row_mt_sync->next_mi_row < mi_row_end) {
+    *current_mi_row = row_mt_sync->next_mi_row;
+    row_mt_sync->num_threads_working++;
+    row_mt_sync->next_mi_row += mib_size;
+    return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE void switch_tile_and_get_next_job(
+    AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id,
+    int *current_mi_row, int *end_of_frame, int is_firstpass,
+    const BLOCK_SIZE fp_block_size) {
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  int tile_id = -1;  // Stores the tile ID with minimum proc done
+  int max_mis_to_encode = 0;
+  int min_num_threads_working = INT_MAX;
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+#if CONFIG_REALTIME_ONLY
+      int num_b_rows_in_tile =
+          av1_get_sb_rows_in_tile(cm, &this_tile->tile_info);
+      int num_b_cols_in_tile =
+          av1_get_sb_cols_in_tile(cm, &this_tile->tile_info);
+#else
+      int num_b_rows_in_tile =
+          is_firstpass
+              ? av1_get_unit_rows_in_tile(&this_tile->tile_info, fp_block_size)
+              : av1_get_sb_rows_in_tile(cm, &this_tile->tile_info);
+      int num_b_cols_in_tile =
+          is_firstpass
+              ? av1_get_unit_cols_in_tile(&this_tile->tile_info, fp_block_size)
+              : av1_get_sb_cols_in_tile(cm, &this_tile->tile_info);
+#endif
+      int theoretical_limit_on_threads =
+          AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile);
+      int num_threads_working = row_mt_sync->num_threads_working;
+
+      if (num_threads_working < theoretical_limit_on_threads) {
+        int num_mis_to_encode =
+            this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row;
+
+        // Tile to be processed by this thread is selected on the basis of
+        // availability of jobs:
+        // 1) If jobs are available, tile to be processed is chosen on the
+        // basis of minimum number of threads working for that tile. If two or
+        // more tiles have same number of threads working for them, then the
+        // tile with maximum number of jobs available will be chosen.
+        // 2) If no jobs are available, then end_of_frame is reached.
+        if (num_mis_to_encode > 0) {
+          if (num_threads_working < min_num_threads_working) {
+            min_num_threads_working = num_threads_working;
+            max_mis_to_encode = 0;
+          }
+          if (num_threads_working == min_num_threads_working &&
+              num_mis_to_encode > max_mis_to_encode) {
+            tile_id = tile_index;
+            max_mis_to_encode = num_mis_to_encode;
+          }
+        }
+      }
+    }
+  }
+  if (tile_id == -1) {
+    *end_of_frame = 1;
+  } else {
+    // Update the current tile id to the tile id that will be processed next,
+    // which will be the least processed tile.
+    *cur_tile_id = tile_id;
+    const int unit_height = mi_size_high[fp_block_size];
+    get_next_job(&tile_data[tile_id], current_mi_row,
+                 is_firstpass ? unit_height : cm->seq_params->mib_size);
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void set_firstpass_encode_done(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+  const int unit_height = mi_size_high[fp_block_size];
+
+  // In case of multithreading of firstpass encode, due to top-right
+  // dependency, the worker on a firstpass row waits for the completion of the
+  // firstpass processing of the top and top-right fp_blocks. Hence, in case a
+  // thread (main/worker) encounters an error, update the firstpass processing
+  // of every row in the frame to indicate that it is complete in order to avoid
+  // dependent workers waiting indefinitely.
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *tile = &tile_data->tile_info;
+      AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+      const int unit_cols_in_tile =
+          av1_get_unit_cols_in_tile(tile, fp_block_size);
+      for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0;
+           mi_row < tile->mi_row_end;
+           mi_row += unit_height, unit_row_in_tile++) {
+        enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile,
+                                   unit_cols_in_tile - 1, unit_cols_in_tile);
+      }
+    }
+  }
+}
+
+static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  int thread_id = thread_data->thread_id;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+  (void)unused;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+    enc_row_mt->firstpass_mt_exit = true;
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    set_firstpass_encode_done(cpi);
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  AV1_COMMON *const cm = &cpi->common;
+  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+  assert(cur_tile_id != -1);
+
+  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+  const int unit_height = mi_size_high[fp_block_size];
+  int end_of_frame = 0;
+  while (1) {
+    int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+    if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id],
+                                            &current_mi_row, unit_height)) {
+      // No jobs are available for the current tile. Query for the status of
+      // other tiles and get the next job if available
+      switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+                                   &current_mi_row, &end_of_frame, 1,
+                                   fp_block_size);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    // When firstpass_mt_exit is set to true, other workers need not pursue any
+    // further jobs.
+    if (firstpass_mt_exit || end_of_frame) break;
+
+    TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+    AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+    ThreadData *td = thread_data->td;
+
+    assert(current_mi_row != -1 &&
+           current_mi_row < this_tile->tile_info.mi_row_end);
+
+    const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+    av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2,
+                       fp_block_size);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+  }
+  error_info->setjmp = 0;
+  return 1;
+}
+#endif
+
+static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data,
+                                    AV1EncRowMultiThreadInfo *enc_row_mt,
+                                    int mib_size_log2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync;
+  const int sb_rows = get_sb_rows_in_frame(cm);
+  AV1LfMTInfo *cur_job_info;
+  bool row_mt_exit = false;
+  (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+
+  while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
+    LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data;
+    const int lpf_opt_level = cur_job_info->lpf_opt_level;
+    (void)sb_rows;
+#if CONFIG_MULTITHREAD
+    const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2;
+    const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1);
+    // Wait for current and next superblock row to finish encoding.
+    pthread_mutex_lock(enc_row_mt_mutex_);
+    while (!enc_row_mt->row_mt_exit &&
+           (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
+            enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) {
+      pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_);
+    }
+    row_mt_exit = enc_row_mt->row_mt_exit;
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    if (row_mt_exit) return;
+
+    av1_thread_loop_filter_rows(
+        lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
+        cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
+        lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf,
+        lf_data->tx_buf, mib_size_log2);
+  }
+}
+
+static void set_encoding_done(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int mib_size = cm->seq_params->mib_size;
+
+  // In case of row-multithreading, due to top-right dependency, the worker on
+  // an SB row waits for the completion of the encode of the top and top-right
+  // SBs. Hence, in case a thread (main/worker) encounters an error, update that
+  // encoding of every SB row in the frame is complete in order to avoid the
+  // dependent workers of every tile from waiting indefinitely.
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileDataEnc *const this_tile =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      const TileInfo *const tile_info = &this_tile->tile_info;
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+      const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+      for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0;
+           mi_row < tile_info->mi_row_end;
+           mi_row += mib_size, sb_row_in_tile++) {
+        enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile,
+                                   sb_cols_in_tile - 1, sb_cols_in_tile);
+      }
+    }
+  }
+}
+
+static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc,
+                                    const int filter_level[2]) {
+  return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]);
+}
+
+static int enc_row_mt_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  int thread_id = thread_data->thread_id;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+  (void)unused;
+
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  AV1LfSync *const lf_sync = thread_data->lf_sync;
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  xd->error_info = error_info;
+  AV1_COMMON *volatile const cm = &cpi->common;
+  volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled(
+      cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level);
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+    enc_row_mt->row_mt_exit = true;
+    // Wake up all the workers waiting in launch_loop_filter_rows() to exit in
+    // case of an error.
+    pthread_cond_broadcast(enc_row_mt->cond_);
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    set_encoding_done(cpi);
+
+    if (do_pipelined_lpf_mt_with_enc) {
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(lf_sync->job_mutex);
+      lf_sync->lf_mt_exit = true;
+      pthread_mutex_unlock(lf_sync->job_mutex);
+#endif
+      av1_set_vert_loop_filter_done(&cpi->common, lf_sync,
+                                    cpi->common.seq_params->mib_size_log2);
+    }
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+
+  // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+  // allocation.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+    if (!thread_data->td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+  } else {
+    thread_data->td->pc_root = NULL;
+  }
+
+  assert(cur_tile_id != -1);
+
+  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+  int end_of_frame = 0;
+  bool row_mt_exit = false;
+
+  // When master thread does not have a valid job to process, xd->tile_ctx
+  // is not set and it contains NULL pointer. This can result in NULL pointer
+  // access violation if accessed beyond the encode stage. Hence, updating
+  // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame
+  // context to avoid NULL pointer access in subsequent stages.
+  thread_data->td->mb.e_mbd.tile_ctx = cm->fc;
+  while (1) {
+    int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    row_mt_exit = enc_row_mt->row_mt_exit;
+    // row_mt_exit check here can be avoided as it is checked after
+    // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here,
+    // tries to return before calling the function get_next_job().
+    if (!row_mt_exit &&
+        !get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+                      cm->seq_params->mib_size)) {
+      // No jobs are available for the current tile. Query for the status of
+      // other tiles and get the next job if available
+      switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+                                   &current_mi_row, &end_of_frame, 0,
+                                   fp_block_size);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    // When row_mt_exit is set to true, other workers need not pursue any
+    // further jobs.
+    if (row_mt_exit) {
+      error_info->setjmp = 0;
+      return 1;
+    }
+
+    if (end_of_frame) break;
+
+    TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+    AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+    const TileInfo *const tile_info = &this_tile->tile_info;
+    const int tile_row = tile_info->tile_row;
+    const int tile_col = tile_info->tile_col;
+    ThreadData *td = thread_data->td;
+    const int sb_row = current_mi_row >> mib_size_log2;
+
+    assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end);
+
+    td->mb.e_mbd.tile_ctx = td->tctx;
+    td->mb.tile_pb_ctx = &this_tile->tctx;
+    td->abs_sum_level = 0;
+
+    if (this_tile->allow_update_cdf) {
+      td->mb.row_ctx = this_tile->row_ctx;
+      if (current_mi_row == tile_info->mi_row_start)
+        memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+    } else {
+      memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+    }
+
+    av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+                           &td->mb.e_mbd);
+
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
+    if (td->mb.txfm_search_info.mb_rd_record != NULL) {
+      av1_crc32c_calculator_init(
+          &td->mb.txfm_search_info.mb_rd_record->crc_calculator);
+    }
+
+    av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    this_tile->abs_sum_level += td->abs_sum_level;
+    row_mt_sync->num_threads_working--;
+    enc_row_mt->num_tile_cols_done[sb_row]++;
+#if CONFIG_MULTITHREAD
+    pthread_cond_broadcast(enc_row_mt->cond_);
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+  }
+  if (do_pipelined_lpf_mt_with_enc) {
+    // Loop-filter a superblock row if encoding of the current and next
+    // superblock row is complete.
+    // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving
+    // encoding and loop filter stage.
+    launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2);
+  }
+  av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  thread_data->td->pc_root = NULL;
+  error_info->setjmp = 0;
+  return 1;
+}
+
+static int enc_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int t;
+
+  (void)unused;
+
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+  // allocation.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+    if (!thread_data->td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+  } else {
+    thread_data->td->pc_root = NULL;
+  }
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+       t += cpi->mt_info.num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    TileDataEnc *const this_tile =
+        &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+    thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+    thread_data->td->mb.tile_pb_ctx = &this_tile->tctx;
+    av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  thread_data->td->pc_root = NULL;
+  error_info->setjmp = 0;
+  return 1;
+}
+
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) {
+  cpi->mt_info.workers = ppi->p_mt_info.workers;
+  cpi->mt_info.num_workers = ppi->p_mt_info.num_workers;
+  cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data;
+  int i;
+  for (i = MOD_FP; i < NUM_MT_MODULES; i++) {
+    cpi->mt_info.num_mod_workers[i] =
+        AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]);
+  }
+}
+
+void av1_init_cdef_worker(AV1_COMP *cpi) {
+  // The allocation is done only for level 0 parallel frames. No change
+  // in config is supported in the middle of a parallel encode set, since the
+  // rest of the MT modules also do not support dynamic change of config.
+  if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return;
+  PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+  int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF);
+
+  av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker,
+                         &cpi->mt_info.cdef_sync, num_cdef_workers, 1);
+  cpi->mt_info.cdef_worker = p_mt_info->cdef_worker;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync;
+  if (lr_sync->sync_range) {
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+      return;
+    int num_lr_workers =
+        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+    assert(num_lr_workers <= lr_sync->num_workers);
+    lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf;
+    lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs;
+  }
+}
+#endif
+
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  if (setjmp(cm->error->jmp)) {
+    cm->error->setjmp = 0;
+    aom_internal_error_copy(&cpi->ppi->error, cm->error);
+  }
+  cm->error->setjmp = 1;
+  // Initialize enc row MT object.
+  if (is_first_pass || cpi->oxcf.row_mt == 1) {
+    AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
+    if (enc_row_mt->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
+                      aom_malloc(sizeof(*(enc_row_mt->mutex_))));
+      if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
+    }
+    if (enc_row_mt->cond_ == NULL) {
+      CHECK_MEM_ERROR(cm, enc_row_mt->cond_,
+                      aom_malloc(sizeof(*(enc_row_mt->cond_))));
+      if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL);
+    }
+  }
+
+  if (!is_first_pass) {
+    // Initialize global motion MT object.
+    AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
+    if (gm_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, gm_sync->mutex_,
+                      aom_malloc(sizeof(*(gm_sync->mutex_))));
+      if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
+    }
+#if !CONFIG_REALTIME_ONLY
+    // Initialize temporal filtering MT object.
+    AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
+    if (tf_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, tf_sync->mutex_,
+                      aom_malloc(sizeof(*tf_sync->mutex_)));
+      if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
+    }
+#endif  // !CONFIG_REALTIME_ONLY
+        // Initialize CDEF MT object.
+    AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+    if (cdef_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                      aom_malloc(sizeof(*(cdef_sync->mutex_))));
+      if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+    }
+
+    // Initialize loop filter MT object.
+    AV1LfSync *lf_sync = &mt_info->lf_row_sync;
+    // Number of superblock rows
+    const int sb_rows =
+        CEIL_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2);
+    PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+    int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF);
+
+    if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+        num_lf_workers > lf_sync->num_workers) {
+      av1_loop_filter_dealloc(lf_sync);
+      av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers);
+    }
+
+    // Initialize tpl MT object.
+    AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt;
+    if (tpl_row_mt->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_,
+                      aom_malloc(sizeof(*(tpl_row_mt->mutex_))));
+      if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL);
+    }
+
+#if !CONFIG_REALTIME_ONLY
+    if (is_restoration_used(cm)) {
+      // Initialize loop restoration MT object.
+      AV1LrSync *lr_sync = &mt_info->lr_row_sync;
+      int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+      int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height);
+      int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
+      if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+          num_lr_workers > lr_sync->num_workers ||
+          MAX_MB_PLANE > lr_sync->num_planes) {
+        av1_loop_restoration_dealloc(lr_sync);
+        av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr,
+                                   MAX_MB_PLANE, cm->width);
+      }
+    }
+#endif
+
+    // Initialization of pack bitstream MT object.
+    AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync;
+    if (pack_bs_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_,
+                      aom_malloc(sizeof(*pack_bs_sync->mutex_)));
+      if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL);
+    }
+  }
+  cm->error->setjmp = 0;
+}
+#endif  // CONFIG_MULTITHREAD
+
+// Computes the number of workers to be considered while allocating memory for a
+// multi-threaded module under FPMT.
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
+                                      MULTI_THREADED_MODULES mod_name) {
+  int num_mod_workers = p_mt_info->num_mod_workers[mod_name];
+  if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) {
+    // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC].
+    // As frame parallel jobs will only perform multi-threading for the encode
+    // stage, we can limit the allocations according to num_enc_workers per
+    // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]).
+    num_mod_workers = p_mt_info->num_workers;
+  }
+  return num_mod_workers;
+}
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+
+  assert(p_mt_info->workers != NULL);
+  assert(p_mt_info->tile_thr_data != NULL);
+
+  int num_workers = p_mt_info->num_workers;
+  int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC);
+  assert(num_enc_workers <= num_workers);
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
+
+    if (i > 0) {
+      // Allocate thread data.
+      ThreadData *td;
+      AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td)));
+      av1_zero(*td);
+      thread_data->original_td = thread_data->td = td;
+
+      // Set up shared coeff buffers.
+      av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf,
+                                    &ppi->error);
+      AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst,
+                          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                                               sizeof(*td->tmp_conv_dst)));
+
+      if (i < p_mt_info->num_mod_workers[MOD_FP]) {
+        // Set up firstpass PICK_MODE_CONTEXT.
+        td->firstpass_ctx =
+            av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf);
+        if (!td->firstpass_ctx)
+          aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
+      }
+
+      if (!is_first_pass && i < num_enc_workers) {
+        // Set up sms_tree.
+        if (av1_setup_sms_tree(ppi->cpi, td)) {
+          aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate SMS tree");
+        }
+
+        for (int x = 0; x < 2; x++)
+          for (int y = 0; y < 2; y++)
+            AOM_CHECK_MEM_ERROR(
+                &ppi->error, td->hash_value_buffer[x][y],
+                (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                                       sizeof(*td->hash_value_buffer[0][0])));
+
+        // Allocate frame counters in thread data.
+        AOM_CHECK_MEM_ERROR(&ppi->error, td->counts,
+                            aom_calloc(1, sizeof(*td->counts)));
+
+        // Allocate buffers used by palette coding mode.
+        AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer,
+                            aom_memalign(16, sizeof(*td->palette_buffer)));
+
+        // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are
+        // used in inter frames to store intermediate inter mode prediction
+        // results and are not required for allintra encoding mode. Hence, the
+        // memory allocations for these buffers are avoided for allintra
+        // encoding mode.
+        if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) {
+          alloc_obmc_buffers(&td->obmc_buffer, &ppi->error);
+
+          alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer);
+
+          for (int j = 0; j < 2; ++j) {
+            AOM_CHECK_MEM_ERROR(
+                &ppi->error, td->tmp_pred_bufs[j],
+                aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                     sizeof(*td->tmp_pred_bufs[j])));
+          }
+        }
+
+        if (is_gradient_caching_for_hog_enabled(ppi->cpi)) {
+          const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome;
+          AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info,
+                              aom_malloc(sizeof(*td->pixel_gradient_info) *
+                                         plane_types * MAX_SB_SQUARE));
+        }
+
+        if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) {
+          const BLOCK_SIZE sb_size = ppi->cpi->common.seq_params->sb_size;
+          const int mi_count_in_sb =
+              mi_size_wide[sb_size] * mi_size_high[sb_size];
+
+          AOM_CHECK_MEM_ERROR(
+              &ppi->error, td->src_var_info_of_4x4_sub_blocks,
+              aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) *
+                         mi_count_in_sb));
+        }
+
+        if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+          const int num_64x64_blocks =
+              (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+          AOM_CHECK_MEM_ERROR(
+              &ppi->error, td->vt64x64,
+              aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks));
+        }
+      }
+    }
+
+    if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) {
+      if (i == 0) {
+        for (int j = 0; j < ppi->num_fp_contexts; j++) {
+          AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx,
+                              (FRAME_CONTEXT *)aom_memalign(
+                                  16, sizeof(*ppi->parallel_cpi[j]->td.tctx)));
+        }
+      } else {
+        AOM_CHECK_MEM_ERROR(
+            &ppi->error, thread_data->td->tctx,
+            (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
+      }
+    }
+  }
+
+  // Record the number of workers in encode stage multi-threading for which
+  // allocation is done.
+  p_mt_info->prev_num_enc_workers = num_enc_workers;
+}
+
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  assert(p_mt_info->num_workers == 0);
+
+  AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers,
+                      aom_malloc(num_workers * sizeof(*p_mt_info->workers)));
+
+  AOM_CHECK_MEM_ERROR(
+      &ppi->error, p_mt_info->tile_thr_data,
+      aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data)));
+
+  for (int i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &p_mt_info->workers[i];
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
+
+    winterface->init(worker);
+    worker->thread_name = "aom enc worker";
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i > 0) {
+      // Create threads
+      if (!winterface->reset(worker))
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    }
+    winterface->sync(worker);
+
+    ++p_mt_info->num_workers;
+  }
+}
+
+// This function will change the state and free the mutex of corresponding
+// workers and terminate the object. The object can not be re-used unless a call
+// to reset() is made.
+void av1_terminate_workers(AV1_PRIMARY *ppi) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  for (int t = 0; t < p_mt_info->num_workers; ++t) {
+    AVxWorker *const worker = &p_mt_info->workers[t];
+    aom_get_worker_interface()->end(worker);
+  }
+}
+
+// This function returns 1 if frame parallel encode is supported for
+// the current configuration. Returns 0 otherwise.
+static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+  // FPMT is enabled for AOM_Q and AOM_VBR.
+  // TODO(Tarun): Test and enable resize config.
+  if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) {
+    return 0;
+  }
+  if (ppi->use_svc) {
+    return 0;
+  }
+  if (oxcf->tile_cfg.enable_large_scale_tile) {
+    return 0;
+  }
+  if (oxcf->dec_model_cfg.timing_info_present) {
+    return 0;
+  }
+  if (oxcf->mode != GOOD) {
+    return 0;
+  }
+  if (oxcf->tool_cfg.error_resilient_mode) {
+    return 0;
+  }
+  if (oxcf->resize_cfg.resize_mode) {
+    return 0;
+  }
+  if (oxcf->pass != AOM_RC_SECOND_PASS) {
+    return 0;
+  }
+  if (oxcf->max_threads < 2) {
+    return 0;
+  }
+  if (!oxcf->fp_mt) {
+    return 0;
+  }
+
+  return 1;
+}
+
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
+                          AV1EncoderConfig *const oxcf) {
+  if (is_fpmt_config(ppi, oxcf)) return 1;
+  // Reset frame parallel configuration for unsupported config
+  if (ppi->num_fp_contexts > 1) {
+    for (int i = 1; i < ppi->num_fp_contexts; i++) {
+      // Release the previously-used frame-buffer
+      if (ppi->parallel_cpi[i]->common.cur_frame != NULL) {
+        --ppi->parallel_cpi[i]->common.cur_frame->ref_count;
+        ppi->parallel_cpi[i]->common.cur_frame = NULL;
+      }
+    }
+
+    int cur_gf_index = ppi->cpi->gf_frame_index;
+    int reset_size = AOMMAX(0, ppi->gf_group.size - cur_gf_index);
+    av1_zero_array(&ppi->gf_group.frame_parallel_level[cur_gf_index],
+                   reset_size);
+    av1_zero_array(&ppi->gf_group.is_frame_non_ref[cur_gf_index], reset_size);
+    av1_zero_array(&ppi->gf_group.src_offset[cur_gf_index], reset_size);
+    memset(&ppi->gf_group.skip_frame_refresh[cur_gf_index][0], INVALID_IDX,
+           sizeof(ppi->gf_group.skip_frame_refresh[cur_gf_index][0]) *
+               reset_size * REF_FRAMES);
+    memset(&ppi->gf_group.skip_frame_as_ref[cur_gf_index], INVALID_IDX,
+           sizeof(ppi->gf_group.skip_frame_as_ref[cur_gf_index]) * reset_size);
+    ppi->num_fp_contexts = 1;
+  }
+  return 0;
+}
+
+// A large value for threads used to compute the max num_enc_workers
+// possible for each resolution.
+#define MAX_THREADS 100
+
+// Computes the max number of enc workers possible for each resolution.
+static AOM_INLINE int compute_max_num_enc_workers(
+    CommonModeInfoParams *const mi_params, int mib_size_log2) {
+  int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2);
+  int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2);
+
+  return AOMMIN((num_sb_cols + 1) >> 1, num_sb_rows);
+}
+
+// Computes the number of frame parallel(fp) contexts to be created
+// based on the number of max_enc_workers.
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+  ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0;
+  if (!av1_check_fpmt_config(ppi, oxcf)) {
+    return 1;
+  }
+  int max_num_enc_workers = compute_max_num_enc_workers(
+      &ppi->cpi->common.mi_params, ppi->cpi->common.seq_params->mib_size_log2);
+  // Scaling factors and rounding factors used to tune worker_per_frame
+  // computation.
+  int rounding_factor[2] = { 2, 4 };
+  int scaling_factor[2] = { 4, 8 };
+  int is_480p_or_lesser =
+      AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) <= 480;
+  int is_sb_64 = 0;
+  if (ppi->cpi != NULL)
+    is_sb_64 = ppi->cpi->common.seq_params->sb_size == BLOCK_64X64;
+  // A parallel frame encode has at least 1/4th the
+  // theoretical limit of max enc workers in default case. For resolutions
+  // larger than 480p, if SB size is 64x64, optimal performance is obtained with
+  // limit of 1/8.
+  int index = (!is_480p_or_lesser && is_sb_64) ? 1 : 0;
+  int workers_per_frame =
+      AOMMAX(1, (max_num_enc_workers + rounding_factor[index]) /
+                    scaling_factor[index]);
+  int max_threads = oxcf->max_threads;
+  int num_fp_contexts = max_threads / workers_per_frame;
+  // Based on empirical results, FPMT gains with multi-tile are significant when
+  // more parallel frames are available. Use FPMT with multi-tile encode only
+  // when sufficient threads are available for parallel encode of
+  // MAX_PARALLEL_FRAMES frames.
+  if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) {
+    if (num_fp_contexts < MAX_PARALLEL_FRAMES) num_fp_contexts = 1;
+  }
+
+  num_fp_contexts = AOMMAX(1, AOMMIN(num_fp_contexts, MAX_PARALLEL_FRAMES));
+  // Limit recalculated num_fp_contexts to ppi->num_fp_contexts.
+  num_fp_contexts = (ppi->num_fp_contexts == 1)
+                        ? num_fp_contexts
+                        : AOMMIN(num_fp_contexts, ppi->num_fp_contexts);
+  if (num_fp_contexts > 1) {
+    ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] =
+        AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads);
+  }
+  return num_fp_contexts;
+}
+
+// Computes the number of workers to process each of the parallel frames.
+static AOM_INLINE int compute_num_workers_per_frame(
+    const int num_workers, const int parallel_frame_count) {
+  // Number of level 2 workers per frame context (floor division).
+  int workers_per_frame = (num_workers / parallel_frame_count);
+  return workers_per_frame;
+}
+
+static AOM_INLINE void restore_workers_after_fpmt(
+    AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared);
+
+// Prepare level 1 workers. This function is only called for
+// parallel_frame_count > 1. This function populates the mt_info structure of
+// frame level contexts appropriately by dividing the total number of available
+// workers amongst the frames as level 2 workers. It also populates the hook and
+// data members of level 1 workers.
+static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi,
+                                            AV1_COMP_DATA *first_cpi_data,
+                                            AVxWorkerHook hook,
+                                            int parallel_frame_count) {
+  assert(parallel_frame_count <= ppi->num_fp_contexts &&
+         parallel_frame_count > 1);
+
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  int num_workers = p_mt_info->num_workers;
+
+  volatile int frame_idx = 0;
+  volatile int i = 0;
+  while (i < num_workers) {
+    // Assign level 1 worker
+    AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] =
+        &p_mt_info->workers[i];
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+    MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+    // This 'aom_internal_error_info' pointer is not derived from the local
+    // pointer ('AV1_COMMON *const cm') to silence the compiler warning
+    // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]".
+    struct aom_internal_error_info *const error = cur_cpi->common.error;
+
+    // The jmp_buf is valid only within the scope of the function that calls
+    // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+    // before it returns.
+    if (setjmp(error->jmp)) {
+      error->setjmp = 0;
+      restore_workers_after_fpmt(ppi, parallel_frame_count, i);
+      aom_internal_error_copy(&ppi->error, error);
+    }
+    error->setjmp = 1;
+
+    AV1_COMMON *const cm = &cur_cpi->common;
+    // Assign start of level 2 worker pool
+    mt_info->workers = &p_mt_info->workers[i];
+    mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i];
+    // Assign number of workers for each frame in the parallel encode set.
+    mt_info->num_workers = compute_num_workers_per_frame(
+        num_workers - i, parallel_frame_count - frame_idx);
+    for (int j = MOD_FP; j < NUM_MT_MODULES; j++) {
+      mt_info->num_mod_workers[j] =
+          AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]);
+    }
+    if (p_mt_info->cdef_worker != NULL) {
+      mt_info->cdef_worker = &p_mt_info->cdef_worker[i];
+
+      // Back up the original cdef_worker pointers.
+      mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf;
+      const int num_planes = av1_num_planes(cm);
+      for (int plane = 0; plane < num_planes; plane++)
+        mt_info->restore_state_buf.cdef_colbuf[plane] =
+            mt_info->cdef_worker->colbuf[plane];
+    }
+#if !CONFIG_REALTIME_ONLY
+    if (is_restoration_used(cm)) {
+      // Back up the original LR buffers before update.
+      int idx = i + mt_info->num_workers - 1;
+      assert(idx < mt_info->lr_row_sync.num_workers);
+      mt_info->restore_state_buf.rst_tmpbuf =
+          mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
+      mt_info->restore_state_buf.rlbs =
+          mt_info->lr_row_sync.lrworkerdata[idx].rlbs;
+
+      // Update LR buffers.
+      mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf;
+      mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs;
+    }
+#endif
+
+    i += mt_info->num_workers;
+
+    // At this stage, the thread specific CDEF buffers for the current frame's
+    // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has
+    // already been allocated across parallel frames.
+    av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync,
+                           p_mt_info->num_workers, 0);
+
+    frame_worker->hook = hook;
+    frame_worker->data1 = cur_cpi;
+    frame_worker->data2 = (frame_idx == 0)
+                              ? first_cpi_data
+                              : &ppi->parallel_frames_data[frame_idx - 1];
+    frame_idx++;
+    error->setjmp = 0;
+  }
+  p_mt_info->p_num_workers = parallel_frame_count;
+}
+
+// Launch level 1 workers to perform frame parallel encode.
+static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_workers = ppi->p_mt_info.p_num_workers;
+
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+    if (i == 0)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
+
+// Restore worker states after parallel encode.
+static AOM_INLINE void restore_workers_after_fpmt(
+    AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) {
+  assert(parallel_frame_count <= ppi->num_fp_contexts &&
+         parallel_frame_count > 1);
+  (void)parallel_frame_count;
+
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+
+  int frame_idx = 0;
+  int i = 0;
+  while (i < num_fpmt_workers_prepared) {
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+    MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+    const AV1_COMMON *const cm = &cur_cpi->common;
+    const int num_planes = av1_num_planes(cm);
+
+    // Restore the original cdef_worker pointers.
+    if (p_mt_info->cdef_worker != NULL) {
+      mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf;
+      for (int plane = 0; plane < num_planes; plane++)
+        mt_info->cdef_worker->colbuf[plane] =
+            mt_info->restore_state_buf.cdef_colbuf[plane];
+    }
+#if !CONFIG_REALTIME_ONLY
+    if (is_restoration_used(cm)) {
+      // Restore the original LR buffers.
+      int idx = i + mt_info->num_workers - 1;
+      assert(idx < mt_info->lr_row_sync.num_workers);
+      mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
+          mt_info->restore_state_buf.rst_tmpbuf;
+      mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
+          mt_info->restore_state_buf.rlbs;
+    }
+#endif
+
+    frame_idx++;
+    i += mt_info->num_workers;
+  }
+}
+
+// Synchronize level 1 workers.
+static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi,
+                                         int frames_in_parallel_set) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_workers = ppi->p_mt_info.p_num_workers;
+  int had_error = 0;
+  // Points to error in the earliest display order frame in the parallel set.
+  const struct aom_internal_error_info *error;
+
+  // Encoding ends.
+  for (int i = num_workers - 1; i >= 0; --i) {
+    AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error = ppi->parallel_cpi[i]->common.error;
+    }
+  }
+
+  restore_workers_after_fpmt(ppi, frames_in_parallel_set,
+                             ppi->p_mt_info.num_workers);
+
+  if (had_error) aom_internal_error_copy(&ppi->error, error);
+}
+
+static int get_compressed_data_hook(void *arg1, void *arg2) {
+  AV1_COMP *cpi = (AV1_COMP *)arg1;
+  AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2;
+  int status = av1_get_compressed_data(cpi, cpi_data);
+
+  // AOM_CODEC_OK(0) means no error.
+  return !status;
+}
+
+// This function encodes the raw frame data for each frame in parallel encode
+// set, and outputs the frame bit stream to the designated buffers.
+void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+                                  AV1_COMP_DATA *const first_cpi_data) {
+  // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf
+  // corresponding to frames in the current parallel encode set.
+  int ref_buffers_used_map = 0;
+  int frames_in_parallel_set = av1_init_parallel_frame_context(
+      first_cpi_data, ppi, &ref_buffers_used_map);
+  prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook,
+                       frames_in_parallel_set);
+  launch_fpmt_workers(ppi);
+  sync_fpmt_workers(ppi, frames_in_parallel_set);
+
+  // Release cpi->scaled_ref_buf corresponding to frames in the current parallel
+  // encode set.
+  for (int i = 0; i < frames_in_parallel_set; ++i) {
+    av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]);
+  }
+  av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool,
+                                ref_buffers_used_map);
+}
+
+static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
+                                      int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    worker->had_error = 0;
+    if (i == 0)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
+
+static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
+                                        AV1_COMMON *const cm, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const AVxWorker *const worker_main = &mt_info->workers[0];
+  int had_error = worker_main->had_error;
+  struct aom_internal_error_info error_info;
+
+  // Read the error_info of main thread.
+  if (had_error) {
+    error_info = ((EncWorkerData *)worker_main->data1)->error_info;
+  }
+
+  // Encoding ends.
+  for (int i = num_workers - 1; i > 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error_info = ((EncWorkerData *)worker->data1)->error_info;
+    }
+  }
+
+  if (had_error) aom_internal_error_copy(cm->error, &error_info);
+
+  // Restore xd->error_info of the main thread back to cm->error so that the
+  // multithreaded code, when executed using a single thread, has a valid
+  // xd->error_info.
+  MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd;
+  xd->error_info = cm->error;
+}
+
+static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
+                                                       int num_workers) {
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->mt_info.workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    cpi->intrabc_used |= thread_data->td->intrabc_used;
+    cpi->deltaq_used |= thread_data->td->deltaq_used;
+    // Accumulate rtc counters.
+    if (!frame_is_intra_only(&cpi->common))
+      av1_accumulate_rtc_counters(cpi, &thread_data->td->mb);
+    cpi->palette_pixel_num += thread_data->td->mb.palette_pixels;
+    if (thread_data->td != &cpi->td) {
+      // Keep these conditional expressions in sync with the corresponding ones
+      // in prepare_enc_workers().
+      if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        aom_free(thread_data->td->mv_costs_alloc);
+        thread_data->td->mv_costs_alloc = NULL;
+      }
+      if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        aom_free(thread_data->td->dv_costs_alloc);
+        thread_data->td->dv_costs_alloc = NULL;
+      }
+    }
+    av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common));
+
+    // Accumulate counters.
+    if (i > 0) {
+      av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+      cpi->td.mb.txfm_search_info.txb_split_count +=
+          thread_data->td->mb.txfm_search_info.txb_split_count;
+#if CONFIG_SPEED_STATS
+      cpi->td.mb.txfm_search_info.tx_search_count +=
+          thread_data->td->mb.txfm_search_info.tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+    }
+  }
+}
+
+static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                           int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1_COMMON *const cm = &cpi->common;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+
+    thread_data->td->intrabc_used = 0;
+    thread_data->td->deltaq_used = 0;
+    thread_data->td->abs_sum_level = 0;
+    thread_data->td->rd_counts.seg_tmp_pred_cost[0] = 0;
+    thread_data->td->rd_counts.seg_tmp_pred_cost[1] = 0;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+      thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+
+      for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+          memcpy(thread_data->td->hash_value_buffer[x][y],
+                 cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+                 AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                     sizeof(*thread_data->td->hash_value_buffer[0][0]));
+          thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] =
+              thread_data->td->hash_value_buffer[x][y];
+        }
+      }
+      // Keep these conditional expressions in sync with the corresponding ones
+      // in accumulate_counters_enc_workers().
+      if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->mv_costs_alloc,
+            (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc)));
+        thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc;
+        memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
+               sizeof(MvCosts));
+      }
+      if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        // Reset dv_costs to NULL for worker threads when dv cost update is
+        // enabled so that only dv_cost_upd_level needs to be checked before the
+        // aom_free() call for the same.
+        thread_data->td->mb.dv_costs = NULL;
+        if (av1_need_dv_costs(cpi)) {
+          CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc,
+                          (IntraBCMVCosts *)aom_malloc(
+                              sizeof(*thread_data->td->dv_costs_alloc)));
+          thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc;
+          memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
+                 sizeof(IntraBCMVCosts));
+        }
+      }
+    }
+    av1_alloc_mb_data(cpi, &thread_data->td->mb);
+
+    // Reset rtc counters.
+    av1_init_rtc_counters(&thread_data->td->mb);
+
+    thread_data->td->mb.palette_pixels = 0;
+
+    if (thread_data->td->counts != &cpi->counts) {
+      memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
+    }
+
+    if (i > 0) {
+      thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
+      thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
+      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      for (int j = 0; j < 2; ++j) {
+        thread_data->td->mb.tmp_pred_bufs[j] =
+            thread_data->td->tmp_pred_bufs[j];
+      }
+      thread_data->td->mb.pixel_gradient_info =
+          thread_data->td->pixel_gradient_info;
+
+      thread_data->td->mb.src_var_info_of_4x4_sub_blocks =
+          thread_data->td->src_var_info_of_4x4_sub_blocks;
+
+      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+      for (int j = 0; j < 2; ++j) {
+        thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+            thread_data->td->mb.tmp_pred_bufs[j];
+      }
+    }
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                              int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+      // Before encoding a frame, copy the thread data from cpi.
+      thread_data->td->mb = cpi->td.mb;
+    }
+    av1_alloc_src_diff_buf(cm, &thread_data->td->mb);
+  }
+}
+#endif
+
+// Computes the number of workers for row multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
+                                                     int max_threads) {
+  TileInfo tile_info;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int total_num_threads_row_mt = 0;
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      av1_tile_init(&tile_info, cm, row, col);
+      const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, &tile_info);
+      const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, &tile_info);
+      total_num_threads_row_mt +=
+          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+    }
+  }
+  return AOMMIN(max_threads, total_num_threads_row_mt);
+}
+
+// Computes the number of workers for tile multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm,
+                                                      int max_threads) {
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  return AOMMIN(max_threads, tile_cols * tile_rows);
+}
+
+// Find max worker of all MT stages
+int av1_get_max_num_workers(const AV1_COMP *cpi) {
+  int max_num_workers = 0;
+  for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+    max_num_workers =
+        AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers);
+  assert(max_num_workers >= 1);
+  return AOMMIN(max_num_workers, cpi->oxcf.max_threads);
+}
+
+// Computes the number of workers for encoding stage (row/tile multi-threading)
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) {
+  if (max_workers <= 1) return 1;
+  if (cpi->oxcf.row_mt)
+    return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
+  else
+    return compute_num_enc_tile_mt_workers(&cpi->common, max_workers);
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int num_workers = mt_info->num_mod_workers[MOD_ENC];
+
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
+
+  av1_init_tile_data(cpi);
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+
+  prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
+  accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+                                 const FRAME_COUNTS *counts) {
+  unsigned int *const acc = (unsigned int *)acc_counts;
+  const unsigned int *const cnt = (const unsigned int *)counts;
+
+  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+
+  for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
+
+// Computes the maximum number of sb rows and sb_cols across tiles which are
+// used to allocate memory for multi-threaded encoding with row-mt=1.
+static AOM_INLINE void compute_max_sb_rows_cols(const AV1_COMMON *cm,
+                                                int *max_sb_rows_in_tile,
+                                                int *max_sb_cols_in_tile) {
+  const int tile_rows = cm->tiles.rows;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const int num_mi_rows = cm->mi_params.mi_rows;
+  const int *const row_start_sb = cm->tiles.row_start_sb;
+  for (int row = 0; row < tile_rows; row++) {
+    const int mi_row_start = row_start_sb[row] << mib_size_log2;
+    const int mi_row_end =
+        AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
+    const int num_sb_rows_in_tile =
+        CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, mib_size_log2);
+    *max_sb_rows_in_tile = AOMMAX(*max_sb_rows_in_tile, num_sb_rows_in_tile);
+  }
+
+  const int tile_cols = cm->tiles.cols;
+  const int num_mi_cols = cm->mi_params.mi_cols;
+  const int *const col_start_sb = cm->tiles.col_start_sb;
+  for (int col = 0; col < tile_cols; col++) {
+    const int mi_col_start = col_start_sb[col] << mib_size_log2;
+    const int mi_col_end =
+        AOMMIN(col_start_sb[col + 1] << mib_size_log2, num_mi_cols);
+    const int num_sb_cols_in_tile =
+        CEIL_POWER_OF_TWO(mi_col_end - mi_col_start, mib_size_log2);
+    *max_sb_cols_in_tile = AOMMAX(*max_sb_cols_in_tile, num_sb_cols_in_tile);
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Computes the number of workers for firstpass stage (row/tile multi-threading)
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int total_num_threads_row_mt = 0;
+  TileInfo tile_info;
+
+  if (cpi->oxcf.max_threads <= 1) return 1;
+
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      av1_tile_init(&tile_info, cm, row, col);
+      const int num_mb_rows_in_tile =
+          av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size);
+      const int num_mb_cols_in_tile =
+          av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size);
+      total_num_threads_row_mt +=
+          AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile);
+    }
+  }
+  return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
+}
+
+// Computes the maximum number of mb_rows for row multi-threading of firstpass
+// stage
+static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *cm,
+                                             BLOCK_SIZE fp_block_size) {
+  const int tile_rows = cm->tiles.rows;
+  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const int num_mi_rows = cm->mi_params.mi_rows;
+  const int *const row_start_sb = cm->tiles.row_start_sb;
+  int max_mb_rows = 0;
+
+  for (int row = 0; row < tile_rows; row++) {
+    const int mi_row_start = row_start_sb[row] << mib_size_log2;
+    const int mi_row_end =
+        AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
+    const int num_mb_rows_in_tile =
+        CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2);
+    max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile);
+  }
+  return max_mb_rows;
+}
+#endif
+
+static void lpf_pipeline_mt_init(AV1_COMP *cpi, int num_workers) {
+  // Pipelining of loop-filtering after encoding is enabled when loop-filter
+  // level is chosen based on quantizer and frame type. It is disabled in case
+  // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage
+  // decides the filter level. Loop-filtering is disabled in case
+  // of non-reference frames and for frames with intra block copy tool enabled.
+  AV1_COMMON *cm = &cpi->common;
+  const int use_loopfilter = is_loopfilter_used(cm);
+  const int use_superres = av1_superres_scaled(cm);
+  const int use_cdef = is_cdef_used(cm);
+  const int use_restoration = is_restoration_used(cm);
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+  const unsigned int skip_apply_postproc_filters =
+      derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef,
+                                         use_superres, use_restoration);
+  mt_info->pipeline_lpf_mt_with_enc =
+      (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 5) &&
+      (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) &&
+      (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) &&
+      !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc &&
+      ((skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0);
+
+  if (!mt_info->pipeline_lpf_mt_with_enc) return;
+
+  set_postproc_filter_default_params(cm);
+
+  if (!use_loopfilter) return;
+
+  const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick;
+  assert(method == LPF_PICK_FROM_Q);
+  assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY);
+
+  av1_pick_filter_level(cpi->source, cpi, method);
+
+  struct loopfilter *lf = &cm->lf;
+  const int plane_start = 0;
+  const int plane_end = av1_num_planes(cm);
+  int planes_to_lf[MAX_MB_PLANE];
+  if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc,
+                              lf->filter_level)) {
+    set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end);
+    int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+    assert(lpf_opt_level == 2);
+
+    const int start_mi_row = 0;
+    const int end_mi_row = start_mi_row + cm->mi_params.mi_rows;
+
+    av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+    assert(mt_info->num_mod_workers[MOD_ENC] ==
+           mt_info->num_mod_workers[MOD_LPF]);
+    loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf,
+                              mt_info->num_mod_workers[MOD_LPF],
+                              &mt_info->lf_row_sync, lpf_opt_level,
+                              cm->seq_params->mib_size_log2);
+
+    for (int i = num_workers - 1; i >= 0; i--) {
+      EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+      // Initialize loopfilter data
+      thread_data->lf_sync = &mt_info->lf_row_sync;
+      thread_data->lf_data = &thread_data->lf_sync->lfdata[i];
+      loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd);
+    }
+  }
+}
+
+void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const int sb_rows_in_frame = get_sb_rows_in_frame(cm);
+  int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+  int max_sb_rows_in_tile = 0, max_sb_cols_in_tile = 0;
+  int num_workers = mt_info->num_mod_workers[MOD_ENC];
+
+  compute_max_sb_rows_cols(cm, &max_sb_rows_in_tile, &max_sb_cols_in_tile);
+  const bool alloc_row_mt_mem =
+      (enc_row_mt->allocated_tile_cols != tile_cols ||
+       enc_row_mt->allocated_tile_rows != tile_rows ||
+       enc_row_mt->allocated_rows != max_sb_rows_in_tile ||
+       enc_row_mt->allocated_cols != (max_sb_cols_in_tile - 1) ||
+       enc_row_mt->allocated_sb_rows != sb_rows_in_frame);
+  const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;
+
+  assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
+  if (alloc_tile_data) {
+    av1_alloc_tile_data(cpi);
+  }
+
+  assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
+  if (alloc_row_mt_mem) {
+    row_mt_mem_alloc(cpi, max_sb_rows_in_tile, max_sb_cols_in_tile,
+                     cpi->oxcf.algo_cfg.cdf_update_mode);
+  }
+
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+  lpf_pipeline_mt_init(cpi, num_workers);
+
+  av1_init_tile_data(cpi);
+
+  memset(thread_id_to_tile_id, -1,
+         sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+  memset(enc_row_mt->num_tile_cols_done, 0,
+         sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame);
+  enc_row_mt->row_mt_exit = false;
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+      // Initialize num_finished_cols to -1 for all rows.
+      memset(row_mt_sync->num_finished_cols, -1,
+             sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows_in_tile);
+      row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+      row_mt_sync->num_threads_working = 0;
+      row_mt_sync->intrabc_extra_top_right_sb_delay =
+          av1_get_intrabc_extra_top_right_sb_delay(cm);
+
+      av1_inter_mode_data_init(this_tile);
+      av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
+                             this_tile->tile_info.mi_col_start,
+                             this_tile->tile_info.mi_col_end, tile_row);
+    }
+  }
+
+  assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+                        num_workers);
+  prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
+  if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi);
+  accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) {
+  for (int i = num_workers - 1; i >= 0; --i) {
+    EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
+    if (thread_data->td != &cpi->td)
+      av1_dealloc_src_diff_buf(&thread_data->td->mb,
+                               av1_num_planes(&cpi->common));
+  }
+}
+
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+  int num_workers = 0;
+  int max_mb_rows = 0;
+
+  max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size);
+  const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols ||
+                                enc_row_mt->allocated_tile_rows != tile_rows ||
+                                enc_row_mt->allocated_rows != max_mb_rows;
+  const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;
+
+  assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
+  if (alloc_tile_data) {
+    av1_alloc_tile_data(cpi);
+  }
+
+  assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
+  if (alloc_row_mt_mem) {
+    row_mt_mem_alloc(cpi, max_mb_rows, -1, 0);
+  }
+
+  av1_init_tile_data(cpi);
+
+  // For pass = 1, compute the no. of workers needed. For single-pass encode
+  // (pass = 0), no. of workers are already computed.
+  if (mt_info->num_mod_workers[MOD_FP] == 0)
+    num_workers = av1_fp_compute_num_enc_workers(cpi);
+  else
+    num_workers = mt_info->num_mod_workers[MOD_FP];
+
+  memset(thread_id_to_tile_id, -1,
+         sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+  enc_row_mt->firstpass_mt_exit = false;
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+      // Initialize num_finished_cols to -1 for all rows.
+      memset(row_mt_sync->num_finished_cols, -1,
+             sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows);
+      row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+      row_mt_sync->num_threads_working = 0;
+
+      // intraBC mode is not evaluated during first-pass encoding. Hence, no
+      // additional top-right delay is required.
+      row_mt_sync->intrabc_extra_top_right_sb_delay = 0;
+    }
+  }
+
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+  assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+                        num_workers);
+  fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
+  dealloc_thread_data_src_diff_buf(cpi, num_workers);
+}
+
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                    int r, int c) {
+  (void)tpl_mt_sync;
+  (void)r;
+  (void)c;
+}
+
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                     int r, int c, int cols) {
+  (void)tpl_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+}
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+                              int c) {
+#if CONFIG_MULTITHREAD
+  int nsync = tpl_row_mt_sync->sync_range;
+
+  if (r) {
+    pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync)
+      pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex);
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)tpl_row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+                               int c, int cols) {
+#if CONFIG_MULTITHREAD
+  int nsync = tpl_row_mt_sync->sync_range;
+  int cur;
+  // Only signal when there are enough encoded blocks for next row to run.
+  int sig = 1;
+
+  if (c < cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]);
+
+    // When a thread encounters an error, num_finished_cols[r] is set to maximum
+    // column number. In this case, the AOMMAX operation here ensures that
+    // num_finished_cols[r] is not overwritten with a smaller value thus
+    // preventing the infinite waiting of threads in the relevant sync_read()
+    // function.
+    tpl_row_mt_sync->num_finished_cols[r] =
+        AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur);
+
+    pthread_cond_signal(&tpl_row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)tpl_row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static AOM_INLINE void set_mode_estimation_done(AV1_COMP *cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const BLOCK_SIZE bsize =
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+  const int mi_height = mi_size_high[bsize];
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+  const int tplb_cols_in_tile =
+      ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+  // In case of tpl row-multithreading, due to top-right dependency, the worker
+  // on an mb_row waits for the completion of the tpl processing of the top and
+  // top-right blocks. Hence, in case a thread (main/worker) encounters an
+  // error, update that the tpl processing of every mb_row in the frame is
+  // complete in order to avoid dependent workers waiting indefinitely.
+  for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows;
+       mi_row += mi_height, tplb_row++) {
+    (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                  tplb_cols_in_tile - 1, tplb_cols_in_tile);
+  }
+}
+
+// Each worker calls tpl_worker_hook() and computes the tpl data.
+static int tpl_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &thread_data->td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
+  TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers;
+  CommonModeInfoParams *mi_params = &cm->mi_params;
+  int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
+
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+  (void)tpl_row_mt;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_;
+#endif
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(tpl_error_mutex_);
+    tpl_row_mt->tpl_mt_exit = true;
+    pthread_mutex_unlock(tpl_error_mutex_);
+#endif
+    set_mode_estimation_done(cpi);
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+  TX_SIZE tx_size = max_txsize_lookup[bsize];
+  int mi_height = mi_size_high[bsize];
+
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
+  for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
+       mi_row += num_active_workers * mi_height) {
+    // Motion estimation row boundary
+    av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+                          cpi->oxcf.border_in_pixels);
+    xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+    xd->mb_to_bottom_edge =
+        GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+    av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row,
+                              bsize, tx_size);
+  }
+  error_info->setjmp = 0;
+  return 1;
+}
+
+// Deallocate tpl synchronization related mutex and data.
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) {
+  assert(tpl_sync != NULL);
+
+#if CONFIG_MULTITHREAD
+  if (tpl_sync->mutex_ != NULL) {
+    for (int i = 0; i < tpl_sync->rows; ++i)
+      pthread_mutex_destroy(&tpl_sync->mutex_[i]);
+    aom_free(tpl_sync->mutex_);
+  }
+  if (tpl_sync->cond_ != NULL) {
+    for (int i = 0; i < tpl_sync->rows; ++i)
+      pthread_cond_destroy(&tpl_sync->cond_[i]);
+    aom_free(tpl_sync->cond_);
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  aom_free(tpl_sync->num_finished_cols);
+  // clear the structure as the source of this call may be a resize in which
+  // case this call will be followed by an _alloc() which may fail.
+  av1_zero(*tpl_sync);
+}
+
+// Allocate memory for tpl row synchronization.
+void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
+                   int mb_rows) {
+  tpl_sync->rows = mb_rows;
+#if CONFIG_MULTITHREAD
+  {
+    CHECK_MEM_ERROR(cm, tpl_sync->mutex_,
+                    aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows));
+    if (tpl_sync->mutex_) {
+      for (int i = 0; i < mb_rows; ++i)
+        pthread_mutex_init(&tpl_sync->mutex_[i], NULL);
+    }
+
+    CHECK_MEM_ERROR(cm, tpl_sync->cond_,
+                    aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows));
+    if (tpl_sync->cond_) {
+      for (int i = 0; i < mb_rows; ++i)
+        pthread_cond_init(&tpl_sync->cond_[i], NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols,
+                  aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows));
+
+  // Set up nsync.
+  tpl_sync->sync_range = 1;
+}
+
+// Each worker is prepared by assigning the hook function and individual thread
+// data.
+static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                           int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      // OBMC buffers are used only to init MS params and remain unused when
+      // called from tpl, hence set the buffers to defaults.
+      av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+      if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers,
+                                  cpi->ppi->tpl_data.tpl_bsize_1d)) {
+        aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                           "Error allocating tpl data");
+      }
+      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+    }
+  }
+}
+
+#if CONFIG_BITRATE_ACCURACY
+// Accumulate transform stats after tpl.
+static void tpl_accumulate_txfm_stats(ThreadData *main_td,
+                                      const MultiThreadInfo *mt_info,
+                                      int num_workers) {
+  TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    ThreadData *td = thread_data->td;
+    if (td != main_td) {
+      const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+      av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats);
+    }
+  }
+}
+#endif  // CONFIG_BITRATE_ACCURACY
+
+// Implements multi-threading for tpl.
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  CommonModeInfoParams *mi_params = &cm->mi_params;
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
+  AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
+  int mb_rows = mi_params->mb_rows;
+  int num_workers =
+      AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers);
+
+  if (mb_rows != tpl_sync->rows) {
+    av1_tpl_dealloc(tpl_sync);
+    av1_tpl_alloc(tpl_sync, cm, mb_rows);
+  }
+  tpl_sync->num_threads_working = num_workers;
+  mt_info->tpl_row_mt.tpl_mt_exit = false;
+
+  // Initialize cur_mb_col to -1 for all MB rows.
+  memset(tpl_sync->num_finished_cols, -1,
+         sizeof(*tpl_sync->num_finished_cols) * mb_rows);
+
+  prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
+#if CONFIG_BITRATE_ACCURACY
+  tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
+#endif  // CONFIG_BITRATE_ACCURACY
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
+  }
+}
+
+// Deallocate memory for temporal filter multi-thread synchronization.
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) {
+  assert(tf_sync != NULL);
+#if CONFIG_MULTITHREAD
+  if (tf_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(tf_sync->mutex_);
+    aom_free(tf_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+  tf_sync->next_tf_row = 0;
+}
+
+// Checks if a job is available. If job is available,
+// populates next_tf_row and returns 1, else returns 0.
+static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
+                                      int *current_mb_row, int mb_rows) {
+  int do_next_row = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
+  pthread_mutex_lock(tf_mutex_);
+#endif
+  if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) {
+    *current_mb_row = tf_mt_sync->next_tf_row;
+    tf_mt_sync->next_tf_row++;
+    do_next_row = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(tf_mutex_);
+#endif
+  return do_next_row;
+}
+
+// Hook function for each thread in temporal filter multi-threading.
+static int tf_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  ThreadData *td = thread_data->td;
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync;
+  const struct scale_factors *scale = &cpi->tf_ctx.sf;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *tf_mutex_ = tf_sync->mutex_;
+#endif
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(tf_mutex_);
+    tf_sync->tf_mt_exit = true;
+    pthread_mutex_unlock(tf_mutex_);
+#endif
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  const int num_planes = av1_num_planes(&cpi->common);
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  MACROBLOCKD *mbd = &td->mb.e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  MB_MODE_INFO **input_mb_mode_info;
+  tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+  tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+  int current_mb_row = -1;
+
+  while (tf_get_next_job(tf_sync, &current_mb_row, tf_ctx->mb_rows))
+    av1_tf_do_filtering_row(cpi, td, current_mb_row);
+
+  tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+
+  error_info->setjmp = 0;
+  return 1;
+}
+
+// Assigns temporal filter hook function and thread data to each worker.
+static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                               int num_workers, int is_highbitdepth) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  mt_info->tf_sync.next_tf_row = 0;
+  mt_info->tf_sync.tf_mt_exit = false;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      // OBMC buffers are used only to init MS params and remain unused when
+      // called from tf, hence set the buffers to defaults.
+      av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+      if (!tf_alloc_and_reset_data(&thread_data->td->tf_data,
+                                   cpi->tf_ctx.num_pels, is_highbitdepth)) {
+        aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                           "Error allocating temporal filter data");
+      }
+    }
+  }
+}
+
+// Deallocate thread specific data for temporal filter.
+static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers,
+                                   int is_highbitdepth) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth);
+  }
+}
+
+// Accumulate sse and sum after temporal filtering.
+static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) {
+  FRAME_DIFF *total_diff = &cpi->td.tf_data.diff;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->mt_info.workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    ThreadData *td = thread_data->td;
+    FRAME_DIFF *diff = &td->tf_data.diff;
+    if (td != &cpi->td) {
+      total_diff->sse += diff->sse;
+      total_diff->sum += diff->sum;
+    }
+  }
+}
+
+// Implements multi-threading for temporal filter.
+void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+
+  int num_workers =
+      AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers);
+
+  prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, cm, num_workers);
+  tf_accumulate_frame_diff(cpi, num_workers);
+  tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth);
+}
+
+// Checks if a job is available in the current direction. If a job is available,
+// frame_idx will be populated and returns 1, else returns 0.
+static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
+                                      int cur_dir) {
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+  JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+
+  int total_refs = gm_info->num_ref_frames[cur_dir];
+  int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir];
+
+  if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) {
+    *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame;
+    job_info->next_frame_to_process[cur_dir] += 1;
+    return 1;
+  }
+  return 0;
+}
+
+// Switches the current direction and calls the function get_next_gm_job() if
+// the speed feature 'prune_ref_frame_for_gm_search' is not set.
+static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx,
+                                        int *cur_dir) {
+  if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return;
+  // Switch the direction and get next job
+  *cur_dir = !(*cur_dir);
+  get_next_gm_job(cpi, frame_idx, *(cur_dir));
+}
+
+// Hook function for each thread in global motion multi-threading.
+static int gm_mt_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+  JobInfo *job_info = &gm_sync->job_info;
+  int thread_id = thread_data->thread_id;
+  GlobalMotionData *gm_thread_data = &thread_data->td->gm_data;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_;
+#endif
+
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(gm_mt_mutex_);
+    gm_sync->gm_mt_exit = true;
+    pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  int cur_dir = job_info->thread_id_to_dir[thread_id];
+  bool gm_mt_exit = false;
+  while (1) {
+    int ref_buf_idx = -1;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(gm_mt_mutex_);
+#endif
+
+    gm_mt_exit = gm_sync->gm_mt_exit;
+    // Populates ref_buf_idx(the reference frame type) for which global motion
+    // estimation will be done.
+    if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
+      // No jobs are available for the current direction. Switch
+      // to other direction and get the next job, if available.
+      switch_direction(cpi, &ref_buf_idx, &cur_dir);
+    }
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+
+    // When gm_mt_exit is set to true, other workers need not pursue any
+    // further jobs.
+    if (gm_mt_exit || ref_buf_idx == -1) break;
+
+    // Compute global motion for the given ref_buf_idx.
+    av1_compute_gm_for_valid_ref_frames(
+        cpi, error_info, gm_info->ref_buf, ref_buf_idx,
+        gm_thread_data->motion_models, gm_thread_data->segment_map,
+        gm_info->segment_map_w, gm_info->segment_map_h);
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(gm_mt_mutex_);
+#endif
+    // If global motion w.r.t. current ref frame is
+    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+    // the remaining ref frames in that direction.
+    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+        cpi->common.global_motion[ref_buf_idx].wmtype <= TRANSLATION)
+      job_info->early_exit[cur_dir] = 1;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+  }
+  error_info->setjmp = 0;
+  return 1;
+}
+
+// Assigns global motion hook function and thread data to each worker.
+static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                          int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  mt_info->gm_sync.gm_mt_exit = false;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+
+    if (thread_data->td != &cpi->td)
+      gm_alloc_data(cpi, &thread_data->td->gm_data);
+  }
+}
+
+// Assigns available threads to past/future direction.
+static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir,
+                                            int num_workers) {
+  int8_t frame_dir_idx = 0;
+
+  for (int i = 0; i < num_workers; i++) {
+    thread_id_to_dir[i] = frame_dir_idx++;
+    if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0;
+  }
+}
+
+// Computes number of workers for global motion multi-threading.
+static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
+  int total_refs =
+      cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
+  int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
+                           ? AOMMIN(MAX_DIRECTIONS, total_refs)
+                           : total_refs;
+  num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers);
+  return (num_gm_workers);
+}
+
+// Frees the memory allocated for each worker in global motion multi-threading.
+static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int j = 0; j < num_workers; j++) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) gm_dealloc_data(&td->gm_data);
+  }
+}
+
+// Implements multi-threading for global motion.
+void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
+  JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+
+  av1_zero(*job_info);
+
+  int num_workers = compute_gm_workers(cpi);
+
+  assign_thread_to_dir(job_info->thread_id_to_dir, num_workers);
+  prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
+  gm_dealloc_thread_data(cpi, num_workers);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE int get_next_job_allintra(
+    AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end,
+    int *current_mi_row, int mib_size) {
+  if (row_mt_sync->next_mi_row < mi_row_end) {
+    *current_mi_row = row_mt_sync->next_mi_row;
+    row_mt_sync->num_threads_working++;
+    row_mt_sync->next_mi_row += mib_size;
+    return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi,
+                                                  AVxWorkerHook hook,
+                                                  const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread, in this case the preprocessing
+    // stage does not need tiles. So we set it to 0.
+    thread_data->start = 0;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td);
+    }
+  }
+}
+
+static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const int mb_step = mi_size_wide[bsize];
+  assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL);
+  const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE];
+  const int mt_unit_cols =
+      (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
+  const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt;
+  AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+      &cpi->ppi->intra_row_mt_sync;
+
+  // Update the wiener variance computation of every row in the frame to
+  // indicate that it is complete in order to avoid dependent workers waiting
+  // indefinitely.
+  for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows;
+       mi_row += mb_step, ++mt_thread_id) {
+    intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id,
+                                   mt_unit_cols - 1, mt_unit_cols);
+  }
+}
+
+static int cal_mb_wiener_var_hook(void *arg1, void *unused) {
+  (void)unused;
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  MACROBLOCK *x = &thread_data->td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const int mb_step = mi_size_wide[bsize];
+  AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+      &cpi->ppi->intra_row_mt_sync;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_;
+#endif
+
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex);
+    enc_row_mt->mb_wiener_mt_exit = true;
+    pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+    set_mb_wiener_var_calc_done(cpi);
+    return 0;
+  }
+  error_info->setjmp = 1;
+  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+  double sum_rec_distortion = 0;
+  double sum_est_rate = 0;
+  while (1) {
+    int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+    int has_jobs = enc_row_mt->mb_wiener_mt_exit
+                       ? 0
+                       : get_next_job_allintra(intra_row_mt_sync,
+                                               cpi->common.mi_params.mi_rows,
+                                               &current_mi_row, mb_step);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+    if (!has_jobs) break;
+    // TODO(chengchen): properly accumulate the distortion and rate.
+    av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff,
+                               qcoeff, dqcoeff, &sum_rec_distortion,
+                               &sum_est_rate,
+                               thread_data->td->wiener_tmp_pred_buf);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+    intra_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+  }
+  error_info->setjmp = 0;
+  return 1;
+}
+
+static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) {
+  av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
+
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int j = 0; j < num_workers; ++j) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td);
+  }
+}
+
+// This function is the multi-threading version of computing the wiener
+// variance.
+// Note that the wiener variance is used for allintra mode (1 pass) and its
+// computation is before the frame encoding, so we don't need to consider
+// the number of tiles, instead we allocate all available threads to
+// the computation.
+void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
+                               double *sum_rec_distortion,
+                               double *sum_est_rate) {
+  (void)sum_rec_distortion;
+  (void)sum_est_rate;
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+      &cpi->ppi->intra_row_mt_sync;
+
+  // TODO(chengchen): the memory usage could be improved.
+  const int mi_rows = cm->mi_params.mi_rows;
+  row_mt_sync_mem_alloc(intra_row_mt_sync, cm, mi_rows);
+
+  intra_row_mt_sync->intrabc_extra_top_right_sb_delay = 0;
+  intra_row_mt_sync->num_threads_working = num_workers;
+  intra_row_mt_sync->next_mi_row = 0;
+  memset(intra_row_mt_sync->num_finished_cols, -1,
+         sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows);
+  mt_info->enc_row_mt.mb_wiener_mt_exit = false;
+
+  prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, cm, num_workers);
+  dealloc_mb_wiener_var_mt_data(cpi, num_workers);
+}
+
+// Compare and order tiles based on absolute sum of tx coeffs.
+static int compare_tile_order(const void *a, const void *b) {
+  const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
+  const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b;
+
+  if (tile_a->abs_sum_level > tile_b->abs_sum_level)
+    return -1;
+  else if (tile_a->abs_sum_level == tile_b->abs_sum_level)
+    return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1);
+  else
+    return 1;
+}
+
+// Get next tile index to be processed for pack bitstream
+static AOM_INLINE int get_next_pack_bs_tile_idx(
+    AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) {
+  assert(pack_bs_sync->next_job_idx <= num_tiles);
+  if (pack_bs_sync->next_job_idx == num_tiles) return -1;
+
+  return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++]
+      .tile_idx;
+}
+
+// Calculates bitstream chunk size based on total buffer size and tile or tile
+// group size.
+static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size,
+                                           const int frame_or_tg_size,
+                                           size_t *remain_buf_size,
+                                           size_t max_buf_size,
+                                           int is_last_chunk) {
+  size_t this_chunk_size;
+  assert(*remain_buf_size > 0);
+  if (is_last_chunk) {
+    this_chunk_size = *remain_buf_size;
+    *remain_buf_size = 0;
+  } else {
+    const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size;
+    this_chunk_size = (size_t)(size_scale / frame_or_tg_size);
+    *remain_buf_size -= this_chunk_size;
+    assert(*remain_buf_size > 0);
+  }
+  assert(this_chunk_size > 0);
+  return this_chunk_size;
+}
+
+// Initializes params required for pack bitstream tile.
+static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst,
+                                     struct aom_write_bit_buffer *saved_wb,
+                                     PackBSParams *const pack_bs_params_arr,
+                                     uint8_t obu_extn_header) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cpi->num_tg;
+  // Tile group size in terms of number of tiles.
+  const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs;
+  uint8_t *tile_dst = dst;
+  uint8_t *tile_data_curr = dst;
+  // Max tile group count can not be more than MAX_TILES.
+  int tg_size_mi[MAX_TILES] = { 0 };  // Size of tile group in mi units
+  int tile_idx;
+  int tg_idx = 0;
+  int tile_count_in_tg = 0;
+  int new_tg = 1;
+
+  // Populate pack bitstream params of all tiles.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info;
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    // Calculate tile size in mi units.
+    const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) *
+                             (tile_info->mi_row_end - tile_info->mi_row_start);
+    int is_last_tile_in_tg = 0;
+    tile_count_in_tg++;
+    if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1))
+      is_last_tile_in_tg = 1;
+
+    // Populate pack bitstream params of this tile.
+    pack_bs_params->curr_tg_hdr_size = 0;
+    pack_bs_params->obu_extn_header = obu_extn_header;
+    pack_bs_params->saved_wb = saved_wb;
+    pack_bs_params->obu_header_size = 0;
+    pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg;
+    pack_bs_params->new_tg = new_tg;
+    pack_bs_params->tile_col = tile_info->tile_col;
+    pack_bs_params->tile_row = tile_info->tile_row;
+    pack_bs_params->tile_size_mi = tile_size_mi;
+    tg_size_mi[tg_idx] += tile_size_mi;
+
+    if (new_tg) new_tg = 0;
+    if (is_last_tile_in_tg) {
+      tile_count_in_tg = 0;
+      new_tg = 1;
+      tg_idx++;
+    }
+  }
+
+  assert(cpi->available_bs_size > 0);
+  size_t tg_buf_size[MAX_TILES] = { 0 };
+  size_t max_buf_size = cpi->available_bs_size;
+  size_t remain_buf_size = max_buf_size;
+  const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols;
+
+  tile_idx = 0;
+  // Prepare obu, tile group and frame header of each tile group.
+  for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    int is_last_tg = tg_idx == cpi->num_tg - 1;
+    // Prorate bitstream buffer size based on tile group size and available
+    // buffer size. This buffer will be used to store headers and tile data.
+    tg_buf_size[tg_idx] =
+        get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size,
+                          max_buf_size, is_last_tg);
+
+    pack_bs_params->dst = tile_dst;
+    pack_bs_params->tile_data_curr = tile_dst;
+
+    // Write obu, tile group and frame header at first tile in the tile
+    // group.
+    av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx);
+    tile_dst += tg_buf_size[tg_idx];
+
+    // Exclude headers from tile group buffer size.
+    tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size;
+    tile_idx += tg_size_in_tiles;
+  }
+
+  tg_idx = 0;
+  // Calculate bitstream buffer size of each tile in the tile group.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+
+    if (pack_bs_params->new_tg) {
+      max_buf_size = tg_buf_size[tg_idx];
+      remain_buf_size = max_buf_size;
+    }
+
+    // Prorate bitstream buffer size of this tile based on tile size and
+    // available buffer size. For this proration, header size is not accounted.
+    const size_t tile_buf_size = get_bs_chunk_size(
+        pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size,
+        max_buf_size, pack_bs_params->is_last_tile_in_tg);
+    pack_bs_params->tile_buf_size = tile_buf_size;
+
+    // Update base address of bitstream buffer for tile and tile group.
+    if (pack_bs_params->new_tg) {
+      tile_dst = pack_bs_params->dst;
+      tile_data_curr = pack_bs_params->tile_data_curr;
+      // Account header size in first tile of a tile group.
+      pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size;
+    } else {
+      pack_bs_params->dst = tile_dst;
+      pack_bs_params->tile_data_curr = tile_data_curr;
+    }
+
+    if (pack_bs_params->is_last_tile_in_tg) tg_idx++;
+    tile_dst += pack_bs_params->tile_buf_size;
+  }
+}
+
+// Worker hook function of pack bitsteam multithreading.
+static int pack_bs_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  PackBSParams *const pack_bs_params = (PackBSParams *)arg2;
+  AV1_COMP *const cpi = thread_data->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_;
+#endif
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pack_bs_mutex);
+    pack_bs_sync->pack_bs_mt_exit = true;
+    pthread_mutex_unlock(pack_bs_mutex);
+#endif
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  while (1) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pack_bs_mutex);
+#endif
+    const int tile_idx =
+        pack_bs_sync->pack_bs_mt_exit
+            ? -1
+            : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pack_bs_mutex);
+#endif
+    // When pack_bs_mt_exit is set to true, other workers need not pursue any
+    // further jobs.
+    if (tile_idx == -1) break;
+    TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+    thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+    av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
+  }
+
+  error_info->setjmp = 0;
+  return 1;
+}
+
+// Prepares thread data and workers of pack bitsteam multithreading.
+static void prepare_pack_bs_workers(AV1_COMP *const cpi,
+                                    PackBSParams *const pack_bs_params,
+                                    AVxWorkerHook hook, const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+
+    if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb;
+
+    thread_data->cpi = cpi;
+    thread_data->start = i;
+    thread_data->thread_id = i;
+    av1_reset_pack_bs_thread_data(thread_data->td);
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = pack_bs_params;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync;
+  const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols;
+  pack_bs_sync->next_job_idx = 0;
+  pack_bs_sync->pack_bs_mt_exit = false;
+
+  PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order;
+  // Reset tile order data of pack bitstream
+  av1_zero_array(pack_bs_tile_order, num_tiles);
+
+  // Populate pack bitstream tile order structure
+  for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    pack_bs_tile_order[tile_idx].abs_sum_level =
+        cpi->tile_data[tile_idx].abs_sum_level;
+    pack_bs_tile_order[tile_idx].tile_idx = tile_idx;
+  }
+
+  // Sort tiles in descending order based on tile area.
+  qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order),
+        compare_tile_order);
+}
+
+// Accumulates data after pack bitsteam processing.
+static void accumulate_pack_bs_data(
+    AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr,
+    uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info,
+    int *const largest_tile_id, unsigned int *max_tile_size,
+    uint32_t *const obu_header_size, uint8_t **tile_data_start,
+    const int num_workers) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int tile_count = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  size_t curr_tg_data_size = 0;
+  int is_first_tg = 1;
+  uint8_t *curr_tg_start = dst;
+  size_t src_offset = 0;
+  size_t dst_offset = 0;
+
+  for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+    // PackBSParams stores all parameters required to pack tile and header
+    // info.
+    const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    uint32_t tile_size = 0;
+
+    if (pack_bs_params->new_tg) {
+      curr_tg_start = dst + *total_size;
+      curr_tg_data_size = pack_bs_params->curr_tg_hdr_size;
+      *tile_data_start += pack_bs_params->curr_tg_hdr_size;
+      *obu_header_size = pack_bs_params->obu_header_size;
+    }
+    curr_tg_data_size +=
+        pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4);
+
+    if (pack_bs_params->buf.size > *max_tile_size) {
+      *largest_tile_id = tile_idx;
+      *max_tile_size = (unsigned int)pack_bs_params->buf.size;
+    }
+    tile_size +=
+        (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size;
+
+    // Pack all the chunks of tile bitstreams together
+    if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size);
+
+    if (pack_bs_params->is_last_tile_in_tg)
+      av1_write_last_tile_info(
+          cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size,
+          curr_tg_start, &tile_size, tile_data_start, largest_tile_id,
+          &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header);
+    src_offset += pack_bs_params->tile_buf_size;
+    dst_offset += tile_size;
+    *total_size += tile_size;
+  }
+
+  // Accumulate thread data
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int idx = num_workers - 1; idx >= 0; idx--) {
+    ThreadData const *td = mt_info->tile_thr_data[idx].td;
+    av1_accumulate_pack_bs_thread_data(cpi, td);
+  }
+}
+
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start, const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  PackBSParams pack_bs_params[MAX_TILES];
+  uint32_t tile_size[MAX_TILES] = { 0 };
+
+  for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++)
+    pack_bs_params[tile_idx].total_size = &tile_size[tile_idx];
+
+  init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header);
+  prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook,
+                          num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, &cpi->common, num_workers);
+  accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info,
+                          largest_tile_id, max_tile_size, obu_header_size,
+                          tile_data_start, num_workers);
+}
+
+// Deallocate memory for CDEF search multi-thread synchronization.
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) {
+  (void)cdef_sync;
+  assert(cdef_sync != NULL);
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(cdef_sync->mutex_);
+    aom_free(cdef_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Updates the row and column indices of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all blocks is complete.
+static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) {
+  cdef_sync->fbc++;
+  if (cdef_sync->fbc == nhfb) {
+    cdef_sync->fbr++;
+    if (cdef_sync->fbr == nvfb) {
+      cdef_sync->end_of_frame = 1;
+    } else {
+      cdef_sync->fbc = 0;
+    }
+  }
+}
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+#endif  // CONFIG_MULTITHREAD
+  cdef_sync->end_of_frame = 0;
+  cdef_sync->fbr = 0;
+  cdef_sync->fbc = 0;
+  cdef_sync->cdef_mt_exit = false;
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
+                                        CdefSearchCtx *cdef_search_ctx,
+                                        volatile int *cur_fbr,
+                                        volatile int *cur_fbc,
+                                        volatile int *sb_count) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  int do_next_block = 0;
+  const int nvfb = cdef_search_ctx->nvfb;
+  const int nhfb = cdef_search_ctx->nhfb;
+
+  // If a block is skip, do not process the block and
+  // check the skip condition for the next block.
+  while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame &&
+         cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
+                      cdef_sync->fbc)) {
+    update_next_job_info(cdef_sync, nvfb, nhfb);
+  }
+
+  // Populates information needed for current job and update the row,
+  // column indices of the next block to be processed.
+  if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
+    do_next_block = 1;
+    *cur_fbr = cdef_sync->fbr;
+    *cur_fbc = cdef_sync->fbc;
+    *sb_count = cdef_search_ctx->sb_count;
+    cdef_search_ctx->sb_count++;
+    update_next_job_info(cdef_sync, nvfb, nhfb);
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  return do_next_block;
+}
+
+// Hook function for each thread in CDEF search multi-threading.
+static int cdef_filter_block_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_;
+#endif
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(cdef_mutex_);
+    cdef_sync->cdef_mt_exit = true;
+    pthread_mutex_unlock(cdef_mutex_);
+#endif
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  volatile int cur_fbr, cur_fbc, sb_count;
+  while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc,
+                           &sb_count)) {
+    av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc,
+                            sb_count);
+  }
+  error_info->setjmp = 0;
+  return 1;
+}
+
+// Assigns CDEF search hook function and thread data to each worker.
+static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                 int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    thread_data->cpi = cpi;
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = &mt_info->cdef_sync;
+  }
+}
+
+// Implements multi-threading for CDEF search.
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+  const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH];
+
+  cdef_reset_job_info(cdef_sync);
+  prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, &cpi->common, num_workers);
+}
+
+// Computes num_workers for temporal filter multi-threading.
+static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) {
+  // For single-pass encode, using no. of workers as per tf block size was not
+  // found to improve speed. Hence the thread assignment for single-pass encode
+  // is kept based on compute_num_enc_workers().
+  if (cpi->oxcf.pass < AOM_RC_SECOND_PASS)
+    return (av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads));
+
+  if (cpi->oxcf.max_threads <= 1) return 1;
+
+  const int frame_height = cpi->common.height;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int mb_height = block_size_high[block_size];
+  const int mb_rows = get_num_blocks(frame_height, mb_height);
+  return AOMMIN(cpi->oxcf.max_threads, mb_rows);
+}
+
+// Computes num_workers for tpl multi-threading.
+static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop filter multi-threading.
+static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for cdef multi-threading.
+static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop-restoration multi-threading.
+static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for pack bitstream multi-threading.
+static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
+  if (cpi->oxcf.max_threads <= 1) return 1;
+  return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for all intra multi-threading.
+static AOM_INLINE int compute_num_ai_workers(AV1_COMP *cpi) {
+  if (cpi->oxcf.max_threads <= 1) return 1;
+  // The multi-threading implementation of deltaq-mode = 3 in allintra
+  // mode is based on row multi threading.
+  if (!cpi->oxcf.row_mt) return 1;
+  cpi->weber_bsize = BLOCK_8X8;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const int mb_step = mi_size_wide[bsize];
+  const int num_mb_rows = cpi->common.mi_params.mi_rows / mb_step;
+  return AOMMIN(num_mb_rows, cpi->oxcf.max_threads);
+}
+
+static int compute_num_mod_workers(AV1_COMP *cpi,
+                                   MULTI_THREADED_MODULES mod_name) {
+  int num_mod_workers = 0;
+  switch (mod_name) {
+    case MOD_FP:
+      if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS)
+        num_mod_workers = 0;
+      else
+        num_mod_workers =
+            av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      break;
+    case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break;
+    case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break;
+    case MOD_GME: num_mod_workers = 1; break;
+    case MOD_ENC:
+      num_mod_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      break;
+    case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break;
+    case MOD_CDEF_SEARCH:
+      num_mod_workers = compute_num_cdef_workers(cpi);
+      break;
+    case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break;
+    case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+    case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break;
+    case MOD_FRAME_ENC:
+      num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC];
+      break;
+    case MOD_AI:
+      if (cpi->oxcf.pass == AOM_RC_ONE_PASS) {
+        num_mod_workers = compute_num_ai_workers(cpi);
+      } else {
+        num_mod_workers = 0;
+      }
+      break;
+    default: assert(0); break;
+  }
+  return (num_mod_workers);
+}
+// Computes the number of workers for each MT modules in the encoder
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi) {
+  for (int i = MOD_FP; i < NUM_MT_MODULES; i++) {
+    cpi->ppi->p_mt_info.num_mod_workers[i] =
+        compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i);
+  }
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
new file mode 100644
index 0000000000..468e120776
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+  struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  struct ThreadData *original_td;
+  struct aom_internal_error_info error_info;
+  AV1LfSync *lf_sync;
+  LFWorkerData *lf_data;
+  int start;
+  int thread_id;
+} EncWorkerData;
+
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c);
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+                           int cols);
+
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                int c);
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                 int c, int cols);
+
+void av1_encode_tiles_mt(struct AV1_COMP *cpi);
+void av1_encode_tiles_row_mt(struct AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi);
+
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi);
+#endif
+
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+                                 const struct FRAME_COUNTS *counts);
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
+
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync);
+
+void av1_global_motion_estimation_mt(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                    int r, int c);
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                     int r, int c, int cols);
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+                              int c);
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+                               int c, int cols);
+
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi);
+
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync);
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
+                               double *sum_rec_distortion,
+                               double *sum_est_rate);
+
+void av1_tf_do_filtering_mt(AV1_COMP *cpi);
+
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync);
+
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi);
+
+int av1_get_max_num_workers(const AV1_COMP *cpi);
+
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers);
+
+void av1_terminate_workers(AV1_PRIMARY *ppi);
+
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi);
+
+void av1_init_cdef_worker(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi);
+#endif
+
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass);
+#endif  // CONFIG_MULTITHREAD
+
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
+                                      MULTI_THREADED_MODULES mod_name);
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass);
+
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi);
+
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
+
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start, const int num_workers);
+
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers);
+
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
+
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf);
+
+void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+                                  AV1_COMP_DATA *const first_cpi_data);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
new file mode 100644
index 0000000000..e1b1e69ca7
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+                                  uint8_t *dst, int dst_pitch, int w, int h,
+                                  int extend_top, int extend_left,
+                                  int extend_bottom, int extend_right,
+                                  int chroma_step) {
+  int i, linesize;
+  // copy the left and right most columns out
+  const uint8_t *src_ptr1 = src;
+  const uint8_t *src_ptr2 = src + (w - 1) * chroma_step;
+  uint8_t *dst_ptr1 = dst - extend_left;
+  uint8_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    if (chroma_step == 1) {
+      memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    } else {
+      for (int j = 0; j < w; j++) {
+        dst_ptr1[extend_left + j] = src_ptr1[chroma_step * j];
+      }
+    }
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+  linesize = extend_left + extend_right + w;
+  assert(linesize <= dst_pitch);
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch, int w,
+                                         int h, int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+    aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+  linesize = extend_left + extend_right + w;
+  assert(linesize <= dst_pitch);
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst) {
+  // Extend src frame in buffer
+  const int et_y = dst->border;
+  const int el_y = dst->border;
+  const int er_y =
+      AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      src->y_crop_width;
+  const int eb_y = AOMMAX(src->y_height + dst->border,
+                          ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+                   src->y_crop_height;
+  const int uv_width_subsampling = src->subsampling_x;
+  const int uv_height_subsampling = src->subsampling_y;
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
+
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, src->y_crop_width,
+                                 src->y_crop_height, et_y, el_y, eb_y, er_y);
+    if (!src->monochrome) {
+      highbd_copy_and_extend_plane(
+          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+          src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+      highbd_copy_and_extend_plane(
+          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+          src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    }
+    return;
+  }
+
+  copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                        dst->y_stride, src->y_crop_width, src->y_crop_height,
+                        et_y, el_y, eb_y, er_y, 1);
+  if (!src->monochrome) {
+    // detect nv12 format
+    const int chroma_step = src->v_buffer ? 1 : 2;
+    const uint8_t *src_v_buffer =
+        src->v_buffer ? src->v_buffer : src->u_buffer + 1;
+    copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+                          dst->uv_stride, src->uv_crop_width,
+                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv,
+                          chroma_step);
+    copy_and_extend_plane(src_v_buffer, src->uv_stride, dst->v_buffer,
+                          dst->uv_stride, src->uv_crop_width,
+                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv,
+                          chroma_step);
+  }
+}
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
new file mode 100644
index 0000000000..b8cc5b9d28
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/external_partition.c b/third_party/aom/av1/encoder/external_partition.c
new file mode 100644
index 0000000000..79f8b4c8a4
--- /dev/null
+++ b/third_party/aom/av1/encoder/external_partition.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/encoder/external_partition.h"
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+                                    aom_ext_part_config_t config,
+                                    ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  ext_part_controller->funcs = funcs;
+  ext_part_controller->config = config;
+  const aom_ext_part_status_t status = ext_part_controller->funcs.create_model(
+      ext_part_controller->funcs.priv, &ext_part_controller->config,
+      &ext_part_controller->model);
+  if (status == AOM_EXT_PART_ERROR) {
+    return AOM_CODEC_ERROR;
+  } else if (status == AOM_EXT_PART_TEST) {
+    ext_part_controller->test_mode = 1;
+    ext_part_controller->ready = 0;
+    return AOM_CODEC_OK;
+  }
+  assert(status == AOM_EXT_PART_OK);
+  ext_part_controller->ready = 1;
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  av1_zero(ext_part_controller);
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (ext_part_controller->ready) {
+    const aom_ext_part_status_t status =
+        ext_part_controller->funcs.delete_model(ext_part_controller->model);
+    if (status != AOM_EXT_PART_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return av1_ext_part_init(ext_part_controller);
+}
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+                                         aom_partition_decision_t *decision) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(decision != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.get_partition_decision(
+          ext_part_controller->model, decision);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(stats != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.send_partition_stats(
+          ext_part_controller->model, stats);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(features != NULL);
+  const aom_ext_part_status_t status = ext_part_controller->funcs.send_features(
+      ext_part_controller->model, features);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+    const ExtPartController *ext_part_controller) {
+  return ext_part_controller->funcs.decision_mode;
+}
diff --git a/third_party/aom/av1/encoder/external_partition.h b/third_party/aom/av1/encoder/external_partition.h
new file mode 100644
index 0000000000..f74973e9eb
--- /dev/null
+++ b/third_party/aom/av1/encoder/external_partition.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+
+#include <stdbool.h>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+
+typedef struct ExtPartController {
+  int ready;
+  int test_mode;
+  aom_ext_part_config_t config;
+  aom_ext_part_model_t model;
+  aom_ext_part_funcs_t funcs;
+} ExtPartController;
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+                                    aom_ext_part_config_t config,
+                                    ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller);
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+                                         aom_partition_decision_t *decision);
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats);
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features);
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+    const ExtPartController *ext_part_controller);
+
+/*!\endcond */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
new file mode 100644
index 0000000000..e20b6c177e
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -0,0 +1,1600 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/variance.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/entropymv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"  // av1_setup_dst_planes()
+#include "av1/common/reconintra.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#define OUTPUT_FPF 0
+
+#define FIRST_PASS_Q 10.0
+#define INTRA_MODE_PENALTY 1024
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+
+#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1
+
+static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
+                                    struct aom_codec_pkt_list *pktlist) {
+  struct aom_codec_cx_pkt pkt;
+  pkt.kind = AOM_CODEC_STATS_PKT;
+  pkt.data.twopass_stats.buf = stats;
+  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+  if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+  {
+    FILE *fpfile;
+    fpfile = fopen("firstpass.stt", "a");
+
+    fprintf(fpfile,
+            "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n",
+            stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+            stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
+            stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
+            stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+            stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+            stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
+            stats->count, stats->duration);
+    fclose(fpfile);
+  }
+#endif
+}
+
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section) {
+  section->frame = 0.0;
+  section->weight = 0.0;
+  section->intra_error = 0.0;
+  section->frame_avg_wavelet_energy = 0.0;
+  section->coded_error = 0.0;
+  section->log_intra_error = 0.0;
+  section->log_coded_error = 0.0;
+  section->sr_coded_error = 0.0;
+  section->pcnt_inter = 0.0;
+  section->pcnt_motion = 0.0;
+  section->pcnt_second_ref = 0.0;
+  section->pcnt_neutral = 0.0;
+  section->intra_skip_pct = 0.0;
+  section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
+  section->MVr = 0.0;
+  section->mvr_abs = 0.0;
+  section->MVc = 0.0;
+  section->mvc_abs = 0.0;
+  section->MVrv = 0.0;
+  section->MVcv = 0.0;
+  section->mv_in_out_count = 0.0;
+  section->new_mv_count = 0.0;
+  section->count = 0.0;
+  section->duration = 1.0;
+  section->is_flash = 0;
+  section->noise_var = 0;
+  section->cor_coeff = 1.0;
+}
+
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+                          const FIRSTPASS_STATS *frame) {
+  section->frame += frame->frame;
+  section->weight += frame->weight;
+  section->intra_error += frame->intra_error;
+  section->log_intra_error += log1p(frame->intra_error);
+  section->log_coded_error += log1p(frame->coded_error);
+  section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
+  section->coded_error += frame->coded_error;
+  section->sr_coded_error += frame->sr_coded_error;
+  section->pcnt_inter += frame->pcnt_inter;
+  section->pcnt_motion += frame->pcnt_motion;
+  section->pcnt_second_ref += frame->pcnt_second_ref;
+  section->pcnt_neutral += frame->pcnt_neutral;
+  section->intra_skip_pct += frame->intra_skip_pct;
+  section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->MVr += frame->MVr;
+  section->mvr_abs += frame->mvr_abs;
+  section->MVc += frame->MVc;
+  section->mvc_abs += frame->mvc_abs;
+  section->MVrv += frame->MVrv;
+  section->MVcv += frame->MVcv;
+  section->mv_in_out_count += frame->mv_in_out_count;
+  section->new_mv_count += frame->new_mv_count;
+  section->count += frame->count;
+  section->duration += frame->duration;
+}
+
+static int get_unit_rows(const BLOCK_SIZE fp_block_size, const int mb_rows) {
+  const int height_mi_log2 = mi_size_high_log2[fp_block_size];
+  const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16];
+  if (height_mi_log2 > mb_height_mi_log2) {
+    return mb_rows >> (height_mi_log2 - mb_height_mi_log2);
+  }
+
+  return mb_rows << (mb_height_mi_log2 - height_mi_log2);
+}
+
+static int get_unit_cols(const BLOCK_SIZE fp_block_size, const int mb_cols) {
+  const int width_mi_log2 = mi_size_wide_log2[fp_block_size];
+  const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16];
+  if (width_mi_log2 > mb_width_mi_log2) {
+    return mb_cols >> (width_mi_log2 - mb_width_mi_log2);
+  }
+
+  return mb_cols << (mb_width_mi_log2 - width_mi_log2);
+}
+
+// TODO(chengchen): can we simplify it even if resize has to be considered?
+static int get_num_mbs(const BLOCK_SIZE fp_block_size,
+                       const int num_mbs_16X16) {
+  const int width_mi_log2 = mi_size_wide_log2[fp_block_size];
+  const int height_mi_log2 = mi_size_high_log2[fp_block_size];
+  const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16];
+  const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16];
+  // TODO(chengchen): Now this function assumes a square block is used.
+  // It does not support rectangular block sizes.
+  assert(width_mi_log2 == height_mi_log2);
+  if (width_mi_log2 > mb_width_mi_log2) {
+    return num_mbs_16X16 >> ((width_mi_log2 - mb_width_mi_log2) +
+                             (height_mi_log2 - mb_height_mi_log2));
+  }
+
+  return num_mbs_16X16 << ((mb_width_mi_log2 - width_mi_log2) +
+                           (mb_height_mi_log2 - height_mi_log2));
+}
+
+void av1_end_first_pass(AV1_COMP *cpi) {
+  if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled)
+    output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats,
+                 cpi->ppi->output_pkt_list);
+}
+
+static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_8X8: return aom_mse8x8;
+    case BLOCK_16X8: return aom_mse16x8;
+    case BLOCK_8X16: return aom_mse8x16;
+    default: return aom_mse16x16;
+  }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  unsigned int sse;
+  const aom_variance_fn_t fn = get_block_variance_fn(bsize);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_8_mse8x8;
+        case BLOCK_16X8: return aom_highbd_8_mse16x8;
+        case BLOCK_8X16: return aom_highbd_8_mse8x16;
+        default: return aom_highbd_8_mse16x16;
+      }
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_10_mse8x8;
+        case BLOCK_16X8: return aom_highbd_10_mse16x8;
+        case BLOCK_8X16: return aom_highbd_10_mse8x16;
+        default: return aom_highbd_10_mse16x16;
+      }
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_12_mse8x8;
+        case BLOCK_16X8: return aom_highbd_12_mse16x8;
+        case BLOCK_8X16: return aom_highbd_12_mse8x16;
+        default: return aom_highbd_12_mse16x16;
+      }
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(int width, int height) {
+  int sr = 0;
+  const int dim = AOMMIN(width, height);
+
+  while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+  return sr;
+}
+
+static AOM_INLINE const search_site_config *
+av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      SEARCH_METHODS search_method) {
+  const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+  // For AVIF applications, even the source frames can have changing resolution,
+  // so we need to manually check for the strides :(
+  // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+  // that's shared by multiple threads. In most cases where all frames have the
+  // same resolution, the cache contains the search site config that we need.
+  const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+  if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_FPF]->stride) {
+    return mv_search_params->search_site_cfg[SS_CFG_FPF];
+  }
+
+  // If the cache does not contain the correct stride, then we will need to rely
+  // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+  // thread level config doesn't match, then we need to update it.
+  search_method = search_method_lookup[search_method];
+  assert(search_method_lookup[search_method] == search_method &&
+         "The search_method_lookup table should be idempotent.");
+  if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+    av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+                                   ref_stride);
+  }
+
+  return x->search_site_cfg_buf;
+}
+
+static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                                const MV *ref_mv,
+                                                FULLPEL_MV *best_mv,
+                                                int *best_motion_err) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+  int tmp_err;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+  const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+  const int sr = get_search_range(cm->width, cm->height);
+  const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr;
+
+  const search_site_config *first_pass_search_sites =
+      av1_get_first_pass_search_site_config(cpi, x, NSTEP);
+  const int fine_search_interval =
+      cpi->is_screen_content_type && cm->features.allow_intrabc;
+  FULLPEL_MOTION_SEARCH_PARAMS ms_params;
+  av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
+                                     start_mv, first_pass_search_sites, NSTEP,
+                                     fine_search_interval);
+
+  FULLPEL_MV this_best_mv;
+  FULLPEL_MV_STATS best_mv_stats;
+  tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
+                                  &this_best_mv, &best_mv_stats, NULL);
+
+  if (tmp_err < INT_MAX) {
+    aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize];
+    const MSBuffers *ms_buffers = &ms_params.ms_buffers;
+    tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv,
+                                 &v_fn_ptr, ms_buffers->src, ms_buffers->ref) +
+              new_mv_mode_penalty;
+  }
+
+  if (tmp_err < *best_motion_err) {
+    *best_motion_err = tmp_err;
+    *best_mv = this_best_mv;
+  }
+}
+
+static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params,
+                            const BLOCK_SIZE fp_block_size, const int unit_row,
+                            const int unit_col) {
+  const int unit_width = mi_size_wide[fp_block_size];
+  const int unit_height = mi_size_high[fp_block_size];
+  const int is_half_width =
+      unit_width * unit_col + unit_width / 2 >= mi_params->mi_cols;
+  const int is_half_height =
+      unit_height * unit_row + unit_height / 2 >= mi_params->mi_rows;
+  const int max_dimension =
+      AOMMAX(block_size_wide[fp_block_size], block_size_high[fp_block_size]);
+  int square_block_size = 0;
+  // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+  switch (max_dimension) {
+    case 4: square_block_size = 0; break;
+    case 8: square_block_size = 1; break;
+    case 16: square_block_size = 2; break;
+    case 32: square_block_size = 3; break;
+    case 64: square_block_size = 4; break;
+    case 128: square_block_size = 5; break;
+    default: assert(0 && "First pass block size is not supported!"); break;
+  }
+  if (is_half_width && is_half_height) {
+    return subsize_lookup[PARTITION_SPLIT][square_block_size];
+  } else if (is_half_width) {
+    return subsize_lookup[PARTITION_VERT][square_block_size];
+  } else if (is_half_height) {
+    return subsize_lookup[PARTITION_HORZ][square_block_size];
+  } else {
+    return fp_block_size;
+  }
+}
+
+static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+  return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1);
+}
+
+static double raw_motion_error_stdev(int *raw_motion_err_list,
+                                     int raw_motion_err_counts) {
+  int64_t sum_raw_err = 0;
+  double raw_err_avg = 0;
+  double raw_err_stdev = 0;
+  if (raw_motion_err_counts == 0) return 0;
+
+  int i;
+  for (i = 0; i < raw_motion_err_counts; i++) {
+    sum_raw_err += raw_motion_err_list[i];
+  }
+  raw_err_avg = (double)sum_raw_err / raw_motion_err_counts;
+  for (i = 0; i < raw_motion_err_counts; i++) {
+    raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) *
+                     (raw_motion_err_list[i] - raw_err_avg);
+  }
+  // Calculate the standard deviation for the motion error of all the inter
+  // blocks of the 0,0 motion using the last source
+  // frame as the reference.
+  raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
+  return raw_err_stdev;
+}
+
+static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
+  return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL;
+}
+typedef struct intra_pred_block_pass1_args {
+  const SequenceHeader *seq_params;
+  MACROBLOCK *x;
+} intra_pred_block_pass1_args;
+
+static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
+                             int sstride, int width, int height, int use_hbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride,
+                             CONVERT_TO_SHORTPTR(dst), dstride, width, height);
+  } else {
+    aom_convolve_copy(src, sstride, dst, dstride, width, height);
+  }
+#else
+  (void)use_hbd;
+  aom_convolve_copy(src, sstride, dst, dstride, width, height);
+#endif
+}
+
+static void first_pass_intra_pred_and_calc_diff(int plane, int block,
+                                                int blk_row, int blk_col,
+                                                BLOCK_SIZE plane_bsize,
+                                                TX_SIZE tx_size, void *arg) {
+  (void)block;
+  struct intra_pred_block_pass1_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  MACROBLOCK_PLANE *const p = &x->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const SequenceHeader *seq_params = args->seq_params;
+  const int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+
+  av1_predict_intra_block(
+      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+      pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src,
+      src_stride, dst, dst_stride, blk_col, blk_row, plane);
+
+  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+}
+
+static void first_pass_predict_intra_block_for_luma_plane(
+    const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = AOM_PLANE_Y;
+  const MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = pd->dst.buf;
+  const MACROBLOCK_PLANE *const p = &x->plane[plane];
+  const int src_stride = p->src.stride;
+  const uint8_t *src = p->src.buf;
+
+  intra_pred_block_pass1_args args = { seq_params, x };
+  av1_foreach_transformed_block_in_plane(
+      xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args);
+
+  // copy source data to recon buffer, as the recon buffer will be used as a
+  // reference frame subsequently.
+  copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize],
+            block_size_high[bsize], seq_params->use_highbitdepth);
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+// Computes and returns the intra pred error of a block.
+// intra pred error: sum of squared error of the intra predicted residual.
+// Inputs:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   this_frame: the current frame buffer.
+//   tile: tile information (not used in first pass, already init to zero)
+//   unit_row: row index in the unit of first pass block size.
+//   unit_col: column index in the unit of first pass block size.
+//   y_offset: the offset of y frame buffer, indicating the starting point of
+//             the current block.
+//   uv_offset: the offset of u and v frame buffer, indicating the starting
+//              point of the current block.
+//   fp_block_size: first pass block size.
+//   qindex: quantization step size to encode the frame.
+//   stats: frame encoding stats.
+// Modifies:
+//   stats->intra_skip_count
+//   stats->image_data_start_row
+//   stats->intra_factor
+//   stats->brightness_factor
+//   stats->intra_error
+//   stats->frame_avg_wavelet_energy
+// Returns:
+//   this_intra_error.
+static int firstpass_intra_prediction(
+    AV1_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const this_frame,
+    const TileInfo *const tile, const int unit_row, const int unit_col,
+    const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size,
+    const int qindex, FRAME_STATS *const stats) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int unit_scale = mi_size_wide[fp_block_size];
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE bsize =
+      get_bsize(mi_params, fp_block_size, unit_row, unit_col);
+
+  set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale);
+  xd->plane[0].dst.buf = this_frame->y_buffer + y_offset;
+  if (num_planes > 1) {
+    xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
+    xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
+  }
+  xd->left_available = (unit_col != 0);
+  xd->mi[0]->bsize = bsize;
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+  set_mi_row_col(xd, tile, unit_row * unit_scale, mi_size_high[bsize],
+                 unit_col * unit_scale, mi_size_wide[bsize], mi_params->mi_rows,
+                 mi_params->mi_cols);
+  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
+  xd->mi[0]->segment_id = 0;
+  xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+  xd->mi[0]->mode = DC_PRED;
+  xd->mi[0]->tx_size = TX_4X4;
+
+  if (cpi->sf.fp_sf.disable_recon)
+    first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize);
+  else
+    av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
+  int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8: break;
+      case AOM_BITS_10: this_intra_error >>= 4; break;
+      case AOM_BITS_12: this_intra_error >>= 8; break;
+      default:
+        assert(0 &&
+               "seq_params->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+        return -1;
+    }
+  }
+
+  if (this_intra_error < UL_INTRA_THRESH) {
+    ++stats->intra_skip_count;
+  } else if ((unit_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
+    stats->image_data_start_row = unit_row;
+  }
+
+  double log_intra = log1p(this_intra_error);
+  if (log_intra < 10.0) {
+    stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+  } else {
+    stats->intra_factor += 1.0;
+  }
+
+  int level_sample;
+  if (seq_params->use_highbitdepth) {
+    level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+  } else {
+    level_sample = x->plane[0].src.buf[0];
+  }
+
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8: break;
+      case AOM_BITS_10: level_sample >>= 2; break;
+      case AOM_BITS_12: level_sample >>= 4; break;
+      default:
+        assert(0 &&
+               "seq_params->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+        return -1;
+    }
+  }
+  if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+    stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+  } else {
+    stats->brightness_factor += 1.0;
+  }
+
+  // Intrapenalty below deals with situations where the intra and inter
+  // error scores are very low (e.g. a plain black frame).
+  // We do not have special cases in first pass for 0,0 and nearest etc so
+  // all inter modes carry an overhead cost estimate for the mv.
+  // When the error score is very low this causes us to pick all or lots of
+  // INTRA modes and throw lots of key frames.
+  // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+  this_intra_error += INTRA_MODE_PENALTY;
+
+  // Accumulate the intra error.
+  stats->intra_error += (int64_t)this_intra_error;
+
+  // Stats based on wavelet energy is used in the following cases :
+  // 1. ML model which predicts if a flat structure (golden-frame only structure
+  // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in
+  // constant quality mode under certain conditions.
+  // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL.
+  // Thus, wavelet energy calculation is enabled for the above cases.
+  if (calc_wavelet_energy(&cpi->oxcf)) {
+    const int hbd = is_cur_buf_hbd(xd);
+    const int stride = x->plane[0].src.stride;
+    const int num_8x8_rows = block_size_high[fp_block_size] / 8;
+    const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
+    const uint8_t *buf = x->plane[0].src.buf;
+    stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input(
+        buf, stride, hbd, num_8x8_rows, num_8x8_cols);
+  } else {
+    stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
+  }
+
+  return this_intra_error;
+}
+
+// Returns the sum of square error between source and reference blocks.
+static int get_prediction_error_bitdepth(const int is_high_bitdepth,
+                                         const int bitdepth,
+                                         const BLOCK_SIZE block_size,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  (void)is_high_bitdepth;
+  (void)bitdepth;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_high_bitdepth) {
+    return highbd_get_prediction_error(block_size, src, ref, bitdepth);
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  return get_prediction_error(block_size, src, ref);
+}
+
+// Accumulates motion vector stats.
+// Modifies member variables of "stats".
+static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
+                                const int mb_row, const int mb_col,
+                                const int mb_rows, const int mb_cols,
+                                MV *last_non_zero_mv, FRAME_STATS *stats) {
+  if (is_zero_mv(&best_mv)) return;
+
+  ++stats->mv_count;
+  // Non-zero vector, was it different from the last non zero vector?
+  if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count;
+  *last_non_zero_mv = best_mv;
+
+  // Does the row vector point inwards or outwards?
+  if (mb_row < mb_rows / 2) {
+    if (mv.row > 0) {
+      --stats->sum_in_vectors;
+    } else if (mv.row < 0) {
+      ++stats->sum_in_vectors;
+    }
+  } else if (mb_row > mb_rows / 2) {
+    if (mv.row > 0) {
+      ++stats->sum_in_vectors;
+    } else if (mv.row < 0) {
+      --stats->sum_in_vectors;
+    }
+  }
+
+  // Does the col vector point inwards or outwards?
+  if (mb_col < mb_cols / 2) {
+    if (mv.col > 0) {
+      --stats->sum_in_vectors;
+    } else if (mv.col < 0) {
+      ++stats->sum_in_vectors;
+    }
+  } else if (mb_col > mb_cols / 2) {
+    if (mv.col > 0) {
+      ++stats->sum_in_vectors;
+    } else if (mv.col < 0) {
+      --stats->sum_in_vectors;
+    }
+  }
+}
+
+// Computes and returns the inter prediction error from the last frame.
+// Computes inter prediction errors from the golden and alt ref frams and
+// Updates stats accordingly.
+// Inputs:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   last_frame: the frame buffer of the last frame.
+//   golden_frame: the frame buffer of the golden frame.
+//   unit_row: row index in the unit of first pass block size.
+//   unit_col: column index in the unit of first pass block size.
+//   recon_yoffset: the y offset of the reconstructed  frame buffer,
+//                  indicating the starting point of the current block.
+//   recont_uvoffset: the u/v offset of the reconstructed frame buffer,
+//                    indicating the starting point of the current block.
+//   src_yoffset: the y offset of the source frame buffer.
+//   fp_block_size: first pass block size.
+//   this_intra_error: the intra prediction error of this block.
+//   raw_motion_err_counts: the count of raw motion vectors.
+//   raw_motion_err_list: the array that records the raw motion error.
+//   ref_mv: the reference used to start the motion search
+//   best_mv: the best mv found
+//   last_non_zero_mv: the last non zero mv found in this tile row.
+//   stats: frame encoding stats.
+//  Modifies:
+//    raw_motion_err_list
+//    best_ref_mv
+//    last_mv
+//    stats: many member params in it.
+//  Returns:
+//    this_inter_error
+static int firstpass_inter_prediction(
+    AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame,
+    const YV12_BUFFER_CONFIG *const golden_frame, const int unit_row,
+    const int unit_col, const int recon_yoffset, const int recon_uvoffset,
+    const int src_yoffset, const BLOCK_SIZE fp_block_size,
+    const int this_intra_error, const int raw_motion_err_counts,
+    int *raw_motion_err_list, const MV ref_mv, MV *best_mv,
+    MV *last_non_zero_mv, FRAME_STATS *stats) {
+  int this_inter_error = this_intra_error;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int is_high_bitdepth = is_cur_buf_hbd(xd);
+  const int bitdepth = xd->bd;
+  const int unit_scale = mi_size_wide[fp_block_size];
+  const BLOCK_SIZE bsize =
+      get_bsize(mi_params, fp_block_size, unit_row, unit_col);
+  const int fp_block_size_height = block_size_wide[fp_block_size];
+  const int unit_width = mi_size_wide[fp_block_size];
+  const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
+  const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols);
+  // Assume 0,0 motion with no mv overhead.
+  FULLPEL_MV mv = kZeroFullMv;
+  xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  av1_set_mv_col_limits(mi_params, &x->mv_limits, unit_col * unit_width,
+                        fp_block_size_height >> MI_SIZE_LOG2,
+                        cpi->oxcf.border_in_pixels);
+
+  int motion_error =
+      get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+                                    &x->plane[0].src, &xd->plane[0].pre[0]);
+
+  // Compute the motion error of the 0,0 motion using the last source
+  // frame as the reference. Skip the further motion search on
+  // reconstructed frame if this error is small.
+  // TODO(chiyotsai): The unscaled last source might be different dimension
+  // as the current source. See BUG=aomedia:3413
+  struct buf_2d unscaled_last_source_buf_2d;
+  unscaled_last_source_buf_2d.buf =
+      cpi->unscaled_last_source->y_buffer + src_yoffset;
+  unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
+  const int raw_motion_error = get_prediction_error_bitdepth(
+      is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
+      &unscaled_last_source_buf_2d);
+  raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
+  const FIRST_PASS_SPEED_FEATURES *const fp_sf = &cpi->sf.fp_sf;
+
+  if (raw_motion_error > fp_sf->skip_motion_search_threshold) {
+    // Test last reference frame using the previous best mv as the
+    // starting point (best reference) for the search.
+    first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error);
+
+    // If the current best reference mv is not centered on 0,0 then do a
+    // 0,0 based search as well.
+    if ((fp_sf->skip_zeromv_motion_search == 0) && !is_zero_mv(&ref_mv)) {
+      FULLPEL_MV tmp_mv = kZeroFullMv;
+      int tmp_err = INT_MAX;
+      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
+
+      if (tmp_err < motion_error) {
+        motion_error = tmp_err;
+        mv = tmp_mv;
+      }
+    }
+  }
+
+  // Motion search in 2nd reference frame.
+  int gf_motion_error = motion_error;
+  if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+    FULLPEL_MV tmp_mv = kZeroFullMv;
+    // Assume 0,0 motion with no mv overhead.
+    av1_setup_pre_planes(xd, 0, golden_frame, 0, 0, NULL, 1);
+    xd->plane[0].pre[0].buf += recon_yoffset;
+    gf_motion_error =
+        get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+                                      &x->plane[0].src, &xd->plane[0].pre[0]);
+    first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error);
+  }
+  if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) {
+    ++stats->second_ref_count;
+  }
+  // In accumulating a score for the 2nd reference frame take the
+  // best of the motion predicted score and the intra coded error
+  // (just as will be done for) accumulation of "coded_error" for
+  // the last frame.
+  if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+    stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error);
+  } else {
+    // TODO(chengchen): I believe logically this should also be changed to
+    // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error).
+    stats->sr_coded_error += motion_error;
+  }
+
+  // Reset to last frame as reference buffer.
+  xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+  if (av1_num_planes(&cpi->common) > 1) {
+    xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
+    xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
+  }
+
+  // Start by assuming that intra mode is best.
+  *best_mv = kZeroMv;
+
+  if (motion_error <= this_intra_error) {
+    // Keep a count of cases where the inter and intra were very close
+    // and very low. This helps with scene cut detection for example in
+    // cropped clips with black bars at the sides or top and bottom.
+    if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) &&
+        (this_intra_error < (2 * INTRA_MODE_PENALTY))) {
+      stats->neutral_count += 1.0;
+      // Also track cases where the intra is not much worse than the inter
+      // and use this in limiting the GF/arf group length.
+    } else if ((this_intra_error > NCOUNT_INTRA_THRESH) &&
+               (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+      stats->neutral_count +=
+          (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error);
+    }
+
+    *best_mv = get_mv_from_fullmv(&mv);
+    this_inter_error = motion_error;
+    xd->mi[0]->mode = NEWMV;
+    xd->mi[0]->mv[0].as_mv = *best_mv;
+    xd->mi[0]->tx_size = TX_4X4;
+    xd->mi[0]->ref_frame[0] = LAST_FRAME;
+    xd->mi[0]->ref_frame[1] = NONE_FRAME;
+
+    if (fp_sf->disable_recon == 0) {
+      av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale,
+                                    unit_col * unit_scale, NULL, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      av1_encode_sby_pass1(cpi, x, bsize);
+    }
+    stats->sum_mvr += best_mv->row;
+    stats->sum_mvr_abs += abs(best_mv->row);
+    stats->sum_mvc += best_mv->col;
+    stats->sum_mvc_abs += abs(best_mv->col);
+    stats->sum_mvrs += best_mv->row * best_mv->row;
+    stats->sum_mvcs += best_mv->col * best_mv->col;
+    ++stats->inter_count;
+
+    accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols,
+                        last_non_zero_mv, stats);
+  }
+
+  return this_inter_error;
+}
+
+// Normalize the first pass stats.
+// Error / counters are normalized to each MB.
+// MVs are normalized to the width/height of the frame.
+static void normalize_firstpass_stats(FIRSTPASS_STATS *fps,
+                                      double num_mbs_16x16, double f_w,
+                                      double f_h) {
+  fps->coded_error /= num_mbs_16x16;
+  fps->sr_coded_error /= num_mbs_16x16;
+  fps->intra_error /= num_mbs_16x16;
+  fps->frame_avg_wavelet_energy /= num_mbs_16x16;
+  fps->log_coded_error = log1p(fps->coded_error);
+  fps->log_intra_error = log1p(fps->intra_error);
+  fps->MVr /= f_h;
+  fps->mvr_abs /= f_h;
+  fps->MVc /= f_w;
+  fps->mvc_abs /= f_w;
+  fps->MVrv /= (f_h * f_h);
+  fps->MVcv /= (f_w * f_w);
+  fps->new_mv_count /= num_mbs_16x16;
+}
+
+// Updates the first pass stats of this frame.
+// Input:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   stats: stats accumulated for this frame.
+//   raw_err_stdev: the statndard deviation for the motion error of all the
+//                  inter blocks of the (0,0) motion using the last source
+//                  frame as the reference.
+//   frame_number: current frame number.
+//   ts_duration: Duration of the frame / collection of frames.
+// Updates:
+//   twopass->total_stats: the accumulated stats.
+//   twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats,
+//                                         update its value and its position
+//                                         in the buffer.
+static void update_firstpass_stats(AV1_COMP *cpi,
+                                   const FRAME_STATS *const stats,
+                                   const double raw_err_stdev,
+                                   const int frame_number,
+                                   const int64_t ts_duration,
+                                   const BLOCK_SIZE fp_block_size) {
+  TWO_PASS *twopass = &cpi->ppi->twopass;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+  FIRSTPASS_STATS fps;
+  // The minimum error here insures some bit allocation to frames even
+  // in static regions. The allocation per MB declines for larger formats
+  // where the typical "real" energy per MB also falls.
+  // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+  // number of mbs is proportional to the image area.
+  const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                                ? cpi->initial_mbs
+                                : mi_params->MBs;
+  // Number of actual units used in the first pass, it can be other square
+  // block sizes than 16X16.
+  const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16);
+  const double min_err = 200 * sqrt(num_mbs);
+
+  fps.weight = stats->intra_factor * stats->brightness_factor;
+  fps.frame = frame_number;
+  fps.coded_error = (double)(stats->coded_error >> 8) + min_err;
+  fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err;
+  fps.intra_error = (double)(stats->intra_error >> 8) + min_err;
+  fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy;
+  fps.count = 1.0;
+  fps.pcnt_inter = (double)stats->inter_count / num_mbs;
+  fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs;
+  fps.pcnt_neutral = (double)stats->neutral_count / num_mbs;
+  fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs;
+  fps.inactive_zone_rows = (double)stats->image_data_start_row;
+  fps.inactive_zone_cols = 0.0;  // Placeholder: not currently supported.
+  fps.raw_error_stdev = raw_err_stdev;
+  fps.is_flash = 0;
+  fps.noise_var = 0.0;
+  fps.cor_coeff = 1.0;
+  fps.log_coded_error = 0.0;
+  fps.log_intra_error = 0.0;
+
+  if (stats->mv_count > 0) {
+    fps.MVr = (double)stats->sum_mvr / stats->mv_count;
+    fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count;
+    fps.MVc = (double)stats->sum_mvc / stats->mv_count;
+    fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count;
+    fps.MVrv = ((double)stats->sum_mvrs -
+                ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) /
+               stats->mv_count;
+    fps.MVcv = ((double)stats->sum_mvcs -
+                ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) /
+               stats->mv_count;
+    fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2);
+    fps.new_mv_count = stats->new_mv_count;
+    fps.pcnt_motion = (double)stats->mv_count / num_mbs;
+  } else {
+    fps.MVr = 0.0;
+    fps.mvr_abs = 0.0;
+    fps.MVc = 0.0;
+    fps.mvc_abs = 0.0;
+    fps.MVrv = 0.0;
+    fps.MVcv = 0.0;
+    fps.mv_in_out_count = 0.0;
+    fps.new_mv_count = 0.0;
+    fps.pcnt_motion = 0.0;
+  }
+
+  // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+  // something less than the full time between subsequent values of
+  // cpi->source_time_stamp.
+  fps.duration = (double)ts_duration;
+
+  normalize_firstpass_stats(&fps, num_mbs_16X16, cm->width, cm->height);
+
+  // We will store the stats inside the persistent twopass struct (and NOT the
+  // local variable 'fps'), and then cpi->output_pkt_list will point to it.
+  *this_frame_stats = fps;
+  if (!cpi->ppi->lap_enabled) {
+    output_stats(this_frame_stats, cpi->ppi->output_pkt_list);
+  } else {
+    av1_firstpass_info_push(&twopass->firstpass_info, this_frame_stats);
+  }
+  if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) {
+    av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps);
+  }
+  twopass->stats_buf_ctx->stats_in_end++;
+  // When ducky encode is on, we always use linear buffer for stats_buf_ctx.
+  if (cpi->use_ducky_encode == 0) {
+    // TODO(angiebird): Figure out why first pass uses circular buffer.
+    /* In the case of two pass, first pass uses it as a circular buffer,
+     * when LAP is enabled it is used as a linear buffer*/
+    if ((cpi->oxcf.pass == AOM_RC_FIRST_PASS) &&
+        (twopass->stats_buf_ctx->stats_in_end >=
+         twopass->stats_buf_ctx->stats_in_buf_end)) {
+      twopass->stats_buf_ctx->stats_in_end =
+          twopass->stats_buf_ctx->stats_in_start;
+    }
+  }
+}
+
+static void print_reconstruction_frame(
+    const YV12_BUFFER_CONFIG *const last_frame, int frame_number,
+    int do_print) {
+  if (!do_print) return;
+
+  char filename[512];
+  FILE *recon_file;
+  snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number);
+
+  if (frame_number == 0) {
+    recon_file = fopen(filename, "wb");
+  } else {
+    recon_file = fopen(filename, "ab");
+  }
+
+  fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file);
+  fclose(recon_file);
+}
+
+static FRAME_STATS accumulate_frame_stats(FRAME_STATS *mb_stats, int mb_rows,
+                                          int mb_cols) {
+  FRAME_STATS stats = { 0 };
+  int i, j;
+
+  stats.image_data_start_row = INVALID_ROW;
+  for (j = 0; j < mb_rows; j++) {
+    for (i = 0; i < mb_cols; i++) {
+      FRAME_STATS mb_stat = mb_stats[j * mb_cols + i];
+      stats.brightness_factor += mb_stat.brightness_factor;
+      stats.coded_error += mb_stat.coded_error;
+      stats.frame_avg_wavelet_energy += mb_stat.frame_avg_wavelet_energy;
+      if (stats.image_data_start_row == INVALID_ROW &&
+          mb_stat.image_data_start_row != INVALID_ROW) {
+        stats.image_data_start_row = mb_stat.image_data_start_row;
+      }
+      stats.inter_count += mb_stat.inter_count;
+      stats.intra_error += mb_stat.intra_error;
+      stats.intra_factor += mb_stat.intra_factor;
+      stats.intra_skip_count += mb_stat.intra_skip_count;
+      stats.mv_count += mb_stat.mv_count;
+      stats.neutral_count += mb_stat.neutral_count;
+      stats.new_mv_count += mb_stat.new_mv_count;
+      stats.second_ref_count += mb_stat.second_ref_count;
+      stats.sr_coded_error += mb_stat.sr_coded_error;
+      stats.sum_in_vectors += mb_stat.sum_in_vectors;
+      stats.sum_mvc += mb_stat.sum_mvc;
+      stats.sum_mvc_abs += mb_stat.sum_mvc_abs;
+      stats.sum_mvcs += mb_stat.sum_mvcs;
+      stats.sum_mvr += mb_stat.sum_mvr;
+      stats.sum_mvr_abs += mb_stat.sum_mvr_abs;
+      stats.sum_mvrs += mb_stat.sum_mvrs;
+    }
+  }
+  return stats;
+}
+
+static void setup_firstpass_data(AV1_COMMON *const cm,
+                                 FirstPassData *firstpass_data,
+                                 const int unit_rows, const int unit_cols) {
+  CHECK_MEM_ERROR(cm, firstpass_data->raw_motion_err_list,
+                  aom_calloc(unit_rows * unit_cols,
+                             sizeof(*firstpass_data->raw_motion_err_list)));
+  CHECK_MEM_ERROR(
+      cm, firstpass_data->mb_stats,
+      aom_calloc(unit_rows * unit_cols, sizeof(*firstpass_data->mb_stats)));
+  for (int j = 0; j < unit_rows; j++) {
+    for (int i = 0; i < unit_cols; i++) {
+      firstpass_data->mb_stats[j * unit_cols + i].image_data_start_row =
+          INVALID_ROW;
+    }
+  }
+}
+
+void av1_free_firstpass_data(FirstPassData *firstpass_data) {
+  aom_free(firstpass_data->raw_motion_err_list);
+  firstpass_data->raw_motion_err_list = NULL;
+  aom_free(firstpass_data->mb_stats);
+  firstpass_data->mb_stats = NULL;
+}
+
+int av1_get_unit_rows_in_tile(const TileInfo *tile,
+                              const BLOCK_SIZE fp_block_size) {
+  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+  const int mi_rows = tile->mi_row_end - tile->mi_row_start;
+  const int unit_rows = CEIL_POWER_OF_TWO(mi_rows, unit_height_log2);
+
+  return unit_rows;
+}
+
+int av1_get_unit_cols_in_tile(const TileInfo *tile,
+                              const BLOCK_SIZE fp_block_size) {
+  const int unit_width_log2 = mi_size_wide_log2[fp_block_size];
+  const int mi_cols = tile->mi_col_end - tile->mi_col_start;
+  const int unit_cols = CEIL_POWER_OF_TWO(mi_cols, unit_width_log2);
+
+  return unit_cols;
+}
+
+#define FIRST_PASS_ALT_REF_DISTANCE 16
+static void first_pass_tile(AV1_COMP *cpi, ThreadData *td,
+                            TileDataEnc *tile_data,
+                            const BLOCK_SIZE fp_block_size) {
+  TileInfo *tile = &tile_data->tile_info;
+  const int unit_height = mi_size_high[fp_block_size];
+  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+  for (int mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += unit_height) {
+    av1_first_pass_row(cpi, td, tile_data, mi_row >> unit_height_log2,
+                       fp_block_size);
+  }
+}
+
+static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  av1_alloc_src_diff_buf(cm, &cpi->td.mb);
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size);
+    }
+  }
+}
+
+void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                        const int unit_row, const BLOCK_SIZE fp_block_size) {
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo *tile = &tile_data->tile_info;
+  const int qindex = find_fp_qindex(seq_params->bit_depth);
+  const int fp_block_size_width = block_size_high[fp_block_size];
+  const int fp_block_size_height = block_size_wide[fp_block_size];
+  const int unit_width = mi_size_wide[fp_block_size];
+  const int unit_width_log2 = mi_size_wide_log2[fp_block_size];
+  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+  const int unit_cols = mi_params->mb_cols * 4 / unit_width;
+  int raw_motion_err_counts = 0;
+  int unit_row_in_tile = unit_row - (tile->mi_row_start >> unit_height_log2);
+  int unit_col_start = tile->mi_col_start >> unit_width_log2;
+  int unit_cols_in_tile = av1_get_unit_cols_in_tile(tile, fp_block_size);
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+
+  const YV12_BUFFER_CONFIG *last_frame =
+      av1_get_scaled_ref_frame(cpi, LAST_FRAME);
+  if (!last_frame) {
+    last_frame = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  }
+  const YV12_BUFFER_CONFIG *golden_frame =
+      av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+  if (!golden_frame) {
+    golden_frame = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+  }
+  YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+
+  PICK_MODE_CONTEXT *ctx = td->firstpass_ctx;
+  FRAME_STATS *mb_stats =
+      cpi->firstpass_data.mb_stats + unit_row * unit_cols + unit_col_start;
+  int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list +
+                             unit_row * unit_cols + unit_col_start;
+  MV *first_top_mv = &tile_data->firstpass_top_mv;
+
+  for (int i = 0; i < num_planes; ++i) {
+    x->plane[i].coeff = ctx->coeff[i];
+    x->plane[i].qcoeff = ctx->qcoeff[i];
+    x->plane[i].eobs = ctx->eobs[i];
+    x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+    x->plane[i].dqcoeff = ctx->dqcoeff[i];
+  }
+
+  const int src_y_stride = cpi->source->y_stride;
+  const int recon_y_stride = this_frame->y_stride;
+  const int recon_uv_stride = this_frame->uv_stride;
+  const int uv_mb_height =
+      fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
+
+  MV best_ref_mv = kZeroMv;
+  MV last_mv;
+
+  // Reset above block coeffs.
+  xd->up_available = (unit_row_in_tile != 0);
+  int recon_yoffset = (unit_row * recon_y_stride * fp_block_size_height) +
+                      (unit_col_start * fp_block_size_width);
+  int src_yoffset = (unit_row * src_y_stride * fp_block_size_height) +
+                    (unit_col_start * fp_block_size_width);
+  int recon_uvoffset = (unit_row * recon_uv_stride * uv_mb_height) +
+                       (unit_col_start * uv_mb_height);
+
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  av1_set_mv_row_limits(
+      mi_params, &x->mv_limits, (unit_row << unit_height_log2),
+      (fp_block_size_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels);
+
+  av1_setup_src_planes(x, cpi->source, unit_row << unit_height_log2,
+                       tile->mi_col_start, num_planes, fp_block_size);
+
+  // Fix - zero the 16x16 block first. This ensures correct this_intra_error for
+  // block sizes smaller than 16x16.
+  av1_zero_array(x->plane[0].src_diff, 256);
+
+  for (int unit_col_in_tile = 0; unit_col_in_tile < unit_cols_in_tile;
+       unit_col_in_tile++) {
+    const int unit_col = unit_col_start + unit_col_in_tile;
+
+    enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+    if (cpi->ppi->p_mt_info.num_workers > 1) {
+      pthread_mutex_lock(enc_row_mt->mutex_);
+      bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+      pthread_mutex_unlock(enc_row_mt->mutex_);
+      // Exit in case any worker has encountered an error.
+      if (firstpass_mt_exit) return;
+    }
+#endif
+
+    if (unit_col_in_tile == 0) {
+      last_mv = *first_top_mv;
+    }
+    int this_intra_error = firstpass_intra_prediction(
+        cpi, td, this_frame, tile, unit_row, unit_col, recon_yoffset,
+        recon_uvoffset, fp_block_size, qindex, mb_stats);
+
+    if (!frame_is_intra_only(cm)) {
+      const int this_inter_error = firstpass_inter_prediction(
+          cpi, td, last_frame, golden_frame, unit_row, unit_col, recon_yoffset,
+          recon_uvoffset, src_yoffset, fp_block_size, this_intra_error,
+          raw_motion_err_counts, raw_motion_err_list, best_ref_mv, &best_ref_mv,
+          &last_mv, mb_stats);
+      if (unit_col_in_tile == 0) {
+        *first_top_mv = last_mv;
+      }
+      mb_stats->coded_error += this_inter_error;
+      ++raw_motion_err_counts;
+    } else {
+      mb_stats->sr_coded_error += this_intra_error;
+      mb_stats->coded_error += this_intra_error;
+    }
+
+    // Adjust to the next column of MBs.
+    x->plane[0].src.buf += fp_block_size_width;
+    if (num_planes > 1) {
+      x->plane[1].src.buf += uv_mb_height;
+      x->plane[2].src.buf += uv_mb_height;
+    }
+
+    recon_yoffset += fp_block_size_width;
+    src_yoffset += fp_block_size_width;
+    recon_uvoffset += uv_mb_height;
+    mb_stats++;
+
+    enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile,
+                               unit_cols_in_tile);
+  }
+}
+
+void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  int max_mb_rows = mi_params->mb_rows;
+  int max_mb_cols = mi_params->mb_cols;
+  if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) {
+    int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width);
+    max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2);
+  }
+  if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) {
+    int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height);
+    max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2);
+  }
+  const int unit_rows = get_unit_rows(BLOCK_16X16, max_mb_rows);
+  const int unit_cols = get_unit_cols(BLOCK_16X16, max_mb_cols);
+  setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
+  FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+  FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
+  av1_free_firstpass_data(&cpi->firstpass_data);
+  update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number,
+                         ts_duration, BLOCK_16X16);
+}
+
+void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int qindex = find_fp_qindex(seq_params->bit_depth);
+  const int ref_frame_flags_backup = cpi->ref_frame_flags;
+  cpi->ref_frame_flags = av1_ref_frame_flag_list[LAST_FRAME] |
+                         av1_ref_frame_flag_list[GOLDEN_FRAME];
+
+  // Detect if the key frame is screen content type.
+  if (frame_is_intra_only(cm)) {
+    FeatureFlags *const features = &cm->features;
+    assert(cpi->source != NULL);
+    xd->cur_buf = cpi->source;
+    av1_set_screen_content_options(cpi, features);
+  }
+
+  // Prepare the speed features
+  av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+
+  // Unit size for the first pass encoding.
+  const BLOCK_SIZE fp_block_size =
+      get_fp_block_size(cpi->is_screen_content_type);
+
+  int max_mb_rows = mi_params->mb_rows;
+  int max_mb_cols = mi_params->mb_cols;
+  if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) {
+    int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width);
+    max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2);
+  }
+  if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) {
+    int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height);
+    max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2);
+  }
+
+  // Number of rows in the unit size.
+  // Note max_mb_rows and max_mb_cols are in the unit of 16x16.
+  const int unit_rows = get_unit_rows(fp_block_size, max_mb_rows);
+  const int unit_cols = get_unit_cols(fp_block_size, max_mb_cols);
+
+  // Set fp_block_size, for the convenience of multi-thread usage.
+  cpi->fp_block_size = fp_block_size;
+
+  setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
+  int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list;
+  FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+
+  // multi threading info
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  if (cpi->allocated_tiles < tile_cols * tile_rows) {
+    av1_alloc_tile_data(cpi);
+  }
+
+  av1_init_tile_data(cpi);
+
+  const YV12_BUFFER_CONFIG *last_frame = NULL;
+  const YV12_BUFFER_CONFIG *golden_frame = NULL;
+  if (!frame_is_intra_only(cm)) {
+    av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
+    last_frame = av1_is_scaled(get_ref_scale_factors_const(cm, LAST_FRAME))
+                     ? av1_get_scaled_ref_frame(cpi, LAST_FRAME)
+                     : get_ref_frame_yv12_buf(cm, LAST_FRAME);
+    golden_frame = av1_is_scaled(get_ref_scale_factors_const(cm, GOLDEN_FRAME))
+                       ? av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME)
+                       : get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+  }
+
+  YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+  // First pass code requires valid last and new frame buffers.
+  assert(this_frame != NULL);
+  assert(frame_is_intra_only(cm) || (last_frame != NULL));
+
+  av1_setup_frame_size(cpi);
+  av1_set_mv_search_params(cpi);
+
+  set_mi_offsets(mi_params, xd, 0, 0);
+  xd->mi[0]->bsize = fp_block_size;
+
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+
+  av1_set_quantizer(
+      cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex,
+      cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq);
+
+  av1_setup_block_planes(xd, seq_params->subsampling_x,
+                         seq_params->subsampling_y, num_planes);
+
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size);
+  av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0,
+                       num_planes);
+
+  if (!frame_is_intra_only(cm)) {
+    av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes);
+  }
+
+  set_mi_offsets(mi_params, xd, 0, 0);
+
+  // Don't store luma on the fist pass since chroma is not computed
+  xd->cfl.store_y = 0;
+  av1_frame_init_quantizer(cpi);
+
+  av1_default_coef_probs(cm);
+  av1_init_mode_probs(cm->fc);
+  av1_init_mv_probs(cm);
+  av1_initialize_rd_consts(cpi);
+
+  enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+  enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+
+  if (mt_info->num_workers > 1) {
+    enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+    enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+    av1_fp_encode_tiles_row_mt(cpi);
+  } else {
+    first_pass_tiles(cpi, fp_block_size);
+  }
+
+  FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
+  int total_raw_motion_err_count =
+      frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols;
+  const double raw_err_stdev =
+      raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count);
+  av1_free_firstpass_data(&cpi->firstpass_data);
+  av1_dealloc_src_diff_buf(&cpi->td.mb, av1_num_planes(cm));
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((stats.image_data_start_row > unit_rows / 2) ||
+      (stats.image_data_start_row == INVALID_ROW)) {
+    stats.image_data_start_row = unit_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (stats.image_data_start_row > 0) {
+    stats.intra_skip_count =
+        AOMMAX(0, stats.intra_skip_count -
+                      (stats.image_data_start_row * unit_cols * 2));
+  }
+
+  TWO_PASS *twopass = &cpi->ppi->twopass;
+  const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                                ? cpi->initial_mbs
+                                : mi_params->MBs;
+  // Number of actual units used in the first pass, it can be other square
+  // block sizes than 16X16.
+  const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16);
+  stats.intra_factor = stats.intra_factor / (double)num_mbs;
+  stats.brightness_factor = stats.brightness_factor / (double)num_mbs;
+  FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+  update_firstpass_stats(cpi, &stats, raw_err_stdev,
+                         current_frame->frame_number, ts_duration,
+                         fp_block_size);
+
+  // Copy the previous Last Frame back into gf buffer if the prediction is good
+  // enough... but also don't allow it to lag too far.
+  if ((twopass->sr_update_lag > 3) ||
+      ((current_frame->frame_number > 0) &&
+       (this_frame_stats->pcnt_inter > 0.20) &&
+       ((this_frame_stats->intra_error /
+         DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) {
+    if (golden_frame != NULL) {
+      assign_frame_buffer_p(
+          &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+          cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
+    }
+    twopass->sr_update_lag = 1;
+  } else {
+    ++twopass->sr_update_lag;
+  }
+
+  aom_extend_frame_borders(this_frame, num_planes);
+
+  // The frame we just compressed now becomes the last frame.
+  assign_frame_buffer_p(
+      &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame);
+
+  // Special case for the first frame. Copy into the GF buffer as a second
+  // reference.
+  if (current_frame->frame_number == 0 &&
+      get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) {
+    assign_frame_buffer_p(
+        &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+        cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
+  }
+
+  print_reconstruction_frame(last_frame, current_frame->frame_number,
+                             /*do_print=*/0);
+
+  ++current_frame->frame_number;
+  cpi->ref_frame_flags = ref_frame_flags_backup;
+  if (!frame_is_intra_only(cm)) {
+    release_scaled_references(cpi);
+  }
+}
+
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+                                        FIRSTPASS_STATS *ext_stats_buf,
+                                        int ext_stats_buf_size) {
+  assert(IMPLIES(ext_stats_buf == NULL, ext_stats_buf_size == 0));
+  if (ext_stats_buf == NULL) {
+    firstpass_info->stats_buf = firstpass_info->static_stats_buf;
+    firstpass_info->stats_buf_size =
+        sizeof(firstpass_info->static_stats_buf) /
+        sizeof(firstpass_info->static_stats_buf[0]);
+    firstpass_info->start_index = 0;
+    firstpass_info->cur_index = 0;
+    firstpass_info->stats_count = 0;
+    firstpass_info->future_stats_count = 0;
+    firstpass_info->past_stats_count = 0;
+    av1_zero(firstpass_info->total_stats);
+    if (ext_stats_buf_size == 0) {
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    firstpass_info->stats_buf = ext_stats_buf;
+    firstpass_info->stats_buf_size = ext_stats_buf_size;
+    firstpass_info->start_index = 0;
+    firstpass_info->cur_index = 0;
+    firstpass_info->stats_count = firstpass_info->stats_buf_size;
+    firstpass_info->future_stats_count = firstpass_info->stats_count;
+    firstpass_info->past_stats_count = 0;
+    av1_zero(firstpass_info->total_stats);
+    for (int i = 0; i < firstpass_info->stats_count; ++i) {
+      av1_accumulate_stats(&firstpass_info->total_stats,
+                           &firstpass_info->stats_buf[i]);
+    }
+  }
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+    FIRSTPASS_INFO *firstpass_info) {
+  assert(firstpass_info->future_stats_count +
+             firstpass_info->past_stats_count ==
+         firstpass_info->stats_count);
+  if (firstpass_info->future_stats_count > 1) {
+    firstpass_info->cur_index =
+        (firstpass_info->cur_index + 1) % firstpass_info->stats_buf_size;
+    --firstpass_info->future_stats_count;
+    ++firstpass_info->past_stats_count;
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info) {
+  if (firstpass_info->stats_count > 0 && firstpass_info->past_stats_count > 0) {
+    const int next_start =
+        (firstpass_info->start_index + 1) % firstpass_info->stats_buf_size;
+    firstpass_info->start_index = next_start;
+    --firstpass_info->stats_count;
+    --firstpass_info->past_stats_count;
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+    FIRSTPASS_INFO *firstpass_info) {
+  aom_codec_err_t ret = av1_firstpass_info_move_cur_index(firstpass_info);
+  if (ret != AOM_CODEC_OK) return ret;
+  ret = av1_firstpass_info_pop(firstpass_info);
+  return ret;
+}
+
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+                                        const FIRSTPASS_STATS *input_stats) {
+  if (firstpass_info->stats_count < firstpass_info->stats_buf_size) {
+    const int next_index =
+        (firstpass_info->start_index + firstpass_info->stats_count) %
+        firstpass_info->stats_buf_size;
+    firstpass_info->stats_buf[next_index] = *input_stats;
+    ++firstpass_info->stats_count;
+    ++firstpass_info->future_stats_count;
+    av1_accumulate_stats(&firstpass_info->total_stats, input_stats);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+    const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) {
+  if (offset_from_cur >= -firstpass_info->past_stats_count &&
+      offset_from_cur < firstpass_info->future_stats_count) {
+    const int index = (firstpass_info->cur_index + offset_from_cur) %
+                      firstpass_info->stats_buf_size;
+    return &firstpass_info->stats_buf[index];
+  } else {
+    return NULL;
+  }
+}
+
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+                                    int offset_from_cur) {
+  if (offset_from_cur < firstpass_info->future_stats_count) {
+    return firstpass_info->future_stats_count - offset_from_cur;
+  }
+  return 0;
+}
+
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+                                  int offset_from_cur) {
+  if (offset_from_cur >= -firstpass_info->past_stats_count) {
+    return offset_from_cur + firstpass_info->past_stats_count;
+  }
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
new file mode 100644
index 0000000000..d01363a80e
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
+
+#include <stdbool.h>
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#define MIN_ZERO_MOTION 0.95
+#define MAX_SR_CODED_ERROR 40
+#define MAX_RAW_ERR_VAR 2000
+#define MIN_MV_IN_OUT 0.4
+
+#define VLOW_MOTION_THRESHOLD 950
+struct ThreadData;
+
+/*!
+ * \brief The stucture of acummulated frame stats in the first pass.
+ *
+ * Errors (coded_error, intra_error, etc.) and counters (new_mv_count) are
+ * normalized to each MB. MV related stats (MVc, MVr, etc.) are normalized to
+ * the frame width and height. See function normalize_firstpass_stats.
+ */
+typedef struct FIRSTPASS_STATS {
+  /*!
+   * Frame number in display order, if stats are for a single frame.
+   * No real meaning for a collection of frames.
+   */
+  double frame;
+  /*!
+   * Weight assigned to this frame (or total weight for the collection of
+   * frames) currently based on intra factor and brightness factor. This is used
+   * to distribute bits betweeen easier and harder frames.
+   */
+  double weight;
+  /*!
+   * Intra prediction error.
+   */
+  double intra_error;
+  /*!
+   * Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+   */
+  double frame_avg_wavelet_energy;
+  /*!
+   * Best of intra pred error and inter pred error using last frame as ref.
+   */
+  double coded_error;
+  /*!
+   * Best of intra pred error and inter pred error using golden frame as ref.
+   */
+  double sr_coded_error;
+  /*!
+   * Percentage of blocks with inter pred error < intra pred error.
+   */
+  double pcnt_inter;
+  /*!
+   * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+   */
+  double pcnt_motion;
+  /*!
+   * Percentage of blocks where golden frame was better than last or intra:
+   * inter pred error using golden frame < inter pred error using last frame and
+   * inter pred error using golden frame < intra pred error
+   */
+  double pcnt_second_ref;
+  /*!
+   * Percentage of blocks where intra and inter prediction errors were very
+   * close. Note that this is a 'weighted count', that is, the so blocks may be
+   * weighted by how close the two errors were.
+   */
+  double pcnt_neutral;
+  /*!
+   * Percentage of blocks that have almost no intra error residual
+   * (i.e. are in effect completely flat and untextured in the intra
+   * domain). In natural videos this is uncommon, but it is much more
+   * common in animations, graphics and screen content, so may be used
+   * as a signal to detect these types of content.
+   */
+  double intra_skip_pct;
+  /*!
+   * Image mask rows top and bottom.
+   */
+  double inactive_zone_rows;
+  /*!
+   * Image mask columns at left and right edges.
+   */
+  double inactive_zone_cols;
+  /*!
+   * Average of row motion vectors.
+   */
+  double MVr;
+  /*!
+   * Mean of absolute value of row motion vectors.
+   */
+  double mvr_abs;
+  /*!
+   * Mean of column motion vectors.
+   */
+  double MVc;
+  /*!
+   * Mean of absolute value of column motion vectors.
+   */
+  double mvc_abs;
+  /*!
+   * Variance of row motion vectors.
+   */
+  double MVrv;
+  /*!
+   * Variance of column motion vectors.
+   */
+  double MVcv;
+  /*!
+   * Value in range [-1,1] indicating fraction of row and column motion vectors
+   * that point inwards (negative MV value) or outwards (positive MV value).
+   * For example, value of 1 indicates, all row/column MVs are inwards.
+   */
+  double mv_in_out_count;
+  /*!
+   * Count of unique non-zero motion vectors.
+   */
+  double new_mv_count;
+  /*!
+   * Duration of the frame / collection of frames.
+   */
+  double duration;
+  /*!
+   * 1.0 if stats are for a single frame, OR
+   * Number of frames in this collection for which the stats are accumulated.
+   */
+  double count;
+  /*!
+   * standard deviation for (0, 0) motion prediction error
+   */
+  double raw_error_stdev;
+  /*!
+   * Whether the frame contains a flash
+   */
+  int64_t is_flash;
+  /*!
+   * Estimated noise variance
+   */
+  double noise_var;
+  /*!
+   * Correlation coefficient with the previous frame
+   */
+  double cor_coeff;
+  /*!
+   * log of intra_error
+   */
+  double log_intra_error;
+  /*!
+   * log of coded_error
+   */
+  double log_coded_error;
+} FIRSTPASS_STATS;
+
+// We want to keep one past stats for key frame detection
+// in test_candidate_kf()
+#define FIRSTPASS_INFO_STATS_PAST_MIN 1
+
+// The size of static buffer used in FIRSTPASS_INFO.
+#define FIRSTPASS_INFO_STATIC_BUF_SIZE \
+  (MAX_LAP_BUFFERS + FIRSTPASS_INFO_STATS_PAST_MIN)
+
+/*!
+ * \brief  Data structure used for managing first pass stats
+ */
+typedef struct {
+  /*!
+   * A static buffer that will be used when no ext_stats_buf is assigned. The
+   * ext_stats_buf is assigned through av1_firstpass_info_init() when the user
+   * already has a pre-existing firstpass stats that is stored in an external
+   * buffer. The ext_stats_buf is usually used in two pass mode. When using one
+   * pass mode, we generate "firstpass" stats and encode the video in the same
+   * pass. In this scenario, the stats will be pushed and popped from
+   * static_stats_buf.
+   */
+  FIRSTPASS_STATS static_stats_buf[FIRSTPASS_INFO_STATIC_BUF_SIZE];
+  /*!
+   * A pointer to first pass stats.
+   * Note that this buffer will be used as ring buffer.
+   */
+  FIRSTPASS_STATS *stats_buf;
+  /*!
+   * size of stats_buf
+   */
+  int stats_buf_size;
+  /*!
+   * start index of the available frame stats
+   * Note that start_index doesn't always point to
+   * current frame's stats because we need to
+   * keep past stats as well. To access current
+   * frame's stats, please use cur_index.
+   */
+  int start_index;
+
+  /*!
+   * count available stats stored in stats_buf
+   * the following condition should stay true
+   * stats_count = future_stats_count + past_stats_count
+   */
+  int stats_count;
+
+  /*!
+   *  index of the current frame's stats
+   */
+  int cur_index;
+
+  /*!
+   * count available future stats including current stats
+   */
+  int future_stats_count;
+
+  /*!
+   * count available past stats EXCLUDING current stats
+   */
+  int past_stats_count;
+
+  /*!
+   * Accumulation of the stats being pushed into firstpass_info
+   */
+  FIRSTPASS_STATS total_stats;
+} FIRSTPASS_INFO;
+
+/*!\brief Init firstpass_info
+ *
+ * If using ext_stats_buf, the buffer needs to stay available during encoding
+ * process.
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \param[in]    ext_stats_buf       external stats buffer. Pass in NULL if
+ *                                   choose to use internal static_stats_buf.
+ * \param[in]    ext_stats_buf_size  external stats buffer size. Pass in 0 if
+ * choose to use internal static_stats_buf. \return status
+ */
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+                                        FIRSTPASS_STATS *ext_stats_buf,
+                                        int ext_stats_buf_size);
+
+/*!\brief Move cur_index by 1
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+    FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Move cur_index by 1 and pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+    FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Push a stats into firstpass_info
+ *
+ * Note that the input stats will be copied into firstpass_info.
+ * \ingroup rate_control
+ * \param[out]  firstpass_info      struct of firstpass_info.
+ * \param[in]   input_stats         input stats
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+                                        const FIRSTPASS_STATS *input_stats);
+
+/*!\brief Peek at a stats from firstpass_info
+ *
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info      struct of firstpass_info.
+ * \param[in]  offset_from_cur  index offset from cur_index.
+ * \return pointer to the stats. The pointer will be NULL if
+ *         stats_index_offset is invalid.
+ */
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+    const FIRSTPASS_INFO *firstpass_info, int offset_from_cur);
+
+/*!\brief Count the future stats from the target in firstpass_info
+ * Note that the target stats will be counted as well.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info    struct of firstpass_info.
+ * \param[in]  offset_from_cur  target stats's inffset
+ *                               from cur_index.
+ * \return Number of stats in the future after the target stats
+ *         including itself.
+ */
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+                                    int offset_from_cur);
+
+/*!\brief Count the past stats before the target in firstpass_info
+ * Note that the target stats will NOT be counted.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info    struct of firstpass_info.
+ * \param[in]  offset_from_cur  target stats's index offset
+ *                               from cur_index.
+ * \return Number of stats in the past before the target stats
+ *         excluding itself.
+ */
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+                                  int offset_from_cur);
+
+/*!\cond */
+#define FC_ANIMATION_THRESH 0.15
+enum {
+  FC_NORMAL = 0,
+  FC_GRAPHICS_ANIMATION = 1,
+  FRAME_CONTENT_TYPES = 2
+} UENUM1BYTE(FRAME_CONTENT_TYPE);
+/*!\endcond */
+
+/*!
+ * \brief  Data related to the current GF/ARF group and the
+ * individual frames within the group
+ */
+typedef struct GF_GROUP {
+  /*!\cond */
+  // Frame update type, e.g. ARF/GF/LF/Overlay
+  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
+  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+  // The number of frames displayed so far within the GOP at a given coding
+  // frame.
+  unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH];
+  int layer_depth[MAX_STATIC_GF_GROUP_LENGTH];
+  int arf_boost[MAX_STATIC_GF_GROUP_LENGTH];
+  int max_layer_depth;
+  int max_layer_depth_allowed;
+  // This is currently only populated for AOM_Q mode
+  int q_val[MAX_STATIC_GF_GROUP_LENGTH];
+  int rdmult_val[MAX_STATIC_GF_GROUP_LENGTH];
+  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
+  // The frame coding type - inter/intra frame
+  FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH];
+  // The reference frame buffer control - update or reset
+  REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH];
+  int arf_index;  // the index in the gf group of ARF, if no arf, then -1
+  int size;       // The total length of a GOP
+
+  // The offset into lookahead_ctx for choosing
+  // source of frame parallel encodes.
+  int src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+  // Stores the display order hint of each frame in the current GF_GROUP.
+  int display_idx[MAX_STATIC_GF_GROUP_LENGTH];
+
+  // The reference frame list maps the reference frame indexes to its
+  // buffer index in the decoded buffer. A value of -1 means the
+  // corresponding reference frame index doesn't point towards any
+  // previously decoded frame.
+  int8_t ref_frame_list[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+  // Update frame index
+  int update_ref_idx[MAX_STATIC_GF_GROUP_LENGTH];
+  // The map_idx of primary reference
+  int primary_ref_idx[MAX_STATIC_GF_GROUP_LENGTH];
+
+  // Indicates the level of parallelism in frame parallel encodes.
+  // 0 : frame is independently encoded (not part of parallel encodes).
+  // 1 : frame is the first in encode order in a given parallel encode set.
+  // 2 : frame occurs later in encode order in a given parallel encode set.
+  int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH];
+  // Indicates whether a frame should act as non-reference frame.
+  bool is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH];
+  // Indicates whether a frame is dropped.
+  bool is_frame_dropped[MAX_STATIC_GF_GROUP_LENGTH];
+
+  // Stores the display order hint of the frames not to be
+  // refreshed by the current frame.
+  int skip_frame_refresh[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+  // Stores the display order hint of the frame to be excluded during reference
+  // assignment.
+  int skip_frame_as_ref[MAX_STATIC_GF_GROUP_LENGTH];
+  /*!\endcond */
+} GF_GROUP;
+/*!\cond */
+
+typedef struct {
+  // Track if the last frame in a GOP has higher quality.
+  int arf_gf_boost_lst;
+} GF_STATE;
+
+typedef struct {
+  FIRSTPASS_STATS *stats_in_start;
+  FIRSTPASS_STATS *stats_in_end;
+  FIRSTPASS_STATS *stats_in_buf_end;
+  FIRSTPASS_STATS *total_stats;
+  FIRSTPASS_STATS *total_left_stats;
+} STATS_BUFFER_CTX;
+
+/*!\endcond */
+
+/*!
+ * \brief Two pass status and control data.
+ */
+typedef struct {
+  /*!\cond */
+  unsigned int section_intra_rating;
+  // Circular queue of first pass stats stored for most recent frames.
+  // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored
+  // here.
+  FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1];
+  int frame_stats_next_idx;  // Index to next unused element in frame_stats_arr.
+  STATS_BUFFER_CTX *stats_buf_ctx;
+  FIRSTPASS_INFO firstpass_info;  // This is the first pass data structure
+                                  // intended to replace stats_in
+  int first_pass_done;
+  int64_t bits_left;
+  double modified_error_min;
+  double modified_error_max;
+  double modified_error_left;
+
+  // Projected total bits available for a key frame group of frames
+  int64_t kf_group_bits;
+
+  // Error score of frames still to be coded in kf group
+  double kf_group_error_left;
+
+  // Over time correction for bits per macro block estimation
+  double bpm_factor;
+
+  // Record of target and actual bits spent in current ARF group
+  int rolling_arf_group_target_bits;
+  int rolling_arf_group_actual_bits;
+
+  int sr_update_lag;
+
+  int kf_zeromotion_pct;
+  int last_kfgroup_zeromotion_pct;
+  int extend_minq;
+  int extend_maxq;
+  /*!\endcond */
+} TWO_PASS;
+
+/*!
+ * \brief Frame level Two pass status and control data.
+ */
+typedef struct {
+  /*!\cond */
+  const FIRSTPASS_STATS *stats_in;
+  // Pointer to the stats of the current frame.
+  const FIRSTPASS_STATS *this_frame;
+  double mb_av_energy;
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
+  double frame_avg_haar_energy;
+  /*!\endcond */
+} TWO_PASS_FRAME;
+
+/*!\cond */
+
+// This structure contains several key parameters to be accumulated for this
+// frame.
+typedef struct {
+  // Intra prediction error.
+  int64_t intra_error;
+  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+  int64_t frame_avg_wavelet_energy;
+  // Best of intra pred error and inter pred error using last frame as ref.
+  int64_t coded_error;
+  // Best of intra pred error and inter pred error using golden frame as ref.
+  int64_t sr_coded_error;
+  // Count of motion vector.
+  int mv_count;
+  // Count of blocks that pick inter prediction (inter pred error is smaller
+  // than intra pred error).
+  int inter_count;
+  // Count of blocks that pick second ref (golden frame).
+  int second_ref_count;
+  // Count of blocks where the inter and intra are very close and very low.
+  double neutral_count;
+  // Count of blocks where intra error is very small.
+  int intra_skip_count;
+  // Start row.
+  int image_data_start_row;
+  // Count of unique non-zero motion vectors.
+  int new_mv_count;
+  // Sum of inward motion vectors.
+  int sum_in_vectors;
+  // Sum of motion vector row.
+  int sum_mvr;
+  // Sum of motion vector column.
+  int sum_mvc;
+  // Sum of absolute value of motion vector row.
+  int sum_mvr_abs;
+  // Sum of absolute value of motion vector column.
+  int sum_mvc_abs;
+  // Sum of the square of motion vector row.
+  int64_t sum_mvrs;
+  // Sum of the square of motion vector column.
+  int64_t sum_mvcs;
+  // A factor calculated using intra pred error.
+  double intra_factor;
+  // A factor that measures brightness.
+  double brightness_factor;
+} FRAME_STATS;
+
+// This structure contains first pass data.
+typedef struct {
+  // Buffer holding frame stats for all MACROBLOCKs.
+  // mb_stats[i] stores the FRAME_STATS of the ith
+  // MB in raster scan order.
+  FRAME_STATS *mb_stats;
+  // Buffer to store the prediction error of the (0,0) motion
+  // vector using the last source frame as the reference.
+  // raw_motion_err_list[i] stores the raw_motion_err of
+  // the ith MB in raster scan order.
+  int *raw_motion_err_list;
+} FirstPassData;
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+struct AV1EncoderConfig;
+struct TileDataEnc;
+
+static INLINE int is_fp_wavelet_energy_invalid(
+    const FIRSTPASS_STATS *fp_stats) {
+  assert(fp_stats != NULL);
+  return (fp_stats->frame_avg_wavelet_energy < 0);
+}
+
+static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
+  return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16);
+}
+
+int av1_get_unit_rows_in_tile(const TileInfo *tile,
+                              const BLOCK_SIZE fp_block_size);
+int av1_get_unit_cols_in_tile(const TileInfo *tile,
+                              const BLOCK_SIZE fp_block_size);
+
+void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                        struct TileDataEnc *tile_data, const int mb_row,
+                        const BLOCK_SIZE fp_block_size);
+void av1_end_first_pass(struct AV1_COMP *cpi);
+
+void av1_free_firstpass_data(FirstPassData *firstpass_data);
+
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+                          const FIRSTPASS_STATS *frame);
+/*!\endcond */
+
+/*!\brief AV1 first pass encoding.
+ *
+ * \ingroup rate_control
+ * This function is the first encoding pass for the two pass encoding mode.
+ * It encodes the whole video and collect essential information.
+ * Two pass encoding is an encoding mode in the reference software (libaom)
+ * of AV1 for high performance encoding. The first pass is a fast encoding
+ * process to collect essential information to help the second pass make
+ * encoding decisions and improve coding quality. The collected stats is used
+ * in rate control, for example, to determine frame cut, the position of
+ * alternative reference frame (ARF), etc.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    ts_duration    Duration of the frame / collection of frames
+ *
+ * \remark Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi"
+ * is modified to store information computed in this function.
+ */
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
+
+void av1_noop_first_pass_frame(struct AV1_COMP *cpi, const int64_t ts_duration);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
new file mode 100644
index 0000000000..73910de121
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/encoder/global_motion.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+
+#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
+
+// Border over which to compute the global motion
+#define ERRORADV_BORDER 0
+
+/* clang-format off */
+// Error metric used for global motion evaluation.
+// For 8-bit input, the pixel error used to index this table will always
+// be between -255 and +255. But for 10- and 12-bit input, we use interpolation
+// which means that we need to support indices of -256 and +256 as well.
+// Therefore, the table is offset so that logical index 0 corresponds to
+// error_measure_lut[256].
+const int error_measure_lut[513] = {
+    // pow 0.7
+    16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113,
+    16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749,
+    15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381,
+    15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010,
+    14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634,
+    14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254,
+    14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870,
+    13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481,
+    13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088,
+    13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689,
+    12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285,
+    12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875,
+    11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458,
+    11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036,
+    10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606,
+    10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168,
+    10113, 10058, 10002,  9947,  9891,  9835,  9779,  9723,
+     9666,  9610,  9553,  9497,  9440,  9383,  9326,  9268,
+     9211,  9153,  9095,  9037,  8979,  8921,  8862,  8804,
+     8745,  8686,  8627,  8568,  8508,  8449,  8389,  8329,
+     8269,  8208,  8148,  8087,  8026,  7965,  7903,  7842,
+     7780,  7718,  7656,  7593,  7531,  7468,  7405,  7341,
+     7278,  7214,  7150,  7086,  7021,  6956,  6891,  6826,
+     6760,  6695,  6628,  6562,  6495,  6428,  6361,  6293,
+     6225,  6157,  6089,  6020,  5950,  5881,  5811,  5741,
+     5670,  5599,  5527,  5456,  5383,  5311,  5237,  5164,
+     5090,  5015,  4941,  4865,  4789,  4713,  4636,  4558,
+     4480,  4401,  4322,  4242,  4162,  4080,  3998,  3916,
+     3832,  3748,  3663,  3577,  3490,  3402,  3314,  3224,
+     3133,  3041,  2948,  2854,  2758,  2661,  2562,  2461,
+     2359,  2255,  2148,  2040,  1929,  1815,  1698,  1577,
+     1452,  1323,  1187,  1045,   894,   731,   550,   339,
+        0,   339,   550,   731,   894,  1045,  1187,  1323,
+     1452,  1577,  1698,  1815,  1929,  2040,  2148,  2255,
+     2359,  2461,  2562,  2661,  2758,  2854,  2948,  3041,
+     3133,  3224,  3314,  3402,  3490,  3577,  3663,  3748,
+     3832,  3916,  3998,  4080,  4162,  4242,  4322,  4401,
+     4480,  4558,  4636,  4713,  4789,  4865,  4941,  5015,
+     5090,  5164,  5237,  5311,  5383,  5456,  5527,  5599,
+     5670,  5741,  5811,  5881,  5950,  6020,  6089,  6157,
+     6225,  6293,  6361,  6428,  6495,  6562,  6628,  6695,
+     6760,  6826,  6891,  6956,  7021,  7086,  7150,  7214,
+     7278,  7341,  7405,  7468,  7531,  7593,  7656,  7718,
+     7780,  7842,  7903,  7965,  8026,  8087,  8148,  8208,
+     8269,  8329,  8389,  8449,  8508,  8568,  8627,  8686,
+     8745,  8804,  8862,  8921,  8979,  9037,  9095,  9153,
+     9211,  9268,  9326,  9383,  9440,  9497,  9553,  9610,
+     9666,  9723,  9779,  9835,  9891,  9947, 10002, 10058,
+    10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497,
+    10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929,
+    10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353,
+    11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771,
+    11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183,
+    12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588,
+    12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988,
+    13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383,
+    13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773,
+    13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159,
+    14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539,
+    14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916,
+    14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289,
+    15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657,
+    15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022,
+    16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384,
+    16384,
+};
+/* clang-format on */
+
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
+  return best_erroradvantage < erroradv_tr &&
+         best_erroradvantage * params_cost < erroradv_prod_tr;
+}
+
+static void convert_to_params(const double *params, int32_t *model) {
+  int i;
+  model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+  model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+
+  for (i = 2; i < 6; ++i) {
+    const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+    model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+    model[i] =
+        (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX);
+    model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR;
+  }
+}
+
+void av1_convert_model_to_params(const double *params,
+                                 WarpedMotionParams *model) {
+  convert_to_params(params, model->wmmat);
+  model->wmtype = get_wmtype(model);
+  model->invalid = 0;
+}
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+                                int32_t offset) {
+  const int scale_vals[2] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF };
+  const int clamp_vals[2] = { GM_TRANS_MAX, GM_ALPHA_MAX };
+  // type of param: 0 - translation, 1 - affine
+  const int param_type = (param_index < 2 ? 0 : 1);
+  const int is_one_centered = (param_index == 2 || param_index == 5);
+
+  // Make parameter zero-centered and offset the shift that was done to make
+  // it compatible with the warped model
+  param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+                scale_vals[param_type];
+  // Add desired offset to the rescaled/zero-centered parameter
+  param_value += offset;
+  // Clamp the parameter so it does not overflow the number of bits allotted
+  // to it in the bitstream
+  param_value = (int32_t)clamp(param_value, -clamp_vals[param_type],
+                               clamp_vals[param_type]);
+  // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+  // with the warped motion library
+  param_value *= (1 << scale_vals[param_type]);
+
+  // Undo the zero-centering step if necessary
+  return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
+  switch (wmtype) {
+    case IDENTITY:
+      wm->wmmat[0] = 0;
+      wm->wmmat[1] = 0;
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
+      wm->wmmat[3] = 0;
+      AOM_FALLTHROUGH_INTENDED;
+    case ROTZOOM:
+      wm->wmmat[4] = -wm->wmmat[3];
+      wm->wmmat[5] = wm->wmmat[2];
+      AOM_FALLTHROUGH_INTENDED;
+    case AFFINE: break;
+    default: assert(0);
+  }
+  wm->wmtype = wmtype;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride,
+                                     const uint16_t *const dst, int dst_stride,
+                                     int p_width, int p_height) {
+  // This function should only be called for patches smaller than
+  // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+  // small enough that we don't need a 64-bit accumulator
+  assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+  int sad = 0;
+  for (int i = 0; i < p_height; ++i) {
+    for (int j = 0; j < p_width; ++j) {
+      sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+    }
+  }
+  return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_segmented_frame_error"
+#endif  // WARP_ERROR_BLOCK != 32
+static int64_t highbd_segmented_frame_error(
+    const uint16_t *const ref, int ref_stride, const uint16_t *const dst,
+    int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map,
+    int segment_map_stride) {
+  (void)bd;
+  int patch_w, patch_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+      if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+        sum_error += aom_highbd_sad32x32(
+            CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride,
+            CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+      } else {
+        sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride,
+                                        dst + j + i * dst_stride, dst_stride,
+                                        patch_w, patch_h);
+      }
+    }
+  }
+  return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
+static int64_t highbd_warp_error(WarpedMotionParams *wm,
+                                 const uint16_t *const ref, int ref_width,
+                                 int ref_height, int ref_stride,
+                                 const uint16_t *const dst, int dst_stride,
+                                 int p_col, int p_row, int p_width,
+                                 int p_height, int subsampling_x,
+                                 int subsampling_y, int bd, int64_t best_error,
+                                 uint8_t *segment_map, int segment_map_stride) {
+  int64_t gm_sumerr = 0;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  DECLARE_ALIGNED(32, uint16_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
+
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  conv_params.use_dist_wtd_comp_avg = 0;
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      const int warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+      const int warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+      highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i,
+                        warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x,
+                        subsampling_y, bd, &conv_params);
+
+      if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+        gm_sumerr += aom_highbd_sad32x32(
+            CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK,
+            CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+      } else {
+        gm_sumerr +=
+            generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+                               dst_stride, warp_w, warp_h);
+      }
+
+      if (gm_sumerr > best_error) return INT64_MAX;
+    }
+  }
+  return gm_sumerr;
+}
+#endif
+
+static INLINE int generic_sad(const uint8_t *const ref, int ref_stride,
+                              const uint8_t *const dst, int dst_stride,
+                              int p_width, int p_height) {
+  // This function should only be called for patches smaller than
+  // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+  // small enough that we don't need a 64-bit accumulator
+  assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+  int sad = 0;
+  for (int i = 0; i < p_height; ++i) {
+    for (int j = 0; j < p_width; ++j) {
+      sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+    }
+  }
+  return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in segmented_warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
+static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride,
+                                     const uint8_t *const dst, int dst_stride,
+                                     int p_width, int p_height,
+                                     uint8_t *segment_map,
+                                     int segment_map_stride) {
+  int patch_w, patch_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+      if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+        sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride,
+                                  dst + j + i * dst_stride, dst_stride);
+      } else {
+        sum_error +=
+            generic_sad(ref + j + i * ref_stride, ref_stride,
+                        dst + j + i * dst_stride, dst_stride, patch_w, patch_h);
+      }
+    }
+  }
+  return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
+static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
+                          int ref_width, int ref_height, int ref_stride,
+                          const uint8_t *const dst, int dst_stride, int p_col,
+                          int p_row, int p_width, int p_height,
+                          int subsampling_x, int subsampling_y,
+                          int64_t best_error, uint8_t *segment_map,
+                          int segment_map_stride) {
+  int64_t gm_sumerr = 0;
+  int warp_w, warp_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
+  ConvolveParams conv_params = get_conv_params(0, 0, 8);
+  conv_params.use_dist_wtd_comp_avg = 0;
+
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+      warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+      warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w,
+                 warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
+                 &conv_params);
+
+      if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+        gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK,
+                                  dst + j + i * dst_stride, dst_stride);
+      } else {
+        gm_sumerr +=
+            generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+                        dst_stride, warp_w, warp_h);
+      }
+
+      if (gm_sumerr > best_error) return INT64_MAX;
+    }
+  }
+  return gm_sumerr;
+}
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int ref_stride, uint8_t *dst, int dst_stride,
+                                  int p_width, int p_height,
+                                  uint8_t *segment_map,
+                                  int segment_map_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    return highbd_segmented_frame_error(
+        CONVERT_TO_SHORTPTR(ref), ref_stride, CONVERT_TO_SHORTPTR(dst),
+        dst_stride, p_width, p_height, bd, segment_map, segment_map_stride);
+  }
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return segmented_frame_error(ref, ref_stride, dst, dst_stride, p_width,
+                               p_height, segment_map, segment_map_stride);
+}
+
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int ref_width, int ref_height,
+                       int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+                       int p_row, int p_width, int p_height, int subsampling_x,
+                       int subsampling_y, int64_t best_error,
+                       uint8_t *segment_map, int segment_map_stride) {
+  if (!av1_get_shear_params(wm)) return INT64_MAX;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd)
+    return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), ref_width,
+                             ref_height, ref_stride, CONVERT_TO_SHORTPTR(dst),
+                             dst_stride, p_col, p_row, p_width, p_height,
+                             subsampling_x, subsampling_y, bd, best_error,
+                             segment_map, segment_map_stride);
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return warp_error(wm, ref, ref_width, ref_height, ref_stride, dst, dst_stride,
+                    p_col, p_row, p_width, p_height, subsampling_x,
+                    subsampling_y, best_error, segment_map, segment_map_stride);
+}
+
+int64_t av1_refine_integerized_param(
+    WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+    uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+    int d_width, int d_height, int d_stride, int n_refinements,
+    int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride) {
+  static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+  const int border = ERRORADV_BORDER;
+  int i = 0, p;
+  int n_params = max_trans_model_params[wmtype];
+  int32_t *param_mat = wm->wmmat;
+  int64_t step_error, best_error;
+  int32_t step;
+  int32_t *param;
+  int32_t curr_param;
+  int32_t best_param;
+
+  force_wmtype(wm, wmtype);
+  wm->wmtype = get_wmtype(wm);
+
+  if (n_refinements == 0) {
+    // Compute the maximum error value that will be accepted, so that
+    // av1_warp_error can terminate early if it proves the model will not
+    // be accepted.
+    int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_tr);
+    return av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                          dst + border * d_stride + border, d_stride, border,
+                          border, d_width - 2 * border, d_height - 2 * border,
+                          0, 0, selection_threshold, segment_map,
+                          segment_map_stride);
+  }
+
+  // When refining, use a slightly higher threshold for the initial error
+  // calculation - see comment above erroradv_early_tr for why.
+  int64_t selection_threshold =
+      (int64_t)lrint(ref_frame_error * erroradv_early_tr);
+  best_error =
+      av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                     dst + border * d_stride + border, d_stride, border, border,
+                     d_width - 2 * border, d_height - 2 * border, 0, 0,
+                     selection_threshold, segment_map, segment_map_stride);
+
+  if (best_error > selection_threshold) {
+    return INT64_MAX;
+  }
+
+  step = 1 << (n_refinements - 1);
+  for (i = 0; i < n_refinements; i++, step >>= 1) {
+    for (p = 0; p < n_params; ++p) {
+      int step_dir = 0;
+      param = param_mat + p;
+      curr_param = *param;
+      best_param = curr_param;
+      // look to the left
+      // Note: We have to use force_wmtype() to keep the proper symmetry for
+      // ROTZOOM type models
+      *param = add_param_offset(p, curr_param, -step);
+      force_wmtype(wm, wmtype);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, d_stride, border,
+                         border, d_width - 2 * border, d_height - 2 * border, 0,
+                         0, best_error, segment_map, segment_map_stride);
+      if (step_error < best_error) {
+        best_error = step_error;
+        best_param = *param;
+        step_dir = -1;
+      }
+
+      // look to the right
+      *param = add_param_offset(p, curr_param, step);
+      force_wmtype(wm, wmtype);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, d_stride, border,
+                         border, d_width - 2 * border, d_height - 2 * border, 0,
+                         0, best_error, segment_map, segment_map_stride);
+      if (step_error < best_error) {
+        best_error = step_error;
+        best_param = *param;
+        step_dir = 1;
+      }
+
+      // look to the direction chosen above repeatedly until error increases
+      // for the biggest step size
+      while (step_dir) {
+        *param = add_param_offset(p, best_param, step * step_dir);
+        force_wmtype(wm, wmtype);
+        step_error =
+            av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                           dst + border * d_stride + border, d_stride, border,
+                           border, d_width - 2 * border, d_height - 2 * border,
+                           0, 0, best_error, segment_map, segment_map_stride);
+        if (step_error < best_error) {
+          best_error = step_error;
+          best_param = *param;
+        } else {
+          step_dir = 0;
+        }
+      }
+
+      // Restore best parameter value so far
+      *param = best_param;
+      force_wmtype(wm, wmtype);
+    }
+  }
+
+  wm->wmtype = get_wmtype(wm);
+  return best_error;
+}
+
+#define FEAT_COUNT_TR 3
+#define SEG_COUNT_TR 48
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+                                          int height, int *inliers,
+                                          int num_inliers) {
+  int seg_count = 0;
+  memset(segment_map, 0, sizeof(*segment_map) * width * height);
+
+  for (int i = 0; i < num_inliers; i++) {
+    int x = inliers[i * 2];
+    int y = inliers[i * 2 + 1];
+    int seg_x = x >> WARP_ERROR_BLOCK_LOG;
+    int seg_y = y >> WARP_ERROR_BLOCK_LOG;
+    segment_map[seg_y * width + seg_x] += 1;
+  }
+
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      uint8_t feat_count = segment_map[i * width + j];
+      segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR);
+      seg_count += (segment_map[i * width + j]);
+    }
+  }
+
+  // If this motion does not make up a large enough portion of the frame,
+  // use the unsegmented version of the error metric
+  if (seg_count < SEG_COUNT_TR)
+    memset(segment_map, 1, width * height * sizeof(*segment_map));
+}
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
new file mode 100644
index 0000000000..8c9c60f0f5
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RANSAC_NUM_MOTIONS 1
+#define GM_MAX_REFINEMENT_STEPS 5
+#define MAX_DIRECTIONS 2
+
+// The structure holds a valid reference frame type and its temporal distance
+// from the source frame.
+typedef struct {
+  int distance;
+  MV_REFERENCE_FRAME frame;
+} FrameDistPair;
+
+typedef struct {
+  // Array of structure which holds the global motion parameters for a given
+  // motion model. motion_models[i] holds the parameters for a given motion
+  // model for the ith ransac motion.
+  MotionModel motion_models[RANSAC_NUM_MOTIONS];
+
+  // Pointer to hold inliers from motion model.
+  uint8_t *segment_map;
+} GlobalMotionData;
+
+typedef struct {
+  // Holds the mapping of each thread to past/future direction.
+  // thread_id_to_dir[i] indicates the direction id (past - 0/future - 1)
+  // assigned to the ith thread.
+  int8_t thread_id_to_dir[MAX_NUM_THREADS];
+
+  // A flag which holds the early exit status based on the speed feature
+  // 'prune_ref_frame_for_gm_search'. early_exit[i] will be set if the speed
+  // feature based early exit happens in the direction 'i'.
+  int8_t early_exit[MAX_DIRECTIONS];
+
+  // Counter for the next reference frame to be processed.
+  // next_frame_to_process[i] will hold the count of next reference frame to be
+  // processed in the direction 'i'.
+  int8_t next_frame_to_process[MAX_DIRECTIONS];
+} JobInfo;
+
+typedef struct {
+  // Data related to assigning jobs for global motion multi-threading.
+  JobInfo job_info;
+
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif
+
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool gm_mt_exit;
+} AV1GlobalMotionSync;
+
+void av1_convert_model_to_params(const double *params,
+                                 WarpedMotionParams *model);
+
+// Criteria for accepting a global motion model
+static const double erroradv_tr = 0.65;
+static const double erroradv_prod_tr = 20000;
+
+// Early exit threshold for global motion refinement
+// This is set slightly higher than erroradv_tr, as a compromise between
+// two factors:
+//
+// 1) By rejecting un-promising models early, we can reduce the encode time
+//    spent trying to refine them
+//
+// 2) When we refine a model, its error may decrease to below the acceptance
+//    threshold even if the model is initially above the threshold
+static const double erroradv_early_tr = 0.70;
+
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost);
+
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+                                          int height, int *inliers,
+                                          int num_inliers);
+
+extern const int error_measure_lut[513];
+
+static INLINE int error_measure(int err) {
+  return error_measure_lut[256 + err];
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_error_measure(int err, int bd) {
+  const int b = bd - 8;
+  const int bmask = (1 << b) - 1;
+  const int v = (1 << b);
+
+  // Split error into two parts and do an interpolated table lookup
+  // To compute the table index and interpolation value, we want to calculate
+  // the quotient and remainder of err / 2^b. But it is very important that
+  // the division must round down, and the remainder must be positive,
+  // ie. in the range [0, 2^b).
+  //
+  // In C, the >> and & operators do what we want, but the / and % operators
+  // give the wrong results for negative inputs. So we must use >> and & here.
+  //
+  // For example, if bd == 10 and err == -5, compare the results:
+  //       (-5) >> 2 = -2, (-5) & 3 =  3
+  //   vs. (-5) / 4  = -1, (-5) % 4 = -1
+  const int e1 = err >> b;
+  const int e2 = err & bmask;
+  return error_measure_lut[256 + e1] * (v - e2) +
+         error_measure_lut[257 + e1] * e2;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int ref_stride, uint8_t *dst, int dst_stride,
+                                  int p_width, int p_height,
+                                  uint8_t *segment_map, int segment_map_stride);
+
+// Returns the error between the result of applying motion 'wm' to the frame
+// described by 'ref' and the frame described by 'dst'.
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int ref_width, int ref_height,
+                       int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+                       int p_row, int p_width, int p_height, int subsampling_x,
+                       int subsampling_y, int64_t best_error,
+                       uint8_t *segment_map, int segment_map_stride);
+
+// Returns the av1_warp_error between "dst" and the result of applying the
+// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
+// modified in place.
+int64_t av1_refine_integerized_param(
+    WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+    uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+    int d_width, int d_height, int d_stride, int n_refinements,
+    int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/global_motion_facade.c b/third_party/aom/av1/encoder/global_motion_facade.c
new file mode 100644
index 0000000000..02a4e70ed3
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion_facade.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_writer.h"
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/pyramid.h"
+#include "av1/common/warped_motion.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/global_motion_facade.h"
+
+// Range of model types to search
+#define FIRST_GLOBAL_TRANS_TYPE ROTZOOM
+#define LAST_GLOBAL_TRANS_TYPE ROTZOOM
+
+// Computes the cost for the warp parameters.
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+                              const WarpedMotionParams *ref_gm, int allow_hp) {
+  int params_cost = 0;
+  int trans_bits, trans_prec_diff;
+  switch (gm->wmtype) {
+    case AFFINE:
+    case ROTZOOM:
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+          (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (gm->wmtype >= AFFINE) {
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS),
+            (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      }
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      trans_bits = (gm->wmtype == TRANSLATION)
+                       ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                       : GM_ABS_TRANS_BITS;
+      trans_prec_diff = (gm->wmtype == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[0] >> trans_prec_diff),
+          (gm->wmmat[0] >> trans_prec_diff));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[1] >> trans_prec_diff),
+          (gm->wmmat[1] >> trans_prec_diff));
+      AOM_FALLTHROUGH_INTENDED;
+    case IDENTITY: break;
+    default: assert(0);
+  }
+  return (params_cost << AV1_PROB_COST_SHIFT);
+}
+
+// For the given reference frame, computes the global motion parameters for
+// different motion models and finds the best.
+static AOM_INLINE void compute_global_motion_for_ref_frame(
+    AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
+    const int segment_map_h, const WarpedMotionParams *ref_params) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int src_width = cpi->source->y_crop_width;
+  int src_height = cpi->source->y_crop_height;
+  int src_stride = cpi->source->y_stride;
+  assert(ref_buf[frame] != NULL);
+  int bit_depth = cpi->common.seq_params->bit_depth;
+  GlobalMotionMethod global_motion_method = default_global_motion_method;
+  int num_refinements = cpi->sf.gm_sf.num_refinement_steps;
+  bool mem_alloc_failed = false;
+
+  // Select the best model based on fractional error reduction.
+  // By initializing this to erroradv_tr, the same logic which is used to
+  // select the best model will automatically filter out any model which
+  // doesn't meet the required quality threshold
+  double best_erroradv = erroradv_tr;
+  for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE;
+       model <= LAST_GLOBAL_TRANS_TYPE; ++model) {
+    if (!aom_compute_global_motion(
+            model, cpi->source, ref_buf[frame], bit_depth, global_motion_method,
+            motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) {
+      if (mem_alloc_failed) {
+        aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate global motion buffers");
+      }
+      continue;
+    }
+
+    for (int i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      if (motion_models[i].num_inliers == 0) continue;
+
+      WarpedMotionParams tmp_wm_params;
+      av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params);
+
+      // Skip models that we won't use (IDENTITY or TRANSLATION)
+      //
+      // For IDENTITY type models, we don't need to evaluate anything because
+      // all the following logic is effectively comparing the estimated model
+      // to an identity model.
+      //
+      // For TRANSLATION type global motion models, gm_get_motion_vector() gives
+      // the wrong motion vector (see comments in that function for details).
+      // As translation-type models do not give much gain, we can avoid this bug
+      // by never choosing a TRANSLATION type model
+      if (tmp_wm_params.wmtype <= TRANSLATION) continue;
+
+      av1_compute_feature_segmentation_map(
+          segment_map, segment_map_w, segment_map_h, motion_models[i].inliers,
+          motion_models[i].num_inliers);
+
+      int64_t ref_frame_error = av1_segmented_frame_error(
+          is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+          ref_buf[frame]->y_stride, cpi->source->y_buffer, src_stride,
+          src_width, src_height, segment_map, segment_map_w);
+
+      if (ref_frame_error == 0) continue;
+
+      const int64_t warp_error = av1_refine_integerized_param(
+          &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
+          ref_buf[frame]->y_buffer, ref_buf[frame]->y_crop_width,
+          ref_buf[frame]->y_crop_height, ref_buf[frame]->y_stride,
+          cpi->source->y_buffer, src_width, src_height, src_stride,
+          num_refinements, ref_frame_error, segment_map, segment_map_w);
+
+      // av1_refine_integerized_param() can return a simpler model type than
+      // its input, so re-check model type here
+      if (tmp_wm_params.wmtype <= TRANSLATION) continue;
+
+      double erroradvantage = (double)warp_error / ref_frame_error;
+
+      if (erroradvantage < best_erroradv) {
+        best_erroradv = erroradvantage;
+        // Save the wm_params modified by
+        // av1_refine_integerized_param() rather than motion index to
+        // avoid rerunning refine() below.
+        memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+               sizeof(WarpedMotionParams));
+      }
+    }
+  }
+
+  if (!av1_get_shear_params(&cm->global_motion[frame]))
+    cm->global_motion[frame] = default_warp_params;
+
+#if 0
+  // We never choose translational models, so this code is disabled
+  if (cm->global_motion[frame].wmtype == TRANSLATION) {
+    cm->global_motion[frame].wmmat[0] =
+        convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                              cm->global_motion[frame].wmmat[0]) *
+        GM_TRANS_ONLY_DECODE_FACTOR;
+    cm->global_motion[frame].wmmat[1] =
+        convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                              cm->global_motion[frame].wmmat[1]) *
+        GM_TRANS_ONLY_DECODE_FACTOR;
+  }
+#endif
+
+  if (cm->global_motion[frame].wmtype == IDENTITY) return;
+
+  // If the best error advantage found doesn't meet the threshold for
+  // this motion type, revert to IDENTITY.
+  if (!av1_is_enough_erroradvantage(
+          best_erroradv,
+          gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                             cm->features.allow_high_precision_mv))) {
+    cm->global_motion[frame] = default_warp_params;
+  }
+}
+
+// Computes global motion for the given reference frame.
+void av1_compute_gm_for_valid_ref_frames(
+    AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
+    int segment_map_h) {
+  AV1_COMMON *const cm = &cpi->common;
+  const WarpedMotionParams *ref_params =
+      cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                     : &default_warp_params;
+
+  compute_global_motion_for_ref_frame(cpi, error_info, ref_buf, frame,
+                                      motion_models, segment_map, segment_map_w,
+                                      segment_map_h, ref_params);
+}
+
+// Loops over valid reference frames and computes global motion estimation.
+static AOM_INLINE void compute_global_motion_for_references(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
+    MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
+    const int segment_map_h) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct aom_internal_error_info *const error_info =
+      cpi->td.mb.e_mbd.error_info;
+  // Compute global motion w.r.t. reference frames starting from the nearest ref
+  // frame in a given direction.
+  for (int frame = 0; frame < num_ref_frames; frame++) {
+    int ref_frame = reference_frame[frame].frame;
+    av1_compute_gm_for_valid_ref_frames(cpi, error_info, ref_buf, ref_frame,
+                                        motion_models, segment_map,
+                                        segment_map_w, segment_map_h);
+    // If global motion w.r.t. current ref frame is
+    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+    // the remaining ref frames in that direction.
+    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+        cm->global_motion[ref_frame].wmtype <= TRANSLATION)
+      break;
+  }
+}
+
+// Compares the distance in 'a' and 'b'. Returns 1 if the frame corresponding to
+// 'a' is farther, -1 if the frame corresponding to 'b' is farther, 0 otherwise.
+static int compare_distance(const void *a, const void *b) {
+  const int diff =
+      ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
+  if (diff > 0)
+    return 1;
+  else if (diff < 0)
+    return -1;
+  return 0;
+}
+
+static int disable_gm_search_based_on_stats(const AV1_COMP *const cpi) {
+  int is_gm_present = 1;
+
+  // Check number of GM models only in GF groups with ARF frames. GM param
+  // estimation is always done in the case of GF groups with no ARF frames (flat
+  // gops)
+  if (cpi->ppi->gf_group.arf_index > -1) {
+    // valid_gm_model_found is initialized to INT32_MAX in the beginning of
+    // every GF group.
+    // Therefore, GM param estimation is always done for all frames until
+    // at least 1 frame each of ARF_UPDATE, INTNL_ARF_UPDATE and LF_UPDATE are
+    // encoded in a GF group For subsequent frames, GM param estimation is
+    // disabled, if no valid models have been found in all the three update
+    // types.
+    is_gm_present = (cpi->ppi->valid_gm_model_found[ARF_UPDATE] != 0) ||
+                    (cpi->ppi->valid_gm_model_found[INTNL_ARF_UPDATE] != 0) ||
+                    (cpi->ppi->valid_gm_model_found[LF_UPDATE] != 0);
+  }
+  return !is_gm_present;
+}
+
+// Prunes reference frames for global motion estimation based on the speed
+// feature 'gm_search_type'.
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
+  (void)frame;
+  switch (sf->gm_sf.gm_search_type) {
+    case GM_FULL_SEARCH: return 1;
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
+               (frame == ALTREF2_FRAME));
+    case GM_SEARCH_CLOSEST_REFS_ONLY: return 1;
+    case GM_DISABLE_SEARCH: return 0;
+    default: assert(0);
+  }
+  return 1;
+}
+
+// Populates valid reference frames in past/future directions in
+// 'reference_frames' and their count in 'num_ref_frames'.
+static AOM_INLINE void update_valid_ref_frames_for_gm(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1],
+    int *num_ref_frames) {
+  AV1_COMMON *const cm = &cpi->common;
+  int *num_past_ref_frames = &num_ref_frames[0];
+  int *num_future_ref_frames = &num_ref_frames[1];
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+      gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index);
+  int cur_frame_gm_disabled = 0;
+  int pyr_lvl = cm->cur_frame->pyramid_level;
+
+  if (cpi->sf.gm_sf.disable_gm_search_based_on_stats) {
+    cur_frame_gm_disabled = disable_gm_search_based_on_stats(cpi);
+  }
+
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+    const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
+    RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+    const int ref_disabled =
+        !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
+    ref_buf[frame] = NULL;
+    cm->global_motion[frame] = default_warp_params;
+    // Skip global motion estimation for invalid ref frames
+    if (buf == NULL ||
+        (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
+      continue;
+    } else {
+      ref_buf[frame] = &buf->buf;
+    }
+
+    int prune_ref_frames =
+        ref_pruning_enabled &&
+        prune_ref_by_selective_ref_frame(cpi, NULL, ref_frame,
+                                         cm->cur_frame->ref_display_order_hint);
+    int ref_pyr_lvl = buf->pyramid_level;
+
+    if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+        ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+        do_gm_search_logic(&cpi->sf, frame) && !prune_ref_frames &&
+        ref_pyr_lvl <= pyr_lvl && !cur_frame_gm_disabled) {
+      assert(ref_buf[frame] != NULL);
+      const int relative_frame_dist = av1_encoder_get_relative_dist(
+          buf->display_order_hint, cm->cur_frame->display_order_hint);
+      // Populate past and future ref frames.
+      // reference_frames[0][] indicates past direction and
+      // reference_frames[1][] indicates future direction.
+      if (relative_frame_dist == 0) {
+        // Skip global motion estimation for frames at the same nominal instant.
+        // This will generally be either a "real" frame coded against a
+        // temporal filtered version, or a higher spatial layer coded against
+        // a lower spatial layer. In either case, the optimal motion model will
+        // be IDENTITY, so we don't need to search explicitly.
+      } else if (relative_frame_dist < 0) {
+        reference_frames[0][*num_past_ref_frames].distance =
+            abs(relative_frame_dist);
+        reference_frames[0][*num_past_ref_frames].frame = frame;
+        (*num_past_ref_frames)++;
+      } else {
+        reference_frames[1][*num_future_ref_frames].distance =
+            abs(relative_frame_dist);
+        reference_frames[1][*num_future_ref_frames].frame = frame;
+        (*num_future_ref_frames)++;
+      }
+    }
+  }
+}
+
+// Initializes parameters used for computing global motion.
+static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  YV12_BUFFER_CONFIG *source = cpi->source;
+
+  gm_info->segment_map_w =
+      (source->y_crop_width + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG;
+  gm_info->segment_map_h =
+      (source->y_crop_height + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG;
+
+  memset(gm_info->reference_frames, -1,
+         sizeof(gm_info->reference_frames[0][0]) * MAX_DIRECTIONS *
+             (REF_FRAMES - 1));
+  av1_zero(gm_info->num_ref_frames);
+
+  // Populate ref_buf for valid ref frames in global motion
+  update_valid_ref_frames_for_gm(cpi, gm_info->ref_buf,
+                                 gm_info->reference_frames,
+                                 gm_info->num_ref_frames);
+
+  // Sort the past and future ref frames in the ascending order of their
+  // distance from the current frame. reference_frames[0] => past direction
+  // and reference_frames[1] => future direction.
+  qsort(gm_info->reference_frames[0], gm_info->num_ref_frames[0],
+        sizeof(gm_info->reference_frames[0][0]), compare_distance);
+  qsort(gm_info->reference_frames[1], gm_info->num_ref_frames[1],
+        sizeof(gm_info->reference_frames[1][0]), compare_distance);
+
+  if (cpi->sf.gm_sf.gm_search_type == GM_SEARCH_CLOSEST_REFS_ONLY) {
+    // Filter down to the nearest two ref frames.
+    // Prefer one past and one future ref over two past refs, even if
+    // the second past ref is closer
+    if (gm_info->num_ref_frames[1] > 0) {
+      gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 1);
+      gm_info->num_ref_frames[1] = AOMMIN(gm_info->num_ref_frames[1], 1);
+    } else {
+      gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 2);
+    }
+  }
+}
+
+// Computes global motion w.r.t. valid reference frames.
+static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  GlobalMotionData *gm_data = &cpi->td.gm_data;
+
+  // Compute global motion w.r.t. past reference frames and future reference
+  // frames
+  for (int dir = 0; dir < MAX_DIRECTIONS; dir++) {
+    if (gm_info->num_ref_frames[dir] > 0)
+      compute_global_motion_for_references(
+          cpi, gm_info->ref_buf, gm_info->reference_frames[dir],
+          gm_info->num_ref_frames[dir], gm_data->motion_models,
+          gm_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h);
+  }
+}
+
+// Global motion estimation for the current frame is computed.This computation
+// happens once per frame and the winner motion model parameters are stored in
+// cm->cur_frame->global_motion.
+void av1_compute_global_motion_facade(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+
+  if (cpi->oxcf.tool_cfg.enable_global_motion) {
+    if (cpi->gf_frame_index == 0) {
+      for (int i = 0; i < FRAME_UPDATE_TYPES; i++) {
+        cpi->ppi->valid_gm_model_found[i] = INT32_MAX;
+#if CONFIG_FPMT_TEST
+        if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)
+          cpi->ppi->temp_valid_gm_model_found[i] = INT32_MAX;
+#endif
+      }
+    }
+  }
+
+  if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
+      cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done &&
+      cpi->sf.gm_sf.gm_search_type != GM_DISABLE_SEARCH) {
+    setup_global_motion_info_params(cpi);
+    // Terminate early if the total number of reference frames is zero.
+    if (cpi->gm_info.num_ref_frames[0] || cpi->gm_info.num_ref_frames[1]) {
+      gm_alloc_data(cpi, &cpi->td.gm_data);
+      if (cpi->mt_info.num_workers > 1)
+        av1_global_motion_estimation_mt(cpi);
+      else
+        global_motion_estimation(cpi);
+      gm_dealloc_data(&cpi->td.gm_data);
+      gm_info->search_done = 1;
+    }
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         sizeof(cm->cur_frame->global_motion));
+}
diff --git a/third_party/aom/av1/encoder/global_motion_facade.h b/third_party/aom/av1/encoder/global_motion_facade.h
new file mode 100644
index 0000000000..f13989aa25
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion_facade.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+// Allocates memory for members of GlobalMotionData.
+static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) {
+  AV1_COMMON *cm = &cpi->common;
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+
+  CHECK_MEM_ERROR(cm, gm_data->segment_map,
+                  aom_malloc(sizeof(*gm_data->segment_map) *
+                             gm_info->segment_map_w * gm_info->segment_map_h));
+
+  av1_zero_array(gm_data->motion_models, RANSAC_NUM_MOTIONS);
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    CHECK_MEM_ERROR(cm, gm_data->motion_models[m].inliers,
+                    aom_malloc(sizeof(*gm_data->motion_models[m].inliers) * 2 *
+                               MAX_CORNERS));
+  }
+}
+
+// Deallocates the memory allocated for members of GlobalMotionData.
+static AOM_INLINE void gm_dealloc_data(GlobalMotionData *gm_data) {
+  aom_free(gm_data->segment_map);
+  gm_data->segment_map = NULL;
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    aom_free(gm_data->motion_models[m].inliers);
+    gm_data->motion_models[m].inliers = NULL;
+  }
+}
+
+void av1_compute_gm_for_valid_ref_frames(
+    AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
+    int segment_map_h);
+void av1_compute_global_motion_facade(struct AV1_COMP *cpi);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
diff --git a/third_party/aom/av1/encoder/gop_structure.c b/third_party/aom/av1/encoder/gop_structure.c
new file mode 100644
index 0000000000..5078098450
--- /dev/null
+++ b/third_party/aom/av1/encoder/gop_structure.c
@@ -0,0 +1,867 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "av1/common/blockd.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "av1/common/av1_common_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+
+// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based
+// on the value of parallel_frame_count.
+static void set_frame_parallel_level(int *frame_parallel_level,
+                                     int *parallel_frame_count,
+                                     int max_parallel_frames) {
+  assert(*parallel_frame_count > 0);
+  // parallel_frame_count > 1 indicates subsequent frame(s) in the current
+  // parallel encode set.
+  *frame_parallel_level = 1 + (*parallel_frame_count > 1);
+  // Update the count of no. of parallel frames.
+  (*parallel_frame_count)++;
+  if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1;
+}
+
+// This function sets gf_group->src_offset based on frame_parallel_level.
+// Outputs are gf_group->src_offset and first_frame_index
+static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index,
+                           int cur_frame_idx, int frame_ind) {
+  if (gf_group->frame_parallel_level[frame_ind] > 0) {
+    if (gf_group->frame_parallel_level[frame_ind] == 1) {
+      *first_frame_index = cur_frame_idx;
+    }
+
+    // Obtain the offset of the frame at frame_ind in the lookahead queue by
+    // subtracting the display order hints of the current frame from the display
+    // order hint of the first frame in parallel encoding set (at
+    // first_frame_index).
+    gf_group->src_offset[frame_ind] =
+        (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) -
+        *first_frame_index;
+  }
+}
+
+// Sets the GF_GROUP params for LF_UPDATE frames.
+static AOM_INLINE void set_params_for_leaf_frames(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *parallel_frame_count, int max_parallel_frames,
+    int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index,
+    int layer_depth, int start, int end) {
+  gf_group->update_type[*frame_ind] = LF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = 0;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth);
+  gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+  gf_group->arf_boost[*frame_ind] =
+      av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+                         end - start, 0, NULL, NULL, 0);
+  ++(*cur_disp_index);
+
+  // Set the level of parallelism for the LF_UPDATE frame.
+  if (do_frame_parallel_encode) {
+    set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+                             parallel_frame_count, max_parallel_frames);
+    // Set LF_UPDATE frames as non-reference frames.
+    gf_group->is_frame_non_ref[*frame_ind] = true;
+  }
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+
+  ++(*frame_ind);
+  ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames.
+static AOM_INLINE void set_params_for_intnl_overlay_frames(
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *first_frame_index, int *cur_disp_index, int layer_depth) {
+  gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = 0;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+  ++(*cur_disp_index);
+
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+  ++(*frame_ind);
+  ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_ARF_UPDATE frames.
+static AOM_INLINE void set_params_for_internal_arfs(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *parallel_frame_count, int max_parallel_frames,
+    int do_frame_parallel_encode, int *first_frame_index, int depth_thr,
+    int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset,
+    int f_frames, int b_frames) {
+  gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = arf_src_offset;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] =
+      (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+  gf_group->arf_boost[*frame_ind] =
+      av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset,
+                         f_frames, b_frames, NULL, NULL, 0);
+
+  if (do_frame_parallel_encode) {
+    if (depth_thr != INT_MAX) {
+      assert(depth_thr == 3 || depth_thr == 4);
+      assert(IMPLIES(depth_thr == 3, layer_depth == 4));
+      assert(IMPLIES(depth_thr == 4, layer_depth == 5));
+      // Set frame_parallel_level of the first frame in the given layer to 1.
+      if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+        gf_group->frame_parallel_level[*frame_ind] = 1;
+      } else {
+        // Set frame_parallel_level of the consecutive frame in the same given
+        // layer to 2.
+        assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+        gf_group->frame_parallel_level[*frame_ind] = 2;
+        // Store the display order hints of the past 2 INTNL_ARF_UPDATE
+        // frames which would not have been displayed at the time of the encode
+        // of current frame.
+        gf_group->skip_frame_refresh[*frame_ind][0] =
+            gf_group->display_idx[(*frame_ind) - 1];
+        gf_group->skip_frame_refresh[*frame_ind][1] =
+            gf_group->display_idx[(*frame_ind) - 2];
+        // Set the display_idx of frame_parallel_level 1 frame in
+        // gf_group->skip_frame_as_ref.
+        gf_group->skip_frame_as_ref[*frame_ind] =
+            gf_group->display_idx[(*frame_ind) - 1];
+      }
+    }
+    // If max_parallel_frames is not exceeded and if the frame will not be
+    // temporally filtered, encode the next internal ARF frame in parallel.
+    if (*parallel_frame_count > 1 &&
+        *parallel_frame_count <= max_parallel_frames) {
+      if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+        gf_group->frame_parallel_level[*frame_ind] = 2;
+      *parallel_frame_count = 1;
+    }
+  }
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+  ++(*frame_ind);
+}
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params_for_fp(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+    RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+    int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+    int max_parallel_frames, int do_frame_parallel_encode,
+    int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) {
+  const int num_frames_to_process = end - start;
+
+  // Either we are at the last level of the pyramid, or we don't have enough
+  // frames between 'l' and 'r' to create one more level.
+  if (layer_depth > gf_group->max_layer_depth_allowed ||
+      num_frames_to_process < 3) {
+    // Leaf nodes.
+    while (start < end) {
+      set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info,
+                                 gf_group, cur_frame_idx, frame_ind,
+                                 parallel_frame_count, max_parallel_frames,
+                                 do_frame_parallel_encode, first_frame_index,
+                                 cur_disp_idx, layer_depth, start, end);
+      ++start;
+    }
+  } else {
+    const int m = (start + end - 1) / 2;
+
+    // Internal ARF.
+    int arf_src_offset = m - start;
+    set_params_for_internal_arfs(
+        twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+        frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx,
+        layer_depth, arf_src_offset, m, end - m, m - start);
+
+    // If encode reordering is enabled, configure the multi-layers accordingly
+    // and return. For e.g., the encode order for gf-interval 16 after
+    // reordering would be 0-> 16-> 8-> 4-> 2-> 6-> 1-> 3-> 5-> 7-> 12-> 10->
+    // 14-> 9-> 11-> 13-> 15.
+    if (layer_depth >= depth_thr) {
+      int m1 = (m + start - 1) / 2;
+      int m2 = (m + 1 + end) / 2;
+      int arf_src_offsets[2] = { m1 - start, m2 - start };
+      // Parameters to compute arf_boost.
+      int offset[2] = { m1, m2 };
+      int f_frames[2] = { m - m1, end - m2 };
+      int b_frames[2] = { m1 - start, m2 - (m + 1) };
+
+      // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered.
+      for (int i = 0; i < 2; i++) {
+        set_params_for_internal_arfs(
+            twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+            frame_ind, parallel_frame_count, max_parallel_frames,
+            do_frame_parallel_encode, first_frame_index, depth_thr,
+            cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i],
+            f_frames[i], b_frames[i]);
+      }
+
+      // Initialize the start and end indices to configure LF_UPDATE frames.
+      int start_idx[4] = { start, m1 + 1, m + 1, end - 1 };
+      int end_idx[4] = { m1, m, m2, end };
+      int layer_depth_for_intnl_overlay[4] = { layer_depth + 1, layer_depth,
+                                               layer_depth + 1, INVALID_IDX };
+
+      // Set GF_GROUP params for the rest of LF_UPDATE and INTNL_OVERLAY_UPDATE
+      // frames after reordering.
+      for (int i = 0; i < 4; i++) {
+        set_multi_layer_params_for_fp(
+            twopass, twopass_frame, gf_group, p_rc, rc, frame_info,
+            start_idx[i], end_idx[i], cur_frame_idx, frame_ind,
+            parallel_frame_count, max_parallel_frames, do_frame_parallel_encode,
+            first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2);
+        if (layer_depth_for_intnl_overlay[i] != INVALID_IDX)
+          set_params_for_intnl_overlay_frames(
+              gf_group, cur_frame_idx, frame_ind, first_frame_index,
+              cur_disp_idx, layer_depth_for_intnl_overlay[i]);
+      }
+      return;
+    }
+
+    // Frames displayed before this internal ARF.
+    set_multi_layer_params_for_fp(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+        layer_depth + 1);
+
+    // Overlay for internal ARF.
+    set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+                                        first_frame_index, cur_disp_idx,
+                                        layer_depth);
+
+    // Frames displayed after this internal ARF.
+    set_multi_layer_params_for_fp(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+        layer_depth + 1);
+  }
+}
+
+// Structure for bookkeeping start, end and display indices to configure
+// INTNL_ARF_UPDATE frames.
+typedef struct {
+  int start;
+  int end;
+  int display_index;
+} FRAME_REORDER_INFO;
+
+// Updates the stats required to configure the GF_GROUP.
+static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats,
+                                            int arf_frame_index,
+                                            int display_idx, int start,
+                                            int end) {
+  arf_frame_stats[arf_frame_index].start = start;
+  arf_frame_stats[arf_frame_index].end = end;
+  arf_frame_stats[arf_frame_index].display_index = display_idx;
+}
+
+// Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates
+// doh_gf_index_map and arf_frame_stats.
+static AOM_INLINE void set_params_for_internal_arfs_in_gf14(
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+    int *count_arf_frames, int *doh_gf_index_map, int start, int end,
+    int layer_depth, int layer_with_parallel_encodes) {
+  int index = (start + end - 1) / 2;
+  gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = index - 1;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] =
+      (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+
+  // Update the display index of the current frame with its gf index.
+  doh_gf_index_map[index] = *frame_ind;
+  if (layer_with_parallel_encodes) {
+    assert(layer_depth == 4);
+    // Set frame_parallel_level of the first frame in the given layer depth
+    // to 1.
+    if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+      gf_group->frame_parallel_level[*frame_ind] = 1;
+    } else {
+      // Set frame_parallel_level of the consecutive frame in the same given
+      // layer depth to 2.
+      assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+      gf_group->frame_parallel_level[*frame_ind] = 2;
+      // Set the display_idx of frame_parallel_level 1 frame in
+      // gf_group->skip_frame_as_ref.
+      gf_group->skip_frame_as_ref[*frame_ind] =
+          gf_group->display_idx[(*frame_ind) - 1];
+    }
+  }
+  ++(*frame_ind);
+
+  // Update arf_frame_stats.
+  fill_arf_frame_stats(arf_frame_stats, *count_arf_frames, index, start, end);
+  ++(*count_arf_frames);
+}
+
+// Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer
+// dpeth.
+static AOM_INLINE void set_params_for_cur_layer_frames(
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+    int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start,
+    int node_end, int layer_depth) {
+  assert(num_dir < 3);
+  int start, end;
+  // Iterate through the nodes in the previous layer depth.
+  for (int i = node_start; i < node_end; i++) {
+    // For each node, check if a frame can be coded as INTNL_ARF_UPDATE frame on
+    // either direction.
+    for (int dir = 0; dir < num_dir; dir++) {
+      // Checks for a frame to the left of current node.
+      if (dir == 0) {
+        start = arf_frame_stats[i].start;
+        end = arf_frame_stats[i].display_index;
+      } else {
+        // Checks for a frame to the right of current node.
+        start = arf_frame_stats[i].display_index + 1;
+        end = arf_frame_stats[i].end;
+      }
+      const int num_frames_to_process = end - start;
+      // Checks if a frame can be coded as INTNL_ARF_UPDATE frame. If
+      // num_frames_to_process is less than 3, then there are not enough frames
+      // between 'start' and 'end' to create another level.
+      if (num_frames_to_process >= 3) {
+        // Flag to indicate the lower layer depths for which parallel encoding
+        // is enabled. Currently enabled for layer 4 frames.
+        int layer_with_parallel_encodes = layer_depth == 4;
+        set_params_for_internal_arfs_in_gf14(
+            gf_group, arf_frame_stats, cur_frame_idx, cur_disp_idx, frame_ind,
+            count_arf_frames, doh_gf_index_map, start, end, layer_depth,
+            layer_with_parallel_encodes);
+      }
+    }
+  }
+}
+
+// Configures multi-layers of the GF_GROUP when consecutive encode of frames in
+// the same layer depth is enbaled.
+static AOM_INLINE void set_multi_layer_params_for_gf14(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *frame_ind, int *count_arf_frames,
+    int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index,
+    int *cur_disp_index, int gf_interval, int layer_depth,
+    int max_parallel_frames) {
+  assert(layer_depth == 2);
+  assert(gf_group->max_layer_depth_allowed >= 4);
+  int layer, node_start, node_end = 0;
+  // Maximum layer depth excluding LF_UPDATE frames is 4 since applicable only
+  // for gf-interval 14.
+  const int max_layer_depth = 4;
+  // Iterate through each layer depth starting from 2 till 'max_layer_depth'.
+  for (layer = layer_depth; layer <= max_layer_depth; layer++) {
+    // 'node_start' and 'node_end' indicate the number of nodes from the
+    // previous layer depth to be considered. It also corresponds to the indices
+    // of arf_frame_stats.
+    node_start = node_end;
+    node_end = (*count_arf_frames);
+    // 'num_dir' indicates the number of directions to traverse w.r.t. a given
+    // node in order to choose an INTNL_ARF_UPDATE frame. Layer depth 2 would
+    // have only one frame and hence needs to traverse only in the left
+    // direction w.r.t the node in the previous layer.
+    int num_dir = layer == 2 ? 1 : 2;
+    set_params_for_cur_layer_frames(gf_group, arf_frame_stats, cur_frame_idx,
+                                    cur_disp_index, frame_ind, count_arf_frames,
+                                    doh_gf_index_map, num_dir, node_start,
+                                    node_end, layer);
+  }
+
+  for (int i = 1; i < gf_interval; i++) {
+    // Since doh_gf_index_map is already populated for all INTNL_ARF_UPDATE
+    // frames in the GF_GROUP, any frame with INVALID_IDX would correspond to an
+    // LF_UPDATE frame.
+    if (doh_gf_index_map[i] == INVALID_IDX) {
+      // LF_UPDATE frames.
+      // TODO(Remya): Correct start and end parameters passed to
+      // set_params_for_leaf_frames() once encode reordering for gf-interval 14
+      // is enbaled for parallel encode of lower layer frames.
+      set_params_for_leaf_frames(
+          twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+          frame_ind, parallel_frame_count, max_parallel_frames, 1,
+          first_frame_index, cur_disp_index, layer, 0, 0);
+    } else {
+      // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get
+      // the gf index of corresponding INTNL_ARF_UPDATE frames.
+      int intnl_arf_index = doh_gf_index_map[i];
+      int ld = gf_group->layer_depth[intnl_arf_index];
+      set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+                                          first_frame_index, cur_disp_index,
+                                          ld);
+    }
+  }
+}
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+    RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+    int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+    int max_parallel_frames, int do_frame_parallel_encode,
+    int *first_frame_index, int *cur_disp_idx, int layer_depth) {
+  const int num_frames_to_process = end - start;
+
+  // Either we are at the last level of the pyramid, or we don't have enough
+  // frames between 'l' and 'r' to create one more level.
+  if (layer_depth > gf_group->max_layer_depth_allowed ||
+      num_frames_to_process < 3) {
+    // Leaf nodes.
+    while (start < end) {
+      gf_group->update_type[*frame_ind] = LF_UPDATE;
+      gf_group->arf_src_offset[*frame_ind] = 0;
+      gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+      gf_group->display_idx[*frame_ind] = *cur_disp_idx;
+      gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+      gf_group->arf_boost[*frame_ind] =
+          av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+                             end - start, 0, NULL, NULL, 0);
+      gf_group->frame_type[*frame_ind] = INTER_FRAME;
+      gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+      gf_group->max_layer_depth =
+          AOMMAX(gf_group->max_layer_depth, layer_depth);
+      // Set the level of parallelism for the LF_UPDATE frame.
+      if (do_frame_parallel_encode) {
+        set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+                                 parallel_frame_count, max_parallel_frames);
+        // Set LF_UPDATE frames as non-reference frames.
+        gf_group->is_frame_non_ref[*frame_ind] = true;
+      }
+      set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+      ++(*frame_ind);
+      ++(*cur_frame_idx);
+      ++(*cur_disp_idx);
+      ++start;
+    }
+  } else {
+    const int m = (start + end - 1) / 2;
+
+    // Internal ARF.
+    gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = m - start;
+    gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+    gf_group->display_idx[*frame_ind] =
+        *cur_disp_idx + gf_group->arf_src_offset[*frame_ind];
+    gf_group->layer_depth[*frame_ind] = layer_depth;
+    gf_group->frame_type[*frame_ind] = INTER_FRAME;
+    gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+    if (do_frame_parallel_encode) {
+      // If max_parallel_frames is not exceeded and if the frame will not be
+      // temporally filtered, encode the next internal ARF frame in parallel.
+      if (*parallel_frame_count > 1 &&
+          *parallel_frame_count <= max_parallel_frames) {
+        if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+          gf_group->frame_parallel_level[*frame_ind] = 2;
+        *parallel_frame_count = 1;
+      }
+    }
+    set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+
+    // Get the boost factor for intermediate ARF frames.
+    gf_group->arf_boost[*frame_ind] =
+        av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, m, end - m,
+                           m - start, NULL, NULL, 0);
+    ++(*frame_ind);
+
+    // Frames displayed before this internal ARF.
+    set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc,
+                           frame_info, start, m, cur_frame_idx, frame_ind,
+                           parallel_frame_count, max_parallel_frames,
+                           do_frame_parallel_encode, first_frame_index,
+                           cur_disp_idx, layer_depth + 1);
+
+    // Overlay for internal ARF.
+    gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = 0;
+    gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+    gf_group->display_idx[*frame_ind] = *cur_disp_idx;
+    gf_group->arf_boost[*frame_ind] = 0;
+    gf_group->layer_depth[*frame_ind] = layer_depth;
+    gf_group->frame_type[*frame_ind] = INTER_FRAME;
+    gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+    set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+    ++(*frame_ind);
+    ++(*cur_frame_idx);
+    ++(*cur_disp_idx);
+
+    // Frames displayed after this internal ARF.
+    set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc,
+                           frame_info, m + 1, end, cur_frame_idx, frame_ind,
+                           parallel_frame_count, max_parallel_frames,
+                           do_frame_parallel_encode, first_frame_index,
+                           cur_disp_idx, layer_depth + 1);
+  }
+}
+
+static int construct_multi_layer_gf_structure(
+    AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group,
+    RATE_CONTROL *rc, FRAME_INFO *const frame_info, int baseline_gf_interval,
+    FRAME_UPDATE_TYPE first_frame_update_type) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  // TODO(angiebird): Why do we need "-1" here?
+  const int gf_interval = baseline_gf_interval - 1;
+  int frame_index = 0;
+  int cur_frame_index = 0;
+
+  // Set the display order hint for the first frame in the GF_GROUP.
+  int cur_disp_index = (first_frame_update_type == KF_UPDATE)
+                           ? 0
+                           : cpi->common.current_frame.frame_number;
+
+  // Initialize gf_group->frame_parallel_level, gf_group->is_frame_non_ref,
+  // gf_group->src_offset and gf_group->is_frame_dropped with 0.
+  memset(gf_group->frame_parallel_level, 0,
+         sizeof(gf_group->frame_parallel_level));
+  memset(gf_group->is_frame_non_ref, 0, sizeof(gf_group->is_frame_non_ref));
+  memset(gf_group->src_offset, 0, sizeof(gf_group->src_offset));
+  memset(gf_group->is_frame_dropped, 0, sizeof(gf_group->is_frame_dropped));
+  // Initialize gf_group->skip_frame_refresh and gf_group->skip_frame_as_ref
+  // with INVALID_IDX.
+  memset(gf_group->skip_frame_refresh, INVALID_IDX,
+         sizeof(gf_group->skip_frame_refresh));
+  memset(gf_group->skip_frame_as_ref, INVALID_IDX,
+         sizeof(gf_group->skip_frame_as_ref));
+
+  int kf_decomp = cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1;
+  // This is a patch that fixes https://crbug.com/aomedia/3163
+  // enable_keyframe_filtering > 1 will introduce an extra overlay frame at
+  // key frame location. However when
+  // baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH, we can't
+  // afford to have an extra overlay frame. Otherwise, the gf_group->size will
+  // become MAX_STATIC_GF_GROUP_LENGTH + 1, which causes memory error.
+  // A cheap solution is to turn of kf_decomp here.
+  // TODO(angiebird): Find a systematic way to solve this issue.
+  if (baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH) {
+    kf_decomp = 0;
+  }
+  if (first_frame_update_type == KF_UPDATE) {
+    gf_group->update_type[frame_index] = kf_decomp ? ARF_UPDATE : KF_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = 0;
+    gf_group->frame_type[frame_index] = KEY_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_RESET;
+    gf_group->max_layer_depth = 0;
+    gf_group->display_idx[frame_index] = cur_disp_index;
+    if (!kf_decomp) cur_disp_index++;
+    ++frame_index;
+
+    if (kf_decomp) {
+      gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+      gf_group->layer_depth[frame_index] = 0;
+      gf_group->frame_type[frame_index] = INTER_FRAME;
+      gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+      gf_group->max_layer_depth = 0;
+      gf_group->display_idx[frame_index] = cur_disp_index;
+      cur_disp_index++;
+      ++frame_index;
+    }
+    cur_frame_index++;
+  }
+
+  if (first_frame_update_type == GF_UPDATE) {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = 0;
+    gf_group->frame_type[frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+    gf_group->max_layer_depth = 0;
+    gf_group->display_idx[frame_index] = cur_disp_index;
+    cur_disp_index++;
+    ++frame_index;
+    ++cur_frame_index;
+  }
+
+  // ALTREF.
+  const int use_altref = gf_group->max_layer_depth_allowed > 0;
+  int is_fwd_kf = rc->frames_to_fwd_kf == gf_interval;
+
+  if (use_altref) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = 1;
+    gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost;
+    gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+    gf_group->max_layer_depth = 1;
+    gf_group->arf_index = frame_index;
+    gf_group->display_idx[frame_index] =
+        cur_disp_index + gf_group->arf_src_offset[frame_index];
+    ++frame_index;
+  } else {
+    gf_group->arf_index = -1;
+  }
+
+  // Flag to indicate if multi-layer configuration is complete.
+  int is_multi_layer_configured = 0;
+
+  // Running count of no. of frames that is part of a given parallel
+  // encode set in a gf_group. Value of 1 indicates no parallel encode.
+  int parallel_frame_count = 1;
+  // Enable parallel encode of frames if gf_group has a multi-layer pyramid
+  // structure with minimum 4 layers.
+  int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref &&
+                                  gf_group->max_layer_depth_allowed >= 4);
+
+  int first_frame_index = cur_frame_index;
+  if (do_frame_parallel_encode) {
+    // construct_multi_layer_gf_structure() takes the input parameter
+    // 'gf_interval' as p_rc->baseline_gf_interval - 1 . Below code computes the
+    // actual GF_GROUP length by compensating for this offset.
+    int actual_gf_length = ((first_frame_update_type == KF_UPDATE) ||
+                            (first_frame_update_type == GF_UPDATE))
+                               ? gf_interval
+                               : gf_interval + 1;
+
+    // In order to facilitate parallel encoding of frames in lower layer depths,
+    // encode reordering is done. Currently encode reordering is enabled only
+    // for gf-intervals 16 and 32. NOTE: Since the buffer holding the
+    // reference frames is of size 8 (ref_frame_map[REF_FRAMES]), there is a
+    // limitation on the number of hidden frames possible at any given point and
+    // hence the reordering is enabled only for gf-intervals 16 and 32.
+    // Disabling encode reordering for gf-interval 14 since some cross-frame
+    // dependencies related to temporal filtering for FPMT is currently not
+    // handled.
+    int disable_gf14_reorder = 1;
+    if (actual_gf_length == 14 && !disable_gf14_reorder) {
+      // This array holds the gf index of INTNL_ARF_UPDATE frames in the slot
+      // corresponding to their display order hint. This is used while
+      // configuring the LF_UPDATE frames and INTNL_OVERLAY_UPDATE frames.
+      int doh_gf_index_map[FIXED_GF_INTERVAL];
+      // Initialize doh_gf_index_map with INVALID_IDX.
+      memset(&doh_gf_index_map[0], INVALID_IDX,
+             (sizeof(doh_gf_index_map[0]) * FIXED_GF_INTERVAL));
+
+      FRAME_REORDER_INFO arf_frame_stats[REF_FRAMES - 1];
+      // Store the stats corresponding to layer 1 frame.
+      fill_arf_frame_stats(arf_frame_stats, 0, actual_gf_length, 1,
+                           actual_gf_length);
+      int count_arf_frames = 1;
+
+      // Sets multi-layer params for gf-interval 14 to consecutively encode
+      // frames in the same layer depth, i.e., encode order would be 0-> 14->
+      // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13.
+      // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames.
+      set_multi_layer_params_for_gf14(
+          twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group,
+          arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames,
+          doh_gf_index_map, &parallel_frame_count, &first_frame_index,
+          &cur_disp_index, actual_gf_length, use_altref + 1,
+          cpi->ppi->num_fp_contexts);
+
+      // Set gf_group->skip_frame_refresh.
+      for (int i = 0; i < actual_gf_length; i++) {
+        int count = 0;
+        if (gf_group->update_type[i] == INTNL_ARF_UPDATE) {
+          for (int j = 0; j < i; j++) {
+            // Store the display order hint of the frames which would not
+            // have been displayed at the encode call of frame 'i'.
+            if ((gf_group->display_idx[j] < gf_group->display_idx[i]) &&
+                gf_group->update_type[j] == INTNL_ARF_UPDATE) {
+              gf_group->skip_frame_refresh[i][count++] =
+                  gf_group->display_idx[j];
+            }
+          }
+        }
+      }
+    } else {
+      // Set layer depth threshold for reordering as per the gf length.
+      int depth_thr = (actual_gf_length == 16)   ? 3
+                      : (actual_gf_length == 32) ? 4
+                                                 : INT_MAX;
+
+      set_multi_layer_params_for_fp(
+          twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info,
+          cur_frame_index, gf_interval, &cur_frame_index, &frame_index,
+          &parallel_frame_count, cpi->ppi->num_fp_contexts,
+          do_frame_parallel_encode, &first_frame_index, depth_thr,
+          &cur_disp_index, use_altref + 1);
+    }
+    is_multi_layer_configured = 1;
+  }
+
+  // Rest of the frames.
+  if (!is_multi_layer_configured)
+    set_multi_layer_params(twopass, &cpi->twopass_frame, gf_group, p_rc, rc,
+                           frame_info, cur_frame_index, gf_interval,
+                           &cur_frame_index, &frame_index,
+                           &parallel_frame_count, cpi->ppi->num_fp_contexts,
+                           do_frame_parallel_encode, &first_frame_index,
+                           &cur_disp_index, use_altref + 1);
+
+  if (use_altref) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+    gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+    gf_group->frame_type[frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[frame_index] =
+        is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE;
+    gf_group->display_idx[frame_index] = cur_disp_index;
+    ++frame_index;
+  } else {
+    for (; cur_frame_index <= gf_interval; ++cur_frame_index) {
+      gf_group->update_type[frame_index] = LF_UPDATE;
+      gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+      gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+      gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+      gf_group->frame_type[frame_index] = INTER_FRAME;
+      gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+      gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+      set_src_offset(gf_group, &first_frame_index, cur_frame_index,
+                     frame_index);
+      gf_group->display_idx[frame_index] = cur_disp_index;
+      cur_disp_index++;
+      ++frame_index;
+    }
+  }
+  if (do_frame_parallel_encode) {
+    // Iterate through the gf_group and reset frame_parallel_level to 0 in case
+    // a frame is marked as frame_parallel_level 1 with no subsequent
+    // frame_parallel_level 2 frame(s).
+    int level1_frame_idx = INT_MAX;
+    int level2_frame_count = 0;
+    for (int frame_idx = 0; frame_idx < frame_index; frame_idx++) {
+      if (gf_group->frame_parallel_level[frame_idx] == 1) {
+        // Set frame_parallel_level to 0 if only one frame is present in a
+        // parallel encode set.
+        if (level1_frame_idx != INT_MAX && !level2_frame_count)
+          gf_group->frame_parallel_level[level1_frame_idx] = 0;
+        // Book-keep frame_idx of frame_parallel_level 1 frame and reset the
+        // count of frame_parallel_level 2 frames in the corresponding parallel
+        // encode set.
+        level1_frame_idx = frame_idx;
+        level2_frame_count = 0;
+      }
+      if (gf_group->frame_parallel_level[frame_idx] == 2) level2_frame_count++;
+    }
+    // If frame_parallel_level is set to 1 for the last LF_UPDATE
+    // frame in the gf_group, reset it to zero since there are no subsequent
+    // frames in the gf_group.
+    if (gf_group->frame_parallel_level[frame_index - 2] == 1) {
+      assert(gf_group->update_type[frame_index - 2] == LF_UPDATE);
+      gf_group->frame_parallel_level[frame_index - 2] = 0;
+    }
+  }
+
+  for (int gf_idx = frame_index; gf_idx < MAX_STATIC_GF_GROUP_LENGTH;
+       ++gf_idx) {
+    gf_group->update_type[gf_idx] = LF_UPDATE;
+    gf_group->arf_src_offset[gf_idx] = 0;
+    gf_group->cur_frame_idx[gf_idx] = gf_idx;
+    gf_group->layer_depth[gf_idx] = MAX_ARF_LAYERS;
+    gf_group->arf_boost[gf_idx] = NORMAL_BOOST;
+    gf_group->frame_type[gf_idx] = INTER_FRAME;
+    gf_group->refbuf_state[gf_idx] = REFBUF_UPDATE;
+    gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+  }
+
+  return frame_index;
+}
+
+static void set_ld_layer_depth(GF_GROUP *gf_group, int gop_length) {
+  int log_gop_length = 0;
+  while ((1 << log_gop_length) < gop_length) {
+    ++log_gop_length;
+  }
+
+  for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+    int count = 0;
+    // Find the trailing zeros
+    for (; count < MAX_ARF_LAYERS; ++count) {
+      if ((gf_index >> count) & 0x01) break;
+    }
+    gf_group->layer_depth[gf_index] = AOMMAX(log_gop_length - count, 0);
+  }
+  gf_group->max_layer_depth = AOMMIN(log_gop_length, MAX_ARF_LAYERS);
+}
+
+void av1_gop_setup_structure(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  const int key_frame = rc->frames_since_key == 0;
+  FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE;
+
+  if (key_frame) {
+    first_frame_update_type = KF_UPDATE;
+    if (cpi->oxcf.kf_max_pyr_height != -1) {
+      gf_group->max_layer_depth_allowed = AOMMIN(
+          cpi->oxcf.kf_max_pyr_height, gf_group->max_layer_depth_allowed);
+    }
+  } else if (!cpi->ppi->gf_state.arf_gf_boost_lst) {
+    first_frame_update_type = GF_UPDATE;
+  }
+
+  gf_group->size = construct_multi_layer_gf_structure(
+      cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval,
+      first_frame_update_type);
+
+  if (gf_group->max_layer_depth_allowed == 0)
+    set_ld_layer_depth(gf_group, p_rc->baseline_gf_interval);
+}
+
+int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group,
+                                   int gf_frame_index) {
+  return gf_group->frame_type[gf_frame_index] == KEY_FRAME &&
+         gf_group->refbuf_state[gf_frame_index] == REFBUF_UPDATE;
+}
+
+int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index) {
+  const int arf_src_offset = gf_group->arf_src_offset[gf_frame_index];
+  // TODO(angiebird): when gf_group->size == 32, it's possble to
+  // have "two" second arf. Check if this is acceptable.
+  if (gf_group->update_type[gf_frame_index] == INTNL_ARF_UPDATE &&
+      arf_src_offset >= TF_LOOKAHEAD_IDX_THR) {
+    return 1;
+  }
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/gop_structure.h b/third_party/aom/av1/encoder/gop_structure.h
new file mode 100644
index 0000000000..ff22f54136
--- /dev/null
+++ b/third_party/aom/av1/encoder/gop_structure.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+#define MIN_ARF_GF_BOOST 240
+#define NORMAL_BOOST 100
+
+/*!\endcond */
+
+/*!\brief Set up the Group-Of-Pictures structure for this GF_GROUP.
+ *
+ *\ingroup rate_control
+ *
+ * This function defines the Group-Of-Pictures structure for this GF_GROUP.
+ * This involves deciding where to place the various FRAME_UPDATE_TYPEs in
+ * the group. It does this primarily by updateing entries in
+ * cpi->twopass.gf_group.update_type[].
+ *
+ * \param[in]    cpi          Top - level encoder instance structure
+ *
+ * \remark No return value but this function updates group data structures.
+ */
+void av1_gop_setup_structure(struct AV1_COMP *cpi);
+
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in]   cpi           Top - level encoder instance structure
+ * \param[in]   rc            Rate control data
+ * \param[in]   gf_group      GF/ARF group data structure
+ * \param[in]   is_key_frame  Indicates if the first frame in the group is
+ *                            also a key frame.
+ * \param[in]   use_arf       Are ARF frames enabled or is this a GF only
+ *                            uni-directional group.
+ * \param[in]   gf_group_bits Bits available to be allocated.
+ *
+ * \remark No return but updates the rate control and group data structures
+ *         to reflect the allocation of bits.
+ */
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits);
+
+/*!\brief Check whether a frame in the GOP is a forward key frame
+ *
+ *\ingroup rate_control
+ *
+ * \param[in]   gf_group       GF/ARF group data structure
+ * \param[in]   gf_frame_index GOP index
+ *
+ * \return Return 1 if it is a forward key frame, otherwise return 0
+ */
+int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group,
+                                   int gf_frame_index);
+
+/*!\brief Check whether a frame in the GOP is the second arf
+ *
+ *\ingroup rate_control
+ *
+ * \param[in]   gf_group       GF/ARF group data structure
+ * \param[in]   gf_frame_index GOP index
+ *
+ * \return Return 1 if it is the second arf
+ */
+int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_GOP_STRUCTURE_H_
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
new file mode 100644
index 0000000000..945dc37331
--- /dev/null
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+
+/* Test vectors for emulation of different film grain types.
+ * Note that bit depth would be derived from the bitstream and
+ * not signaled in film grain metadata. The parameters are valid
+ * for any bit depth.
+ */
+static aom_film_grain_t film_grain_test_vectors[16] = {
+  /* Test 1 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 16, 0 },
+        { 25, 136 },
+        { 33, 144 },
+        { 41, 160 },
+        { 48, 168 },
+        { 56, 136 },
+        { 67, 128 },
+        { 82, 144 },
+        { 97, 152 },
+        { 113, 144 },
+        { 128, 176 },
+        { 143, 168 },
+        { 158, 176 },
+        { 178, 184 } },
+      14 /* num_points_y */,
+      { { 16, 0 },
+        { 20, 64 },
+        { 28, 88 },
+        { 60, 104 },
+        { 90, 136 },
+        { 105, 160 },
+        { 134, 168 },
+        { 168, 208 } },
+      8 /* num_cb_points */,
+      { { 16, 0 },
+        { 28, 96 },
+        { 56, 80 },
+        { 66, 96 },
+        { 80, 104 },
+        { 108, 96 },
+        { 122, 112 },
+        { 137, 112 },
+        { 169, 176 } },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      247 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      18 /* cb_offset */,
+      229 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      54 /* cr_offset */,
+      0 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /* chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 2 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cb_points */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 3 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 192 }, { 255, 192 } },
+      2 /* num_points_y */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cb_points */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      1 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 4 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 137 },
+          { 53, 146 },
+          { 63, 155 },
+          { 78, 155 },
+          { 107, 150 },
+          { 122, 147 },
+          { 136, 147 },
+          { 166, 153 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 72 },
+          { 27, 82 },
+          { 33, 91 },
+          { 69, 121 },
+          { 95, 143 },
+          { 108, 154 },
+          { 134, 169 },
+          { 147, 177 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 24, 95 },
+          { 54, 93 },
+          { 65, 94 },
+          { 79, 98 },
+          { 109, 107 },
+          { 124, 119 },
+          { 139, 136 },
+          { 169, 170 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 5 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_points_y */,
+      {
+          { 0, 96 },
+          { 32, 90 },
+          { 64, 83 },
+          { 96, 76 },
+          { 128, 68 },
+          { 159, 59 },
+          { 191, 48 },
+          { 223, 34 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 34 },
+          { 64, 48 },
+          { 96, 59 },
+          { 128, 68 },
+          { 159, 76 },
+          { 191, 83 },
+          { 223, 90 },
+          { 255, 96 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          -2, 2,  -5, 7,   -6, 4,   -2, -1, 1,  -2,  0,  -2, 2,
+          -3, -5, 13, -13, 6,  -14, 8,  -1, 18, -36, 58, 0,
+      },
+      {
+          -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2,
+          0,  1,  0,  -7, 50, -8, -2, 2, 2,  2, -4,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1063 /* random_seed */
+  },
+  /* Test 6 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 96 },
+          { 20, 92 },
+          { 39, 88 },
+          { 59, 84 },
+          { 78, 80 },
+          { 98, 75 },
+          { 118, 70 },
+          { 137, 65 },
+          { 157, 60 },
+          { 177, 53 },
+          { 196, 46 },
+          { 216, 38 },
+          { 235, 27 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      2754 /* random_seed */
+  },
+  /* Test 7 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 27 },
+          { 39, 38 },
+          { 59, 46 },
+          { 78, 53 },
+          { 98, 60 },
+          { 118, 65 },
+          { 137, 70 },
+          { 157, 75 },
+          { 177, 80 },
+          { 196, 84 },
+          { 216, 88 },
+          { 235, 92 },
+          { 255, 96 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 8 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cb_points */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 9 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 10 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 11 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_points_y */,
+      {
+          { 0, 48 },
+          { 32, 45 },
+          { 64, 42 },
+          { 96, 38 },
+          { 128, 34 },
+          { 159, 29 },
+          { 191, 24 },
+          { 223, 17 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 17 },
+          { 64, 24 },
+          { 96, 29 },
+          { 128, 34 },
+          { 159, 38 },
+          { 191, 42 },
+          { 223, 45 },
+          { 255, 48 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1357 /* random_seed */
+  },
+  /* Test 12 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 49 },
+          { 39, 69 },
+          { 46, 84 },
+          { 53, 91 },
+          { 63, 100 },
+          { 78, 114 },
+          { 92, 134 },
+          { 164, 139 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 31 },
+          { 26, 42 },
+          { 33, 54 },
+          { 40, 65 },
+          { 47, 72 },
+          { 56, 85 },
+          { 84, 123 },
+          { 152, 157 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 25, 14 },
+          { 39, 33 },
+          { 47, 40 },
+          { 54, 47 },
+          { 64, 62 },
+          { 79, 76 },
+          { 94, 83 },
+          { 167, 101 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      0 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 13 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 48 },
+          { 20, 46 },
+          { 39, 44 },
+          { 59, 42 },
+          { 78, 40 },
+          { 98, 38 },
+          { 118, 35 },
+          { 137, 33 },
+          { 157, 30 },
+          { 177, 27 },
+          { 196, 23 },
+          { 216, 19 },
+          { 235, 13 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 14 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 13 },
+          { 39, 19 },
+          { 59, 23 },
+          { 78, 27 },
+          { 98, 30 },
+          { 118, 33 },
+          { 137, 35 },
+          { 157, 38 },
+          { 177, 40 },
+          { 196, 42 },
+          { 216, 44 },
+          { 235, 46 },
+          { 255, 48 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 15 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      1 /* num_points_y */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cb_points */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 },
+      { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 },
+      { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      1 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 16 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 58, 126 },
+          { 87, 120 },
+          { 97, 122 },
+          { 112, 125 },
+          { 126, 131 },
+          { 141, 139 },
+          { 199, 153 },
+      },
+      8 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 59, 68 },
+          { 66, 76 },
+          { 73, 82 },
+          { 79, 85 },
+          { 86, 86 },
+          { 151, 95 },
+          { 192, 101 },
+      },
+      8 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 59, 64 },
+          { 89, 80 },
+          { 99, 86 },
+          { 114, 90 },
+          { 129, 93 },
+          { 144, 97 },
+          { 203, 85 },
+      },
+      8 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      2 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+};
+#endif  // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c
new file mode 100644
index 0000000000..8037b59bef
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/hash.h"
+#include "config/av1_rtcd.h"
+
+static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
+                                        uint8_t *pData, uint32_t dataLength) {
+  for (uint32_t i = 0; i < dataLength; i++) {
+    const uint8_t index = (uint8_t)(
+        (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
+        pData[i]);
+    p_crc_calculator->remainder <<= 8;
+    p_crc_calculator->remainder ^= p_crc_calculator->table[index];
+  }
+}
+
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+  p_crc_calculator->remainder = 0;
+}
+
+static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) {
+  return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
+}
+
+static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) {
+  const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
+  const uint32_t byte_high_bit = 1 << (8 - 1);
+
+  for (uint32_t value = 0; value < 256; value++) {
+    uint32_t remainder = 0;
+    for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
+      if (value & mask) {
+        remainder ^= high_bit;
+      }
+
+      if (remainder & high_bit) {
+        remainder <<= 1;
+        remainder ^= p_crc_calculator->trunc_poly;
+      } else {
+        remainder <<= 1;
+      }
+    }
+    p_crc_calculator->table[value] = remainder;
+  }
+}
+
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+                             uint32_t truncPoly) {
+  p_crc_calculator->remainder = 0;
+  p_crc_calculator->bits = bits;
+  p_crc_calculator->trunc_poly = truncPoly;
+  p_crc_calculator->final_result_mask = (1 << bits) - 1;
+  crc_calculator_init_table(p_crc_calculator);
+}
+
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+                           int length) {
+  crc_calculator_reset(p_crc_calculator);
+  crc_calculator_process_data(p_crc_calculator, p, length);
+  return crc_calculator_get_crc(p_crc_calculator);
+}
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Construct table for software CRC-32C calculation. */
+void av1_crc32c_calculator_init(CRC32C *p_crc32c) {
+  uint32_t crc;
+
+  for (int n = 0; n < 256; n++) {
+    crc = n;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    p_crc32c->table[0][n] = crc;
+  }
+  for (int n = 0; n < 256; n++) {
+    crc = p_crc32c->table[0][n];
+    for (int k = 1; k < 8; k++) {
+      crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8);
+      p_crc32c->table[k][n] = crc;
+    }
+  }
+}
+
+/* Table-driven software version as a fall-back.  This is about 15 times slower
+ than using the hardware instructions.  This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) {
+  const uint8_t *next = (const uint8_t *)(buf);
+  uint64_t crc;
+  CRC32C *p = (CRC32C *)c;
+  crc = 0 ^ 0xffffffff;
+  while (len && ((uintptr_t)next & 7) != 0) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  while (len >= 8) {
+    crc ^= *(uint64_t *)next;
+    crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^
+          p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^
+          p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^
+          p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56];
+    next += 8;
+    len -= 8;
+  }
+  while (len) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  return (uint32_t)crc ^ 0xffffffff;
+}
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
new file mode 100644
index 0000000000..d8e8cc3a0b
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _crc_calculator {
+  uint32_t remainder;
+  uint32_t trunc_poly;
+  uint32_t bits;
+  uint32_t table[256];
+  uint32_t final_result_mask;
+} CRC_CALCULATOR;
+
+// Initialize the crc calculator. It must be executed at least once before
+// calling av1_get_crc_value().
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+                             uint32_t truncPoly);
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+                           int length);
+
+// CRC32C: POLY = 0x82f63b78;
+typedef struct _CRC32C {
+  /* Table for a quadword-at-a-time software crc. */
+  uint32_t table[8][256];
+} CRC32C;
+
+// init table for software version crc32c
+void av1_crc32c_calculator_init(CRC32C *p_crc32c);
+
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
new file mode 100644
index 0000000000..8b04e22d6c
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
+
+#define kSrcBits 16
+#define kBlockSizeBits 3
+#define kMaxAddr (1 << (kSrcBits + kBlockSizeBits))
+
+// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
+// If yes, fix this function
+static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src,
+                                                     int stride,
+                                                     uint8_t *p_pixels_in1D) {
+  const uint8_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
+static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src,
+                                                      int stride,
+                                                      uint16_t *p_pixels_in1D) {
+  const uint16_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
+static int is_block_2x2_row_same_value(const uint8_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block16_2x2_row_same_value(const uint16_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block_2x2_col_same_value(const uint8_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block16_2x2_col_same_value(const uint16_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
+  return 1;
+}
+
+// the hash value (hash_value1 consists two parts, the first 3 bits relate to
+// the block size and the remaining 16 bits are the crc values. This fuction
+// is used to get the first 3 bits.
+static int hash_block_size_to_index(int block_size) {
+  switch (block_size) {
+    case 4: return 0;
+    case 8: return 1;
+    case 16: return 2;
+    case 32: return 3;
+    case 64: return 4;
+    case 128: return 5;
+    default: return -1;
+  }
+}
+
+void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) {
+  if (!intrabc_hash_info->g_crc_initialized) {
+    av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB);
+    av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB);
+    intrabc_hash_info->g_crc_initialized = 1;
+  }
+  intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL;
+}
+
+void av1_hash_table_clear_all(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table == NULL) {
+    return;
+  }
+  for (int i = 0; i < kMaxAddr; i++) {
+    if (p_hash_table->p_lookup_table[i] != NULL) {
+      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
+      aom_free(p_hash_table->p_lookup_table[i]);
+      p_hash_table->p_lookup_table[i] = NULL;
+    }
+  }
+}
+
+void av1_hash_table_destroy(hash_table *p_hash_table) {
+  av1_hash_table_clear_all(p_hash_table);
+  aom_free(p_hash_table->p_lookup_table);
+  p_hash_table->p_lookup_table = NULL;
+}
+
+bool av1_hash_table_create(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table != NULL) {
+    av1_hash_table_clear_all(p_hash_table);
+    return true;
+  }
+  p_hash_table->p_lookup_table =
+      (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0]));
+  if (!p_hash_table->p_lookup_table) return false;
+  return true;
+}
+
+static bool hash_table_add_to_table(hash_table *p_hash_table,
+                                    uint32_t hash_value,
+                                    block_hash *curr_block_hash) {
+  if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+    p_hash_table->p_lookup_table[hash_value] =
+        aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
+    if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+      return false;
+    }
+    if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+                         sizeof(curr_block_hash[0])) == VECTOR_ERROR)
+      return false;
+    if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                             curr_block_hash) == VECTOR_ERROR)
+      return false;
+  } else {
+    if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                             curr_block_hash) == VECTOR_ERROR)
+      return false;
+  }
+  return true;
+}
+
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+                             uint32_t hash_value) {
+  if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+    return 0;
+  } else {
+    return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size);
+  }
+}
+
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+                                     uint32_t hash_value) {
+  assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
+  return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]);
+}
+
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+                            uint32_t hash_value2) {
+  if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
+    return 0;
+  }
+  Iterator iterator =
+      aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+  Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
+  for (; !aom_iterator_equals(&iterator, &last);
+       aom_iterator_increment(&iterator)) {
+    if ((*(block_hash *)aom_iterator_get(&iterator)).hash_value2 ==
+        hash_value2) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                                       const YV12_BUFFER_CONFIG *picture,
+                                       uint32_t *pic_block_hash[2],
+                                       int8_t *pic_block_same_info[3]) {
+  const int width = 2;
+  const int height = 2;
+  const int x_end = picture->y_crop_width - width + 1;
+  const int y_end = picture->y_crop_height - height + 1;
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
+  const int length = width * 2;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_short_array_by_block_2x2(
+            CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+                x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] =
+            av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] =
+            av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
+    }
+  } else {
+    uint8_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_char_array_by_block_2x2(
+            picture->y_buffer + y_pos * picture->y_stride + x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] =
+            av1_get_crc_value(calc_1, p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] =
+            av1_get_crc_value(calc_2, p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
+    }
+  }
+}
+
+void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                                   const YV12_BUFFER_CONFIG *picture,
+                                   int block_size,
+                                   uint32_t *src_pic_block_hash[2],
+                                   uint32_t *dst_pic_block_hash[2],
+                                   int8_t *src_pic_block_same_info[3],
+                                   int8_t *dst_pic_block_same_info[3]) {
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
+  const int pic_width = picture->y_crop_width;
+  const int x_end = picture->y_crop_width - block_size + 1;
+  const int y_end = picture->y_crop_height - block_size + 1;
+
+  const int src_size = block_size >> 1;
+  const int quad_size = block_size >> 2;
+
+  uint32_t p[4];
+  const int length = sizeof(p);
+
+  int pos = 0;
+  for (int y_pos = 0; y_pos < y_end; y_pos++) {
+    for (int x_pos = 0; x_pos < x_end; x_pos++) {
+      p[0] = src_pic_block_hash[0][pos];
+      p[1] = src_pic_block_hash[0][pos + src_size];
+      p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
+      p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
+      dst_pic_block_hash[0][pos] =
+          av1_get_crc_value(calc_1, (uint8_t *)p, length);
+
+      p[0] = src_pic_block_hash[1][pos];
+      p[1] = src_pic_block_hash[1][pos + src_size];
+      p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
+      p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
+      dst_pic_block_hash[1][pos] =
+          av1_get_crc_value(calc_2, (uint8_t *)p, length);
+
+      dst_pic_block_same_info[0][pos] =
+          src_pic_block_same_info[0][pos] &&
+          src_pic_block_same_info[0][pos + quad_size] &&
+          src_pic_block_same_info[0][pos + src_size] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width + src_size];
+
+      dst_pic_block_same_info[1][pos] =
+          src_pic_block_same_info[1][pos] &&
+          src_pic_block_same_info[1][pos + src_size] &&
+          src_pic_block_same_info[1][pos + quad_size * pic_width] &&
+          src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] &&
+          src_pic_block_same_info[1][pos + src_size * pic_width] &&
+          src_pic_block_same_info[1][pos + src_size * pic_width + src_size];
+      pos++;
+    }
+    pos += block_size - 1;
+  }
+
+  if (block_size >= 4) {
+    const int size_minus_1 = block_size - 1;
+    pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        dst_pic_block_same_info[2][pos] =
+            (!dst_pic_block_same_info[0][pos] &&
+             !dst_pic_block_same_info[1][pos]) ||
+            (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0));
+        pos++;
+      }
+      pos += block_size - 1;
+    }
+  }
+}
+
+bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+                                                 uint32_t *pic_hash[2],
+                                                 int8_t *pic_is_same,
+                                                 int pic_width, int pic_height,
+                                                 int block_size) {
+  const int x_end = pic_width - block_size + 1;
+  const int y_end = pic_height - block_size + 1;
+
+  const int8_t *src_is_added = pic_is_same;
+  const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] };
+
+  int add_value = hash_block_size_to_index(block_size);
+  assert(add_value >= 0);
+  add_value <<= kSrcBits;
+  const int crc_mask = (1 << kSrcBits) - 1;
+
+  for (int x_pos = 0; x_pos < x_end; x_pos++) {
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      const int pos = y_pos * pic_width + x_pos;
+      // valid data
+      if (src_is_added[pos]) {
+        block_hash curr_block_hash;
+        curr_block_hash.x = x_pos;
+        curr_block_hash.y = y_pos;
+
+        const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value;
+        curr_block_hash.hash_value2 = src_hash[1][pos];
+
+        if (!hash_table_add_to_table(p_hash_table, hash_value1,
+                                     &curr_block_hash)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size, int x_start, int y_start) {
+  const int stride = picture->y_stride;
+  const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j] != p16[0]) {
+          return 0;
+        }
+      }
+      p16 += stride;
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j] != p[0]) {
+          return 0;
+        }
+      }
+      p += stride;
+    }
+  }
+
+  return 1;
+}
+
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+                                 int block_size, int x_start, int y_start) {
+  const int stride = picture->y_stride;
+  const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j * stride + i] != p16[i]) {
+          return 0;
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j * stride + i] != p[i]) {
+          return 0;
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                              const uint8_t *y_src, int stride, int block_size,
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth) {
+  int add_value = hash_block_size_to_index(block_size);
+  assert(add_value >= 0);
+  add_value <<= kSrcBits;
+  const int crc_mask = (1 << kSrcBits) - 1;
+
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+  uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0];
+  uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1];
+
+  // 2x2 subblock hash values in current CU
+  int sub_block_in_width = (block_size >> 1);
+  if (use_highbitdepth) {
+    uint16_t pixel_to_hash[4];
+    uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_short_array_by_block_2x2(
+            y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash,
+                                          sizeof(pixel_to_hash));
+        buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash,
+                                          sizeof(pixel_to_hash));
+      }
+    }
+  } else {
+    uint8_t pixel_to_hash[4];
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+                                                 stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        buf_1[0][pos] =
+            av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash));
+        buf_2[0][pos] =
+            av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash));
+      }
+    }
+  }
+
+  int src_sub_block_in_width = sub_block_in_width;
+  sub_block_in_width >>= 1;
+
+  int src_idx = 1;
+  int dst_idx = 0;
+
+  // 4x4 subblock hash values to current block hash values
+  uint32_t to_hash[4];
+  for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) {
+    src_idx = 1 - src_idx;
+    dst_idx = 1 - dst_idx;
+
+    int dst_pos = 0;
+    for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) {
+      for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
+        int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
+
+        assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(srcPos + src_sub_block_in_width + 1 <
+               AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        to_hash[0] = buf_1[src_idx][srcPos];
+        to_hash[1] = buf_1[src_idx][srcPos + 1];
+        to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1];
+
+        buf_1[dst_idx][dst_pos] =
+            av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash));
+
+        to_hash[0] = buf_2[src_idx][srcPos];
+        to_hash[1] = buf_2[src_idx][srcPos + 1];
+        to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1];
+        buf_2[dst_idx][dst_pos] =
+            av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash));
+        dst_pos++;
+      }
+    }
+
+    src_sub_block_in_width = sub_block_in_width;
+    sub_block_in_width >>= 1;
+  }
+
+  *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value;
+  *hash_value2 = buf_2[dst_idx][0];
+}
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
new file mode 100644
index 0000000000..8974ba27cb
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
+
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/hash.h"
+#include "third_party/vector/vector.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Block size used for force_integer_mv decisions
+#define FORCE_INT_MV_DECISION_BLOCK_SIZE 8
+
+// store a block's hash info.
+// x and y are the position from the top left of the picture
+// hash_value2 is used to store the second hash value
+typedef struct _block_hash {
+  int16_t x;
+  int16_t y;
+  uint32_t hash_value2;
+} block_hash;
+
+typedef struct _hash_table {
+  Vector **p_lookup_table;
+} hash_table;
+
+struct intrabc_hash_info;
+
+typedef struct intrabc_hash_info {
+  // buffer for hash value calculation of a block
+  // used only in av1_get_block_hash_value()
+  // [first hash/second hash]
+  // [two buffers used ping-pong]
+  uint32_t *hash_value_buffer[2][2];
+  hash_table intrabc_hash_table;
+
+  CRC_CALCULATOR crc_calculator1;
+  CRC_CALCULATOR crc_calculator2;
+  int g_crc_initialized;
+} IntraBCHashInfo;
+
+void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info);
+void av1_hash_table_clear_all(hash_table *p_hash_table);
+void av1_hash_table_destroy(hash_table *p_hash_table);
+bool av1_hash_table_create(hash_table *p_hash_table);
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+                             uint32_t hash_value);
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+                                     uint32_t hash_value);
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+                            uint32_t hash_value2);
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+                                       const YV12_BUFFER_CONFIG *picture,
+                                       uint32_t *pic_block_hash[2],
+                                       int8_t *pic_block_same_info[3]);
+void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+                                   const YV12_BUFFER_CONFIG *picture,
+                                   int block_size,
+                                   uint32_t *src_pic_block_hash[2],
+                                   uint32_t *dst_pic_block_hash[2],
+                                   int8_t *src_pic_block_same_info[3],
+                                   int8_t *dst_pic_block_same_info[3]);
+bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+                                                 uint32_t *pic_hash[2],
+                                                 int8_t *pic_is_same,
+                                                 int pic_width, int pic_height,
+                                                 int block_size);
+
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all rows
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size, int x_start, int y_start);
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all columns
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+                                 int block_size, int x_start, int y_start);
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                              const uint8_t *y_src, int stride, int block_size,
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 0000000000..a108e8148c
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/blockd.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel.
+   Shared for both high and low bit depth.
+ */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[1] = (tran_low_t)c1;
+    op[2] = (tran_low_t)d1;
+    op[3] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op += 4;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    b1 = ip[4 * 1];
+    c1 = ip[4 * 2];
+    d1 = ip[4 * 3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[4 * 0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[4 * 1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[4 * 2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[4 * 3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip++;
+    op++;
+  }
+}
+
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  if (txfm_param->lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+  av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                     txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                     txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       txfm_param->bd);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
+}
+#endif
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       bd);
+}
+
+static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       bd);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+#endif
+
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+                  TxfmParam *txfm_param) {
+  if (txfm_param->bd == 8)
+    av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+  else
+    av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TxfmParam *txfm_param) {
+  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_64X64:
+      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X64:
+      highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X32:
+      highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+
+    case TX_32X32:
+      highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X16:
+      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X8:
+      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_4X8:
+      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X4:
+      highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X16:
+      highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X8:
+      highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X32:
+      highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X16:
+      highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_4X4:
+      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+#if !CONFIG_REALTIME_ONLY
+    case TX_4X16:
+      highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X4:
+      highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X32:
+      highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X8:
+      highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X64:
+      highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X16:
+      highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+                                       ptrdiff_t src_stride,
+                                       tran_low_t *coeff) {
+  switch (tx_size) {
+    // As the output transform co-efficients of 4x4 Hadamard transform can be
+    // represented using 15 bits (for 12-bit clip) use lowbd variant of
+    // hadamard_4x4.
+    case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+    case TX_8X8: aom_highbd_hadamard_8x8(src_diff, src_stride, coeff); break;
+    case TX_16X16:
+      aom_highbd_hadamard_16x16(src_diff, src_stride, coeff);
+      break;
+    case TX_32X32:
+      aom_highbd_hadamard_32x32(src_diff, src_stride, coeff);
+      break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+                                ptrdiff_t src_stride, tran_low_t *coeff) {
+  switch (tx_size) {
+    case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+    case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
+    case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
+    case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
+    default: assert(0);
+  }
+}
+
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+                    const int16_t *src_diff, int src_stride,
+                    tran_low_t *coeff) {
+  if (use_hadamard) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (bd_info.use_highbitdepth_buf) {
+      highbd_wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+    } else {
+      wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+    }
+#else
+    wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  } else {
+    TxfmParam txfm_param;
+    txfm_param.tx_type = DCT_DCT;
+    txfm_param.tx_size = tx_size;
+    txfm_param.lossless = 0;
+    txfm_param.bd = bd_info.bit_depth;
+    txfm_param.is_hbd = bd_info.use_highbitdepth_buf;
+    txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+    av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 0000000000..30f8a2258b
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+                  TxfmParam *txfm_param);
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TxfmParam *txfm_param);
+
+/*!\brief Apply Hadamard or DCT transform
+ *
+ * \callergraph
+ * DCT and Hadamard transforms are commonly used for quick RD score estimation.
+ * The coeff buffer's size should be equal to the number of pixels
+ * corresponding to tx_size.
+ */
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+                    const int16_t *src_diff, int src_stride, tran_low_t *coeff);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/interp_search.c b/third_party/aom/av1/encoder/interp_search.c
new file mode 100644
index 0000000000..27235303c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/interp_search.c
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+
+// return mv_diff
+static INLINE int is_interp_filter_good_match(
+    const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi,
+    int skip_level) {
+  const int is_comp = has_second_ref(mi);
+  int i;
+
+  for (i = 0; i < 1 + is_comp; ++i) {
+    if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX;
+  }
+
+  if (skip_level == 1 && is_comp) {
+    if (st->comp_type != mi->interinter_comp.type) return INT_MAX;
+    if (st->compound_idx != mi->compound_idx) return INT_MAX;
+  }
+
+  int mv_diff = 0;
+  for (i = 0; i < 1 + is_comp; ++i) {
+    mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) +
+               abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col);
+  }
+  return mv_diff;
+}
+
+static INLINE int save_interp_filter_search_stat(
+    MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx) {
+  if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) {
+    INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+                                        { mbmi->mv[0], mbmi->mv[1] },
+                                        { mbmi->ref_frame[0],
+                                          mbmi->ref_frame[1] },
+                                        mbmi->interinter_comp.type,
+                                        mbmi->compound_idx,
+                                        rd,
+                                        pred_sse };
+    interp_filter_stats[interp_filter_stats_idx] = stat;
+    interp_filter_stats_idx++;
+  }
+  return interp_filter_stats_idx;
+}
+
+static INLINE int find_interp_filter_in_stats(
+    MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx, int skip_level) {
+  // [skip_levels][single or comp]
+  const int thr[2][2] = { { 0, 0 }, { 3, 7 } };
+  const int is_comp = has_second_ref(mbmi);
+
+  // Find good enough match.
+  // TODO(yunqing): Separate single-ref mode and comp mode stats for fast
+  // search.
+  int best = INT_MAX;
+  int match = -1;
+  for (int j = 0; j < interp_filter_stats_idx; ++j) {
+    const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j];
+    const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level);
+    // Exact match is found.
+    if (mv_diff == 0) {
+      match = j;
+      break;
+    } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) {
+      best = mv_diff;
+      match = j;
+    }
+  }
+
+  if (match != -1) {
+    mbmi->interp_filters = interp_filter_stats[match].filters;
+    return match;
+  }
+  return -1;  // no match result found
+}
+
+int av1_find_interp_filter_match(
+    MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+    const InterpFilter assign_filter, const int need_search,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx) {
+  int match_found_idx = -1;
+  if (cpi->sf.interp_sf.use_interp_filter && need_search)
+    match_found_idx = find_interp_filter_in_stats(
+        mbmi, interp_filter_stats, interp_filter_stats_idx,
+        cpi->sf.interp_sf.use_interp_filter);
+
+  if (!need_search || match_found_idx == -1)
+    set_default_interp_filters(mbmi, assign_filter);
+  return match_found_idx;
+}
+
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+                                      const int_interpfilters filters,
+                                      const int ctx[2], int dual_filter) {
+  const InterpFilter filter0 = filters.as_filters.y_filter;
+  int inter_filter_cost =
+      x->mode_costs.switchable_interp_costs[ctx[0]][filter0];
+  if (dual_filter) {
+    const InterpFilter filter1 = filters.as_filters.x_filter;
+    inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx[1]][filter1];
+  }
+  return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
+// Build inter predictor and calculate model rd
+// for a given plane.
+static INLINE void interp_model_rd_eval(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int plane_from, int plane_to,
+    RD_STATS *rd_stats, int is_skip_build_pred) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_STATS tmp_rd_stats;
+  av1_init_rd_stats(&tmp_rd_stats);
+
+  // Skip inter predictor if the predictor is already available.
+  if (!is_skip_build_pred) {
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  plane_from, plane_to);
+  }
+
+  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+                     ? MODELRD_LEGACY
+                     : MODELRD_TYPE_INTERP_FILTER](
+      cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate,
+      &tmp_rd_stats.dist, &tmp_rd_stats.skip_txfm, &tmp_rd_stats.sse, NULL,
+      NULL, NULL);
+
+  av1_merge_rd_stats(rd_stats, &tmp_rd_stats);
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd,
+    RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2],
+    const int skip_pred) {
+  const AV1_COMMON *cm = &cpi->common;
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS this_rd_stats_luma, this_rd_stats;
+
+  // Initialize rd_stats structures to default values.
+  av1_init_rd_stats(&this_rd_stats_luma);
+  this_rd_stats = *rd_stats_luma;
+  const int_interpfilters last_best = mbmi->interp_filters;
+  mbmi->interp_filters = filter_sets[filter_idx];
+  const int tmp_rs =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+                          cm->seq_params->enable_dual_filter);
+
+  int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
+  if (min_rd > *rd) {
+    mbmi->interp_filters = last_best;
+    return 0;
+  }
+
+  (void)tile_data;
+
+  assert(skip_pred != 2);
+  assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0));
+  assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0));
+  assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0));
+  assert((rd_stats_luma->skip_txfm == 0) || (rd_stats_luma->skip_txfm == 1));
+  assert((rd_stats->skip_txfm == 0) || (rd_stats->skip_txfm == 1));
+  assert((skip_pred >= 0) &&
+         (skip_pred <= interp_search_flags->default_interp_skip_flags));
+
+  // When skip_txfm pred is equal to default_interp_skip_flags,
+  // skip both luma and chroma MC.
+  // For mono-chrome images:
+  // num_planes = 1 and cpi->default_interp_skip_flags = 1,
+  // skip_pred = 1: skip both luma and chroma
+  // skip_pred = 0: Evaluate luma and as num_planes=1,
+  // skip chroma evaluation
+  int tmp_skip_pred =
+      (skip_pred == interp_search_flags->default_interp_skip_flags)
+          ? INTERP_SKIP_LUMA_SKIP_CHROMA
+          : skip_pred;
+
+  switch (tmp_skip_pred) {
+    case INTERP_EVAL_LUMA_EVAL_CHROMA:
+      // skip_pred = 0: Evaluate both luma and chroma.
+      // Luma MC
+      interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+                           &this_rd_stats_luma, 0);
+      this_rd_stats = this_rd_stats_luma;
+#if CONFIG_COLLECT_RD_STATS == 3
+      RD_STATS rd_stats_y;
+      av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                          INT64_MAX);
+      PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+      AOM_FALLTHROUGH_INTENDED;
+    case INTERP_SKIP_LUMA_EVAL_CHROMA:
+      // skip_pred = 1: skip luma evaluation (retain previous best luma stats)
+      // and do chroma evaluation.
+      for (int plane = 1; plane < num_planes; ++plane) {
+        int64_t tmp_rd =
+            RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+        if (tmp_rd >= *rd) {
+          mbmi->interp_filters = last_best;
+          return 0;
+        }
+        interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane,
+                             &this_rd_stats, 0);
+      }
+      break;
+    case INTERP_SKIP_LUMA_SKIP_CHROMA:
+      // both luma and chroma evaluation is skipped
+      this_rd_stats = *rd_stats;
+      break;
+    case INTERP_EVAL_INVALID:
+    default: assert(0); return 0;
+  }
+  int64_t tmp_rd =
+      RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+
+  if (tmp_rd < *rd) {
+    *rd = tmp_rd;
+    *switchable_rate = tmp_rs;
+    if (skip_pred != interp_search_flags->default_interp_skip_flags) {
+      if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) {
+        // Overwrite the data as current filter is the best one
+        *rd_stats_luma = this_rd_stats_luma;
+        *rd_stats = this_rd_stats;
+        // As luma MC data is computed, no need to recompute after the search
+        x->recalc_luma_mc_data = 0;
+      } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) {
+        // As luma MC data is not computed, update of luma data can be skipped
+        *rd_stats = this_rd_stats;
+        // As luma MC data is not recomputed and current filter is the best,
+        // indicate the possibility of recomputing MC data
+        // If current buffer contains valid MC data, toggle to indicate that
+        // luma MC data needs to be recomputed
+        x->recalc_luma_mc_data ^= 1;
+      }
+      swap_dst_buf(xd, dst_bufs, num_planes);
+    }
+    return 1;
+  }
+  mbmi->interp_filters = last_best;
+  return 0;
+}
+
+static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed(
+    const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int_interpfilters *af, int_interpfilters *lf) {
+  const AV1_COMMON *cm = &cpi->common;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int bsl = mi_size_wide_log2[bsize];
+  int is_horiz_eq = 0, is_vert_eq = 0;
+
+  if (above_mbmi && is_inter_block(above_mbmi))
+    *af = above_mbmi->interp_filters;
+
+  if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters;
+
+  if (af->as_filters.x_filter != INTERP_INVALID)
+    is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter;
+  if (af->as_filters.y_filter != INTERP_INVALID)
+    is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter;
+
+  INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int pred_filter_enable =
+      cpi->sf.interp_sf.cb_pred_filter_search
+          ? (((mi_row + mi_col) >> bsl) +
+             get_chessboard_index(cm->current_frame.frame_number)) &
+                0x1
+          : 0;
+  pred_filter_enable &= is_horiz_eq || is_vert_eq;
+  // pred_filter_search = 0: pred_filter is disabled
+  // pred_filter_search = 1: pred_filter is enabled and only horz pred matching
+  // pred_filter_search = 2: pred_filter is enabled and only vert pred matching
+  // pred_filter_search = 3: pred_filter is enabled and
+  //                         both vert, horz pred matching
+  return pred_filter_enable * pred_filter_type;
+}
+
+static DUAL_FILTER_TYPE find_best_interp_rd_facade(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) {
+  int tmp_skip_pred = skip_pred;
+  DUAL_FILTER_TYPE best_filt_type = REG_REG;
+
+  // If no filter are set to be evaluated, return from function
+  if (allow_interp_mask == 0x0) return best_filt_type;
+  // For block width or height is 4, skip the pred evaluation of SHARP_SHARP
+  tmp_skip_pred = is_w4_or_h4
+                      ? cpi->interp_search_flags.default_interp_skip_flags
+                      : skip_pred;
+
+  // Loop over the all filter types and evaluate for only allowed filter types
+  for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) {
+    const int is_filter_allowed =
+        get_interp_filter_allowed_mask(allow_interp_mask, filt_type);
+    if (is_filter_allowed)
+      if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                  rd_stats_y, rd_stats, switchable_rate,
+                                  dst_bufs, filt_type, switchable_ctx,
+                                  tmp_skip_pred))
+        best_filt_type = filt_type;
+    tmp_skip_pred = skip_pred;
+  }
+  return best_filt_type;
+}
+
+static INLINE void pred_dual_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af,
+    int_interpfilters *lf) {
+  (void)lf;
+  assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ);
+  assert(pred_filt_type < INTERP_PRED_TYPE_ALL);
+  uint16_t allowed_interp_mask = 0;
+
+  if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) {
+    // pred_filter_search = 1: Only horizontal filter is matching
+    allowed_interp_mask =
+        av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter];
+  } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) {
+    // pred_filter_search = 2: Only vertical filter is matching
+    allowed_interp_mask =
+        av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter];
+  } else {
+    // pred_filter_search = 3: Both horizontal and vertical filter are matching
+    int filt_type =
+        af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS;
+    set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type);
+  }
+  // REG_REG is already been evaluated in the beginning
+  reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+  find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y,
+                             rd_stats, switchable_rate, dst_bufs,
+                             switchable_ctx, skip_pred, allowed_interp_mask, 0);
+}
+// Evaluate dual filter type
+// a) Using above, left block interp filter
+// b) Find the best horizontal filter and
+//    then evaluate corresponding vertical filters.
+static INLINE void fast_dual_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_hor, const int skip_ver) {
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+  int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+  int_interpfilters lf = af;
+
+  if (!have_newmv_in_inter_mode(mbmi->mode)) {
+    pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+  }
+
+  if (pred_filter_type) {
+    pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                               rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                               switchable_ctx, (skip_hor & skip_ver),
+                               pred_filter_type, &af, &lf);
+  } else {
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
+    int best_dual_mode = 0;
+    int skip_pred =
+        bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor;
+    // TODO(any): Make use of find_best_interp_rd_facade()
+    // if speed impact is negligible
+    for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+      if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                  rd_stats_y, rd_stats, switchable_rate,
+                                  dst_bufs, i, switchable_ctx, skip_pred)) {
+        best_dual_mode = i;
+      }
+      skip_pred = skip_hor;
+    }
+    // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+    skip_pred =
+        bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver;
+    for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
+         i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                              rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                              i, switchable_ctx, skip_pred);
+      skip_pred = skip_ver;
+    }
+  }
+}
+
+// Find the best interp filter if dual_interp_filter = 0
+static INLINE void find_best_non_dual_interp_filter(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_ver, const int skip_hor) {
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  int8_t i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  uint16_t interp_filter_search_mask =
+      interp_search_flags->interp_filter_search_mask;
+
+  if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0);
+    const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1);
+    int use_actual_frame_probs = 1;
+    const int *switchable_interp_p0;
+    const int *switchable_interp_p1;
+#if CONFIG_FPMT_TEST
+    use_actual_frame_probs =
+        (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+    if (!use_actual_frame_probs) {
+      switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs
+                                 .switchable_interp_probs[update_type][ctx0];
+      switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs
+                                 .switchable_interp_probs[update_type][ctx1];
+    }
+#endif
+    if (use_actual_frame_probs) {
+      switchable_interp_p0 =
+          cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx0];
+      switchable_interp_p1 =
+          cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx1];
+    }
+    static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 };
+    const int thresh = thr[update_type];
+    for (i = 0; i < SWITCHABLE_FILTERS; i++) {
+      // For non-dual case, the 2 dir's prob should be identical.
+      assert(switchable_interp_p0[i] == switchable_interp_p1[i]);
+      if (switchable_interp_p0[i] < thresh &&
+          switchable_interp_p1[i] < thresh) {
+        DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i;
+        reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type);
+      }
+    }
+  }
+
+  // Regular filter evaluation should have been done and hence the same should
+  // be the winner
+  assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int);
+  if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) {
+    INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+    int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+    int_interpfilters lf = af;
+
+    pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+    if (pred_filter_type) {
+      assert(af.as_filters.x_filter != INTERP_INVALID);
+      int filter_idx = SWITCHABLE * af.as_filters.x_filter;
+      // This assert tells that (filter_x == filter_y) for non-dual filter case
+      assert(filter_sets[filter_idx].as_filters.x_filter ==
+             filter_sets[filter_idx].as_filters.y_filter);
+      if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+          !(get_interp_filter_allowed_mask(interp_filter_search_mask,
+                                           filter_idx))) {
+        return;
+      }
+      if (filter_idx) {
+        interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                                filter_idx, switchable_ctx,
+                                (skip_hor & skip_ver));
+      }
+      return;
+    }
+  }
+  // Reuse regular filter's modeled rd data for sharp filter for following
+  // cases
+  // 1) When bsize is 4x4
+  // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical
+  // direction is full-pel
+  // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal
+  // direction is full-pel
+  // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction
+  // alone is full-pel
+
+  if ((bsize == BLOCK_4X4) ||
+      (block_size_wide[bsize] == 4 &&
+       skip_ver == interp_search_flags->default_interp_skip_flags) ||
+      (block_size_high[bsize] == 4 &&
+       skip_hor == interp_search_flags->default_interp_skip_flags)) {
+    int skip_pred = skip_hor & skip_ver;
+    uint16_t allowed_interp_mask = 0;
+
+    // REG_REG filter type is evaluated beforehand, hence skip it
+    set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP);
+    set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH);
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search)
+      allowed_interp_mask &= interp_filter_search_mask;
+
+    find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+                               rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                               switchable_ctx, skip_pred, allowed_interp_mask,
+                               1);
+  } else {
+    int skip_pred = (skip_hor & skip_ver);
+    for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE;
+         i += (SWITCHABLE_FILTERS + 1)) {
+      // This assert tells that (filter_x == filter_y) for non-dual filter case
+      assert(filter_sets[i].as_filters.x_filter ==
+             filter_sets[i].as_filters.y_filter);
+      if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+          !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) {
+        continue;
+      }
+      interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                              rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                              i, switchable_ctx, skip_pred);
+      // In first iteration, smooth filter is evaluated. If smooth filter
+      // (which is less sharper) is the winner among regular and smooth filters,
+      // sharp filter evaluation is skipped
+      // TODO(any): Refine this gating based on modelled rd only (i.e., by not
+      // accounting switchable filter rate)
+      if (cpi->sf.interp_sf.skip_sharp_interp_filter_search &&
+          skip_pred != interp_search_flags->default_interp_skip_flags) {
+        if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int)
+          break;
+      }
+    }
+  }
+}
+
+static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x,
+                                              const AV1_COMP *const cpi,
+                                              int *skip_hor, int *skip_ver) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  const int is_compound = has_second_ref(mbmi);
+  assert(is_intrabc_block(mbmi) == 0);
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]);
+    // TODO(any): Refine skip flag calculation considering scaling
+    if (av1_is_scaled(sf)) {
+      *skip_hor = 0;
+      *skip_ver = 0;
+      break;
+    }
+    const MV mv = mbmi->mv[ref].as_mv;
+    int skip_hor_plane = 0;
+    int skip_ver_plane = 0;
+    for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1));
+         ++plane_idx) {
+      struct macroblockd_plane *const pd = &xd->plane[plane_idx];
+      const int bw = pd->width;
+      const int bh = pd->height;
+      const MV mv_q4 = clamp_mv_to_umv_border_sb(
+          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+      const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      skip_hor_plane |= ((sub_x == 0) << plane_idx);
+      skip_ver_plane |= ((sub_y == 0) << plane_idx);
+    }
+    *skip_hor &= skip_hor_plane;
+    *skip_ver &= skip_ver_plane;
+    // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+    assert(*skip_hor != 2);
+    assert(*skip_ver != 2);
+  }
+  // When compond prediction type is compound segment wedge, luma MC and chroma
+  // MC need to go hand in hand as mask generated during luma MC is reuired for
+  // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+  // vertical filter decision may be incorrect as temporary MC evaluation
+  // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+  // populated during luma MC
+  if (is_compound && mbmi->compound_idx == 1 &&
+      mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+    assert(mbmi->comp_group_idx == 1);
+    if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0;
+  }
+}
+
+/*!\brief AV1 interpolation filter search
+ *
+ * \ingroup inter_mode_search
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to struc holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in]     tmp_dst           A temporary prediction buffer to hold a
+ *                                  computed prediction.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in,out] rd                The RD cost associated with the selected
+ *                                  interpolation filter parameters.
+ * \param[in,out] switchable_rate   The rate associated with using a SWITCHABLE
+ *                                  filter mode.
+ * \param[in,out] skip_build_pred   Indicates whether or not to build the inter
+ *                                  predictor. If this is 0, the inter predictor
+ *                                  has already been built and thus we can avoid
+ *                                  repeating computation.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ *
+ * \return Returns INT64_MAX if the filter parameters are invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * parameter search is a success.
+ */
+int64_t av1_interpolation_filter_search(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+    int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+    HandleInterModeArgs *args, int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int need_search = av1_is_interp_needed(xd);
+  const int ref_frame = xd->mi[0]->ref_frame[0];
+  RD_STATS rd_stats_luma, rd_stats;
+
+  // Initialization of rd_stats structures with default values
+  av1_init_rd_stats(&rd_stats_luma);
+  av1_init_rd_stats(&rd_stats);
+
+  int match_found_idx = -1;
+  const InterpFilter assign_filter = cm->features.interp_filter;
+
+  match_found_idx = av1_find_interp_filter_match(
+      mbmi, cpi, assign_filter, need_search, args->interp_filter_stats,
+      args->interp_filter_stats_idx);
+
+  if (match_found_idx != -1) {
+    *rd = args->interp_filter_stats[match_found_idx].rd;
+    x->pred_sse[ref_frame] =
+        args->interp_filter_stats[match_found_idx].pred_sse;
+    *skip_build_pred = 0;
+    return 0;
+  }
+
+  int switchable_ctx[2];
+  switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+  switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+  *switchable_rate =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+                          cm->seq_params->enable_dual_filter);
+
+  // Do MC evaluation for default filter_type.
+  // Luma MC
+  interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+                       &rd_stats_luma, *skip_build_pred);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+  RD_STATS rd_stats_y;
+  av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+  // Chroma MC
+  if (num_planes > 1) {
+    interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V,
+                         &rd_stats, *skip_build_pred);
+  }
+  *skip_build_pred = 1;
+
+  av1_merge_rd_stats(&rd_stats, &rd_stats_luma);
+
+  assert(rd_stats.rate >= 0);
+
+  *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist);
+  x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+  if (assign_filter != SWITCHABLE || match_found_idx != -1) {
+    return 0;
+  }
+  if (!need_search) {
+    int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+    assert(mbmi->interp_filters.as_int == filters.as_int);
+    (void)filters;
+    return 0;
+  }
+  if (args->modelled_rd != NULL) {
+    if (has_second_ref(mbmi)) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+      const int mode0 = compound_ref0_mode(mbmi->mode);
+      const int mode1 = compound_ref1_mode(mbmi->mode);
+      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                                 args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+      if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+        return INT64_MAX;
+      }
+    }
+  }
+
+  x->recalc_luma_mc_data = 0;
+  // skip_flag=xx (in binary form)
+  // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+  // corresponds to skipping chroma MC  skip_flag=0 corresponds to "Don't skip
+  // luma and chroma MC"  Skip flag=1 corresponds to "Skip Luma MC only"
+  // Skip_flag=2 is not a valid case
+  // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+  int skip_hor = interp_search_flags->default_interp_skip_flags;
+  int skip_ver = interp_search_flags->default_interp_skip_flags;
+  calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver);
+
+  // do interp_filter search
+  restore_dst_buf(xd, *tmp_dst, num_planes);
+  const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+  // Evaluate dual interp filters
+  if (cm->seq_params->enable_dual_filter) {
+    if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) {
+      fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                 &rd_stats_luma, &rd_stats, switchable_rate,
+                                 dst_bufs, switchable_ctx, skip_hor, skip_ver);
+    } else {
+      // Use full interpolation filter search
+      uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK;
+      // REG_REG filter type is evaluated beforehand, so loop is repeated over
+      // REG_SMOOTH to SHARP_SHARP for full interpolation filter search
+      reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+      find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+                                 &rd_stats_luma, &rd_stats, switchable_rate,
+                                 dst_bufs, switchable_ctx,
+                                 (skip_hor & skip_ver), allowed_interp_mask, 0);
+    }
+  } else {
+    // Evaluate non-dual interp filters
+    find_best_non_dual_interp_filter(
+        x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats,
+        switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor);
+  }
+  swap_dst_buf(xd, dst_bufs, num_planes);
+  // Recompute final MC data if required
+  if (x->recalc_luma_mc_data == 1) {
+    // Recomputing final luma MC data is required only if the same was skipped
+    // in either of the directions  Condition below is necessary, but not
+    // sufficient
+    assert((skip_hor == 1) || (skip_ver == 1));
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+  }
+  x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+  // save search results
+  if (cpi->sf.interp_sf.use_interp_filter) {
+    assert(match_found_idx == -1);
+    args->interp_filter_stats_idx = save_interp_filter_search_stat(
+        mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats,
+        args->interp_filter_stats_idx);
+  }
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/interp_search.h b/third_party/aom/av1/encoder/interp_search.h
new file mode 100644
index 0000000000..9815e0bcfb
--- /dev/null
+++ b/third_party/aom/av1/encoder/interp_search.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+#define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define MAX_INTERP_FILTER_STATS 128
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+
+typedef struct {
+  int_interpfilters filters;
+  int_mv mv[2];
+  int8_t ref_frames[2];
+  COMPOUND_TYPE comp_type;
+  int compound_idx;
+  int64_t rd;
+  unsigned int pred_sse;
+} INTERPOLATION_FILTER_STATS;
+/*!\endcond */
+
+/*!\brief Miscellaneous arguments for inter mode search.
+ */
+typedef struct HandleInterModeArgs {
+  /*!
+   * Buffer for the above predictor in OBMC
+   */
+  uint8_t *above_pred_buf[MAX_MB_PLANE];
+  /*!
+   * Stride for the above predictor in OBMC
+   */
+  int above_pred_stride[MAX_MB_PLANE];
+  /*!
+   * Buffer for the left predictor in OBMC
+   */
+  uint8_t *left_pred_buf[MAX_MB_PLANE];
+  /*!
+   * Stride for the left predictor in OBMC
+   */
+  int left_pred_stride[MAX_MB_PLANE];
+  /*!
+   * Pointer to the first member in a 2D array which holds
+   * single reference mode motion vectors to be used as a starting
+   * point in the mv search for compound modes. Each array is length REF_FRAMES,
+   * meaning there is a slot for a single reference motion vector for
+   * each possible reference frame. The 2D array consists of N of these arrays,
+   * where N is the length of the reference mv stack computed for the single
+   * reference case for that particular reference frame.
+   */
+  int_mv (*single_newmv)[REF_FRAMES];
+  /*!
+   * Pointer to the first array of a 2D array with the same setup as
+   * single_newmv array above. This is a 2D array to hold the rate
+   * corresponding to each of the single reference mode motion vectors
+   * held in single_newmv.
+   */
+  int (*single_newmv_rate)[REF_FRAMES];
+  /*!
+   * Pointer to the first array of a 2D array with the same setup as
+   * single_newmv array above. This is a 2D array to hold a 0 or 1
+   * validity value corresponding to each of the single reference mode motion
+   * vectors held in single_newmv.
+   */
+  int (*single_newmv_valid)[REF_FRAMES];
+  /*!
+   * Pointer to the first array in a 3D array of predicted rate-distortion.
+   * The dimensions of this structure are:
+   * (number of possible inter modes) X
+   * (number of reference MVs) X
+   * (number of reference frames).
+   */
+  int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  /*!
+   * Holds an estimated entropy cost for picking the current reference frame.
+   * This is used to compute an rd estimate.
+   */
+  int ref_frame_cost;
+  /*!
+   * Holds an estimated entropy cost for picking single or compound
+   * reference. This is used to compute an rd estimate.
+   */
+  int single_comp_cost;
+  /*!
+   * Pointer to the first element in a 3D array holding rd's of
+   * SIMPLE_TRANSLATION used to prune out the motion mode search in single ref
+   * modes used to determine compound ref modes. The full structure is:
+   * (number of inter modes) X (length of refmv list) X (number of ref frames)
+   */
+  int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  /*!
+   * An integer value 0 or 1 which indicates whether or not to skip the motion
+   * mode search and default to SIMPLE_TRANSLATION as a speed feature.
+   */
+  int skip_motion_mode;
+  /*!
+   * Initialized to false. If true, skips interpolation filter search and uses
+   * the default EIGHTTAP_REGULAR.
+   */
+  bool skip_ifs;
+  /*!
+   * A pointer to the first element in an array of INTERINTRA_MODE types. This
+   * contains the best inter_intra mode for each reference frame.
+   */
+  INTERINTRA_MODE *inter_intra_mode;
+  /*!
+   * Array of saved interpolation filter stats collected to avoid repeating
+   * an interpolation filter search when the mv and ref_frame are the same
+   * as a previous search.
+   */
+  INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS];
+
+  /*!
+   * Stack to store full pixel search start mv of NEWMV mode.
+   */
+  FULLPEL_MV start_mv_stack[(MAX_REF_MV_SEARCH - 1) * 2];
+
+  /*!
+   * Stack to store ref_mv_idx of NEWMV mode.
+   */
+  uint8_t ref_mv_idx_stack[(MAX_REF_MV_SEARCH - 1) * 2];
+
+  /*!
+   * Count of mvs in start mv stack.
+   */
+  int start_mv_cnt;
+
+  /*!
+   * Index of the last set of saved stats in the interp_filter_stats array.
+   */
+  int interp_filter_stats_idx;
+  /*!
+   * Estimated wedge index.
+   */
+  int wedge_index;
+  /*!
+   * Estimated wedge sign.
+   */
+  int wedge_sign;
+  /*!
+   * Estimated diff wtd index.
+   */
+  int diffwtd_index;
+  /*!
+   * Estimated cmp mode.
+   */
+  int cmp_mode[MODE_CTX_REF_FRAMES];
+  /*!
+   * The best sse during single new_mv search. Note that the sse here comes from
+   * single_motion_search, and not from interpolation_filter_search. This has
+   * two implications:
+   * 1. The mv used to calculate the sse here does not have to be the best sse
+   *    found in handle_inter_mode.
+   * 2. Even if the mvs agree, the sse here can differ from the sse in \ref
+   *    MACROBLOCK::pred_sse due to different interpolation filter used.
+   */
+  unsigned int best_single_sse_in_refs[REF_FRAMES];
+  /*!
+   * Holds the sse of best mode so far in the mode evaluation process. This is
+   * used in intermediate termination of NEWMV mode evaluation.
+   */
+  unsigned int best_pred_sse;
+} HandleInterModeArgs;
+
+/*!\cond */
+static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+  { 0x00000000 }, { 0x00010000 }, { 0x00020000 },  // y = 0
+  { 0x00000001 }, { 0x00010001 }, { 0x00020001 },  // y = 1
+  { 0x00000002 }, { 0x00010002 }, { 0x00020002 },  // y = 2
+};
+
+int av1_find_interp_filter_match(
+    MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+    const InterpFilter assign_filter, const int need_search,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx);
+
+int64_t av1_interpolation_filter_search(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+    int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+    HandleInterModeArgs *args, int64_t ref_best_rd);
+
+/*!\endcond */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/intra_mode_search.c b/third_party/aom/av1/encoder/intra_mode_search.c
new file mode 100644
index 0000000000..99b0af2f8e
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search.c
@@ -0,0 +1,1739 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/tx_search.h"
+
+// Even though there are 7 delta angles, this macro is set to 9 to facilitate
+// the rd threshold check to prune -3 and 3 delta angles.
+#define SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY (2 * MAX_ANGLE_DELTA + 3)
+
+// The order for evaluating delta angles while processing the luma directional
+// intra modes. Currently, this order of evaluation is applicable only when
+// speed feature prune_luma_odd_delta_angles_in_intra is enabled. In this case,
+// even angles are evaluated first in order to facilitate the pruning of odd
+// delta angles based on the rd costs of the neighboring delta angles.
+static const int8_t luma_delta_angles_order[2 * MAX_ANGLE_DELTA] = {
+  -2, 2, -3, -1, 1, 3,
+};
+
+/*!\cond */
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
+  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
+  D67_PRED,      D113_PRED,     D45_PRED,
+};
+
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
+  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
+  UV_D113_PRED,   UV_D45_PRED,
+};
+
+// The bitmask corresponds to the filter intra modes as defined in enums.h
+// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding filter intra mode. The table
+// av1_derived_filter_intra_mode_used_flag is used when speed feature
+// prune_filter_intra_level is 1. The evaluated filter intra modes are union
+// of the following:
+// 1) FILTER_DC_PRED
+// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED,
+// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED).
+static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = {
+  0x01,  // DC_PRED:           0000 0001
+  0x03,  // V_PRED:            0000 0011
+  0x05,  // H_PRED:            0000 0101
+  0x01,  // D45_PRED:          0000 0001
+  0x01,  // D135_PRED:         0000 0001
+  0x01,  // D113_PRED:         0000 0001
+  0x09,  // D157_PRED:         0000 1001
+  0x01,  // D203_PRED:         0000 0001
+  0x01,  // D67_PRED:          0000 0001
+  0x01,  // SMOOTH_PRED:       0000 0001
+  0x01,  // SMOOTH_V_PRED:     0000 0001
+  0x01,  // SMOOTH_H_PRED:     0000 0001
+  0x11   // PAETH_PRED:        0001 0001
+};
+
+// The bitmask corresponds to the chroma intra modes as defined in enums.h
+// UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding chroma intra mode. The table
+// av1_derived_chroma_intra_mode_used_flag is used when speed feature
+// prune_chroma_modes_using_luma_winner is enabled. The evaluated chroma
+// intra modes are union of the following:
+// 1) UV_DC_PRED
+// 2) UV_SMOOTH_PRED
+// 3) UV_CFL_PRED
+// 4) mode that corresponds to luma intra mode winner (Eg : UV_V_PRED if luma
+// intra mode winner is V_PRED).
+static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = {
+  0x2201,  // DC_PRED:           0010 0010 0000 0001
+  0x2203,  // V_PRED:            0010 0010 0000 0011
+  0x2205,  // H_PRED:            0010 0010 0000 0101
+  0x2209,  // D45_PRED:          0010 0010 0000 1001
+  0x2211,  // D135_PRED:         0010 0010 0001 0001
+  0x2221,  // D113_PRED:         0010 0010 0010 0001
+  0x2241,  // D157_PRED:         0010 0010 0100 0001
+  0x2281,  // D203_PRED:         0010 0010 1000 0001
+  0x2301,  // D67_PRED:          0010 0011 0000 0001
+  0x2201,  // SMOOTH_PRED:       0010 0010 0000 0001
+  0x2601,  // SMOOTH_V_PRED:     0010 0110 0000 0001
+  0x2a01,  // SMOOTH_H_PRED:     0010 1010 0000 0001
+  0x3201   // PAETH_PRED:        0011 0010 0000 0001
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 };
+DECLARE_ALIGNED(16, static const uint16_t,
+                highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
+                                 const int stride, const int is_hbd) {
+  unsigned int sse;
+
+  if (is_hbd)
+    return vf(buf, stride, CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse);
+  else
+    return vf(buf, stride, all_zeros, 0, &sse);
+}
+
+// Computes average of log(1 + variance) across 4x4 sub-blocks for source and
+// reconstructed blocks.
+static void compute_avg_log_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     const BLOCK_SIZE bs,
+                                     double *avg_log_src_variance,
+                                     double *avg_log_recon_variance) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+  const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+  const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+  const int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  const int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+  const int bw = (MI_SIZE * mi_size_wide[bs] - right_overflow);
+  const int bh = (MI_SIZE * mi_size_high[bs] - bottom_overflow);
+  const int is_hbd = is_cur_buf_hbd(xd);
+
+  for (int i = 0; i < bh; i += MI_SIZE) {
+    const int r = mi_row_in_sb + (i >> MI_SIZE_LOG2);
+    for (int j = 0; j < bw; j += MI_SIZE) {
+      const int c = mi_col_in_sb + (j >> MI_SIZE_LOG2);
+      const int mi_offset = r * mi_size_wide[sb_size] + c;
+      Block4x4VarInfo *block_4x4_var_info =
+          &x->src_var_info_of_4x4_sub_blocks[mi_offset];
+      int src_var = block_4x4_var_info->var;
+      double log_src_var = block_4x4_var_info->log_var;
+      // Compute average of log(1 + variance) for the source block from 4x4
+      // sub-block variance values. Calculate and store 4x4 sub-block variance
+      // and log(1 + variance), if the values present in
+      // src_var_of_4x4_sub_blocks are invalid. Reuse the same if it is readily
+      // available with valid values.
+      if (src_var < 0) {
+        src_var = av1_calc_normalized_variance(
+            cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+            x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+            x->plane[0].src.stride, is_hbd);
+        block_4x4_var_info->var = src_var;
+        log_src_var = log1p(src_var / 16.0);
+        block_4x4_var_info->log_var = log_src_var;
+      } else {
+        // When source variance is already calculated and available for
+        // retrieval, check if log(1 + variance) is also available. If it is
+        // available, then retrieve from buffer. Else, calculate the same and
+        // store to the buffer.
+        if (log_src_var < 0) {
+          log_src_var = log1p(src_var / 16.0);
+          block_4x4_var_info->log_var = log_src_var;
+        }
+      }
+      *avg_log_src_variance += log_src_var;
+
+      const int recon_var = av1_calc_normalized_variance(
+          cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+          xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j,
+          xd->plane[0].dst.stride, is_hbd);
+      *avg_log_recon_variance += log1p(recon_var / 16.0);
+    }
+  }
+
+  const int blocks = (bw * bh) / 16;
+  *avg_log_src_variance /= (double)blocks;
+  *avg_log_recon_variance /= (double)blocks;
+}
+
+// Returns a factor to be applied to the RD value based on how well the
+// reconstructed block variance matches the source variance.
+static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bs) {
+  double threshold = INTRA_RD_VAR_THRESH(cpi->oxcf.speed);
+  // For non-positive threshold values, the comparison of source and
+  // reconstructed variances with threshold evaluates to false
+  // (src_var < threshold/rec_var < threshold) as these metrics are greater than
+  // than 0. Hence further calculations are skipped.
+  if (threshold <= 0) return 1.0;
+
+  double variance_rd_factor = 1.0;
+  double avg_log_src_variance = 0.0;
+  double avg_log_recon_variance = 0.0;
+  double var_diff = 0.0;
+
+  compute_avg_log_variance(cpi, x, bs, &avg_log_src_variance,
+                           &avg_log_recon_variance);
+
+  // Dont allow 0 to prevent / 0 below.
+  avg_log_src_variance += 0.000001;
+  avg_log_recon_variance += 0.000001;
+
+  if (avg_log_src_variance >= avg_log_recon_variance) {
+    var_diff = (avg_log_src_variance - avg_log_recon_variance);
+    if ((var_diff > 0.5) && (avg_log_recon_variance < threshold)) {
+      variance_rd_factor = 1.0 + ((var_diff * 2) / avg_log_src_variance);
+    }
+  } else {
+    var_diff = (avg_log_recon_variance - avg_log_src_variance);
+    if ((var_diff > 0.5) && (avg_log_src_variance < threshold)) {
+      variance_rd_factor = 1.0 + (var_diff / (2 * avg_log_src_variance));
+    }
+  }
+
+  // Limit adjustment;
+  variance_rd_factor = AOMMIN(3.0, variance_rd_factor);
+
+  return variance_rd_factor;
+}
+/*!\endcond */
+
+/*!\brief Search for the best filter_intra mode when coding intra frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \return Returns 1 if a new filter_intra mode is selected; 0 otherwise.
+ */
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, uint8_t *skippable,
+                                    BLOCK_SIZE bsize, int mode_cost,
+                                    PREDICTION_MODE best_mode_so_far,
+                                    int64_t *best_rd, int64_t *best_model_rd,
+                                    PICK_MODE_CONTEXT *ctx) {
+  // Skip the evaluation of filter intra modes.
+  if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int filter_intra_selected_flag = 0;
+  FILTER_INTRA_MODE mode;
+  TX_SIZE best_tx_size = TX_8X8;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  av1_zero(filter_intra_mode_info);
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
+  mbmi->mode = DC_PRED;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+
+  // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have
+  // filter-intra as winner.
+  if (x->use_mb_mode_cache &&
+      !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra)
+    return 0;
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    int64_t this_rd;
+    RD_STATS tokenonly_rd_stats;
+    mbmi->filter_intra_mode_info.filter_intra_mode = mode;
+
+    if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) &&
+        !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] &
+          (1 << mode)))
+      continue;
+
+    // Skip the evaluation of modes that do not match with the winner mode in
+    // x->mb_mode_cache.
+    if (x->use_mb_mode_cache &&
+        mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode)
+      continue;
+
+    if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
+      continue;
+    }
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) continue;
+    const int this_rate =
+        tokenonly_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+    // Visual quality adjustment based on recon vs source variance.
+    if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+      this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+    }
+
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+    if (this_rd < *best_rd) {
+      *best_rd = this_rd;
+      best_tx_size = mbmi->tx_size;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip_txfm;
+      filter_intra_selected_flag = 1;
+    }
+  }
+
+  if (filter_intra_selected_flag) {
+    mbmi->mode = DC_PRED;
+    mbmi->tx_size = best_tx_size;
+    mbmi->filter_intra_mode_info = filter_intra_mode_info;
+    av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                      int *val_count, int *num_colors) {
+  const int max_pix_val = 1 << 8;
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      ++val_count[this_val];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < max_pix_val; ++i) {
+    if (val_count[i]) ++n;
+  }
+  *num_colors = n;
+}
+
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth, int *val_count,
+                             int *bin_val_count, int *num_color_bins,
+                             int *num_colors) {
+  assert(bit_depth <= 12);
+  const int max_bin_val = 1 << 8;
+  const int max_pix_val = 1 << bit_depth;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  memset(bin_val_count, 0, max_bin_val * sizeof(val_count[0]));
+  if (val_count != NULL)
+    memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      /*
+       * Down-convert the pixels to 8-bit domain before counting.
+       * This provides consistency of behavior for palette search
+       * between lbd and hbd encodes. This down-converted pixels
+       * are only used for calculating the threshold (n).
+       */
+      const int this_val = ((src[r * stride + c]) >> (bit_depth - 8));
+      assert(this_val < max_bin_val);
+      if (this_val >= max_bin_val) continue;
+      ++bin_val_count[this_val];
+      if (val_count != NULL) ++val_count[(src[r * stride + c])];
+    }
+  }
+  int n = 0;
+  // Count the colors based on 8-bit domain used to gate the palette path
+  for (int i = 0; i < max_bin_val; ++i) {
+    if (bin_val_count[i]) ++n;
+  }
+  *num_color_bins = n;
+
+  // Count the actual hbd colors used to create top_colors
+  n = 0;
+  if (val_count != NULL) {
+    for (int i = 0; i < max_pix_val; ++i) {
+      if (val_count[i]) ++n;
+    }
+    *num_colors = n;
+  }
+}
+
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi,
+                                int reorder_delta_angle_eval) {
+  if (mode_idx < INTRA_MODE_END) {
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+  } else {
+    mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED;
+    int delta_angle_eval_idx =
+        (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2);
+    if (reorder_delta_angle_eval) {
+      mbmi->angle_delta[PLANE_TYPE_Y] =
+          luma_delta_angles_order[delta_angle_eval_idx];
+    } else {
+      mbmi->angle_delta[PLANE_TYPE_Y] =
+          (delta_angle_eval_idx < 3 ? (delta_angle_eval_idx - 3)
+                                    : (delta_angle_eval_idx - 2));
+    }
+  }
+}
+
+static AOM_INLINE int get_model_rd_index_for_pruning(
+    const MACROBLOCK *const x,
+    const INTRA_MODE_SPEED_FEATURES *const intra_sf) {
+  const int top_intra_model_count_allowed =
+      intra_sf->top_intra_model_count_allowed;
+  if (!intra_sf->adapt_top_model_rd_count_using_neighbors)
+    return top_intra_model_count_allowed - 1;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const PREDICTION_MODE mode = xd->mi[0]->mode;
+  int model_rd_index_for_pruning = top_intra_model_count_allowed - 1;
+  int is_left_mode_neq_cur_mode = 0, is_above_mode_neq_cur_mode = 0;
+  if (xd->left_available)
+    is_left_mode_neq_cur_mode = xd->left_mbmi->mode != mode;
+  if (xd->up_available)
+    is_above_mode_neq_cur_mode = xd->above_mbmi->mode != mode;
+  // The pruning of luma intra modes is made more aggressive at lower quantizers
+  // and vice versa. The value for model_rd_index_for_pruning is derived as
+  // follows.
+  // qidx 0 to 127: Reduce the index of a candidate used for comparison only if
+  // the current mode does not match either of the available neighboring modes.
+  // qidx 128 to 255: Reduce the index of a candidate used for comparison only
+  // if the current mode does not match both the available neighboring modes.
+  if (x->qindex <= 127) {
+    if (is_left_mode_neq_cur_mode || is_above_mode_neq_cur_mode)
+      model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0);
+  } else {
+    if (is_left_mode_neq_cur_mode && is_above_mode_neq_cur_mode)
+      model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0);
+  }
+  return model_rd_index_for_pruning;
+}
+
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[], int max_model_cnt_allowed,
+                       int model_rd_index_for_pruning) {
+  const double thresh_best = 1.50;
+  const double thresh_top = 1.00;
+  for (int i = 0; i < max_model_cnt_allowed; i++) {
+    if (this_model_rd < top_intra_model_rd[i]) {
+      for (int j = max_model_cnt_allowed - 1; j > i; j--) {
+        top_intra_model_rd[j] = top_intra_model_rd[j - 1];
+      }
+      top_intra_model_rd[i] = this_model_rd;
+      break;
+    }
+  }
+  if (top_intra_model_rd[model_rd_index_for_pruning] != INT64_MAX &&
+      this_model_rd >
+          thresh_top * top_intra_model_rd[model_rd_index_for_pruning])
+    return 1;
+
+  if (this_model_rd != INT64_MAX &&
+      this_model_rd > thresh_best * (*best_model_rd))
+    return 1;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  return 0;
+}
+
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+    int *best_angle_delta, int64_t *best_rd) {
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  assert(!is_inter_block(mbmi));
+  int this_rate;
+  int64_t this_rd;
+  RD_STATS tokenonly_rd_stats;
+
+  if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+    return INT64_MAX;
+  this_rate = tokenonly_rd_stats.rate +
+              intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
+  }
+  return this_rd;
+}
+
+/*!\brief Search for the best angle delta for chroma prediction
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Given a chroma directional intra prediction mode, this function will try to
+ * estimate the best delta_angle.
+ *
+ * \returns Return if there is a new mode with smaller rdcost than best_rd.
+ */
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, int rate_overhead,
+                                    int64_t best_rd, int *rate,
+                                    RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  int i, angle_delta, best_angle_delta = 0;
+  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+  rd_stats->rate = INT_MAX;
+  rd_stats->skip_txfm = 0;
+  rd_stats->dist = INT64_MAX;
+  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (i = 0; i < 2; ++i) {
+      best_rd_in = (best_rd == INT64_MAX)
+                       ? INT64_MAX
+                       : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+      mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+      this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+                                              best_rd_in, rate, rd_stats,
+                                              &best_angle_delta, &best_rd);
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (angle_delta == 0) {
+        if (this_rd == INT64_MAX) return 0;
+        rd_cost[1] = this_rd;
+        break;
+      }
+    }
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    int64_t rd_thresh;
+    for (i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+        pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                      rate, rd_stats, &best_angle_delta,
+                                      &best_rd);
+      }
+    }
+  }
+
+  mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
+  return rd_stats->rate != INT_MAX;
+}
+
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+  (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+
+static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign,
+                                      int *cfl_alpha) {
+  int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO;
+  if (cfl_linear_idx == 0) {
+    *cfl_sign = CFL_SIGN_ZERO;
+    *cfl_alpha = 0;
+  } else {
+    *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG;
+    *cfl_alpha = abs(cfl_linear_idx) - 1;
+  }
+}
+
+static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              int plane, TX_SIZE tx_size,
+                              BLOCK_SIZE plane_bsize, int cfl_idx,
+                              int fast_mode, RD_STATS *rd_stats) {
+  assert(IMPLIES(fast_mode, rd_stats == NULL));
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int cfl_plane = get_cfl_pred_type(plane);
+  CFL_SIGN_TYPE cfl_sign;
+  int cfl_alpha;
+  cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha);
+  // We conly build CFL for a given plane, the other plane's sign is dummy
+  int dummy_sign = CFL_SIGN_NEG;
+  const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs;
+  const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx;
+  mbmi->cfl_alpha_signs =
+      PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign);
+  mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha;
+  int64_t cfl_cost;
+  if (fast_mode) {
+    cfl_cost =
+        intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0);
+  } else {
+    av1_init_rd_stats(rd_stats);
+    av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize,
+                         tx_size, FTXS_NONE, 0);
+    av1_rd_cost_update(x->rdmult, rd_stats);
+    cfl_cost = rd_stats->rdcost;
+  }
+  mbmi->cfl_alpha_signs = orig_cfl_alpha_signs;
+  mbmi->cfl_alpha_idx = orig_cfl_alpha_idx;
+  return cfl_cost;
+}
+
+static const int cfl_dir_ls[2] = { 1, -1 };
+
+// If cfl_search_range is CFL_MAGS_SIZE, return zero. Otherwise return the index
+// of the best alpha found using intra_model_rd().
+static int cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int plane, TX_SIZE tx_size,
+                                    int cfl_search_range) {
+  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+
+  if (cfl_search_range == CFL_MAGS_SIZE) return CFL_INDEX_ZERO;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->uv_mode == UV_CFL_PRED);
+  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+  int est_best_cfl_idx = CFL_INDEX_ZERO;
+  int fast_mode = 1;
+  int start_cfl_idx = CFL_INDEX_ZERO;
+  int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+                                         start_cfl_idx, fast_mode, NULL);
+  for (int si = 0; si < 2; ++si) {
+    const int dir = cfl_dir_ls[si];
+    for (int i = 1; i < CFL_MAGS_SIZE; ++i) {
+      int cfl_idx = start_cfl_idx + dir * i;
+      if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+      int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+                                        cfl_idx, fast_mode, NULL);
+      if (cfl_cost < best_cfl_cost) {
+        best_cfl_cost = cfl_cost;
+        est_best_cfl_idx = cfl_idx;
+      } else {
+        break;
+      }
+    }
+  }
+  return est_best_cfl_idx;
+}
+
+static AOM_INLINE void set_invalid_cfl_parameters(
+    uint8_t *best_cfl_alpha_idx, int8_t *best_cfl_alpha_signs) {
+  *best_cfl_alpha_idx = 0;
+  *best_cfl_alpha_signs = 0;
+}
+
+static void cfl_pick_plane_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              int plane, TX_SIZE tx_size, int cfl_search_range,
+                              RD_STATS cfl_rd_arr[CFL_MAGS_SIZE],
+                              int est_best_cfl_idx) {
+  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->uv_mode == UV_CFL_PRED);
+  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+  for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) {
+    av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]);
+  }
+
+  int fast_mode = 0;
+  int start_cfl_idx = est_best_cfl_idx;
+  cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode,
+                 &cfl_rd_arr[start_cfl_idx]);
+
+  if (cfl_search_range == 1) return;
+
+  for (int si = 0; si < 2; ++si) {
+    const int dir = cfl_dir_ls[si];
+    for (int i = 1; i < cfl_search_range; ++i) {
+      int cfl_idx = start_cfl_idx + dir * i;
+      if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+      cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode,
+                     &cfl_rd_arr[cfl_idx]);
+    }
+  }
+}
+
+/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ *
+ * This function will use DCT_DCT followed by computing SATD (sum of absolute
+ * transformed differences) to estimate the RD score and find the best possible
+ * CFL parameter.
+ *
+ * Then the function will apply a full RD search near the best possible CFL
+ * parameter to find the best actual CFL parameter.
+ *
+ * Side effect:
+ * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD
+ * search.
+ *
+ * \param[in] x                Encoder prediction block structure.
+ * \param[in] cpi              Top-level encoder instance structure.
+ * \param[in] tx_size          Transform size.
+ * \param[in] ref_best_rd      Reference best RD.
+ * \param[in] cfl_search_range The search range of full RD search near the
+ *                             estimated best CFL parameter.
+ *
+ * \param[out]   best_rd_stats          RD stats of the best CFL parameter
+ * \param[out]   best_cfl_alpha_idx     Best CFL alpha index
+ * \param[out]   best_cfl_alpha_signs   Best CFL joint signs
+ *
+ */
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+                             TX_SIZE tx_size, int64_t ref_best_rd,
+                             int cfl_search_range, RD_STATS *best_rd_stats,
+                             uint8_t *best_cfl_alpha_idx,
+                             int8_t *best_cfl_alpha_signs) {
+  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+  const ModeCosts *mode_costs = &x->mode_costs;
+  RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE];
+  RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int est_best_cfl_idx_u, est_best_cfl_idx_v;
+
+  av1_invalid_rd_stats(best_rd_stats);
+
+  // As the dc pred data is same for different values of alpha, enable the
+  // caching of dc pred data. Call clear_cfl_dc_pred_cache_flags() before
+  // returning to avoid the unintentional usage of cached dc pred data.
+  xd->cfl.use_dc_pred_cache = true;
+  // Evaluate alpha parameter of each chroma plane.
+  est_best_cfl_idx_u =
+      cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range);
+  est_best_cfl_idx_v =
+      cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range);
+
+  if (cfl_search_range == 1) {
+    // For cfl_search_range=1, further refinement of alpha is not enabled. Hence
+    // CfL index=0 for both the chroma planes implies invalid CfL mode.
+    if (est_best_cfl_idx_u == CFL_INDEX_ZERO &&
+        est_best_cfl_idx_v == CFL_INDEX_ZERO) {
+      set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+      clear_cfl_dc_pred_cache_flags(&xd->cfl);
+      return 0;
+    }
+
+    int cfl_alpha_u, cfl_alpha_v;
+    CFL_SIGN_TYPE cfl_sign_u, cfl_sign_v;
+    const MB_MODE_INFO *mbmi = xd->mi[0];
+    cfl_idx_to_sign_and_alpha(est_best_cfl_idx_u, &cfl_sign_u, &cfl_alpha_u);
+    cfl_idx_to_sign_and_alpha(est_best_cfl_idx_v, &cfl_sign_v, &cfl_alpha_v);
+    const int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
+    // Compute alpha and mode signaling rate.
+    const int rate_overhead =
+        mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u] +
+        mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v] +
+        mode_costs
+            ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_CFL_PRED];
+    // Skip the CfL mode evaluation if the RD cost derived using the rate needed
+    // to signal the CfL mode and alpha parameter exceeds the ref_best_rd.
+    if (RDCOST(x->rdmult, rate_overhead, 0) > ref_best_rd) {
+      set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+      clear_cfl_dc_pred_cache_flags(&xd->cfl);
+      return 0;
+    }
+  }
+
+  // Compute the rd cost of each chroma plane using the alpha parameters which
+  // were already evaluated.
+  cfl_pick_plane_rd(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u,
+                    est_best_cfl_idx_u);
+  cfl_pick_plane_rd(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v,
+                    est_best_cfl_idx_v);
+
+  clear_cfl_dc_pred_cache_flags(&xd->cfl);
+
+  for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) {
+    if (cfl_rd_arr_u[ui].rate == INT_MAX) continue;
+    int cfl_alpha_u;
+    CFL_SIGN_TYPE cfl_sign_u;
+    cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u);
+    for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) {
+      if (cfl_rd_arr_v[vi].rate == INT_MAX) continue;
+      int cfl_alpha_v;
+      CFL_SIGN_TYPE cfl_sign_v;
+      cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v);
+      // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a
+      // valid parameter for CFL
+      if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue;
+      int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
+      RD_STATS rd_stats = cfl_rd_arr_u[ui];
+      av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]);
+      if (rd_stats.rate != INT_MAX) {
+        rd_stats.rate +=
+            mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u];
+        rd_stats.rate +=
+            mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v];
+      }
+      av1_rd_cost_update(x->rdmult, &rd_stats);
+      if (rd_stats.rdcost < best_rd_stats->rdcost) {
+        *best_rd_stats = rd_stats;
+        *best_cfl_alpha_idx =
+            (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v;
+        *best_cfl_alpha_signs = joint_sign;
+      }
+    }
+  }
+  if (best_rd_stats->rdcost >= ref_best_rd) {
+    av1_invalid_rd_stats(best_rd_stats);
+    // Set invalid CFL parameters here since the rdcost is not better than
+    // ref_best_rd.
+    set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+    return 0;
+  }
+  return 1;
+}
+
+static bool should_prune_chroma_smooth_pred_based_on_source_variance(
+    const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize) {
+  if (!cpi->sf.intra_sf.prune_smooth_intra_mode_for_chroma) return false;
+
+  // If the source variance of both chroma planes is less than 20 (empirically
+  // derived), prune UV_SMOOTH_PRED.
+  for (int i = AOM_PLANE_U; i < av1_num_planes(&cpi->common); i++) {
+    const unsigned int variance = av1_get_perpixel_variance_facade(
+        cpi, &x->e_mbd, &x->plane[i].src, bsize, i);
+    if (variance >= 20) return false;
+  }
+  return true;
+}
+
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, uint8_t *skippable,
+                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int64_t best_rd = INT64_MAX, this_rd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+
+  init_sbuv_mode(mbmi);
+
+  // Return if the current block does not correspond to a chroma block.
+  if (!xd->is_chroma_ref) {
+    *rate = 0;
+    *rate_tokenonly = 0;
+    *distortion = 0;
+    *skippable = 1;
+    return INT64_MAX;
+  }
+
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+  if (xd->cfl.store_y) {
+    // Restore reconstructed luma values.
+    // TODO(chiyotsai@google.com): right now we are re-computing the txfm in
+    // this function everytime we search through uv modes. There is some
+    // potential speed up here if we cache the result to avoid redundant
+    // computation.
+    av1_encode_intra_block_plane(cpi, x, mbmi->bsize, AOM_PLANE_Y,
+                                 DRY_RUN_NORMAL,
+                                 cpi->optimize_seg_arr[mbmi->segment_id]);
+    xd->cfl.store_y = 0;
+  }
+  IntraModeSearchState intra_search_state;
+  init_intra_mode_search_state(&intra_search_state);
+  const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+
+  // Search through all non-palette modes.
+  for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+    int this_rate;
+    RD_STATS tokenonly_rd_stats;
+    UV_PREDICTION_MODE uv_mode = uv_rd_search_mode_order[mode_idx];
+
+    // Skip the current mode evaluation if the RD cost derived using the mode
+    // signaling rate exceeds the best_rd so far.
+    const int mode_rate =
+        mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+    if (RDCOST(x->rdmult, mode_rate, 0) > best_rd) continue;
+
+    PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+    const int is_diagonal_mode = av1_is_diagonal_mode(intra_mode);
+    const int is_directional_mode = av1_is_directional_mode(intra_mode);
+
+    if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
+      continue;
+    if (is_directional_mode &&
+        !cpi->oxcf.intra_mode_cfg.enable_directional_intra)
+      continue;
+
+    if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+          (1 << uv_mode)))
+      continue;
+    if (!intra_mode_cfg->enable_smooth_intra && uv_mode >= UV_SMOOTH_PRED &&
+        uv_mode <= UV_SMOOTH_H_PRED)
+      continue;
+
+    if (!intra_mode_cfg->enable_paeth_intra && uv_mode == UV_PAETH_PRED)
+      continue;
+
+    assert(mbmi->mode < INTRA_MODES);
+    if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner &&
+        !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << uv_mode)))
+      continue;
+
+    mbmi->uv_mode = uv_mode;
+
+    // Init variables for cfl and angle delta
+    const SPEED_FEATURES *sf = &cpi->sf;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+    if (uv_mode == UV_CFL_PRED) {
+      if (!cfl_allowed || !intra_mode_cfg->enable_cfl_intra) continue;
+      assert(!is_directional_mode);
+      const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd,
+                             sf->intra_sf.cfl_search_range, &tokenonly_rd_stats,
+                             &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) {
+        continue;
+      }
+    } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
+               intra_mode_cfg->enable_angle_delta) {
+      if (sf->intra_sf.chroma_intra_pruning_with_hog &&
+          !intra_search_state.dir_mode_skip_mask_ready) {
+        static const float thresh[2][4] = {
+          { -1.2f, 0.0f, 0.0f, 1.2f },    // Interframe
+          { -1.2f, -1.2f, -0.6f, 0.4f },  // Intraframe
+        };
+        const int is_chroma = 1;
+        const int is_intra_frame = frame_is_intra_only(cm);
+        prune_intra_mode_with_hog(
+            x, bsize, cm->seq_params->sb_size,
+            thresh[is_intra_frame]
+                  [sf->intra_sf.chroma_intra_pruning_with_hog - 1],
+            intra_search_state.directional_mode_skip_mask, is_chroma);
+        intra_search_state.dir_mode_skip_mask_ready = 1;
+      }
+      if (intra_search_state.directional_mode_skip_mask[uv_mode]) {
+        continue;
+      }
+
+      // Search through angle delta
+      const int rate_overhead =
+          mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+      if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                    &this_rate, &tokenonly_rd_stats))
+        continue;
+    } else {
+      if (uv_mode == UV_SMOOTH_PRED &&
+          should_prune_chroma_smooth_pred_based_on_source_variance(cpi, x,
+                                                                   bsize))
+        continue;
+
+      // Predict directly if we don't need to search for angle delta.
+      if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+        continue;
+      }
+    }
+    const int mode_cost =
+        mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip_txfm;
+    }
+  }
+
+  // Search palette mode
+  const int try_palette =
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mbmi->bsize);
+  if (try_palette) {
+    uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    av1_rd_pick_palette_intra_sbuv(
+        cpi, x,
+        mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][UV_DC_PRED],
+        best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+        distortion, skippable);
+  }
+
+  *mbmi = best_mbmi;
+  // Make sure we actually chose a mode
+  assert(best_rd < INT64_MAX);
+  return best_rd;
+}
+
+// Searches palette mode for luma channel in inter frame.
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
+                            int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int rate2 = 0;
+  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd;
+  int skippable = 0;
+  uint8_t *const best_palette_color_map =
+      x->palette_buffer->best_palette_color_map;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  MB_MODE_INFO best_mbmi_palette = *mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const intra_mode_cost =
+      mode_costs->mbmode_cost[size_group_lookup[bsize]];
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  av1_zero(pmi->palette_size);
+
+  RD_STATS rd_stats_y;
+  av1_invalid_rd_stats(&rd_stats_y);
+  av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+                                &best_mbmi_palette, best_palette_color_map,
+                                &best_rd_palette, &rd_stats_y.rate, NULL,
+                                &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+                                ctx, best_blk_skip, best_tx_type_map);
+  if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+    this_rd_cost->rdcost = INT64_MAX;
+    return skippable;
+  }
+
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+  memcpy(color_map, best_palette_color_map,
+         rows * cols * sizeof(best_palette_color_map[0]));
+
+  skippable = rd_stats_y.skip_txfm;
+  distortion2 = rd_stats_y.dist;
+  rate2 = rd_stats_y.rate + ref_frame_cost;
+  if (num_planes > 1) {
+    if (intra_search_state->rate_uv_intra == INT_MAX) {
+      // We have not found any good uv mode yet, so we need to search for it.
+      TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+      av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+                                  &intra_search_state->rate_uv_tokenonly,
+                                  &intra_search_state->dist_uvs,
+                                  &intra_search_state->skip_uvs, bsize, uv_tx);
+      intra_search_state->mode_uv = mbmi->uv_mode;
+      intra_search_state->pmi_uv = *pmi;
+      intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+    }
+
+    // We have found at least one good uv mode before, so copy and paste it
+    // over.
+    mbmi->uv_mode = intra_search_state->mode_uv;
+    pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+    if (pmi->palette_size[1] > 0) {
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+    skippable = skippable && intra_search_state->skip_uvs;
+    distortion2 += intra_search_state->dist_uvs;
+    rate2 += intra_search_state->rate_uv_intra;
+  }
+
+  if (skippable) {
+    rate2 -= rd_stats_y.rate;
+    if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly;
+    rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
+  } else {
+    rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+  }
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+  this_rd_cost->rate = rate2;
+  this_rd_cost->dist = distortion2;
+  this_rd_cost->rdcost = this_rd;
+  return skippable;
+}
+
+void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  RD_STATS *this_rd_cost, int64_t best_rd) {
+  MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t best_rd_palette = best_rd, this_rd;
+  uint8_t *const best_palette_color_map =
+      x->palette_buffer->best_palette_color_map;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  MB_MODE_INFO best_mbmi_palette = *mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const intra_mode_cost =
+      mode_costs->mbmode_cost[size_group_lookup[bsize]];
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  av1_zero(pmi->palette_size);
+
+  RD_STATS rd_stats_y;
+  av1_invalid_rd_stats(&rd_stats_y);
+  av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+                                &best_mbmi_palette, best_palette_color_map,
+                                &best_rd_palette, &rd_stats_y.rate, NULL,
+                                &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+                                ctx, best_blk_skip, best_tx_type_map);
+  if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+    this_rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+  memcpy(color_map, best_palette_color_map,
+         rows * cols * sizeof(best_palette_color_map[0]));
+
+  rd_stats_y.rate += ref_frame_cost;
+
+  if (rd_stats_y.skip_txfm) {
+    rd_stats_y.rate =
+        ref_frame_cost +
+        mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
+  } else {
+    rd_stats_y.rate +=
+        mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+  }
+  this_rd = RDCOST(x->rdmult, rd_stats_y.rate, rd_stats_y.dist);
+  this_rd_cost->rate = rd_stats_y.rate;
+  this_rd_cost->dist = rd_stats_y.dist;
+  this_rd_cost->rdcost = this_rd;
+  this_rd_cost->skip_txfm = rd_stats_y.skip_txfm;
+}
+
+/*!\brief Get the intra prediction by searching through tx_type and tx_size.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Currently this function is only used in the intra frame code path for
+ * winner-mode processing.
+ *
+ * \return Returns whether the current mode is an improvement over best_rd.
+ */
+static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, const int *bmode_costs,
+                                      int64_t *best_rd, int *rate,
+                                      int *rate_tokenonly, int64_t *distortion,
+                                      uint8_t *skippable,
+                                      MB_MODE_INFO *best_mbmi,
+                                      PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS rd_stats;
+  // In order to improve txfm search, avoid rd based breakouts during winner
+  // mode evaluation. Hence passing ref_best_rd as INT64_MAX by default when the
+  // speed feature use_rd_based_breakout_for_intra_tx_search is disabled.
+  int64_t ref_best_rd = cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search
+                            ? *best_rd
+                            : INT64_MAX;
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, ref_best_rd);
+  if (rd_stats.rate == INT_MAX) return 0;
+  int this_rate_tokenonly = rd_stats.rate;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+    // in the tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  const int this_rate =
+      rd_stats.rate +
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0);
+  const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_mbmi = *mbmi;
+    *best_rd = this_rd;
+    *rate = this_rate;
+    *rate_tokenonly = this_rate_tokenonly;
+    *distortion = rd_stats.dist;
+    *skippable = rd_stats.skip_txfm;
+    av1_copy_array(ctx->blk_skip, x->txfm_search_info.blk_skip,
+                   ctx->num_4x4_blk);
+    av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    return 1;
+  }
+  return 0;
+}
+
+/*!\brief Search for the best filter_intra mode when coding inter frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \remark Returns nothing, but updates the mbmi and rd_stats.
+ */
+static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                                            BLOCK_SIZE bsize,
+                                            const PICK_MODE_CONTEXT *ctx,
+                                            RD_STATS *rd_stats_y, int mode_cost,
+                                            int64_t best_rd,
+                                            int64_t best_rd_so_far) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->mode == DC_PRED &&
+         av1_filter_intra_allowed_bsize(&cpi->common, bsize));
+
+  RD_STATS rd_stats_y_fi;
+  int filter_intra_selected_flag = 0;
+  TX_SIZE best_tx_size = mbmi->tx_size;
+  FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+         sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
+  for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES;
+       ++fi_mode) {
+    mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd);
+    if (rd_stats_y_fi.rate == INT_MAX) continue;
+    const int this_rate_tmp =
+        rd_stats_y_fi.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+    const int64_t this_rd_tmp =
+        RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+    if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
+      break;
+    }
+    if (this_rd_tmp < best_rd_so_far) {
+      best_tx_size = mbmi->tx_size;
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+      best_fi_mode = fi_mode;
+      *rd_stats_y = rd_stats_y_fi;
+      filter_intra_selected_flag = 1;
+      best_rd_so_far = this_rd_tmp;
+    }
+  }
+
+  mbmi->tx_size = best_tx_size;
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+
+  if (filter_intra_selected_flag) {
+    mbmi->filter_intra_mode_info.use_filter_intra = 1;
+    mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+  } else {
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  }
+}
+
+// Evaluate a given luma intra-mode in inter frames.
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+                            int64_t *best_model_rd,
+                            int64_t top_intra_model_rd[]) {
+  const AV1_COMMON *cm = &cpi->common;
+  const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+  const PREDICTION_MODE mode = mbmi->mode;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int mode_cost =
+      mode_costs->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+
+  int known_rate = mode_cost;
+  const int intra_cost_penalty = av1_get_intra_cost_penalty(
+      cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
+      cm->seq_params->bit_depth);
+
+  if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
+  known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0],
+                       mode_costs->skip_txfm_cost[skip_ctx][1]);
+  const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+  if (known_rd > best_rd) {
+    intra_search_state->skip_intra_modes = 1;
+    return 0;
+  }
+
+  const int is_directional_mode = av1_is_directional_mode(mode);
+  if (is_directional_mode && av1_use_angle_delta(bsize) &&
+      cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
+    if (intra_sf->intra_pruning_with_hog &&
+        !intra_search_state->dir_mode_skip_mask_ready) {
+      const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f };
+      const int is_chroma = 0;
+      prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size,
+                                thresh[intra_sf->intra_pruning_with_hog - 1],
+                                intra_search_state->directional_mode_skip_mask,
+                                is_chroma);
+      intra_search_state->dir_mode_skip_mask_ready = 1;
+    }
+    if (intra_search_state->directional_mode_skip_mask[mode]) return 0;
+  }
+  const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+  const int64_t this_model_rd =
+      intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+
+  const int model_rd_index_for_pruning =
+      get_model_rd_index_for_pruning(x, intra_sf);
+
+  if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd,
+                         intra_sf->top_intra_model_count_allowed,
+                         model_rd_index_for_pruning))
+    return 0;
+  av1_init_rd_stats(rd_stats_y);
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
+
+  // Pick filter intra modes.
+  if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+    int try_filter_intra = 1;
+    int64_t best_rd_so_far = INT64_MAX;
+    if (rd_stats_y->rate != INT_MAX) {
+      // best_rd_so_far is the rdcost of DC_PRED without using filter_intra.
+      // Later, in filter intra search, best_rd_so_far is used for comparison.
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      const int tmp_rate =
+          rd_stats_y->rate +
+          intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+      best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+      try_filter_intra = (best_rd_so_far / 2) <= best_rd;
+    } else if (intra_sf->skip_filter_intra_in_inter_frames >= 1) {
+      // As rd cost of luma intra dc mode is more than best_rd (i.e.,
+      // rd_stats_y->rate = INT_MAX), skip the evaluation of filter intra modes.
+      try_filter_intra = 0;
+    }
+
+    if (try_filter_intra) {
+      handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost,
+                               best_rd, best_rd_so_far);
+    }
+  }
+
+  if (rd_stats_y->rate == INT_MAX) return 0;
+
+  *mode_cost_y = intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+  const int rate_y = rd_stats_y->skip_txfm
+                         ? mode_costs->skip_txfm_cost[skip_ctx][1]
+                         : rd_stats_y->rate;
+  *rd_y = RDCOST(x->rdmult, rate_y + *mode_cost_y, rd_stats_y->dist);
+  if (best_rd < (INT64_MAX / 2) && *rd_y > (best_rd + (best_rd >> 2))) {
+    intra_search_state->skip_intra_modes = 1;
+    return 0;
+  }
+
+  return 1;
+}
+
+int av1_search_intra_uv_modes_in_interframe(
+    IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+    MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+    const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+
+  // TODO(chiyotsai@google.com): Consolidate the chroma search code here with
+  // the one in av1_search_palette_mode.
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize);
+
+  assert(intra_search_state->rate_uv_intra == INT_MAX);
+  if (intra_search_state->rate_uv_intra == INT_MAX) {
+    // If no good uv-predictor had been found, search for it.
+    const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+                                &intra_search_state->rate_uv_tokenonly,
+                                &intra_search_state->dist_uvs,
+                                &intra_search_state->skip_uvs, bsize, uv_tx);
+    intra_search_state->mode_uv = mbmi->uv_mode;
+    if (try_palette) intra_search_state->pmi_uv = *pmi;
+    intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+
+    const int uv_rate = intra_search_state->rate_uv_tokenonly;
+    const int64_t uv_dist = intra_search_state->dist_uvs;
+    const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+    if (uv_rd > best_rd) {
+      // If there is no good intra uv-mode available, we can skip all intra
+      // modes.
+      intra_search_state->skip_intra_modes = 1;
+      return 0;
+    }
+  }
+
+  // If we are here, then the encoder has found at least one good intra uv
+  // predictor, so we can directly copy its statistics over.
+  // TODO(any): the stats here is not right if the best uv mode is CFL but the
+  // best y mode is palette.
+  rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
+  rd_stats_uv->dist = intra_search_state->dist_uvs;
+  rd_stats_uv->skip_txfm = intra_search_state->skip_uvs;
+  rd_stats->skip_txfm = rd_stats_y->skip_txfm && rd_stats_uv->skip_txfm;
+  mbmi->uv_mode = intra_search_state->mode_uv;
+  if (try_palette) {
+    pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+  }
+  mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+
+  return 1;
+}
+
+// Checks if odd delta angles can be pruned based on rdcosts of even delta
+// angles of the corresponding directional mode.
+static AOM_INLINE int prune_luma_odd_delta_angles_using_rd_cost(
+    const MB_MODE_INFO *const mbmi, const int64_t *const intra_modes_rd_cost,
+    int64_t best_rd, int prune_luma_odd_delta_angles_in_intra) {
+  const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y];
+  if (!prune_luma_odd_delta_angles_in_intra ||
+      !av1_is_directional_mode(mbmi->mode) || !(abs(luma_delta_angle) & 1) ||
+      best_rd == INT64_MAX)
+    return 0;
+
+  const int64_t rd_thresh = best_rd + (best_rd >> 3);
+
+  // Neighbour rdcosts are considered for pruning of odd delta angles as
+  // mentioned below:
+  // Delta angle      Delta angle rdcost
+  // to be pruned     to be considered
+  //    -3                   -2
+  //    -1                -2, 0
+  //     1                 0, 2
+  //     3                    2
+  return intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA] > rd_thresh &&
+         intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA + 2] >
+             rd_thresh;
+}
+
+// Finds the best non-intrabc mode on an intra frame.
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   int *rate, int *rate_tokenonly,
+                                   int64_t *distortion, uint8_t *skippable,
+                                   BLOCK_SIZE bsize, int64_t best_rd,
+                                   PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  int64_t best_model_rd = INT64_MAX;
+  int is_directional_mode;
+  uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 };
+  // Flag to check rd of any intra mode is better than best_rd passed to this
+  // function
+  int beat_best_rd = 0;
+  const int *bmode_costs;
+  const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mbmi->bsize);
+  uint8_t *best_palette_color_map =
+      try_palette ? x->palette_buffer->best_palette_color_map : NULL;
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
+
+  mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+  const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf;
+  if (intra_sf->intra_pruning_with_hog) {
+    // Less aggressive thresholds are used here than those used in inter frame
+    // encoding in av1_handle_intra_y_mode() because we want key frames/intra
+    // frames to have higher quality.
+    const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f };
+    const int is_chroma = 0;
+    prune_intra_mode_with_hog(x, bsize, cpi->common.seq_params->sb_size,
+                              thresh[intra_sf->intra_pruning_with_hog - 1],
+                              directional_mode_skip_mask, is_chroma);
+  }
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  pmi->palette_size[0] = 0;
+
+  // Set params for mode evaluation
+  set_mode_eval_params(cpi, x, MODE_EVAL);
+
+  MB_MODE_INFO best_mbmi = *mbmi;
+  const int max_winner_mode_count =
+      winner_mode_count_allowed[cpi->sf.winner_mode_sf.multi_winner_mode_type];
+  zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats);
+  x->winner_mode_count = 0;
+
+  // Searches the intra-modes except for intrabc, palette, and filter_intra.
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+
+  // Initialize the rdcost corresponding to all the directional and
+  // non-directional intra modes.
+  // 1. For directional modes, it stores the rdcost values for delta angles -4,
+  // -3, ..., 3, 4.
+  // 2. The rdcost value for luma_delta_angle is stored at index
+  // luma_delta_angle + MAX_ANGLE_DELTA + 1.
+  // 3. The rdcost values for fictitious/nonexistent luma_delta_angle -4 and 4
+  // (array indices 0 and 8) are always set to INT64_MAX (the initial value).
+  int64_t intra_modes_rd_cost[INTRA_MODE_END]
+                             [SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY];
+  for (int i = 0; i < INTRA_MODE_END; i++) {
+    for (int j = 0; j < SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY; j++) {
+      intra_modes_rd_cost[i][j] = INT64_MAX;
+    }
+  }
+
+  for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+       ++mode_idx) {
+    set_y_mode_and_delta_angle(mode_idx, mbmi,
+                               intra_sf->prune_luma_odd_delta_angles_in_intra);
+    RD_STATS this_rd_stats;
+    int this_rate, this_rate_tokenonly, s;
+    int is_diagonal_mode;
+    int64_t this_distortion, this_rd;
+    const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y];
+
+    is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode);
+    if (is_diagonal_mode && !intra_mode_cfg->enable_diagonal_intra) continue;
+    if (av1_is_directional_mode(mbmi->mode) &&
+        !intra_mode_cfg->enable_directional_intra)
+      continue;
+
+    // The smooth prediction mode appears to be more frequently picked
+    // than horizontal / vertical smooth prediction modes. Hence treat
+    // them differently in speed features.
+    if ((!intra_mode_cfg->enable_smooth_intra ||
+         intra_sf->disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_H_PRED || mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!intra_mode_cfg->enable_smooth_intra && mbmi->mode == SMOOTH_PRED)
+      continue;
+
+    // The functionality of filter intra modes and smooth prediction
+    // overlap. Hence smooth prediction is pruned only if all the
+    // filter intra modes are enabled.
+    if (intra_sf->disable_smooth_intra &&
+        intra_sf->prune_filter_intra_level == 0 && mbmi->mode == SMOOTH_PRED)
+      continue;
+    if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED)
+      continue;
+
+    // Skip the evaluation of modes that do not match with the winner mode in
+    // x->mb_mode_cache.
+    if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue;
+
+    is_directional_mode = av1_is_directional_mode(mbmi->mode);
+    if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+    if (is_directional_mode &&
+        !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) &&
+        luma_delta_angle != 0)
+      continue;
+
+    // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+    if (!(intra_sf->intra_y_mode_mask[max_txsize_lookup[bsize]] &
+          (1 << mbmi->mode)))
+      continue;
+
+    if (prune_luma_odd_delta_angles_using_rd_cost(
+            mbmi, intra_modes_rd_cost[mbmi->mode], best_rd,
+            intra_sf->prune_luma_odd_delta_angles_in_intra))
+      continue;
+
+    const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+    const int64_t this_model_rd =
+        intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+
+    const int model_rd_index_for_pruning =
+        get_model_rd_index_for_pruning(x, intra_sf);
+
+    if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd,
+                           intra_sf->top_intra_model_count_allowed,
+                           model_rd_index_for_pruning))
+      continue;
+
+    // Builds the actual prediction. The prediction from
+    // model_intra_yrd_and_prune was just an estimation that did not take into
+    // account the effect of txfm pipeline, so we need to redo it for real
+    // here.
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+    this_rate_tokenonly = this_rd_stats.rate;
+    this_distortion = this_rd_stats.dist;
+    s = this_rd_stats.skip_txfm;
+
+    if (this_rate_tokenonly == INT_MAX) continue;
+
+    if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+      // av1_pick_uniform_tx_size_type_yrd above includes the cost of the
+      // tx_size in the tokenonly rate, but for intra blocks, tx_size is always
+      // coded (prediction granularity), so we account for it in the full rate,
+      // not the tokenonly rate.
+      this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+    }
+    this_rate =
+        this_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0);
+    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+
+    // Visual quality adjustment based on recon vs source variance.
+    if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+      this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+    }
+
+    intra_modes_rd_cost[mbmi->mode][luma_delta_angle + MAX_ANGLE_DELTA + 1] =
+        this_rd;
+
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      // Setting beat_best_rd flag because current mode rd is better than
+      // best_rd passed to this function
+      beat_best_rd = 1;
+      *rate = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion = this_distortion;
+      *skippable = s;
+      memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    }
+  }
+
+  // Searches palette
+  if (try_palette) {
+    av1_rd_pick_palette_intra_sby(
+        cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
+        &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd,
+        ctx, ctx->blk_skip, ctx->tx_type_map);
+  }
+
+  // Searches filter_intra
+  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
+    if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+                                 skippable, bsize, bmode_costs[DC_PRED],
+                                 best_mbmi.mode, &best_rd, &best_model_rd,
+                                 ctx)) {
+      best_mbmi = *mbmi;
+    }
+  }
+
+  // No mode is identified with less rd value than best_rd passed to this
+  // function. In such cases winner mode processing is not necessary and return
+  // best_rd as INT64_MAX to indicate best mode is not identified
+  if (!beat_best_rd) return INT64_MAX;
+
+  // In multi-winner mode processing, perform tx search for few best modes
+  // identified during mode evaluation. Winner mode processing uses best tx
+  // configuration for tx search.
+  if (cpi->sf.winner_mode_sf.multi_winner_mode_type) {
+    int best_mode_idx = 0;
+    int block_width, block_height;
+    uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map;
+    av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+                             &block_height, NULL, NULL);
+
+    for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) {
+      *mbmi = x->winner_mode_stats[mode_idx].mbmi;
+      if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) {
+        // Restore color_map of palette mode before winner mode processing
+        if (mbmi->palette_mode_info.palette_size[0] > 0) {
+          uint8_t *color_map_src =
+              x->winner_mode_stats[mode_idx].color_index_map;
+          memcpy(color_map_dst, color_map_src,
+                 block_width * block_height * sizeof(*color_map_src));
+        }
+        // Set params for winner mode evaluation
+        set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+        // Winner mode processing
+        // If previous searches use only the default tx type/no R-D optimization
+        // of quantized coeffs, do an extra search for the best tx type/better
+        // R-D optimization of quantized coeffs
+        if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+                            rate_tokenonly, distortion, skippable, &best_mbmi,
+                            ctx))
+          best_mode_idx = mode_idx;
+      }
+    }
+    // Copy color_map of palette mode for final winner mode
+    if (best_mbmi.palette_mode_info.palette_size[0] > 0) {
+      uint8_t *color_map_src =
+          x->winner_mode_stats[best_mode_idx].color_index_map;
+      memcpy(color_map_dst, color_map_src,
+             block_width * block_height * sizeof(*color_map_src));
+    }
+  } else {
+    // If previous searches use only the default tx type/no R-D optimization of
+    // quantized coeffs, do an extra search for the best tx type/better R-D
+    // optimization of quantized coeffs
+    if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) {
+      // Set params for winner mode evaluation
+      set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+      *mbmi = best_mbmi;
+      intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+                      rate_tokenonly, distortion, skippable, &best_mbmi, ctx);
+    }
+  }
+  *mbmi = best_mbmi;
+  av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+  return best_rd;
+}
diff --git a/third_party/aom/av1/encoder/intra_mode_search.h b/third_party/aom/av1/encoder/intra_mode_search.h
new file mode 100644
index 0000000000..75289c4e3c
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search.h
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares high level functions to search through intra modes.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief Variables related to intra-mode search during inter frame coding.
+ *
+ * \ingroup intra_mode_search
+ * This is a set of variables used during intra-mode search for inter frames.
+ * This includes an histogram of gradient speed features and a cache of uv
+ * prediction to avoid repeated search of chroma prediction.
+ */
+typedef struct IntraModeSearchState {
+  /*!
+   * \brief The best luma intra-mode found so far
+   */
+  PREDICTION_MODE best_intra_mode;
+
+  /** \name Speed feature variables
+   * Variables to help with pruning some luma intra-modes during inter frame
+   * coding process.
+   */
+  /**@{*/
+  /*!
+   * \brief Whether to terminate all intra mode search.
+   */
+  int skip_intra_modes;
+  /*!
+   * \brief Whether a directional mode is pruned.
+   */
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  /*!
+   * \brief Whether \ref directional_mode_skip_mask is valid for pruning.
+   */
+  int dir_mode_skip_mask_ready;
+  /**@}*/
+
+  /** \name Chroma mode search cache
+   * A cache of the best chroma prediction mode to avoid having to search for
+   * chroma predictions repeatedly in \ref
+   * av1_search_intra_uv_modes_in_interframe()
+   */
+  /**@{*/
+  int rate_uv_intra;          /*!< \brief Total rate to transmit uv_mode */
+  int rate_uv_tokenonly;      /*!< \brief Rate transmit txfm tokens */
+  int64_t dist_uvs;           /*!< \brief Distortion of the uv_mode's recon */
+  uint8_t skip_uvs;           /*!< \brief Whether the uv txfm is skippable */
+  UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */
+  PALETTE_MODE_INFO pmi_uv;   /*!< \brief Color map if mode_uv is palette */
+  int8_t uv_angle_delta;      /*!< \brief Angle delta if mode_uv directional */
+  /**@}*/
+} IntraModeSearchState;
+
+/*!\brief Evaluate a given luma intra-mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles an intra-mode luma prediction when the current frame
+ * is an inter frame. This is the intra-mode counterpart of handle_inter_mode.
+ * This function performs an intra luma prediction using the mode specified by
+ * x->e_mbd.mi[0]->mode. This function does *not* support palette mode
+ * prediction in the luma channel.
+ *
+ * \param[in,out]    intra_search_state Structure to intra search state.
+ * \param[in]        cpi                Top-level encoder structure.
+ * \param[in,out]    x                  Pointer to structure holding all the
+ *                                      data for the current macroblock.
+ * \param[in]        bsize              Current partition block size.
+ * \param[in]        ref_frame_cost     The entropy cost for signaling that the
+ *                                      current ref frame is an intra frame.
+ * \param[in]        ctx                Structure to hold the number of 4x4 blks
+ *                                      to copy tx_type and txfm_skip arrays.
+ * \param[out]       rd_stats_y         Struct to keep track of the current
+ *                                      intra-mode's rd_stats (luma only).
+ * \param[in]        best_rd            Best RD seen for this block so far.
+ * \param[out]       mode_cost_y        The cost needed to signal the current
+ *                                      intra mode.
+ * \param[out]       rd_y               The rdcost of the chosen mode.
+ * \param[in]        best_model_rd      Best model RD seen for this block so far
+ * \param[in]        top_intra_model_rd Top intra model RD seen for this
+ *                                      block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and
+ * rd_y are also updated. Moreover, in the first evaluation with directional
+ * mode, a prune_mask computed with histogram of gradient is also stored in
+ * intra_search_state.
+ */
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+                            int64_t *best_model_rd,
+                            int64_t top_intra_model_rd[]);
+
+/*!\brief Search through all chroma intra-modes for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles intra-mode chroma prediction when the current frame
+ * is an inter frame. This is done by calling \ref av1_rd_pick_intra_sbuv_mode
+ * with some additional book-keeping.
+ *
+ * \param[in,out]    intra_search_state Structure to intra search state.
+ * \param[in]        cpi                Top-level encoder structure.
+ * \param[in,out]    x                  Pointer to structure holding all the
+ *                                      data for the current macroblock.
+ * \param[in]        bsize              Current partition block size.
+ * \param[out]       rd_stats           Struct to keep track of the current
+ *                                      intra-mode's rd_stats (all planes).
+ * \param[out]       rd_stats_y         Struct to keep track of the current
+ *                                      intra-mode's rd_stats (luma only).
+ * \param[out]       rd_stats_uv        Struct to keep track of the current
+ *                                      intra-mode's rd_stats (chroma only).
+ * \param[in]        best_rd            Best RD seen for this block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats(_y|_uv)  are also
+ * updated. Moreover, in the first evocation of the function, the chroma intra
+ * mode result is cached in intra_search_state to be used in subsequent calls.
+ */
+int av1_search_intra_uv_modes_in_interframe(
+    IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+    MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+    const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd);
+
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in]    intra_search_state Structure to hold the best luma intra mode
+ *                                  and cache chroma prediction for speed up.
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    ref_frame_cost     The entropy cost for signaling that the
+ *                                  current ref frame is an intra frame.
+ * \param[in]    ctx                Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ * \param[in]    this_rd_cost       Struct to keep track of palette mode's
+ *                                  rd_stats.
+ * \param[in]    best_rd            Best RD seen for this block so far.
+ *
+ * \return Returns whether luma palette mode can skip the txfm. The
+ * corresponding mbmi, this_rd_costs, intra_search_state, and tx_type arrays in
+ * ctx are also updated.
+ */
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
+                            int64_t best_rd);
+
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    ref_frame_cost     The entropy cost for signaling that the
+ *                                  current ref frame is an intra frame.
+ * \param[in]    ctx                Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ * \param[in]    this_rd_cost       Struct to keep track of palette mode's
+ *                                  rd_stats.
+ * \param[in]    best_rd            Best RD seen for this block so far.
+ */
+void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  RD_STATS *this_rd_cost, int64_t best_rd);
+
+/*!\brief Perform intra-mode search on luma channels for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function performs intra-mode search on the luma channel when the
+ * current frame is intra-only. This function does not search intrabc mode,
+ * but it does search palette and filter_intra.
+ *
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    rate               The total rate needed to predict the current
+ *                                  chroma block.
+ * \param[in]    rate_tokenonly     The rate without the cost of sending the
+ *                                  prediction modes.
+ *                                  chroma block.
+ *                                  after the reconstruction.
+ * \param[in]    distortion         The chroma distortion of the best prediction
+ *                                  after the reconstruction.
+ * \param[in]    skippable          Whether we can skip txfm process.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    best_rd            Best RD seen for this block so far.
+ * \param[in]    ctx                Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ *
+ * \return Returns the rd_cost if this function finds a mode better than
+ * best_rd, otherwise returns INT64_MAX. This also updates the mbmi, the rate
+ * and distortion, and the tx_type arrays in ctx.
+ */
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   int *rate, int *rate_tokenonly,
+                                   int64_t *distortion, uint8_t *skippable,
+                                   BLOCK_SIZE bsize, int64_t best_rd,
+                                   PICK_MODE_CONTEXT *ctx);
+
+/*!\brief Perform intra-mode search on chroma channels.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function performs intra-mode search on the chroma channels. Just like
+ * \ref av1_rd_pick_intra_sby_mode(), this function searches over palette mode
+ * (filter_intra is not available on chroma planes). Unlike \ref
+ * av1_rd_pick_intra_sby_mode() this function is used by both inter and intra
+ * frames.
+ *
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    rate               The total rate needed to predict the current
+ *                                  chroma block.
+ * \param[in]    rate_tokenonly     The rate without the cost of sending the
+ *                                  prediction modes.
+ *                                  chroma block.
+ *                                  after the reconstruction.
+ * \param[in]    distortion         The chroma distortion of the best prediction
+ *                                  after the reconstruction.
+ * \param[in]    skippable          Whether we can skip txfm process.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    max_tx_size        The maximum tx_size available
+ *
+ * \return Returns the rd_cost of the best uv mode found. This also updates the
+ * mbmi, the rate and distortion, distortion.
+ */
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, uint8_t *skippable,
+                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size);
+
+/*! \brief Return the number of colors in src. Used by palette mode.
+ */
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                      int *val_count, int *num_colors);
+
+/*! \brief See \ref av1_count_colors(), but for highbd.
+ */
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth, int *val_count,
+                             int *val_count_8bit, int *num_color_bins,
+                             int *num_colors);
+
+/*! \brief Initializes the \ref IntraModeSearchState struct.
+ */
+static AOM_INLINE void init_intra_mode_search_state(
+    IntraModeSearchState *intra_search_state) {
+  memset(intra_search_state, 0, sizeof(*intra_search_state));
+  intra_search_state->rate_uv_intra = INT_MAX;
+}
+
+/*! \brief set the luma intra mode and delta angles for a given mode index.
+ * The total number of luma intra mode is LUMA_MODE_COUNT = 61.
+ * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional
+ * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2
+ * delta angles.
+ * \param[in]    mode_idx                  mode index in intra mode decision
+ *                                         process.
+ * \param[in]    mbmi                      Pointer to structure holding the mode
+ *                                         info for the current macroblock.
+ * \param[in]    reorder_delta_angle_eval  Indicates whether to reorder the
+ *                                         evaluation of delta angle modes.
+ */
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi,
+                                int reorder_delta_angle_eval);
+
+/*! \brief prune luma intra mode based on the model rd.
+ * \param[in]    this_model_rd              model rd for current mode.
+ * \param[in]    best_model_rd              Best model RD seen for this block so
+ *                                          far.
+ * \param[in]    top_intra_model_rd         Top intra model RD seen for this
+ *                                          block so far.
+ * \param[in]    max_model_cnt_allowed      The maximum number of top intra
+ *                                          model RD allowed.
+ * \param[in]    model_rd_index_for_pruning Index of the candidate used for
+ *                                          pruning based on model rd.
+ */
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[], int max_model_cnt_allowed,
+                       int model_rd_index_for_pruning);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/intra_mode_search_utils.h b/third_party/aom/av1/encoder/intra_mode_search_utils.h
new file mode 100644
index 0000000000..107c2236f8
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search_utils.h
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Defines utility functions used in intra mode search.
+ *
+ * This includes rdcost estimations, histogram based pruning, etc.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+
+#include "av1/common/enums.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+// Macro for computing the speed-preset dependent threshold which is used for
+// deciding whether to enable/disable variance calculations in
+// intra_rd_variance_factor().
+#define INTRA_RD_VAR_THRESH(X) (1.0 - (0.25 * (X)))
+
+#define BINS 32
+static const float av1_intra_hog_model_bias[DIRECTIONAL_MODES] = {
+  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
+  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
+};
+
+static const float av1_intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
+  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
+  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
+  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
+  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
+  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
+  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
+  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
+  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
+  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
+  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
+  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
+  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
+  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
+  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
+  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
+  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
+  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
+  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
+  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
+  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
+  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
+  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
+  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
+  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
+  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
+  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
+  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
+  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
+  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
+  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
+  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
+  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
+  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
+  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
+  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
+  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
+  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
+  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
+  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
+  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
+  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
+  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
+  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+};
+
+static const NN_CONFIG av1_intra_hog_model_nnconfig = {
+  BINS,               // num_inputs
+  DIRECTIONAL_MODES,  // num_outputs
+  0,                  // num_hidden_layers
+  { 0 },
+  {
+      av1_intra_hog_model_weights,
+  },
+  {
+      av1_intra_hog_model_bias,
+  },
+};
+
+#define FIX_PREC_BITS (16)
+static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
+  const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx;
+
+  // Find index by bisection
+  static const int thresholds[BINS] = {
+    -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303,
+    -59392,   -48579,  -39272,  -30982,  -23445,  -16400,  -9715,  -3194,
+    3227,     9748,    16433,   23478,   31015,   39305,   48611,  59425,
+    72336,    88392,   109364,  138593,  183191,  261638,  441831, INT32_MAX
+  };
+
+  int lo_idx = 0, hi_idx = BINS - 1;
+  // Divide into segments of size 8 gives better performance than binary search
+  // here.
+  if (ratio <= thresholds[7]) {
+    lo_idx = 0;
+    hi_idx = 7;
+  } else if (ratio <= thresholds[15]) {
+    lo_idx = 8;
+    hi_idx = 15;
+  } else if (ratio <= thresholds[23]) {
+    lo_idx = 16;
+    hi_idx = 23;
+  } else {
+    lo_idx = 24;
+    hi_idx = 31;
+  }
+
+  for (int idx = lo_idx; idx <= hi_idx; idx++) {
+    if (ratio <= thresholds[idx]) {
+      return idx;
+    }
+  }
+  assert(0 && "No valid histogram bin found!");
+  return BINS - 1;
+}
+#undef FIX_PREC_BITS
+
+// Normalizes the hog data.
+static AOM_INLINE void normalize_hog(float total, float *hist) {
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride,
+                                          int rows, int cols, float *hist) {
+  float total = 0.1f;
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const int idx = get_hist_bin_idx(dx, dy);
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for LBD encode.
+static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                      BLOCK_SIZE sb_size,
+                                                      PLANE_TYPE plane) {
+  PixelLevelGradientInfo *const grad_info_sb =
+      x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+  const uint8_t *src = x->plane[plane].src.buf;
+  const int stride = x->plane[plane].src.stride;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_height = block_size_high[sb_size] >> ss_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+  src += stride;
+  for (int r = 1; r < sb_height - 1; ++r) {
+    for (int c = 1; c < sb_width - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+      grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+          (uint16_t)(abs(dx) + abs(dy));
+      grad_info_sb[r * sb_width + c].hist_bin_idx =
+          (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+    }
+    src += stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride,
+                                           int rows, int cols, float *hist) {
+  float total = 0.1f;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const int idx = get_hist_bin_idx(dx, dy);
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for HBD encode.
+static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                       BLOCK_SIZE sb_size,
+                                                       PLANE_TYPE plane) {
+  PixelLevelGradientInfo *const grad_info_sb =
+      x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf);
+  const int stride = x->plane[plane].src.stride;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_height = block_size_high[sb_size] >> ss_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+  src += stride;
+  for (int r = 1; r < sb_height - 1; ++r) {
+    for (int c = 1; c < sb_width - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+      grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+          (uint16_t)(abs(dx) + abs(dy));
+      grad_info_sb[r * sb_width + c].hist_bin_idx =
+          (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+    }
+    src += stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows,
+                                    int cols, float *hist, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    highbd_generate_hog(src8, stride, rows, cols, hist);
+    return;
+  }
+#else
+  (void)highbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  lowbd_generate_hog(src8, stride, rows, cols, hist);
+}
+
+static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x,
+                                                BLOCK_SIZE sb_size,
+                                                PLANE_TYPE plane) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(&x->e_mbd)) {
+    highbd_compute_gradient_info_sb(x, sb_size, plane);
+    return;
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  lowbd_compute_gradient_info_sb(x, sb_size, plane);
+}
+
+// Gradient caching at superblock level is allowed only if all of the following
+// conditions are satisfied:
+// (1) The current frame is an intra only frame
+// (2) Non-RD mode decisions are not enabled
+// (3) The sf partition_search_type is set to SEARCH_PARTITION
+// (4) Either intra_pruning_with_hog or chroma_intra_pruning_with_hog is enabled
+//
+// SB level caching of gradient data may not help in speedup for the following
+// cases:
+// (1) Inter frames (due to early intra gating)
+// (2) When partition_search_type is not SEARCH_PARTITION
+// Hence, gradient data is computed at block level in such cases.
+static AOM_INLINE bool is_gradient_caching_for_hog_enabled(
+    const AV1_COMP *const cpi) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  return frame_is_intra_only(&cpi->common) && !sf->rt_sf.use_nonrd_pick_mode &&
+         (sf->part_sf.partition_search_type == SEARCH_PARTITION) &&
+         (sf->intra_sf.intra_pruning_with_hog ||
+          sf->intra_sf.chroma_intra_pruning_with_hog);
+}
+
+// Function to generate pixel level gradient information for a given superblock.
+// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if
+// gradient info is generated for the same.
+static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
+                                                BLOCK_SIZE sb_size, int mi_row,
+                                                int mi_col) {
+  // Initialise flags related to hog data caching.
+  x->is_sb_gradient_cached[PLANE_TYPE_Y] = false;
+  x->is_sb_gradient_cached[PLANE_TYPE_UV] = false;
+  if (!is_gradient_caching_for_hog_enabled(cpi)) return;
+
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const int num_planes = av1_num_planes(&cpi->common);
+
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+  if (sf->intra_sf.intra_pruning_with_hog) {
+    compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y);
+    x->is_sb_gradient_cached[PLANE_TYPE_Y] = true;
+  }
+  if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) {
+    compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV);
+    x->is_sb_gradient_cached[PLANE_TYPE_UV] = true;
+  }
+}
+
+// Reuses the pixel level gradient data generated at superblock level for block
+// level histogram computation.
+static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x,
+                                                         int rows, int cols,
+                                                         BLOCK_SIZE sb_size,
+                                                         PLANE_TYPE plane,
+                                                         float *hist) {
+  float total = 0.1f;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+
+  // Derive the offset from the starting of the superblock in order to locate
+  // the block level gradient data in the cache.
+  const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+  const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+  const int block_offset_in_grad_cache =
+      sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) +
+      (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x));
+  const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info +
+                                                plane * MAX_SB_SQUARE +
+                                                block_offset_in_grad_cache;
+
+  // Retrieve the cached gradient information and generate the histogram.
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t abs_dx_abs_dy_sum =
+          grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum;
+      if (!abs_dx_abs_dy_sum) continue;
+      total += abs_dx_abs_dy_sum;
+      const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero;
+      if (is_dx_zero) {
+        hist[0] += abs_dx_abs_dy_sum >> 1;
+        hist[BINS - 1] += abs_dx_abs_dy_sum >> 1;
+      } else {
+        const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx;
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += abs_dx_abs_dy_sum;
+      }
+    }
+  }
+  normalize_hog(total, hist);
+}
+
+static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                    BLOCK_SIZE sb_size, int plane, float *hog) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const int rows =
+      ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >>
+      ss_y;
+  const int cols =
+      ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >>
+      ss_x;
+
+  // If gradient data is already generated at SB level, reuse the cached data.
+  // Otherwise, compute the data.
+  if (x->is_sb_gradient_cached[plane]) {
+    generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog);
+  } else {
+    const uint8_t *src = x->plane[plane].src.buf;
+    const int src_stride = x->plane[plane].src.stride;
+    generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd));
+  }
+
+  // Scale the hog so the luma and chroma are on the same scale
+  for (int b = 0; b < BINS; ++b) {
+    hog[b] *= (1 + ss_x) * (1 + ss_y);
+  }
+}
+
+static AOM_INLINE void prune_intra_mode_with_hog(
+    const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th,
+    uint8_t *directional_mode_skip_mask, int is_chroma) {
+  const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
+  float hist[BINS] = { 0.0f };
+  collect_hog_data(x, bsize, sb_size, plane, hist);
+
+  // Make prediction for each of the mode
+  float scores[DIRECTIONAL_MODES] = { 0.0f };
+  av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores);
+  for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED;
+       uv_mode++) {
+    if (scores[uv_mode - UV_V_PRED] <= th) {
+      directional_mode_skip_mask[uv_mode] = 1;
+    }
+  }
+}
+#undef BINS
+
+int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
+                                 const int stride, const int is_hbd);
+
+// Returns whether caching of source variance for 4x4 sub-blocks is allowed.
+static AOM_INLINE bool is_src_var_for_4x4_sub_blocks_caching_enabled(
+    const AV1_COMP *const cpi) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  if (cpi->oxcf.mode != ALLINTRA) return false;
+
+  if (sf->part_sf.partition_search_type == SEARCH_PARTITION) return true;
+
+  if (INTRA_RD_VAR_THRESH(cpi->oxcf.speed) <= 0 ||
+      (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode))
+    return false;
+
+  return true;
+}
+
+// Initialize the members of Block4x4VarInfo structure to -1 at the start
+// of every superblock.
+static AOM_INLINE void init_src_var_info_of_4x4_sub_blocks(
+    const AV1_COMP *const cpi, Block4x4VarInfo *src_var_info_of_4x4_sub_blocks,
+    const BLOCK_SIZE sb_size) {
+  if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
+
+  const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size];
+  for (int i = 0; i < mi_count_in_sb; i++) {
+    src_var_info_of_4x4_sub_blocks[i].var = -1;
+    src_var_info_of_4x4_sub_blocks[i].log_var = -1.0;
+  }
+}
+
+// Returns the cost needed to send a uniformly distributed r.v.
+static AOM_INLINE int write_uniform_cost(int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return av1_cost_literal(l - 1);
+  else
+    return av1_cost_literal(l);
+}
+/*!\endcond */
+
+/*!\brief Returns the rate cost for luma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
+                                             const MACROBLOCK *x,
+                                             const MB_MODE_INFO *mbmi,
+                                             BLOCK_SIZE bsize, int mode_cost,
+                                             int discount_color_cost) {
+  int total_rate = mode_cost;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate +=
+        mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          mode_costs
+              ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.seq_params->bit_depth);
+      if (!discount_color_cost)
+        palette_mode_cost +=
+            av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate += mode_costs->filter_intra_cost[mbmi->bsize][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate +=
+          mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                 .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
+                                      [MAX_ANGLE_DELTA +
+                                       mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += mode_costs->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+/*!\brief Return the rate cost for chroma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
+                                              const MACROBLOCK *x,
+                                              const MB_MODE_INFO *mbmi,
+                                              BLOCK_SIZE bsize, int mode_cost) {
+  int total_rate = mode_cost;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((uv_mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+  if (try_palette && uv_mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          mode_costs
+              ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(
+          pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+  if (av1_is_directional_mode(intra_mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          mode_costs->angle_delta_cost[intra_mode - V_PRED]
+                                      [mbmi->angle_delta[PLANE_TYPE_UV] +
+                                       MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
+/*!\cond */
+// Makes a quick intra prediction and estimate the rdcost with a model without
+// going through the whole txfm/quantize/itxfm process.
+static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
+                              int plane, BLOCK_SIZE plane_bsize,
+                              TX_SIZE tx_size, int use_hadamard) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  int row, col;
+  assert(!is_inter_block(xd->mi[0]));
+  const int stepr = tx_size_high_unit[tx_size];
+  const int stepc = tx_size_wide_unit[tx_size];
+  const int txbw = tx_size_wide[tx_size];
+  const int txbh = tx_size_high[tx_size];
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  int64_t satd_cost = 0;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  // Prediction.
+  for (row = 0; row < max_blocks_high; row += stepr) {
+    for (col = 0; col < max_blocks_wide; col += stepc) {
+      av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+      // Here we use p->src_diff and p->coeff as temporary buffers for
+      // prediction residue and transform coefficients. The buffers are only
+      // used in this for loop, therefore we don't need to properly add offset
+      // to the buffers.
+      av1_subtract_block(
+          bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
+          p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride,
+          pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride);
+      av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff,
+                     block_size_wide[plane_bsize], p->coeff);
+      satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]);
+    }
+  }
+  return satd_cost;
+}
+/*!\endcond */
+
+/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function first makes a quick luma prediction and estimates the rdcost
+ * with a model without going through the txfm, then try to prune the current
+ * mode if the new estimate y_rd > 1.25 * best_model_rd.
+ *
+ * \return Returns 1 if the given mode is prune; 0 otherwise.
+ */
+static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+                                                MACROBLOCK *x, BLOCK_SIZE bsize,
+                                                int64_t *best_model_rd) {
+  const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+  const int plane = 0;
+  const AV1_COMMON *cm = &cpi->common;
+  const int64_t this_model_rd =
+      intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 2)) {
+    return 1;
+  } else if (this_model_rd < *best_model_rd) {
+    *best_model_rd = this_model_rd;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h
new file mode 100644
index 0000000000..4be2038a6f
--- /dev/null
+++ b/third_party/aom/av1/encoder/k_means_template.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#ifndef AV1_K_MEANS_DIM
+#error "This template requires AV1_K_MEANS_DIM to be defined"
+#endif
+
+#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
+#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
+
+// Though we want to compute the smallest L2 norm, in 1 dimension,
+// it is equivalent to find the smallest L1 norm and then square it.
+// This is preferrable for speed, especially on the SIMD side.
+static int RENAME(calc_dist)(const int16_t *p1, const int16_t *p2) {
+#if AV1_K_MEANS_DIM == 1
+  return abs(p1[0] - p2[0]);
+#else
+  int dist = 0;
+  for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+    const int diff = p1[i] - p2[i];
+    dist += diff * diff;
+  }
+  return dist;
+#endif
+}
+
+void RENAME(av1_calc_indices)(const int16_t *data, const int16_t *centroids,
+                              uint8_t *indices, int64_t *dist, int n, int k) {
+  if (dist) {
+    *dist = 0;
+  }
+  for (int i = 0; i < n; ++i) {
+    int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+    indices[i] = 0;
+    for (int j = 1; j < k; ++j) {
+      const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                                              centroids + j * AV1_K_MEANS_DIM);
+      if (this_dist < min_dist) {
+        min_dist = this_dist;
+        indices[i] = j;
+      }
+    }
+    if (dist) {
+#if AV1_K_MEANS_DIM == 1
+      *dist += min_dist * min_dist;
+#else
+      *dist += min_dist;
+#endif
+    }
+  }
+}
+
+static void RENAME(calc_centroids)(const int16_t *data, int16_t *centroids,
+                                   const uint8_t *indices, int n, int k) {
+  int i, j;
+  int count[PALETTE_MAX_SIZE] = { 0 };
+  int centroids_sum[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE];
+  unsigned int rand_state = (unsigned int)data[0];
+  assert(n <= 32768);
+  memset(centroids_sum, 0, sizeof(centroids_sum[0]) * k * AV1_K_MEANS_DIM);
+
+  for (i = 0; i < n; ++i) {
+    const int index = indices[i];
+    assert(index < k);
+    ++count[index];
+    for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+      centroids_sum[index * AV1_K_MEANS_DIM + j] +=
+          data[i * AV1_K_MEANS_DIM + j];
+    }
+  }
+
+  for (i = 0; i < k; ++i) {
+    if (count[i] == 0) {
+      memcpy(centroids + i * AV1_K_MEANS_DIM,
+             data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
+             sizeof(centroids[0]) * AV1_K_MEANS_DIM);
+    } else {
+      for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+        centroids[i * AV1_K_MEANS_DIM + j] =
+            DIVIDE_AND_ROUND(centroids_sum[i * AV1_K_MEANS_DIM + j], count[i]);
+      }
+    }
+  }
+}
+
+void RENAME(av1_k_means)(const int16_t *data, int16_t *centroids,
+                         uint8_t *indices, int n, int k, int max_itr) {
+  int16_t centroids_tmp[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE];
+  uint8_t indices_tmp[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT];
+  int16_t *meta_centroids[2] = { centroids, centroids_tmp };
+  uint8_t *meta_indices[2] = { indices, indices_tmp };
+  int i, l = 0, prev_l, best_l = 0;
+  int64_t this_dist;
+
+  assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT);
+
+#if AV1_K_MEANS_DIM == 1
+  av1_calc_indices_dim1(data, centroids, indices, &this_dist, n, k);
+#else
+  av1_calc_indices_dim2(data, centroids, indices, &this_dist, n, k);
+#endif
+
+  for (i = 0; i < max_itr; ++i) {
+    const int64_t prev_dist = this_dist;
+    prev_l = l;
+    l = (l == 1) ? 0 : 1;
+
+    RENAME(calc_centroids)(data, meta_centroids[l], meta_indices[prev_l], n, k);
+    if (!memcmp(meta_centroids[l], meta_centroids[prev_l],
+                sizeof(centroids[0]) * k * AV1_K_MEANS_DIM)) {
+      break;
+    }
+#if AV1_K_MEANS_DIM == 1
+    av1_calc_indices_dim1(data, meta_centroids[l], meta_indices[l], &this_dist,
+                          n, k);
+#else
+    av1_calc_indices_dim2(data, meta_centroids[l], meta_indices[l], &this_dist,
+                          n, k);
+#endif
+
+    if (this_dist > prev_dist) {
+      best_l = prev_l;
+      break;
+    }
+  }
+  if (i == max_itr) best_l = l;
+  if (best_l != 0) {
+    memcpy(centroids, meta_centroids[1],
+           sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+    memcpy(indices, meta_indices[1], sizeof(indices[0]) * n);
+  }
+}
+#undef RENAME_
+#undef RENAME
diff --git a/third_party/aom/av1/encoder/level.c b/third_party/aom/av1/encoder/level.c
new file mode 100644
index 0000000000..5d5fe9ce96
--- /dev/null
+++ b/third_party/aom/av1/encoder/level.c
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/level.h"
+
+#define UNDEFINED_LEVEL                                                 \
+  {                                                                     \
+    .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0,     \
+    .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0,       \
+    .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \
+    .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0                    \
+  }
+
+static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = {
+  { .level = SEQ_LEVEL_2_0,
+    .max_picture_size = 147456,
+    .max_h_size = 2048,
+    .max_v_size = 1152,
+    .max_display_rate = 4423680L,
+    .max_decode_rate = 5529600L,
+    .max_header_rate = 150,
+    .main_mbps = 1.5,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 8,
+    .max_tile_cols = 4 },
+  { .level = SEQ_LEVEL_2_1,
+    .max_picture_size = 278784,
+    .max_h_size = 2816,
+    .max_v_size = 1584,
+    .max_display_rate = 8363520L,
+    .max_decode_rate = 10454400L,
+    .max_header_rate = 150,
+    .main_mbps = 3.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 8,
+    .max_tile_cols = 4 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_3_0,
+    .max_picture_size = 665856,
+    .max_h_size = 4352,
+    .max_v_size = 2448,
+    .max_display_rate = 19975680L,
+    .max_decode_rate = 24969600L,
+    .max_header_rate = 150,
+    .main_mbps = 6.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 16,
+    .max_tile_cols = 6 },
+  { .level = SEQ_LEVEL_3_1,
+    .max_picture_size = 1065024,
+    .max_h_size = 5504,
+    .max_v_size = 3096,
+    .max_display_rate = 31950720L,
+    .max_decode_rate = 39938400L,
+    .max_header_rate = 150,
+    .main_mbps = 10.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 16,
+    .max_tile_cols = 6 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_4_0,
+    .max_picture_size = 2359296,
+    .max_h_size = 6144,
+    .max_v_size = 3456,
+    .max_display_rate = 70778880L,
+    .max_decode_rate = 77856768L,
+    .max_header_rate = 300,
+    .main_mbps = 12.0,
+    .high_mbps = 30.0,
+    .main_cr = 4.0,
+    .high_cr = 4.0,
+    .max_tiles = 32,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_4_1,
+    .max_picture_size = 2359296,
+    .max_h_size = 6144,
+    .max_v_size = 3456,
+    .max_display_rate = 141557760L,
+    .max_decode_rate = 155713536L,
+    .max_header_rate = 300,
+    .main_mbps = 20.0,
+    .high_mbps = 50.0,
+    .main_cr = 4.0,
+    .high_cr = 4.0,
+    .max_tiles = 32,
+    .max_tile_cols = 8 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_5_0,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 267386880L,
+    .max_decode_rate = 273715200L,
+    .max_header_rate = 300,
+    .main_mbps = 30.0,
+    .high_mbps = 100.0,
+    .main_cr = 6.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_1,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 534773760L,
+    .max_decode_rate = 547430400L,
+    .max_header_rate = 300,
+    .main_mbps = 40.0,
+    .high_mbps = 160.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_2,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1094860800L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_3,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1176502272L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_6_0,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1176502272L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_1,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 2139095040L,
+    .max_decode_rate = 2189721600L,
+    .max_header_rate = 300,
+    .main_mbps = 100.0,
+    .high_mbps = 480.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_2,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 4278190080L,
+    .max_decode_rate = 4379443200L,
+    .max_header_rate = 300,
+    .main_mbps = 160.0,
+    .high_mbps = 800.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_3,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 4278190080L,
+    .max_decode_rate = 4706009088L,
+    .max_header_rate = 300,
+    .main_mbps = 160.0,
+    .high_mbps = 800.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+#if CONFIG_CWG_C013
+  { .level = SEQ_LEVEL_7_0,
+    .max_picture_size = 142606336,
+    .max_h_size = 32768,
+    .max_v_size = 17408,
+    .max_display_rate = 4278190080L,
+    .max_decode_rate = 4706009088L,
+    .max_header_rate = 300,
+    .main_mbps = 160.0,
+    .high_mbps = 800.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 256,
+    .max_tile_cols = 32 },
+  { .level = SEQ_LEVEL_7_1,
+    .max_picture_size = 142606336,
+    .max_h_size = 32768,
+    .max_v_size = 17408,
+    .max_display_rate = 8556380160L,
+    .max_decode_rate = 8758886400L,
+    .max_header_rate = 300,
+    .main_mbps = 200.0,
+    .high_mbps = 960.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 256,
+    .max_tile_cols = 32 },
+  { .level = SEQ_LEVEL_7_2,
+    .max_picture_size = 142606336,
+    .max_h_size = 32768,
+    .max_v_size = 17408,
+    .max_display_rate = 17112760320L,
+    .max_decode_rate = 17517772800L,
+    .max_header_rate = 300,
+    .main_mbps = 320.0,
+    .high_mbps = 1600.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 256,
+    .max_tile_cols = 32 },
+  { .level = SEQ_LEVEL_7_3,
+    .max_picture_size = 142606336,
+    .max_h_size = 32768,
+    .max_v_size = 17408,
+    .max_display_rate = 17112760320L,
+    .max_decode_rate = 18824036352L,
+    .max_header_rate = 300,
+    .main_mbps = 320.0,
+    .high_mbps = 1600.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 256,
+    .max_tile_cols = 32 },
+  { .level = SEQ_LEVEL_8_0,
+    .max_picture_size = 530841600,
+    .max_h_size = 65536,
+    .max_v_size = 34816,
+    .max_display_rate = 17112760320L,
+    .max_decode_rate = 18824036352L,
+    .max_header_rate = 300,
+    .main_mbps = 320.0,
+    .high_mbps = 1600.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 512,
+    .max_tile_cols = 64 },
+  { .level = SEQ_LEVEL_8_1,
+    .max_picture_size = 530841600,
+    .max_h_size = 65536,
+    .max_v_size = 34816,
+    .max_display_rate = 34225520640L,
+    .max_decode_rate = 34910031052L,
+    .max_header_rate = 300,
+    .main_mbps = 400.0,
+    .high_mbps = 1920.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 512,
+    .max_tile_cols = 64 },
+  { .level = SEQ_LEVEL_8_2,
+    .max_picture_size = 530841600,
+    .max_h_size = 65536,
+    .max_v_size = 34816,
+    .max_display_rate = 68451041280L,
+    .max_decode_rate = 69820062105L,
+    .max_header_rate = 300,
+    .main_mbps = 640.0,
+    .high_mbps = 3200.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 512,
+    .max_tile_cols = 64 },
+  { .level = SEQ_LEVEL_8_3,
+    .max_picture_size = 530841600,
+    .max_h_size = 65536,
+    .max_v_size = 34816,
+    .max_display_rate = 68451041280L,
+    .max_decode_rate = 75296145408L,
+    .max_header_rate = 300,
+    .main_mbps = 640.0,
+    .high_mbps = 3200.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 512,
+    .max_tile_cols = 64 },
+#else   // !CONFIG_CWG_C013
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+#endif  // CONFIG_CWG_C013
+};
+
+typedef enum {
+  LUMA_PIC_SIZE_TOO_LARGE,
+  LUMA_PIC_H_SIZE_TOO_LARGE,
+  LUMA_PIC_V_SIZE_TOO_LARGE,
+  LUMA_PIC_H_SIZE_TOO_SMALL,
+  LUMA_PIC_V_SIZE_TOO_SMALL,
+  TOO_MANY_TILE_COLUMNS,
+  TOO_MANY_TILES,
+  TILE_RATE_TOO_HIGH,
+  TILE_TOO_LARGE,
+  SUPERRES_TILE_WIDTH_TOO_LARGE,
+  CROPPED_TILE_WIDTH_TOO_SMALL,
+  CROPPED_TILE_HEIGHT_TOO_SMALL,
+  TILE_WIDTH_INVALID,
+  FRAME_HEADER_RATE_TOO_HIGH,
+  DISPLAY_RATE_TOO_HIGH,
+  DECODE_RATE_TOO_HIGH,
+  CR_TOO_SMALL,
+  TILE_SIZE_HEADER_RATE_TOO_HIGH,
+  BITRATE_TOO_HIGH,
+  DECODER_MODEL_FAIL,
+
+  TARGET_LEVEL_FAIL_IDS,
+  TARGET_LEVEL_OK,
+} TARGET_LEVEL_FAIL_ID;
+
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+  "The picture size is too large.",
+  "The picture width is too large.",
+  "The picture height is too large.",
+  "The picture width is too small.",
+  "The picture height is too small.",
+  "Too many tile columns are used.",
+  "Too many tiles are used.",
+  "The tile rate is too high.",
+  "The tile size is too large.",
+  "The superres tile width is too large.",
+  "The cropped tile width is less than 8.",
+  "The cropped tile height is less than 8.",
+  "The tile width is invalid.",
+  "The frame header rate is too high.",
+  "The display luma sample rate is too high.",
+  "The decoded luma sample rate is too high.",
+  "The compression ratio is too small.",
+  "The product of max tile size and header rate is too high.",
+  "The bitrate is too high.",
+  "The decoder model fails.",
+};
+
+static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier,
+                              BITSTREAM_PROFILE profile) {
+  if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+  const double bitrate_basis =
+      (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6;
+  const double bitrate_profile_factor =
+      profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0);
+  return bitrate_basis * bitrate_profile_factor;
+}
+
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+                                     BITSTREAM_PROFILE profile) {
+  assert(is_valid_seq_level_idx(level_index));
+  return get_max_bitrate(&av1_level_defs[level_index], tier, profile);
+}
+
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+                                 int *const max_tile_cols) {
+  assert(is_valid_seq_level_idx(level_index));
+  const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+  *max_tiles = level_spec->max_tiles;
+  *max_tile_cols = level_spec->max_tile_cols;
+}
+
+// We assume time t to be valid if and only if t >= 0.0.
+// So INVALID_TIME can be defined as anything less than 0.
+#define INVALID_TIME (-1.0)
+
+// This corresponds to "free_buffer" in the spec.
+static void release_buffer(DECODER_MODEL *const decoder_model, int idx) {
+  assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE);
+  FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+  this_buffer->decoder_ref_count = 0;
+  this_buffer->player_ref_count = 0;
+  this_buffer->display_index = -1;
+  this_buffer->presentation_time = INVALID_TIME;
+}
+
+static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    release_buffer(decoder_model, i);
+  }
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    decoder_model->vbi[i] = -1;
+  }
+}
+
+static int get_free_buffer(DECODER_MODEL *const decoder_model) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count == 0 &&
+        this_buffer->player_ref_count == 0)
+      return i;
+  }
+  return -1;
+}
+
+static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx,
+                               int refresh_frame_flags) {
+  FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    if (refresh_frame_flags & (1 << i)) {
+      const int pre_idx = decoder_model->vbi[i];
+      if (pre_idx != -1) {
+        --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count;
+      }
+      decoder_model->vbi[i] = idx;
+      ++this_buffer->decoder_ref_count;
+    }
+  }
+}
+
+// The time (in seconds) required to decode a frame.
+static double time_to_decode_frame(const AV1_COMMON *const cm,
+                                   int64_t max_decode_rate) {
+  if (cm->show_existing_frame) return 0.0;
+
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  int luma_samples = 0;
+  if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) {
+    luma_samples = cm->superres_upscaled_width * cm->height;
+  } else {
+    const int spatial_layer_dimensions_present_flag = 0;
+    if (spatial_layer_dimensions_present_flag) {
+      assert(0 && "Spatial layer dimensions not supported yet.");
+    } else {
+      const SequenceHeader *const seq_params = cm->seq_params;
+      const int max_frame_width = seq_params->max_frame_width;
+      const int max_frame_height = seq_params->max_frame_height;
+      luma_samples = max_frame_width * max_frame_height;
+    }
+  }
+
+  return luma_samples / (double)max_decode_rate;
+}
+
+// Release frame buffers that are no longer needed for decode or display.
+// It corresponds to "start_decode_at_removal_time" in the spec.
+static void release_processed_frames(DECODER_MODEL *const decoder_model,
+                                     double removal_time) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->player_ref_count > 0) {
+      if (this_buffer->presentation_time >= 0.0 &&
+          this_buffer->presentation_time <= removal_time) {
+        this_buffer->player_ref_count = 0;
+        if (this_buffer->decoder_ref_count == 0) {
+          release_buffer(decoder_model, i);
+        }
+      }
+    }
+  }
+}
+
+static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) {
+  int frames_in_pool = 0;
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count > 0 ||
+        this_buffer->player_ref_count > 0) {
+      ++frames_in_pool;
+    }
+  }
+  return frames_in_pool;
+}
+
+static double get_presentation_time(const DECODER_MODEL *const decoder_model,
+                                    int display_index) {
+  if (decoder_model->mode == SCHEDULE_MODE) {
+    assert(0 && "SCHEDULE_MODE NOT SUPPORTED");
+    return INVALID_TIME;
+  } else {
+    const double initial_presentation_delay =
+        decoder_model->initial_presentation_delay;
+    // Can't decide presentation time until the initial presentation delay is
+    // known.
+    if (initial_presentation_delay < 0.0) return INVALID_TIME;
+
+    return initial_presentation_delay +
+           display_index * decoder_model->num_ticks_per_picture *
+               decoder_model->display_clock_tick;
+  }
+}
+
+#define MAX_TIME 1e16
+static double time_next_buffer_is_free(int num_decoded_frame,
+                                       int decoder_buffer_delay,
+                                       const FRAME_BUFFER *frame_buffer_pool,
+                                       double current_time) {
+  if (num_decoded_frame == 0) {
+    return (double)decoder_buffer_delay / 90000.0;
+  }
+
+  double buf_free_time = MAX_TIME;
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer = &frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count == 0) {
+      if (this_buffer->player_ref_count == 0) {
+        return current_time;
+      }
+      const double presentation_time = this_buffer->presentation_time;
+      if (presentation_time >= 0.0 && presentation_time < buf_free_time) {
+        buf_free_time = presentation_time;
+      }
+    }
+  }
+  return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME;
+}
+#undef MAX_TIME
+
+static double get_removal_time(int mode, int num_decoded_frame,
+                               int decoder_buffer_delay,
+                               const FRAME_BUFFER *frame_buffer_pool,
+                               double current_time) {
+  if (mode == SCHEDULE_MODE) {
+    assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET");
+    return INVALID_TIME;
+  } else {
+    return time_next_buffer_is_free(num_decoded_frame, decoder_buffer_delay,
+                                    frame_buffer_pool, current_time);
+  }
+}
+
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) {
+  printf(
+      "\n status %d, num_frame %3d, num_decoded_frame %3d, "
+      "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, "
+      "presentation delay %6.2f, total interval %6.2f\n",
+      decoder_model->status, decoder_model->num_frame,
+      decoder_model->num_decoded_frame, decoder_model->num_shown_frame,
+      decoder_model->current_time, frames_in_buffer_pool(decoder_model),
+      decoder_model->initial_presentation_delay,
+      decoder_model->dfg_interval_queue.total_interval);
+  for (int i = 0; i < 10; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    printf("buffer %d, decode count %d, display count %d, present time %6.4f\n",
+           i, this_buffer->decoder_ref_count, this_buffer->player_ref_count,
+           this_buffer->presentation_time);
+  }
+}
+
+// op_index is the operating point index.
+void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
+                            int op_index, DECODER_MODEL *const decoder_model) {
+  decoder_model->status = DECODER_MODEL_OK;
+  decoder_model->level = level;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  decoder_model->bit_rate = get_max_bitrate(
+      av1_level_defs + level, seq_params->tier[op_index], seq_params->profile);
+
+  // TODO(huisu or anyone): implement SCHEDULE_MODE.
+  decoder_model->mode = RESOURCE_MODE;
+  decoder_model->encoder_buffer_delay = 20000;
+  decoder_model->decoder_buffer_delay = 70000;
+  decoder_model->is_low_delay_mode = false;
+
+  decoder_model->first_bit_arrival_time = 0.0;
+  decoder_model->last_bit_arrival_time = 0.0;
+  decoder_model->coded_bits = 0;
+
+  decoder_model->removal_time = INVALID_TIME;
+  decoder_model->presentation_time = INVALID_TIME;
+  decoder_model->decode_samples = 0;
+  decoder_model->display_samples = 0;
+  decoder_model->max_decode_rate = 0.0;
+  decoder_model->max_display_rate = 0.0;
+
+  decoder_model->num_frame = -1;
+  decoder_model->num_decoded_frame = -1;
+  decoder_model->num_shown_frame = -1;
+  decoder_model->current_time = 0.0;
+
+  initialize_buffer_pool(decoder_model);
+
+  DFG_INTERVAL_QUEUE *const dfg_interval_queue =
+      &decoder_model->dfg_interval_queue;
+  dfg_interval_queue->total_interval = 0.0;
+  dfg_interval_queue->head = 0;
+  dfg_interval_queue->size = 0;
+
+  if (seq_params->timing_info_present) {
+    decoder_model->num_ticks_per_picture =
+        seq_params->timing_info.num_ticks_per_picture;
+    decoder_model->display_clock_tick =
+        seq_params->timing_info.num_units_in_display_tick /
+        seq_params->timing_info.time_scale;
+  } else {
+    decoder_model->num_ticks_per_picture = 1;
+    decoder_model->display_clock_tick = 1.0 / cpi->framerate;
+  }
+
+  decoder_model->initial_display_delay =
+      seq_params->op_params[op_index].initial_display_delay;
+  decoder_model->initial_presentation_delay = INVALID_TIME;
+  decoder_model->decode_rate = av1_level_defs[level].max_decode_rate;
+}
+
+DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
+    const AV1_COMP *const cpi, size_t coded_bits,
+    const DECODER_MODEL *const decoder_model) {
+  DECODER_MODEL_STATUS status = DECODER_MODEL_OK;
+
+  if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) {
+    return status;
+  }
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int show_existing_frame = cm->show_existing_frame;
+
+  size_t cur_coded_bits = decoder_model->coded_bits + coded_bits;
+  int num_decoded_frame = decoder_model->num_decoded_frame;
+  if (!show_existing_frame) ++num_decoded_frame;
+
+  if (show_existing_frame) {
+    return status;
+  } else {
+    const double removal_time = get_removal_time(
+        decoder_model->mode, num_decoded_frame,
+        decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool,
+        decoder_model->current_time);
+    if (removal_time < 0.0) {
+      status = DECODE_FRAME_BUF_UNAVAILABLE;
+      return status;
+    }
+
+    // A frame with show_existing_frame being false indicates the end of a DFG.
+    // Update the bits arrival time of this DFG.
+    const double buffer_delay = (decoder_model->encoder_buffer_delay +
+                                 decoder_model->decoder_buffer_delay) /
+                                90000.0;
+    const double latest_arrival_time = removal_time - buffer_delay;
+    const double first_bit_arrival_time =
+        AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+    const double last_bit_arrival_time =
+        first_bit_arrival_time +
+        (double)cur_coded_bits / decoder_model->bit_rate;
+    // Smoothing buffer underflows if the last bit arrives after the removal
+    // time.
+    if (last_bit_arrival_time > removal_time &&
+        !decoder_model->is_low_delay_mode) {
+      status = SMOOTHING_BUFFER_UNDERFLOW;
+      return status;
+    }
+
+    // Check if the smoothing buffer overflows.
+    const DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+    if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+      assert(0);
+    }
+
+    double total_interval = queue->total_interval;
+    int qhead = queue->head;
+    int qsize = queue->size;
+    // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+    while (queue->buf[qhead].removal_time <= last_bit_arrival_time &&
+           qsize > 0) {
+      if (queue->buf[qhead].removal_time - first_bit_arrival_time +
+              total_interval >
+          1.0) {
+        status = SMOOTHING_BUFFER_OVERFLOW;
+        return status;
+      }
+      total_interval -= queue->buf[qhead].last_bit_arrival_time -
+                        queue->buf[qhead].first_bit_arrival_time;
+      qhead = (qhead + 1) % DFG_INTERVAL_QUEUE_SIZE;
+      --qsize;
+    }
+    total_interval += last_bit_arrival_time - first_bit_arrival_time;
+    // The smoothing buffer can hold at most "bit_rate" bits, which is
+    // equivalent to 1 second of total interval.
+    if (total_interval > 1.0) {
+      status = SMOOTHING_BUFFER_OVERFLOW;
+      return status;
+    }
+
+    return status;
+  }
+}
+
+void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
+                                     size_t coded_bits,
+                                     DECODER_MODEL *const decoder_model) {
+  if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int luma_pic_size = cm->superres_upscaled_width * cm->height;
+  const int show_existing_frame = cm->show_existing_frame;
+  const int show_frame = cm->show_frame || show_existing_frame;
+  ++decoder_model->num_frame;
+  if (!show_existing_frame) ++decoder_model->num_decoded_frame;
+  if (show_frame) ++decoder_model->num_shown_frame;
+  decoder_model->coded_bits += coded_bits;
+
+  int display_idx = -1;
+  if (show_existing_frame) {
+    display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show];
+    if (display_idx < 0) {
+      decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY;
+      return;
+    }
+    if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) {
+      update_ref_buffers(decoder_model, display_idx, 0xFF);
+    }
+  } else {
+    const double removal_time = get_removal_time(
+        decoder_model->mode, decoder_model->num_decoded_frame,
+        decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool,
+        decoder_model->current_time);
+    if (removal_time < 0.0) {
+      decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+      return;
+    }
+
+    const int previous_decode_samples = decoder_model->decode_samples;
+    const double previous_removal_time = decoder_model->removal_time;
+    assert(previous_removal_time < removal_time);
+    decoder_model->removal_time = removal_time;
+    decoder_model->decode_samples = luma_pic_size;
+    const double this_decode_rate =
+        previous_decode_samples / (removal_time - previous_removal_time);
+    decoder_model->max_decode_rate =
+        AOMMAX(decoder_model->max_decode_rate, this_decode_rate);
+
+    // A frame with show_existing_frame being false indicates the end of a DFG.
+    // Update the bits arrival time of this DFG.
+    const double buffer_delay = (decoder_model->encoder_buffer_delay +
+                                 decoder_model->decoder_buffer_delay) /
+                                90000.0;
+    const double latest_arrival_time = removal_time - buffer_delay;
+    decoder_model->first_bit_arrival_time =
+        AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+    decoder_model->last_bit_arrival_time =
+        decoder_model->first_bit_arrival_time +
+        (double)decoder_model->coded_bits / decoder_model->bit_rate;
+    // Smoothing buffer underflows if the last bit arrives after the removal
+    // time.
+    if (decoder_model->last_bit_arrival_time > removal_time &&
+        !decoder_model->is_low_delay_mode) {
+      decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW;
+      return;
+    }
+    // Reset the coded bits for the next DFG.
+    decoder_model->coded_bits = 0;
+
+    // Check if the smoothing buffer overflows.
+    DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+    if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+      assert(0);
+    }
+    const double first_bit_arrival_time = decoder_model->first_bit_arrival_time;
+    const double last_bit_arrival_time = decoder_model->last_bit_arrival_time;
+    // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+    while (queue->buf[queue->head].removal_time <= last_bit_arrival_time &&
+           queue->size > 0) {
+      if (queue->buf[queue->head].removal_time - first_bit_arrival_time +
+              queue->total_interval >
+          1.0) {
+        decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+        return;
+      }
+      queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time -
+                               queue->buf[queue->head].first_bit_arrival_time;
+      queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE;
+      --queue->size;
+    }
+    // Push current DFG into the queue.
+    const int queue_index =
+        (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE;
+    queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time;
+    queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time;
+    queue->buf[queue_index].removal_time = removal_time;
+    queue->total_interval += last_bit_arrival_time - first_bit_arrival_time;
+    // The smoothing buffer can hold at most "bit_rate" bits, which is
+    // equivalent to 1 second of total interval.
+    if (queue->total_interval > 1.0) {
+      decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+      return;
+    }
+
+    release_processed_frames(decoder_model, removal_time);
+    decoder_model->current_time =
+        removal_time + time_to_decode_frame(cm, decoder_model->decode_rate);
+
+    const int cfbi = get_free_buffer(decoder_model);
+    if (cfbi < 0) {
+      decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+      return;
+    }
+    const CurrentFrame *const current_frame = &cm->current_frame;
+    decoder_model->frame_buffer_pool[cfbi].frame_type =
+        cm->current_frame.frame_type;
+    display_idx = cfbi;
+    update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags);
+
+    if (decoder_model->initial_presentation_delay < 0.0) {
+      // Display can begin after required number of frames have been buffered.
+      if (frames_in_buffer_pool(decoder_model) >=
+          decoder_model->initial_display_delay - 1) {
+        decoder_model->initial_presentation_delay = decoder_model->current_time;
+        // Update presentation time for each shown frame in the frame buffer.
+        for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+          FRAME_BUFFER *const this_buffer =
+              &decoder_model->frame_buffer_pool[i];
+          if (this_buffer->player_ref_count == 0) continue;
+          assert(this_buffer->display_index >= 0);
+          this_buffer->presentation_time =
+              get_presentation_time(decoder_model, this_buffer->display_index);
+        }
+      }
+    }
+  }
+
+  // Display.
+  if (show_frame) {
+    assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE);
+    FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[display_idx];
+    ++this_buffer->player_ref_count;
+    this_buffer->display_index = decoder_model->num_shown_frame;
+    const double presentation_time =
+        get_presentation_time(decoder_model, this_buffer->display_index);
+    this_buffer->presentation_time = presentation_time;
+    if (presentation_time >= 0.0 &&
+        decoder_model->current_time > presentation_time) {
+      decoder_model->status = DISPLAY_FRAME_LATE;
+      return;
+    }
+
+    const int previous_display_samples = decoder_model->display_samples;
+    const double previous_presentation_time = decoder_model->presentation_time;
+    decoder_model->display_samples = luma_pic_size;
+    decoder_model->presentation_time = presentation_time;
+    if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) {
+      assert(previous_presentation_time < presentation_time);
+      const double this_display_rate =
+          previous_display_samples /
+          (presentation_time - previous_presentation_time);
+      decoder_model->max_display_rate =
+          AOMMAX(decoder_model->max_display_rate, this_display_rate);
+    }
+  }
+}
+
+void av1_init_level_info(AV1_COMP *cpi) {
+  for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) {
+    AV1LevelInfo *const this_level_info =
+        cpi->ppi->level_params.level_info[op_index];
+    if (!this_level_info) continue;
+    memset(this_level_info, 0, sizeof(*this_level_info));
+    AV1LevelSpec *const level_spec = &this_level_info->level_spec;
+    level_spec->level = SEQ_LEVEL_MAX;
+    AV1LevelStats *const level_stats = &this_level_info->level_stats;
+    level_stats->min_cropped_tile_width = INT_MAX;
+    level_stats->min_cropped_tile_height = INT_MAX;
+    level_stats->min_frame_width = INT_MAX;
+    level_stats->min_frame_height = INT_MAX;
+    level_stats->tile_width_is_valid = 1;
+    level_stats->min_cr = 1e8;
+
+    FrameWindowBuffer *const frame_window_buffer =
+        &this_level_info->frame_window_buffer;
+    frame_window_buffer->num = 0;
+    frame_window_buffer->start = 0;
+
+    const AV1_COMMON *const cm = &cpi->common;
+    const int upscaled_width = cm->superres_upscaled_width;
+    const int height = cm->height;
+    const int pic_size = upscaled_width * height;
+    for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+      DECODER_MODEL *const this_model = &this_level_info->decoder_models[level];
+      const AV1LevelSpec *const spec = &av1_level_defs[level];
+      if (upscaled_width > spec->max_h_size || height > spec->max_v_size ||
+          pic_size > spec->max_picture_size) {
+        // Turn off decoder model for this level as the frame size already
+        // exceeds level constraints.
+        this_model->status = DECODER_MODEL_DISABLED;
+      } else {
+        av1_decoder_model_init(cpi, level, op_index, this_model);
+      }
+    }
+  }
+}
+
+static double get_min_cr(const AV1LevelSpec *const level_spec, int tier,
+                         int is_still_picture, int64_t decoded_sample_rate) {
+  if (is_still_picture) return 0.8;
+  if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+  const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr;
+  const double speed_adj =
+      (double)decoded_sample_rate / level_spec->max_display_rate;
+  return AOMMAX(min_cr_basis * speed_adj, 0.8);
+}
+
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+                                int is_still_picture) {
+  assert(is_valid_seq_level_idx(level_index));
+  const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+  return get_min_cr(level_spec, tier, is_still_picture,
+                    level_spec->max_decode_rate);
+}
+
+static void get_temporal_parallel_params(int scalability_mode_idc,
+                                         int *temporal_parallel_num,
+                                         int *temporal_parallel_denom) {
+  if (scalability_mode_idc < 0) {
+    *temporal_parallel_num = 1;
+    *temporal_parallel_denom = 1;
+    return;
+  }
+
+  // TODO(huisu@): handle scalability cases.
+  if (scalability_mode_idc == SCALABILITY_SS) {
+    (void)scalability_mode_idc;
+  } else {
+    (void)scalability_mode_idc;
+  }
+}
+
+#define MIN_CROPPED_TILE_WIDTH 8
+#define MIN_CROPPED_TILE_HEIGHT 8
+#define MIN_FRAME_WIDTH 16
+#define MIN_FRAME_HEIGHT 16
+#define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136
+
+static TARGET_LEVEL_FAIL_ID check_level_constraints(
+    const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier,
+    int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) {
+  const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level];
+  const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status;
+  if (decoder_model_status != DECODER_MODEL_OK &&
+      decoder_model_status != DECODER_MODEL_DISABLED) {
+    return DECODER_MODEL_FAIL;
+  }
+
+  const AV1LevelSpec *const level_spec = &level_info->level_spec;
+  const AV1LevelSpec *const target_level_spec = &av1_level_defs[level];
+  const AV1LevelStats *const level_stats = &level_info->level_stats;
+  TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK;
+  do {
+    if (level_spec->max_picture_size > target_level_spec->max_picture_size) {
+      fail_id = LUMA_PIC_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_h_size > target_level_spec->max_h_size) {
+      fail_id = LUMA_PIC_H_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_v_size > target_level_spec->max_v_size) {
+      fail_id = LUMA_PIC_V_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) {
+      fail_id = TOO_MANY_TILE_COLUMNS;
+      break;
+    }
+
+    if (level_spec->max_tiles > target_level_spec->max_tiles) {
+      fail_id = TOO_MANY_TILES;
+      break;
+    }
+
+    if (level_spec->max_header_rate > target_level_spec->max_header_rate) {
+      fail_id = FRAME_HEADER_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (decoder_model->max_display_rate >
+        (double)target_level_spec->max_display_rate) {
+      fail_id = DISPLAY_RATE_TOO_HIGH;
+      break;
+    }
+
+    // TODO(huisu): we are not using max decode rate calculated by the decoder
+    // model because the model in resource availability mode always returns
+    // MaxDecodeRate(as in the level definitions) as the max decode rate.
+    if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) {
+      fail_id = DECODE_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) {
+      fail_id = TILE_RATE_TOO_HIGH;
+      break;
+    }
+
+#if CONFIG_CWG_C013
+    const int max_tile_size = (level >= SEQ_LEVEL_7_0 && level <= SEQ_LEVEL_8_3)
+                                  ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE
+                                  : MAX_TILE_AREA;
+#else
+    const int max_tile_size = MAX_TILE_AREA;
+#endif
+    if (level_stats->max_tile_size > max_tile_size) {
+      fail_id = TILE_TOO_LARGE;
+      break;
+    }
+
+    if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) {
+      fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE;
+      break;
+    }
+
+    if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) {
+      fail_id = CROPPED_TILE_WIDTH_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) {
+      fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_frame_width < MIN_FRAME_WIDTH) {
+      fail_id = LUMA_PIC_H_SIZE_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) {
+      fail_id = LUMA_PIC_V_SIZE_TOO_SMALL;
+      break;
+    }
+
+    if (!level_stats->tile_width_is_valid) {
+      fail_id = TILE_WIDTH_INVALID;
+      break;
+    }
+
+    const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture,
+                                     level_spec->max_decode_rate);
+    if (level_stats->min_cr < min_cr) {
+      fail_id = CR_TOO_SMALL;
+      break;
+    }
+
+    if (check_bitrate) {
+      // Check average bitrate instead of max_bitrate.
+      const double bitrate_limit =
+          get_max_bitrate(target_level_spec, tier, profile);
+      const double avg_bitrate = level_stats->total_compressed_size * 8.0 /
+                                 level_stats->total_time_encoded;
+      if (avg_bitrate > bitrate_limit) {
+        fail_id = BITRATE_TOO_HIGH;
+        break;
+      }
+    }
+
+    if (target_level_spec->level > SEQ_LEVEL_5_1) {
+      int temporal_parallel_num;
+      int temporal_parallel_denom;
+      const int scalability_mode_idc = -1;
+      get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num,
+                                   &temporal_parallel_denom);
+      const int val = level_stats->max_tile_size * level_spec->max_header_rate *
+                      temporal_parallel_denom / temporal_parallel_num;
+      if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) {
+        fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH;
+        break;
+      }
+    }
+  } while (0);
+
+  return fail_id;
+}
+
+static void get_tile_stats(const AV1_COMMON *const cm,
+                           const TileDataEnc *const tile_data,
+                           int *max_tile_size, int *max_superres_tile_width,
+                           int *min_cropped_tile_width,
+                           int *min_cropped_tile_height,
+                           int *tile_width_valid) {
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const int superres_scale_denominator = cm->superres_scale_denominator;
+
+  *max_tile_size = 0;
+  *max_superres_tile_width = 0;
+  *min_cropped_tile_width = INT_MAX;
+  *min_cropped_tile_height = INT_MAX;
+  *tile_width_valid = 1;
+
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const TileInfo *const tile_info =
+          &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info;
+      const int tile_width =
+          (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE;
+      const int tile_height =
+          (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+      const int tile_size = tile_width * tile_height;
+      *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+      const int supperres_tile_width =
+          tile_width * superres_scale_denominator / SCALE_NUMERATOR;
+      *max_superres_tile_width =
+          AOMMAX(*max_superres_tile_width, supperres_tile_width);
+
+      const int cropped_tile_width =
+          cm->width - tile_info->mi_col_start * MI_SIZE;
+      const int cropped_tile_height =
+          cm->height - tile_info->mi_row_start * MI_SIZE;
+      *min_cropped_tile_width =
+          AOMMIN(*min_cropped_tile_width, cropped_tile_width);
+      *min_cropped_tile_height =
+          AOMMIN(*min_cropped_tile_height, cropped_tile_height);
+
+      const int is_right_most_tile =
+          tile_info->mi_col_end == cm->mi_params.mi_cols;
+      if (!is_right_most_tile) {
+        if (av1_superres_scaled(cm))
+          *tile_width_valid &= tile_width >= 128;
+        else
+          *tile_width_valid &= tile_width >= 64;
+      }
+    }
+  }
+}
+
+static int store_frame_record(int64_t ts_start, int64_t ts_end,
+                              size_t encoded_size, int pic_size,
+                              int frame_header_count, int tiles, int show_frame,
+                              int show_existing_frame,
+                              FrameWindowBuffer *const buffer) {
+  if (buffer->num < FRAME_WINDOW_SIZE) {
+    ++buffer->num;
+  } else {
+    buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE;
+  }
+  const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+  FrameRecord *const record = &buffer->buf[new_idx];
+  record->ts_start = ts_start;
+  record->ts_end = ts_end;
+  record->encoded_size_in_bytes = encoded_size;
+  record->pic_size = pic_size;
+  record->frame_header_count = frame_header_count;
+  record->tiles = tiles;
+  record->show_frame = show_frame;
+  record->show_existing_frame = show_existing_frame;
+
+  return new_idx;
+}
+
+// Count the number of frames encoded in the last "duration" ticks, in display
+// time.
+static int count_frames(const FrameWindowBuffer *const buffer,
+                        int64_t duration) {
+  const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+  // Assume current frame is shown frame.
+  assert(buffer->buf[current_idx].show_frame);
+
+  const int64_t current_time = buffer->buf[current_idx].ts_end;
+  const int64_t time_limit = AOMMAX(current_time - duration, 0);
+  int num_frames = 1;
+  int index = current_idx - 1;
+  for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) {
+    if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+    const FrameRecord *const record = &buffer->buf[index];
+    if (!record->show_frame) continue;
+    const int64_t ts_start = record->ts_start;
+    if (ts_start < time_limit) break;
+  }
+
+  return num_frames;
+}
+
+// Scan previously encoded frames and update level metrics accordingly.
+static void scan_past_frames(const FrameWindowBuffer *const buffer,
+                             int num_frames_to_scan,
+                             AV1LevelSpec *const level_spec,
+                             AV1LevelStats *const level_stats) {
+  const int num_frames_in_buffer = buffer->num;
+  int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE;
+  int frame_headers = 0;
+  int tiles = 0;
+  int64_t display_samples = 0;
+  int64_t decoded_samples = 0;
+  size_t encoded_size_in_bytes = 0;
+  for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) {
+    const FrameRecord *const record = &buffer->buf[index];
+    if (!record->show_existing_frame) {
+      frame_headers += record->frame_header_count;
+      decoded_samples += record->pic_size;
+    }
+    if (record->show_frame) {
+      display_samples += record->pic_size;
+    }
+    tiles += record->tiles;
+    encoded_size_in_bytes += record->encoded_size_in_bytes;
+    --index;
+    if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+  }
+  level_spec->max_header_rate =
+      AOMMAX(level_spec->max_header_rate, frame_headers);
+  // TODO(huisu): we can now compute max display rate with the decoder model, so
+  // these couple of lines can be removed. Keep them here for a while for
+  // debugging purpose.
+  level_spec->max_display_rate =
+      AOMMAX(level_spec->max_display_rate, display_samples);
+  level_spec->max_decode_rate =
+      AOMMAX(level_spec->max_decode_rate, decoded_samples);
+  level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles);
+  level_stats->max_bitrate =
+      AOMMAX(level_stats->max_bitrate,
+             (int)AOMMIN(encoded_size_in_bytes * 8, (size_t)INT_MAX));
+}
+
+void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
+                           int64_t ts_end) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1LevelParams *const level_params = &cpi->ppi->level_params;
+
+  const int upscaled_width = cm->superres_upscaled_width;
+  const int width = cm->width;
+  const int height = cm->height;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const int tiles = tile_cols * tile_rows;
+  const int luma_pic_size = upscaled_width * height;
+  const int frame_header_count = cpi->frame_header_count;
+  const int show_frame = cm->show_frame;
+  const int show_existing_frame = cm->show_existing_frame;
+
+  int max_tile_size;
+  int min_cropped_tile_width;
+  int min_cropped_tile_height;
+  int max_superres_tile_width;
+  int tile_width_is_valid;
+  get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width,
+                 &min_cropped_tile_width, &min_cropped_tile_height,
+                 &tile_width_is_valid);
+
+  const double compression_ratio = av1_get_compression_ratio(cm, size);
+
+  const int temporal_layer_id = cm->temporal_layer_id;
+  const int spatial_layer_id = cm->spatial_layer_id;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const int is_still_picture = seq_params->still_picture;
+  // update level_stats
+  // TODO(kyslov@) fix the implementation according to buffer model
+  for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) {
+    if (!is_in_operating_point(seq_params->operating_point_idc[i],
+                               temporal_layer_id, spatial_layer_id) ||
+        !((level_params->keep_level_stats >> i) & 1)) {
+      continue;
+    }
+
+    AV1LevelInfo *const level_info = level_params->level_info[i];
+    assert(level_info != NULL);
+    AV1LevelStats *const level_stats = &level_info->level_stats;
+
+    level_stats->max_tile_size =
+        AOMMAX(level_stats->max_tile_size, max_tile_size);
+    level_stats->max_superres_tile_width =
+        AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width);
+    level_stats->min_cropped_tile_width =
+        AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width);
+    level_stats->min_cropped_tile_height =
+        AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height);
+    level_stats->tile_width_is_valid &= tile_width_is_valid;
+    level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width);
+    level_stats->min_frame_height =
+        AOMMIN(level_stats->min_frame_height, height);
+    level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio);
+    level_stats->total_compressed_size += (double)size;
+
+    // update level_spec
+    // TODO(kyslov@) update all spec fields
+    AV1LevelSpec *const level_spec = &level_info->level_spec;
+    level_spec->max_picture_size =
+        AOMMAX(level_spec->max_picture_size, luma_pic_size);
+    level_spec->max_h_size =
+        AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width);
+    level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height);
+    level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols);
+    level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles);
+
+    // Store info. of current frame into FrameWindowBuffer.
+    FrameWindowBuffer *const buffer = &level_info->frame_window_buffer;
+    store_frame_record(ts_start, ts_end, size, luma_pic_size,
+                       frame_header_count, tiles, show_frame,
+                       show_existing_frame, buffer);
+    if (show_frame) {
+      // Count the number of frames encoded in the past 1 second.
+      const int encoded_frames_in_last_second =
+          show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
+      scan_past_frames(buffer, encoded_frames_in_last_second, level_spec,
+                       level_stats);
+      level_stats->total_time_encoded +=
+          (cpi->time_stamps.prev_ts_end - cpi->time_stamps.prev_ts_start) /
+          (double)TICKS_PER_SEC;
+    }
+
+    DECODER_MODEL *const decoder_models = level_info->decoder_models;
+    for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+      av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]);
+    }
+
+    // Check whether target level is met.
+    const AV1_LEVEL target_level = level_params->target_seq_level_idx[i];
+    if (target_level < SEQ_LEVELS && cpi->oxcf.strict_level_conformance) {
+      assert(is_valid_seq_level_idx(target_level));
+      const int tier = seq_params->tier[i];
+      const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+          level_info, target_level, tier, is_still_picture, profile, 0);
+      if (fail_id != TARGET_LEVEL_OK) {
+        const int target_level_major = 2 + (target_level >> 2);
+        const int target_level_minor = target_level & 3;
+        aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                           "Failed to encode to the target level %d_%d. %s",
+                           target_level_major, target_level_minor,
+                           level_fail_messages[fail_id]);
+      }
+    }
+  }
+}
+
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+                                      const AV1LevelParams *level_params,
+                                      int *seq_level_idx) {
+  const int is_still_picture = seq_params->still_picture;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+    seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+    if (!((level_params->keep_level_stats >> op) & 1)) continue;
+    const int tier = seq_params->tier[op];
+    const AV1LevelInfo *const level_info = level_params->level_info[op];
+    assert(level_info != NULL);
+    for (int level = 0; level < SEQ_LEVELS; ++level) {
+      if (!is_valid_seq_level_idx(level)) continue;
+      const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+          level_info, level, tier, is_still_picture, profile, 1);
+      if (fail_id == TARGET_LEVEL_OK) {
+        seq_level_idx[op] = level;
+        break;
+      }
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
+                                             const AV1LevelParams *level_params,
+                                             int *target_seq_level_idx) {
+  for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+    target_seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+    if (!((level_params->keep_level_stats >> op) & 1)) continue;
+    target_seq_level_idx[op] = level_params->target_seq_level_idx[op];
+  }
+
+  return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/level.h b/third_party/aom/av1/encoder/level.h
new file mode 100644
index 0000000000..ebf2a1c19d
--- /dev/null
+++ b/third_party/aom/av1/encoder/level.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LEVEL_H_
+#define AOM_AV1_ENCODER_LEVEL_H_
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+
+// AV1 Level Specifications
+typedef struct {
+  AV1_LEVEL level;
+  int max_picture_size;
+  int max_h_size;
+  int max_v_size;
+  int max_header_rate;
+  int max_tile_rate;
+  int max_tiles;
+  int max_tile_cols;
+  int64_t max_display_rate;
+  int64_t max_decode_rate;
+  double main_mbps;
+  double high_mbps;
+  double main_cr;
+  double high_cr;
+} AV1LevelSpec;
+
+typedef struct {
+  int64_t ts_start;
+  int64_t ts_end;
+  size_t encoded_size_in_bytes;
+  int pic_size;
+  int frame_header_count;
+  int tiles;
+  int show_frame;
+  int show_existing_frame;
+} FrameRecord;
+
+// Record frame info. in a rolling window.
+#define FRAME_WINDOW_SIZE 256
+typedef struct {
+  FrameRecord buf[FRAME_WINDOW_SIZE];
+  int num;    // Number of FrameRecord stored in the buffer.
+  int start;  // Buffer index of the first FrameRecord.
+} FrameWindowBuffer;
+
+typedef struct {
+  int max_bitrate;  // Max bitrate in any 1-second window, in bps.
+  int max_tile_size;
+  int max_superres_tile_width;
+  int min_cropped_tile_width;
+  int min_cropped_tile_height;
+  int tile_width_is_valid;
+  int min_frame_width;
+  int min_frame_height;
+  double total_compressed_size;  // In bytes.
+  double total_time_encoded;     // In seconds.
+  double min_cr;
+} AV1LevelStats;
+
+// The following data structures are for the decoder model.
+typedef struct {
+  int decoder_ref_count;
+  int player_ref_count;
+  int display_index;
+  FRAME_TYPE frame_type;
+  double presentation_time;
+} FRAME_BUFFER;
+
+// Interval of bits transmission for a DFG(Decodable Frame Group).
+typedef struct {
+  double first_bit_arrival_time;  // Time when the first bit arrives.
+  double last_bit_arrival_time;   // Time when the last bit arrives.
+  // Removal time means the time when the bits to be decoded are removed from
+  // the smoothing buffer. Removal time is essentially the time when the
+  // decoding of the frame starts.
+  double removal_time;
+} DFG_INTERVAL;
+
+#define DFG_INTERVAL_QUEUE_SIZE 64
+typedef struct {
+  int head;
+  int size;
+  double total_interval;
+  DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE];
+} DFG_INTERVAL_QUEUE;
+
+enum {
+  RESOURCE_MODE = 0,  // Resource availability mode.
+  SCHEDULE_MODE       // Decoding schedule mode.
+} UENUM1BYTE(DECODER_MODEL_MODE);
+
+enum {
+  DECODER_MODEL_OK = 0,
+  DECODE_BUFFER_AVAILABLE_LATE,
+  DECODE_FRAME_BUF_UNAVAILABLE,
+  DECODE_EXISTING_FRAME_BUF_EMPTY,
+  DISPLAY_FRAME_LATE,
+  SMOOTHING_BUFFER_UNDERFLOW,
+  SMOOTHING_BUFFER_OVERFLOW,
+  DECODER_MODEL_DISABLED
+} UENUM1BYTE(DECODER_MODEL_STATUS);
+
+#define BUFFER_POOL_MAX_SIZE 10
+typedef struct {
+  DECODER_MODEL_STATUS status;
+  DECODER_MODEL_MODE mode;
+  bool is_low_delay_mode;
+  AV1_LEVEL level;
+  int encoder_buffer_delay;  // In units of 1/90000 seconds.
+  int decoder_buffer_delay;  // In units of 1/90000 seconds.
+  int num_ticks_per_picture;
+  int initial_display_delay;  // In units of frames.
+  int64_t decode_rate;
+  double display_clock_tick;          // In units of seconds.
+  double current_time;                // In units of seconds.
+  double initial_presentation_delay;  // In units of seconds.
+  double bit_rate;                    // Bits per second.
+
+  int num_frame;
+  int num_decoded_frame;
+  int num_shown_frame;
+  int vbi[REF_FRAMES];  // Virtual buffer index.
+  FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE];
+  DFG_INTERVAL_QUEUE dfg_interval_queue;
+
+  // Information for the DFG(Decodable Frame Group) being processed.
+  double first_bit_arrival_time;
+  double last_bit_arrival_time;
+  size_t coded_bits;
+
+  // Information for the frame being processed.
+  double removal_time;
+  double presentation_time;
+  int decode_samples;
+  int display_samples;
+
+  double max_display_rate;
+  double max_decode_rate;
+} DECODER_MODEL;
+
+typedef struct {
+  AV1LevelStats level_stats;
+  AV1LevelSpec level_spec;
+  FrameWindowBuffer frame_window_buffer;
+  DECODER_MODEL decoder_models[SEQ_LEVELS];
+} AV1LevelInfo;
+
+typedef struct AV1LevelParams {
+  // Specifies the level that the coded video sequence conforms to for each
+  // operating point.
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  // Bit mask to indicate whether to keep level stats for corresponding
+  // operating points.
+  uint32_t keep_level_stats;
+  // Level information for each operating point.
+  AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
+} AV1LevelParams;
+
+static INLINE int is_in_operating_point(int operating_point,
+                                        int temporal_layer_id,
+                                        int spatial_layer_id) {
+  if (!operating_point) return 1;
+
+  return ((operating_point >> temporal_layer_id) & 1) &&
+         ((operating_point >> (spatial_layer_id + 8)) & 1);
+}
+
+void av1_init_level_info(struct AV1_COMP *cpi);
+
+void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start,
+                           int64_t ts_end);
+
+// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS].
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+                                      const AV1LevelParams *level_params,
+                                      int *seq_level_idx);
+
+aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
+                                             const AV1LevelParams *level_params,
+                                             int *target_seq_level_idx);
+
+// Print the status of the decoder model(for debugging).
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level,
+                            int op_index, DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi,
+                                     size_t coded_bits,
+                                     DECODER_MODEL *const decoder_model);
+
+// This function uses the decoder model to check whether there could be
+// SMOOTHING_BUFFER_UNDERFLOW or SMOOTHING_BUFFER_OVERFLOW. It does not
+// update the content of decoder_model, and can be used to target certain
+// encoding level in the recode loop.
+DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
+    const struct AV1_COMP *const cpi, size_t coded_bits,
+    const DECODER_MODEL *const decoder_model);
+
+// Return max bitrate(bps) for given level.
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+                                     BITSTREAM_PROFILE profile);
+
+// Get max number of tiles and tile columns for given level.
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+                                 int *const max_tile_cols);
+
+// Return minimum compression ratio for given level.
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+                                int is_still_picture);
+#endif  // AOM_AV1_ENCODER_LEVEL_H_
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
new file mode 100644
index 0000000000..9ef9b88675
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+  int index = *idx;
+  struct lookahead_entry *buf = ctx->buf + index;
+
+  assert(index < ctx->max_sz);
+  if (++index >= ctx->max_sz) index -= ctx->max_sz;
+  *idx = index;
+  return buf;
+}
+
+void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
+  if (ctx) {
+    if (ctx->buf) {
+      int i;
+
+      for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
+      free(ctx->buf);
+    }
+    free(ctx);
+  }
+}
+
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+    const int border_in_pixels, int byte_alignment, int num_lap_buffers,
+    bool is_all_intra, int num_pyramid_levels) {
+  int lag_in_frames = AOMMAX(1, depth);
+
+  // For all-intra frame encoding, previous source frames are not required.
+  // Hence max_pre_frames is set to 0 in this case. As previous source frames
+  // are accessed using a negative index to av1_lookahead_peek(), setting
+  // max_pre_frames to 0 will cause av1_lookahead_peek() to return NULL for a
+  // negative index.
+  const uint8_t max_pre_frames = is_all_intra ? 0 : MAX_PRE_FRAMES;
+
+  // Add the lags to depth and clamp
+  depth += num_lap_buffers;
+  depth = clamp(depth, 1, MAX_TOTAL_BUFFERS);
+
+  // Allocate memory to keep previous source frames available.
+  depth += max_pre_frames;
+
+  // Allocate the lookahead structures
+  struct lookahead_ctx *ctx = calloc(1, sizeof(*ctx));
+  if (ctx) {
+    unsigned int i;
+    ctx->max_sz = depth;
+    ctx->push_frame_count = 0;
+    ctx->max_pre_frames = max_pre_frames;
+    ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - ctx->max_pre_frames;
+    ctx->read_ctxs[ENCODE_STAGE].valid = 1;
+    if (num_lap_buffers) {
+      ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames;
+      ctx->read_ctxs[LAP_STAGE].valid = 1;
+    }
+    ctx->buf = calloc(depth, sizeof(*ctx->buf));
+    if (!ctx->buf) goto fail;
+    for (i = 0; i < depth; i++) {
+      if (aom_realloc_frame_buffer(
+              &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+              use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL,
+              NULL, num_pyramid_levels, 0)) {
+        goto fail;
+      }
+    }
+  }
+  return ctx;
+fail:
+  av1_lookahead_destroy(ctx);
+  return NULL;
+}
+
+int av1_lookahead_full(const struct lookahead_ctx *ctx) {
+  // TODO(angiebird): Test this function.
+  return ctx->read_ctxs[ENCODE_STAGE].sz >= ctx->read_ctxs[ENCODE_STAGE].pop_sz;
+}
+
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+                       int num_pyramid_levels, aom_enc_frame_flags_t flags) {
+  int width = src->y_crop_width;
+  int height = src->y_crop_height;
+  int uv_width = src->uv_crop_width;
+  int uv_height = src->uv_crop_height;
+  int subsampling_x = src->subsampling_x;
+  int subsampling_y = src->subsampling_y;
+  int larger_dimensions, new_dimensions;
+
+  assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1);
+  if (ctx->read_ctxs[ENCODE_STAGE].sz + ctx->max_pre_frames > ctx->max_sz)
+    return 1;
+
+  ctx->read_ctxs[ENCODE_STAGE].sz++;
+  if (ctx->read_ctxs[LAP_STAGE].valid) {
+    ctx->read_ctxs[LAP_STAGE].sz++;
+  }
+
+  struct lookahead_entry *buf = pop(ctx, &ctx->write_idx);
+
+  new_dimensions = width != buf->img.y_crop_width ||
+                   height != buf->img.y_crop_height ||
+                   uv_width != buf->img.uv_crop_width ||
+                   uv_height != buf->img.uv_crop_height;
+  larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+                      uv_width > buf->img.uv_width ||
+                      uv_height > buf->img.uv_height;
+  assert(!larger_dimensions || new_dimensions);
+
+  if (larger_dimensions) {
+    YV12_BUFFER_CONFIG new_img;
+    memset(&new_img, 0, sizeof(new_img));
+    if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+                               subsampling_y, use_highbitdepth,
+                               AOM_BORDER_IN_PIXELS, 0, num_pyramid_levels, 0))
+      return 1;
+    aom_free_frame_buffer(&buf->img);
+    buf->img = new_img;
+  } else if (new_dimensions) {
+    buf->img.y_crop_width = src->y_crop_width;
+    buf->img.y_crop_height = src->y_crop_height;
+    buf->img.uv_crop_width = src->uv_crop_width;
+    buf->img.uv_crop_height = src->uv_crop_height;
+    buf->img.subsampling_x = src->subsampling_x;
+    buf->img.subsampling_y = src->subsampling_y;
+  }
+  // Partial copy not implemented yet
+  av1_copy_and_extend_frame(src, &buf->img);
+
+  buf->ts_start = ts_start;
+  buf->ts_end = ts_end;
+  buf->display_idx = ctx->push_frame_count;
+  buf->flags = flags;
+  ++ctx->push_frame_count;
+  aom_remove_metadata_from_frame_buffer(&buf->img);
+  if (src->metadata &&
+      aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata)) {
+    return 1;
+  }
+  return 0;
+}
+
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+                                          COMPRESSOR_STAGE stage) {
+  struct lookahead_entry *buf = NULL;
+  if (ctx) {
+    struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+    assert(read_ctx->valid == 1);
+    if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) {
+      buf = pop(ctx, &read_ctx->read_idx);
+      read_ctx->sz--;
+    }
+  }
+  return buf;
+}
+
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+                                           COMPRESSOR_STAGE stage) {
+  struct lookahead_entry *buf = NULL;
+  if (ctx == NULL) {
+    return buf;
+  }
+
+  struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
+  if (index >= 0) {
+    // Forward peek
+    if (index < read_ctx->sz) {
+      index += read_ctx->read_idx;
+      if (index >= ctx->max_sz) index -= ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  } else if (index < 0) {
+    // Backward peek
+    if (-index <= ctx->max_pre_frames) {
+      index += (int)(read_ctx->read_idx);
+      if (index < 0) index += (int)(ctx->max_sz);
+      buf = ctx->buf + index;
+    }
+  }
+
+  return buf;
+}
+
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+                                 COMPRESSOR_STAGE stage) {
+  assert(ctx != NULL);
+
+  struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
+  return read_ctx->sz;
+}
+
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) {
+  assert(ctx != NULL);
+
+  struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
+  return read_ctx->pop_sz;
+}
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
new file mode 100644
index 0000000000..c0e6d222f5
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes look ahead buffer operations.
+ */
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
+
+#include <stdbool.h>
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define MAX_LAG_BUFFERS 48
+#define MAX_LAP_BUFFERS 48
+#define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
+#define LAP_LAG_IN_FRAMES 17
+
+struct lookahead_entry {
+  YV12_BUFFER_CONFIG img;
+  int64_t ts_start;
+  int64_t ts_end;
+  int display_idx;
+  aom_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE);
+
+struct read_ctx {
+  int sz;       /* Number of buffers currently in the queue */
+  int read_idx; /* Read index */
+  int pop_sz;   /* Size to check for pop condition */
+  int valid;    /* Is this ctx valid? */
+};
+
+struct lookahead_ctx {
+  int max_sz;                            /* Absolute size of the queue */
+  int write_idx;                         /* Write index */
+  struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */
+  struct lookahead_entry *buf;           /* Buffer list */
+  int push_frame_count; /* Number of frames that have been pushed in the queue*/
+  uint8_t
+      max_pre_frames; /* Maximum number of past frames allowed in the queue */
+};
+/*!\endcond */
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+    const int border_in_pixels, int byte_alignment, int num_lap_buffers,
+    bool is_all_intra, int num_pyramid_levels);
+
+/**\brief Destroys the lookahead stage
+ */
+void av1_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Check if lookahead buffer is full
+ */
+int av1_lookahead_full(const struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] use_highbitdepth Tell if HBD is used
+ * \param[in] num_pyramid_levels Number of pyramid levels to allocate
+                          for each frame buffer
+ * \param[in] flags       Flags set on this frame
+ */
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+                       int num_pyramid_levels, aom_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ * \param[in] stage     Encoder stage
+ *
+ * \retval Return NULL, if drain set and queue is empty, or if drain not set and
+ * queue not of the configured depth.
+ */
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+                                          COMPRESSOR_STAGE stage);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ * \param[in] stage     Encoder stage
+ *
+ * \retval Return NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+                                           COMPRESSOR_STAGE stage);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ */
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+                                 COMPRESSOR_STAGE stage);
+
+/**\brief Get pop_sz value
+ */
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
new file mode 100644
index 0000000000..4e53447379
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -0,0 +1,3998 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
+                                       const MvCosts *mv_costs,
+                                       const MV *ref_mv, int errorperbit,
+                                       int sadperbit) {
+  mv_cost_params->ref_mv = ref_mv;
+  mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv);
+  mv_cost_params->mv_cost_type = MV_COST_ENTROPY;
+  mv_cost_params->error_per_bit = errorperbit;
+  mv_cost_params->sad_per_bit = sadperbit;
+  // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the
+  // population of mvjcost and mvcost are avoided. In case of IntraBC, these
+  // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode().
+  if (mv_costs != NULL) {
+    mv_cost_params->mvjcost = mv_costs->nmv_joint_cost;
+    mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0];
+    mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1];
+  }
+}
+
+static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
+  ms_buffers->ref = &x->e_mbd.plane[0].pre[0];
+  ms_buffers->src = &x->plane[0].src;
+
+  av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0);
+
+  ms_buffers->wsrc = x->obmc_buffer.wsrc;
+  ms_buffers->obmc_mask = x->obmc_buffer.mask;
+}
+
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
+  obmc_buffer->wsrc = NULL;
+  obmc_buffer->mask = NULL;
+  obmc_buffer->above_pred = NULL;
+  obmc_buffer->left_pred = NULL;
+}
+
+void av1_make_default_fullpel_ms_params(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
+    const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+    SEARCH_METHODS search_method, int fine_search_interval) {
+  const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+  const int is_key_frame =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE;
+
+  // High level params
+  ms_params->bsize = bsize;
+  ms_params->vfp = &cpi->ppi->fn_ptr[bsize];
+
+  init_ms_buffers(&ms_params->ms_buffers, x);
+
+  av1_set_mv_search_method(ms_params, search_sites, search_method);
+
+  ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
+  ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns;
+  ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh;
+  ms_params->prune_mesh_search =
+      (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_2) ? 1 : 0;
+  ms_params->mesh_search_mv_diff_threshold = 4;
+  ms_params->run_mesh_search = 0;
+  ms_params->fine_search_interval = fine_search_interval;
+
+  ms_params->is_intra_mode = 0;
+
+  ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level;
+
+  ms_params->mv_limits = x->mv_limits;
+  av1_set_mv_search_range(&ms_params->mv_limits, ref_mv);
+
+  // Mvcost params
+  init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
+                      x->errorperbit, x->sadperbit);
+
+  ms_params->sdf = ms_params->vfp->sdf;
+  ms_params->sdx4df = ms_params->vfp->sdx4df;
+  ms_params->sdx3df = ms_params->vfp->sdx3df;
+
+  if (mv_sf->use_downsampled_sad == 2 && block_size_high[bsize] >= 16) {
+    ms_params->sdf = ms_params->vfp->sdsf;
+    ms_params->sdx4df = ms_params->vfp->sdsx4df;
+    // Skip version of sadx3 is not available yet
+    ms_params->sdx3df = ms_params->vfp->sdsx4df;
+  } else if (mv_sf->use_downsampled_sad == 1 && block_size_high[bsize] >= 16 &&
+             !is_key_frame) {
+    FULLPEL_MV start_mv_clamped = start_mv;
+    // adjust start_mv to make sure it is within MV range
+    clamp_fullmv(&start_mv_clamped, &ms_params->mv_limits);
+
+    const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+    const int ref_stride = ref->stride;
+    const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv_clamped);
+    const struct buf_2d *const src = ms_params->ms_buffers.src;
+    const uint8_t *src_buf = src->buf;
+    const int src_stride = src->stride;
+
+    unsigned int start_mv_sad_even_rows, start_mv_sad_odd_rows;
+    start_mv_sad_even_rows =
+        ms_params->vfp->sdsf(src_buf, src_stride, best_address, ref_stride);
+    start_mv_sad_odd_rows =
+        ms_params->vfp->sdsf(src_buf + src_stride, src_stride,
+                             best_address + ref_stride, ref_stride);
+
+    // If the absolute SAD difference computed between the pred-to-src of even
+    // and odd rows is small, skip every other row in sad computation.
+    const int odd_to_even_diff_sad =
+        abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+    const int mult_thresh = 4;
+    if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+      ms_params->sdf = ms_params->vfp->sdsf;
+      ms_params->sdx4df = ms_params->vfp->sdsx4df;
+      ms_params->sdx3df = ms_params->vfp->sdsx4df;
+    }
+  }
+}
+
+void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const IntraBCMVCosts *dv_costs) {
+  ms_params->is_intra_mode = 1;
+
+  MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+  mv_cost_params->mvjcost = dv_costs->joint_mv;
+  mv_cost_params->mvcost[0] = dv_costs->dv_costs[0];
+  mv_cost_params->mvcost[1] = dv_costs->dv_costs[1];
+}
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                       const struct AV1_COMP *cpi,
+                                       const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                       const MV *ref_mv, const int *cost_list) {
+  const AV1_COMMON *cm = &cpi->common;
+  // High level params
+  ms_params->allow_hp = cm->features.allow_high_precision_mv;
+  ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop;
+  ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step;
+  ms_params->cost_list = cond_cost_list_const(cpi, cost_list);
+
+  av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv);
+
+  // Mvcost params
+  init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
+                      x->errorperbit, x->sadperbit);
+
+  // Subpel variance params
+  ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize];
+  ms_params->var_params.subpel_search_type =
+      cpi->sf.mv_sf.use_accurate_subpel_search;
+  ms_params->var_params.w = block_size_wide[bsize];
+  ms_params->var_params.h = block_size_high[bsize];
+
+  // Ref and src buffers
+  MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers;
+  init_ms_buffers(ms_buffers, x);
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) {
+  // Calculate the outermost full-pixel MVs which are inside the limits set by
+  // av1_set_subpel_mv_search_range().
+  //
+  // The subpel limits are simply mv->col +/- 8*MAX_FULL_PEL_VAL, and similar
+  // for mv->row. We can then divide by 8 to find the fullpel MV limits. But
+  // we have to be careful about the rounding. We want these bounds to be
+  // at least as tight as the subpel limits, which means that we must round
+  // the minimum values up and the maximum values down when dividing.
+  int col_min = ((mv->col + 7) >> 3) - MAX_FULL_PEL_VAL;
+  int row_min = ((mv->row + 7) >> 3) - MAX_FULL_PEL_VAL;
+  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+  col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
+
+  // Get intersection of UMV window and valid MV window to reduce # of checks
+  // in diamond search.
+  if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+  if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+  if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+  if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+
+  mv_limits->col_max = AOMMAX(mv_limits->col_min, mv_limits->col_max);
+  mv_limits->row_max = AOMMAX(mv_limits->row_min, mv_limits->row_max);
+}
+
+int av1_init_search_range(int size) {
+  int sr = 0;
+  // Minimum search size no matter what the passed in value.
+  size = AOMMAX(16, size);
+
+  while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+  sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
+  return sr;
+}
+
+// ============================================================================
+//  Cost of motion vectors
+// ============================================================================
+// TODO(any): Adaptively adjust the regularization strength based on image size
+// and motion activity instead of using hard-coded values. It seems like we
+// roughly half the lambda for each increase in resolution
+// These are multiplier used to perform regularization in motion compensation
+// when x->mv_cost_type is set to MV_COST_L1.
+// LOWRES
+#define SSE_LAMBDA_LOWRES 2   // Used by mv_cost_err_fn
+#define SAD_LAMBDA_LOWRES 32  // Used by mvsad_err_cost during full pixel search
+// MIDRES
+#define SSE_LAMBDA_MIDRES 0   // Used by mv_cost_err_fn
+#define SAD_LAMBDA_MIDRES 15  // Used by mvsad_err_cost during full pixel search
+// HDRES
+#define SSE_LAMBDA_HDRES 1  // Used by mv_cost_err_fn
+#define SAD_LAMBDA_HDRES 8  // Used by mvsad_err_cost during full pixel search
+
+// Returns the rate of encoding the current motion vector based on the
+// joint_cost and comp_cost. joint_costs covers the cost of transmitting
+// JOINT_MV, and comp_cost covers the cost of transmitting the actual motion
+// vector.
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+                          const int *const comp_cost[2]) {
+  return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+         comp_cost[1][mv->col];
+}
+
+#define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr))
+// Returns the cost of encoding the motion vector diff := *mv - *ref. The cost
+// is defined as the rate required to encode diff * weight, rounded to the
+// nearest 2 ** 7.
+// This is NOT used during motion compensation.
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+                    int *const mvcost[2], int weight) {
+  const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+  return ROUND_POWER_OF_TWO(
+      mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// used when var is used as the error metric.
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
+                              const int *mvjcost, const int *const mvcost[2],
+                              int error_per_bit, MV_COST_TYPE mv_cost_type) {
+  const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+  const MV abs_diff = { abs(diff.row), abs(diff.col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      if (mvcost) {
+        return (int)ROUND_POWER_OF_TWO_64(
+            (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+            RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+                PIXEL_TRANSFORM_ERROR_SCALE);
+      }
+      return 0;
+    case MV_COST_L1_LOWRES:
+      return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_L1_MIDRES:
+      return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_L1_HDRES:
+      return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_NONE: return 0;
+    default: assert(0 && "Invalid rd_cost_type"); return 0;
+  }
+}
+
+static INLINE int mv_err_cost_(const MV *mv,
+                               const MV_COST_PARAMS *mv_cost_params) {
+  if (mv_cost_params->mv_cost_type == MV_COST_NONE) {
+    return 0;
+  }
+  return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
+                     mv_cost_params->mvcost, mv_cost_params->error_per_bit,
+                     mv_cost_params->mv_cost_type);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// only used during full pixel motion search when sad is used as the error
+// metric
+static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
+                                 const int *mvjcost, const int *const mvcost[2],
+                                 int sad_per_bit, MV_COST_TYPE mv_cost_type) {
+  const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row),
+                    GET_MV_SUBPEL(mv->col - ref_mv->col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      return ROUND_POWER_OF_TWO(
+          (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) *
+              sad_per_bit,
+          AV1_PROB_COST_SHIFT);
+    case MV_COST_L1_LOWRES:
+      return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_L1_MIDRES:
+      return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_L1_HDRES:
+      return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_NONE: return 0;
+    default: assert(0 && "Invalid rd_cost_type"); return 0;
+  }
+}
+
+static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv,
+                                  const MV_COST_PARAMS *mv_cost_params) {
+  return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv,
+                        mv_cost_params->mvjcost, mv_cost_params->mvcost,
+                        mv_cost_params->sad_per_bit,
+                        mv_cost_params->mv_cost_type);
+}
+
+// =============================================================================
+//  Fullpixel Motion Search: Translational
+// =============================================================================
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
+#define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
+
+// Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods.
+// level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+                                    int level) {
+  int num_search_steps = 0;
+  int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+  cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+  cfg->site[stage_index][0].offset = 0;
+  cfg->stride = stride;
+
+  // Choose the initial step size depending on level.
+  const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP;
+
+  for (int radius = first_step; radius > 0;) {
+    int num_search_pts = 8;
+
+    const FULLPEL_MV search_site_mvs[13] = {
+      { 0, 0 },           { -radius, 0 },      { radius, 0 },
+      { 0, -radius },     { 0, radius },       { -radius, -radius },
+      { radius, radius }, { -radius, radius }, { radius, -radius },
+    };
+
+    int i;
+    for (i = 0; i <= num_search_pts; ++i) {
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    // Update the search radius based on level.
+    if (!level || ((stage_index < 9) && level)) radius /= 2;
+    --stage_index;
+    ++num_search_steps;
+  }
+  cfg->num_search_steps = num_search_steps;
+}
+
+void av1_init_motion_fpf(search_site_config *cfg, int stride) {
+  int num_search_steps = 0;
+  int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+  cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+  cfg->site[stage_index][0].offset = 0;
+  cfg->stride = stride;
+
+  for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+    // Generate offsets for 8 search sites per step.
+    int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+    int num_search_pts = 12;
+    if (radius == 1) num_search_pts = 8;
+
+    const FULLPEL_MV search_site_mvs[13] = {
+      { 0, 0 },
+      { -radius, 0 },
+      { radius, 0 },
+      { 0, -radius },
+      { 0, radius },
+      { -radius, -tan_radius },
+      { radius, tan_radius },
+      { -tan_radius, radius },
+      { tan_radius, -radius },
+      { -radius, tan_radius },
+      { radius, -tan_radius },
+      { tan_radius, radius },
+      { -tan_radius, -radius },
+    };
+
+    int i;
+    for (i = 0; i <= num_search_pts; ++i) {
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    --stage_index;
+    ++num_search_steps;
+  }
+  cfg->num_search_steps = num_search_steps;
+}
+
+// Search site initialization for NSTEP / NSTEP_8PT search methods.
+// level = 0: NSTEP, level = 1: NSTEP_8PT.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+                                        int level) {
+  int num_search_steps = 0;
+  int stage_index = 0;
+  cfg->stride = stride;
+  int radius = 1;
+  const int num_stages = (level > 0) ? 16 : 15;
+  for (stage_index = 0; stage_index < num_stages; ++stage_index) {
+    int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+    int num_search_pts = 12;
+    if ((radius <= 5) || (level > 0)) {
+      tan_radius = radius;
+      num_search_pts = 8;
+    }
+    const FULLPEL_MV search_site_mvs[13] = {
+      { 0, 0 },
+      { -radius, 0 },
+      { radius, 0 },
+      { 0, -radius },
+      { 0, radius },
+      { -radius, -tan_radius },
+      { radius, tan_radius },
+      { -tan_radius, radius },
+      { tan_radius, -radius },
+      { -radius, tan_radius },
+      { radius, -tan_radius },
+      { tan_radius, radius },
+      { -tan_radius, -radius },
+    };
+
+    for (int i = 0; i <= num_search_pts; ++i) {
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    ++num_search_steps;
+    if (stage_index < 12)
+      radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1);
+  }
+  cfg->num_search_steps = num_search_steps;
+}
+
+// Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND
+// search methods.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+                                         int level) {
+  (void)level;
+  cfg->stride = stride;
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+
+  // BIGDIA search method candidates.
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const FULLPEL_MV
+      site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+          { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 },
+            { 0, 0 }, { 0, 0 } },
+          { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+            { -1, 1 }, { -2, 0 } },
+          { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+            { -2, 2 }, { -4, 0 } },
+          { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+            { -4, 4 }, { -8, 0 } },
+          { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+            { -8, 8 }, { -16, 0 } },
+          { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+            { 0, 32 }, { -16, 16 }, { -32, 0 } },
+          { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+            { 0, 64 }, { -32, 32 }, { -64, 0 } },
+          { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+            { 0, 128 }, { -64, 64 }, { -128, 0 } },
+          { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 },
+            { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } },
+          { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 },
+            { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } },
+          { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+            { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+        };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = bigdia_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = site_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for SQUARE search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+                                         int level) {
+  (void)level;
+  cfg->stride = stride;
+  // All scales have 8 closest points in square shape.
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+
+  // Square search method candidates.
+  // Note that the largest candidate step at each scale is 2^scale.
+  /* clang-format off */
+    static const FULLPEL_MV
+        square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+             { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+               { -1, 1 }, { -1, 0 } },
+             { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+               { -2, 2 }, { -2, 0 } },
+             { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+               { -4, 4 }, { -4, 0 } },
+             { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+               { -8, 8 }, { -8, 0 } },
+             { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+               { 0, 16 }, { -16, 16 }, { -16, 0 } },
+             { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+               { 0, 32 }, { -32, 32 }, { -32, 0 } },
+             { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+               { 0, 64 }, { -64, 64 }, { -64, 0 } },
+             { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 },
+               { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } },
+             { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 },
+               { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } },
+             { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 },
+               { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } },
+             { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+               { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+    };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = square_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = square_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for HEX / FAST_HEX search methods.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+                                      int level) {
+  (void)level;
+  cfg->stride = stride;
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales.
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+                                                              6, 6, 6, 6, 6 };
+  // Note that the largest candidate step at each scale is 2^scale.
+  /* clang-format off */
+    static const FULLPEL_MV
+        hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+          { -1, 1 }, { -1, 0 } },
+        { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+        { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+        { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+        { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 },
+          { -8, 16 }, { -16, 0 } },
+        { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+          { -32, 0 } },
+        { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+          { -64, 0 } },
+        { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 },
+          { -64, 128 }, { -128, 0 } },
+        { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 },
+          { -128, 256 }, { -256, 0 } },
+        { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 },
+          { -256, 512 }, { -512, 0 } },
+        { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+          { -512, 1024 }, { -1024, 0 } },
+    };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = hex_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < hex_num_candidates[i]; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = hex_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+const av1_init_search_site_config
+    av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = {
+      av1_init_dsmotion_compensation,     av1_init_motion_compensation_nstep,
+      av1_init_motion_compensation_nstep, av1_init_dsmotion_compensation,
+      av1_init_motion_compensation_hex,   av1_init_motion_compensation_bigdia,
+      av1_init_motion_compensation_square
+    };
+
+// Checks whether the mv is within range of the mv_limits
+static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
+                               int range) {
+  return ((row - range) >= mv_limits->row_min) &
+         ((row + range) <= mv_limits->row_max) &
+         ((col - range) >= mv_limits->col_min) &
+         ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int get_mvpred_var_cost(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+    FULLPEL_MV_STATS *mv_stats) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  const int ref_stride = ref->stride;
+
+  int bestsme;
+
+  bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+                    ref_stride, &mv_stats->sse);
+  mv_stats->distortion = bestsme;
+
+  mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+  bestsme += mv_stats->err_cost;
+
+  return bestsme;
+}
+
+static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 const struct buf_2d *const src,
+                                 const uint8_t *const ref_address,
+                                 const int ref_stride) {
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+
+  return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
+}
+
+static INLINE int get_mvpred_compound_var_cost(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+    FULLPEL_MV_STATS *mv_stats) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  const int ref_stride = ref->stride;
+
+  const uint8_t *mask = ms_params->ms_buffers.mask;
+  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+  const int mask_stride = ms_params->ms_buffers.mask_stride;
+  const int invert_mask = ms_params->ms_buffers.inv_mask;
+  int bestsme;
+
+  if (mask) {
+    bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+                        src_buf, src_stride, second_pred, mask, mask_stride,
+                        invert_mask, &mv_stats->sse);
+  } else if (second_pred) {
+    bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+                        src_buf, src_stride, &mv_stats->sse, second_pred);
+  } else {
+    bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+                      ref_stride, &mv_stats->sse);
+  }
+  mv_stats->distortion = bestsme;
+
+  const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+  mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+  bestsme += mv_stats->err_cost;
+
+  return bestsme;
+}
+
+static INLINE int get_mvpred_compound_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const struct buf_2d *const src, const uint8_t *const ref_address,
+    const int ref_stride) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+
+  const uint8_t *mask = ms_params->ms_buffers.mask;
+  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+  const int mask_stride = ms_params->ms_buffers.mask_stride;
+  const int invert_mask = ms_params->ms_buffers.inv_mask;
+
+  if (mask) {
+    return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred,
+                     mask, mask_stride, invert_mask);
+  } else if (second_pred) {
+    return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
+  } else {
+    return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
+  }
+}
+
+// Calculates and returns a sad+mvcost list around an integer best pel during
+// fullpixel motion search. The resulting list can be used to speed up subpel
+// motion search later.
+#define USE_SAD_COSTLIST 1
+
+// calc_int_cost_list uses var to populate the costlist, which is more accurate
+// than sad but slightly slower.
+static AOM_FORCE_INLINE void calc_int_cost_list(
+    const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    int *cost_list) {
+  static const FULLPEL_MV neighbors[4] = {
+    { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+  };
+  const int br = best_mv.row;
+  const int bc = best_mv.col;
+
+  FULLPEL_MV_STATS mv_stats;
+  cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats);
+
+  if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+    for (int i = 0; i < 4; i++) {
+      const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+                                       bc + neighbors[i].col };
+      cost_list[i + 1] =
+          get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
+    }
+  } else {
+    for (int i = 0; i < 4; i++) {
+      const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+                                       bc + neighbors[i].col };
+      if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) {
+        cost_list[i + 1] = INT_MAX;
+      } else {
+        cost_list[i + 1] =
+            get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
+      }
+    }
+  }
+}
+
+// calc_int_sad_list uses sad to populate the costlist, which is less accurate
+// than var but faster.
+static AOM_FORCE_INLINE void calc_int_sad_list(
+    const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    int *cost_list, int costlist_has_sad) {
+  static const FULLPEL_MV neighbors[4] = {
+    { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+  };
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const int ref_stride = ref->stride;
+  const int br = best_mv.row;
+  const int bc = best_mv.col;
+
+  assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv));
+
+  // Refresh the costlist it does not contain valid sad
+  if (!costlist_has_sad) {
+    cost_list[0] = get_mvpred_sad(
+        ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride);
+
+    if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+      for (int i = 0; i < 4; i++) {
+        const FULLPEL_MV this_mv = { br + neighbors[i].row,
+                                     bc + neighbors[i].col };
+        cost_list[i + 1] = get_mvpred_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+      }
+    } else {
+      for (int i = 0; i < 4; i++) {
+        const FULLPEL_MV this_mv = { br + neighbors[i].row,
+                                     bc + neighbors[i].col };
+        if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+          cost_list[i + 1] = INT_MAX;
+        } else {
+          cost_list[i + 1] = get_mvpred_sad(
+              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+        }
+      }
+    }
+  }
+
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params);
+
+  for (int idx = 0; idx < 4; idx++) {
+    if (cost_list[idx + 1] != INT_MAX) {
+      const FULLPEL_MV this_mv = { br + neighbors[idx].row,
+                                   bc + neighbors[idx].col };
+      cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params);
+    }
+  }
+}
+
+// Computes motion vector cost and adds to the sad cost.
+// Then updates the best sad and motion vectors.
+// Inputs:
+//   this_sad: the sad to be evaluated.
+//   mv: the current motion vector.
+//   mv_cost_params: a structure containing information to compute mv cost.
+//   best_sad: the current best sad.
+//   raw_best_sad (optional): the current best sad without calculating mv cost.
+//   best_mv: the current best motion vector.
+//   second_best_mv (optional): the second best motion vector up to now.
+// Modifies:
+//   best_sad, raw_best_sad, best_mv, second_best_mv
+//   If the current sad is lower than the current best sad.
+// Returns:
+//   Whether the input sad (mv) is better than the current best.
+static AOM_INLINE int update_mvs_and_sad(const unsigned int this_sad,
+                                         const FULLPEL_MV *mv,
+                                         const MV_COST_PARAMS *mv_cost_params,
+                                         unsigned int *best_sad,
+                                         unsigned int *raw_best_sad,
+                                         FULLPEL_MV *best_mv,
+                                         FULLPEL_MV *second_best_mv) {
+  if (this_sad >= *best_sad) return 0;
+
+  // Add the motion vector cost.
+  const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params);
+  if (sad < *best_sad) {
+    if (raw_best_sad) *raw_best_sad = this_sad;
+    *best_sad = sad;
+    if (second_best_mv) *second_best_mv = *best_mv;
+    *best_mv = *mv;
+    return 1;
+  }
+  return 0;
+}
+
+// Calculate sad4 and update the bestmv information
+// in FAST_DIAMOND search method.
+static AOM_INLINE void calc_sad4_update_bestmv(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    const FULLPEL_MV center_mv, const uint8_t *center_address,
+    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+    int *best_site, int cand_start, int *cost_list) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+
+  unsigned char const *block_offset[4];
+  unsigned int sads_buf[4];
+  unsigned int *sads;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  if (cost_list) {
+    sads = (unsigned int *)(cost_list + 1);
+  } else {
+    sads = sads_buf;
+  }
+  // Loop over number of candidates.
+  for (int j = 0; j < 4; j++)
+    block_offset[j] = site[cand_start + j].offset + center_address;
+
+  // 4-point sad calculation.
+  ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
+
+  for (int j = 0; j < 4; j++) {
+    const FULLPEL_MV this_mv = { center_mv.row + site[cand_start + j].mv.row,
+                                 center_mv.col + site[cand_start + j].mv.col };
+    const int found_better_mv = update_mvs_and_sad(
+        sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = cand_start + j;
+  }
+}
+
+static AOM_INLINE void calc_sad3_update_bestmv(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad,
+    unsigned int *raw_bestsad, int search_step, int *best_site,
+    const int *chkpts_indices, int *cost_list) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+  unsigned char const *block_offset[4] = {
+    center_address + site[chkpts_indices[0]].offset,
+    center_address + site[chkpts_indices[1]].offset,
+    center_address + site[chkpts_indices[2]].offset,
+    center_address,
+  };
+  unsigned int sads[4];
+  ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads);
+  for (int j = 0; j < 3; j++) {
+    const int index = chkpts_indices[j];
+    const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
+                                 center_mv.col + site[index].mv.col };
+    const int found_better_mv = update_mvs_and_sad(
+        sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = j;
+  }
+  if (cost_list) {
+    for (int j = 0; j < 3; j++) {
+      int index = chkpts_indices[j];
+      cost_list[index + 1] = sads[j];
+    }
+  }
+}
+
+// Calculate sad and update the bestmv information
+// in FAST_DIAMOND search method.
+static AOM_INLINE void calc_sad_update_bestmv(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    const FULLPEL_MV center_mv, const uint8_t *center_address,
+    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+    int *best_site, const int num_candidates, int cand_start, int *cost_list) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+  // Loop over number of candidates.
+  for (int i = cand_start; i < num_candidates; i++) {
+    const FULLPEL_MV this_mv = { center_mv.row + site[i].mv.row,
+                                 center_mv.col + site[i].mv.col };
+    if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
+    int thissad = get_mvpred_sad(ms_params, src,
+                                 center_address + site[i].offset, ref->stride);
+    if (cost_list) {
+      cost_list[i + 1] = thissad;
+    }
+    const int found_better_mv = update_mvs_and_sad(
+        thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = i;
+  }
+}
+
+static AOM_INLINE void calc_sad_update_bestmv_with_indices(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    const FULLPEL_MV center_mv, const uint8_t *center_address,
+    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+    int *best_site, const int num_candidates, const int *chkpts_indices,
+    int *cost_list) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+  // Loop over number of candidates.
+  for (int i = 0; i < num_candidates; i++) {
+    int index = chkpts_indices[i];
+    const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
+                                 center_mv.col + site[index].mv.col };
+    if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+      if (cost_list) {
+        cost_list[index + 1] = INT_MAX;
+      }
+      continue;
+    }
+    const int thissad = get_mvpred_sad(
+        ms_params, src, center_address + site[index].offset, ref->stride);
+    if (cost_list) {
+      cost_list[index + 1] = thissad;
+    }
+    const int found_better_mv = update_mvs_and_sad(
+        thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = i;
+  }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+static int pattern_search(FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          int search_step, const int do_init_search,
+                          int *cost_list, FULLPEL_MV *best_mv,
+                          FULLPEL_MV_STATS *best_mv_stats) {
+  static const int search_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site_config *search_sites = ms_params->search_sites;
+  const int *num_candidates = search_sites->searches_per_step;
+  const int ref_stride = ref->stride;
+  const int last_is_4 = num_candidates[0] == 4;
+  int br, bc;
+  unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX;
+  int k = -1;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1);
+  assert(search_step >= 0);
+  int best_init_s = search_steps[search_step];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  br = start_mv.row;
+  bc = start_mv.col;
+  if (cost_list != NULL) {
+    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+        INT_MAX;
+  }
+  int costlist_has_sad = 0;
+
+  // Work out the start point for the search
+  raw_bestsad = get_mvpred_sad(ms_params, src,
+                               get_buf_from_fullmv(ref, &start_mv), ref_stride);
+  bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params);
+
+  // Search all possible scales up to the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  const uint8_t *center_address = get_buf_from_fullmv(ref, &start_mv);
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      int best_site = -1;
+      FULLPEL_MV center_mv = { br, bc };
+      if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
+        // Call 4-point sad for multiples of 4 candidates.
+        const int no_of_4_cand_loops = num_candidates[t] >> 2;
+        for (i = 0; i < no_of_4_cand_loops; i++) {
+          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                  center_address, &bestsad, &raw_bestsad, t,
+                                  &best_site, i * 4, /*cost_list=*/NULL);
+        }
+        // Rest of the candidates
+        const int remaining_cand = num_candidates[t] % 4;
+        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                               center_address, &bestsad, &raw_bestsad, t,
+                               &best_site, remaining_cand,
+                               no_of_4_cand_loops * 4, NULL);
+      } else {
+        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                               center_address, &bestsad, &raw_bestsad, t,
+                               &best_site, num_candidates[t], 0, NULL);
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += search_sites->site[best_init_s][k].mv.row;
+      bc += search_sites->site[best_init_s][k].mv.col;
+      center_address += search_sites->site[best_init_s][k].offset;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    const int last_s = (last_is_4 && cost_list != NULL);
+    int best_site = -1;
+    s = best_init_s;
+
+    for (; s >= last_s; s--) {
+      // No need to search all points the 1st time if initial search was used
+      if (!do_init_search || s != best_init_s) {
+        FULLPEL_MV center_mv = { br, bc };
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+          // Call 4-point sad for multiples of 4 candidates.
+          const int no_of_4_cand_loops = num_candidates[s] >> 2;
+          for (i = 0; i < no_of_4_cand_loops; i++) {
+            calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                    center_mv, center_address, &bestsad,
+                                    &raw_bestsad, s, &best_site, i * 4,
+                                    /*cost_list=*/NULL);
+          }
+          // Rest of the candidates
+          const int remaining_cand = num_candidates[s] % 4;
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                 center_address, &bestsad, &raw_bestsad, s,
+                                 &best_site, remaining_cand,
+                                 no_of_4_cand_loops * 4, NULL);
+        } else {
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                 center_address, &bestsad, &raw_bestsad, s,
+                                 &best_site, num_candidates[s], 0, NULL);
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += search_sites->site[s][best_site].mv.row;
+          bc += search_sites->site[s][best_site].mv.col;
+          center_address += search_sites->site[s][best_site].offset;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        FULLPEL_MV center_mv = { br, bc };
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+          calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                  center_address, &bestsad, &raw_bestsad, s,
+                                  &best_site, next_chkpts_indices, NULL);
+        } else {
+          calc_sad_update_bestmv_with_indices(
+              ms_params, mv_cost_params, best_mv, center_mv, center_address,
+              &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
+              next_chkpts_indices, NULL);
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += search_sites->site[s][k].mv.row;
+          bc += search_sites->site[s][k].mv.col;
+          center_address += search_sites->site[s][k].offset;
+        }
+      } while (best_site != -1);
+    }
+    // Note: If we enter the if below, then cost_list must be non-NULL.
+    if (s == 0) {
+      cost_list[0] = raw_bestsad;
+      costlist_has_sad = 1;
+      assert(num_candidates[s] == 4);
+      if (!do_init_search || s != best_init_s) {
+        FULLPEL_MV center_mv = { br, bc };
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                  center_address, &bestsad, &raw_bestsad, s,
+                                  &best_site, 0, cost_list);
+        } else {
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                 center_address, &bestsad, &raw_bestsad, s,
+                                 &best_site, /*num_candidates=*/4,
+                                 /*cand_start=*/0, cost_list);
+        }
+
+        if (best_site != -1) {
+          br += search_sites->site[s][best_site].mv.row;
+          bc += search_sites->site[s][best_site].mv.col;
+          center_address += search_sites->site[s][best_site].offset;
+          k = best_site;
+        }
+      }
+      while (best_site != -1) {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+        cost_list[((k + 2) % 4) + 1] = cost_list[0];
+        cost_list[0] = raw_bestsad;
+
+        FULLPEL_MV center_mv = { br, bc };
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+          assert(PATTERN_CANDIDATES_REF == 3);
+          calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                  center_address, &bestsad, &raw_bestsad, s,
+                                  &best_site, next_chkpts_indices, cost_list);
+        } else {
+          calc_sad_update_bestmv_with_indices(
+              ms_params, mv_cost_params, best_mv, center_mv, center_address,
+              &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
+              next_chkpts_indices, cost_list);
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += search_sites->site[s][k].mv.row;
+          bc += search_sites->site[s][k].mv.col;
+          center_address += search_sites->site[s][k].offset;
+        }
+      }
+    }
+  }
+  best_mv->row = br;
+  best_mv->col = bc;
+
+  assert(center_address == get_buf_from_fullmv(ref, best_mv) &&
+         "center address is out of sync with best_mv!\n");
+
+  // Returns the one-away integer pel cost/sad around the best as follows:
+  // cost_list[0]: cost/sad at the best integer pel
+  // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost/sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost/sad at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    if (USE_SAD_COSTLIST) {
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+    } else {
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
+    }
+  }
+
+  const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats);
+  return var_cost;
+}
+
+// For the following foo_search, the input arguments are:
+// start_mv: where we are starting our motion search
+// ms_params: a collection of motion search parameters
+// search_step: how many steps to skip in our motion search. For example,
+//   a value 3 suggests that 3 search steps have already taken place prior to
+//   this function call, so we jump directly to step 4 of the search process
+// do_init_search: if on, do an initial search of all possible scales around the
+//   start_mv, and then pick the best scale.
+// cond_list: used to hold the cost around the best full mv so we can use it to
+//   speed up subpel search later.
+// best_mv: the best mv found in the motion search
+static int hex_search(const FULLPEL_MV start_mv,
+                      const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                      const int search_step, const int do_init_search,
+                      int *cost_list, FULLPEL_MV *best_mv,
+                      FULLPEL_MV_STATS *best_mv_stats) {
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv, best_mv_stats);
+}
+
+static int bigdia_search(const FULLPEL_MV start_mv,
+                         const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                         const int search_step, const int do_init_search,
+                         int *cost_list, FULLPEL_MV *best_mv,
+                         FULLPEL_MV_STATS *best_mv_stats) {
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv, best_mv_stats);
+}
+
+static int square_search(const FULLPEL_MV start_mv,
+                         const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                         const int search_step, const int do_init_search,
+                         int *cost_list, FULLPEL_MV *best_mv,
+                         FULLPEL_MV_STATS *best_mv_stats) {
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_hex_search(const FULLPEL_MV start_mv,
+                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                           const int search_step, const int do_init_search,
+                           int *cost_list, FULLPEL_MV *best_mv,
+                           FULLPEL_MV_STATS *best_mv_stats) {
+  return hex_search(start_mv, ms_params,
+                    AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search,
+                    cost_list, best_mv, best_mv_stats);
+}
+
+static int vfast_dia_search(const FULLPEL_MV start_mv,
+                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                            const int search_step, const int do_init_search,
+                            int *cost_list, FULLPEL_MV *best_mv,
+                            FULLPEL_MV_STATS *best_mv_stats) {
+  return bigdia_search(start_mv, ms_params,
+                       AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step),
+                       do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_dia_search(const FULLPEL_MV start_mv,
+                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                           const int search_step, const int do_init_search,
+                           int *cost_list, FULLPEL_MV *best_mv,
+                           FULLPEL_MV_STATS *best_mv_stats) {
+  return bigdia_search(start_mv, ms_params,
+                       AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step),
+                       do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_bigdia_search(const FULLPEL_MV start_mv,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int search_step, const int do_init_search,
+                              int *cost_list, FULLPEL_MV *best_mv,
+                              FULLPEL_MV_STATS *best_mv_stats) {
+  return bigdia_search(start_mv, ms_params,
+                       AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step),
+                       do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int search_step, int *num00,
+                              FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+#define UPDATE_SEARCH_STEP                                      \
+  do {                                                          \
+    if (best_site != 0) {                                       \
+      tmp_second_best_mv = *best_mv;                            \
+      best_mv->row += site[best_site].mv.row;                   \
+      best_mv->col += site[best_site].mv.col;                   \
+      best_address += site[best_site].offset;                   \
+      is_off_center = 1;                                        \
+    }                                                           \
+                                                                \
+    if (is_off_center == 0) num_center_steps++;                 \
+                                                                \
+    if (best_site == 0 && step > 2) {                           \
+      int next_step_size = cfg->radius[step - 1];               \
+      while (next_step_size == cfg->radius[step] && step > 2) { \
+        num_center_steps++;                                     \
+        --step;                                                 \
+        next_step_size = cfg->radius[step - 1];                 \
+      }                                                         \
+    }                                                           \
+  } while (0)
+
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  const int ref_stride = ref->stride;
+
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+  const search_site_config *cfg = ms_params->search_sites;
+
+  int is_off_center = 0;
+  // Number of times that we have stayed in the middle. This is used to skip
+  // search steps in the future if diamond_search_sad is called again.
+  int num_center_steps = 0;
+
+  // search_step determines the length of the initial step and hence the number
+  // of iterations.
+  const int tot_steps = cfg->num_search_steps - search_step;
+  FULLPEL_MV tmp_second_best_mv;
+  if (second_best_mv) {
+    tmp_second_best_mv = *second_best_mv;
+  }
+
+  *best_mv = start_mv;
+
+  // Check the starting position
+  const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv);
+  unsigned int bestsad = start_mv_sad;
+
+  // TODO(chiyotsai@google.com): Implement 4 points search for msdf&sdaf
+  if (ms_params->ms_buffers.second_pred) {
+    for (int step = tot_steps - 1; step >= 0; --step) {
+      const search_site *site = cfg->site[step];
+      const int num_searches = cfg->searches_per_step[step];
+      int best_site = 0;
+
+      for (int idx = 1; idx <= num_searches; idx++) {
+        const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
+                                     best_mv->col + site[idx].mv.col };
+
+        if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+          const uint8_t *const check_here = site[idx].offset + best_address;
+          unsigned int thissad =
+              get_mvpred_compound_sad(ms_params, src, check_here, ref_stride);
+
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = idx;
+            }
+          }
+        }
+      }
+      UPDATE_SEARCH_STEP;
+    }
+  } else {
+    for (int step = tot_steps - 1; step >= 0; --step) {
+      const search_site *site = cfg->site[step];
+      const int num_searches = cfg->searches_per_step[step];
+      int best_site = 0;
+
+      int all_in = 1;
+      // Trap illegal vectors
+      all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min;
+      all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max;
+      all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min;
+      all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max;
+
+      if (all_in) {
+        for (int idx = 1; idx <= num_searches; idx += 4) {
+          unsigned char const *block_offset[4];
+          unsigned int sads[4];
+
+          for (int j = 0; j < 4; j++)
+            block_offset[j] = site[idx + j].offset + best_address;
+
+          ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride,
+                            sads);
+          for (int j = 0; j < 4; j++) {
+            if (sads[j] < bestsad) {
+              const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row,
+                                           best_mv->col +
+                                               site[idx + j].mv.col };
+              unsigned int thissad =
+                  sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params);
+              if (thissad < bestsad) {
+                bestsad = thissad;
+                best_site = idx + j;
+              }
+            }
+          }
+        }
+      } else {
+        for (int idx = 1; idx <= num_searches; idx++) {
+          const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
+                                       best_mv->col + site[idx].mv.col };
+
+          if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+            const uint8_t *const check_here = site[idx].offset + best_address;
+            unsigned int thissad =
+                get_mvpred_sad(ms_params, src, check_here, ref_stride);
+
+            if (thissad < bestsad) {
+              thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
+              if (thissad < bestsad) {
+                bestsad = thissad;
+                best_site = idx;
+              }
+            }
+          }
+        }
+      }
+      UPDATE_SEARCH_STEP;
+    }
+  }
+
+  *num00 = num_center_steps;
+  if (second_best_mv) {
+    *second_best_mv = tmp_second_best_mv;
+  }
+
+  return bestsad;
+
+#undef UPDATE_SEARCH_STEP
+}
+
+static INLINE unsigned int get_start_mvpred_sad_cost(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv);
+
+  unsigned int start_mv_sad =
+      mvsad_err_cost_(&start_mv, &ms_params->mv_cost_params);
+
+  if (ms_params->ms_buffers.second_pred)
+    start_mv_sad +=
+        get_mvpred_compound_sad(ms_params, src, best_address, ref->stride);
+  else
+    start_mv_sad += get_mvpred_sad(ms_params, src, best_address, ref->stride);
+
+  return start_mv_sad;
+}
+
+static int full_pixel_diamond(FULLPEL_MV start_mv,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int step_param, int *cost_list,
+                              FULLPEL_MV *best_mv,
+                              FULLPEL_MV_STATS *best_mv_stats,
+                              FULLPEL_MV *second_best_mv) {
+  const search_site_config *cfg = ms_params->search_sites;
+  int thissme, n, num00 = 0;
+
+  // Clamp start mv and calculate the cost
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  unsigned int start_mv_sad = get_start_mvpred_sad_cost(ms_params, start_mv);
+
+  diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv,
+                     second_best_mv);
+
+  int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats);
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  const int further_steps = cfg->num_search_steps - 1 - step_param;
+  while (n < further_steps) {
+    ++n;
+
+    // TODO(chiyotsai@google.com): There is another bug here where the second
+    // best mv gets incorrectly overwritten. Fix it later.
+    FULLPEL_MV tmp_best_mv;
+    FULLPEL_MV_STATS tmp_best_mv_stats;
+    diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n,
+                       &num00, &tmp_best_mv, second_best_mv);
+
+    thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv,
+                                           &tmp_best_mv_stats);
+
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *best_mv = tmp_best_mv;
+      *best_mv_stats = tmp_best_mv_stats;
+    }
+
+    if (num00) {
+      // Advance the loop by num00 steps
+      n += num00;
+      num00 = 0;
+    }
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    if (USE_SAD_COSTLIST) {
+      const int costlist_has_sad = 0;
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+    } else {
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
+    }
+  }
+  return bestsme;
+}
+
+// Exhaustive motion search around a given centre position with a given
+// step size.
+static int exhaustive_mesh_search(FULLPEL_MV start_mv,
+                                  const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  const int range, const int step,
+                                  FULLPEL_MV *best_mv,
+                                  FULLPEL_MV *second_best_mv) {
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const int ref_stride = ref->stride;
+  unsigned int best_sad = INT_MAX;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+  const int col_step = (step > 1) ? step : 4;
+
+  assert(step >= 1);
+
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  *best_mv = start_mv;
+  best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv),
+                            ref_stride);
+  best_sad += mvsad_err_cost_(&start_mv, mv_cost_params);
+  start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row);
+  start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col);
+  end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row);
+  end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c };
+        unsigned int sad = get_mvpred_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+        update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+                           /*raw_best_sad=*/NULL, best_mv, second_best_mv);
+      } else {
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+            addrs[i] = get_buf_from_fullmv(ref, &mv);
+          }
+
+          ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+              update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad,
+                                 /*raw_best_sad=*/NULL, best_mv,
+                                 second_best_mv);
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+            unsigned int sad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+            update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+                               /*raw_best_sad=*/NULL, best_mv, second_best_mv);
+          }
+        }
+      }
+    }
+  }
+
+  return best_sad;
+}
+
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
+                                 const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 const struct MESH_PATTERN *const mesh_patterns,
+                                 int *cost_list, FULLPEL_MV *best_mv,
+                                 FULLPEL_MV_STATS *mv_stats,
+                                 FULLPEL_MV *second_best_mv) {
+  const int kMinRange = 7;
+  const int kMaxRange = 256;
+  const int kMinInterval = 1;
+
+  int bestsme;
+  int i;
+  int interval = mesh_patterns[0].interval;
+  int range = mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // TODO(chiyotsai@google.com): Currently exhaustive search calls single ref
+  // version of sad and variance function. We still need to check the
+  // performance when compound ref exhaustive search is enabled.
+  assert(!ms_params->ms_buffers.second_pred &&
+         "Mesh search does not support compound mode!");
+
+  *best_mv = start_mv;
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) ||
+      (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4);
+  range = AOMMIN(range, kMaxRange);
+  interval = AOMMAX(interval, range / baseline_interval_divisor);
+  // Use a small search step/interval for certain kind of clips.
+  // For example, screen content clips with a lot of texts.
+  // Large interval could lead to a false matching position, and it can't find
+  // the best global candidate in following iterations due to reduced search
+  // range. The solution here is to use a small search iterval in the beginning
+  // and thus reduces the chance of missing the best candidate.
+  if (ms_params->fine_search_interval) {
+    interval = AOMMIN(interval, 4);
+  }
+
+  // initial search
+  bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval,
+                                   best_mv, second_best_mv);
+
+  if ((interval > kMinInterval) && (range > kMinRange)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhaustive_mesh_search(
+          *best_mv, ms_params, mesh_patterns[i].range,
+          mesh_patterns[i].interval, best_mv, second_best_mv);
+
+      if (mesh_patterns[i].interval == 1) break;
+    }
+  }
+
+  if (bestsme < INT_MAX) {
+    bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats);
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    if (USE_SAD_COSTLIST) {
+      const int costlist_has_sad = 0;
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+    } else {
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
+    }
+  }
+  return bestsme;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode, or when searching for one component of an ext-inter compound mode.
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                             const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) {
+  static const search_neighbors neighbors[8] = {
+    { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+    { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+    { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+    { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+    { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+  };
+
+  uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P *
+                                SEARCH_GRID_STRIDE_8P] = { 0 };
+  int grid_center = SEARCH_GRID_CENTER_8P;
+  int grid_coord = grid_center;
+
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const FullMvLimits *mv_limits = &ms_params->mv_limits;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const struct buf_2d *src = ms_buffers->src;
+  const struct buf_2d *ref = ms_buffers->ref;
+  const int ref_stride = ref->stride;
+
+  *best_mv = start_mv;
+  clamp_fullmv(best_mv, mv_limits);
+
+  unsigned int best_sad = get_mvpred_compound_sad(
+      ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride);
+  best_sad += mvsad_err_cost_(best_mv, mv_cost_params);
+
+  do_refine_search_grid[grid_coord] = 1;
+
+  for (int i = 0; i < SEARCH_RANGE_8P; ++i) {
+    int best_site = -1;
+
+    for (int j = 0; j < 8; ++j) {
+      grid_coord = grid_center + neighbors[j].coord_offset;
+      if (do_refine_search_grid[grid_coord] == 1) {
+        continue;
+      }
+      const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row,
+                              best_mv->col + neighbors[j].coord.col };
+
+      do_refine_search_grid[grid_coord] = 1;
+      if (av1_is_fullmv_in_range(mv_limits, mv)) {
+        unsigned int sad;
+        sad = get_mvpred_compound_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      best_mv->row += neighbors[best_site].coord.row;
+      best_mv->col += neighbors[best_site].coord.col;
+      grid_center += neighbors[best_site].coord_offset;
+    }
+  }
+  return best_sad;
+}
+
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          const int step_param, int *cost_list,
+                          FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+                          FULLPEL_MV *second_best_mv) {
+  const BLOCK_SIZE bsize = ms_params->bsize;
+  const SEARCH_METHODS search_method = ms_params->search_method;
+
+  const int is_intra_mode = ms_params->is_intra_mode;
+  int run_mesh_search = ms_params->run_mesh_search;
+
+  int var = 0;
+  MARK_MV_INVALID(best_mv);
+  if (second_best_mv) {
+    MARK_MV_INVALID(second_best_mv);
+  }
+
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride);
+  assert(ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width);
+
+  switch (search_method) {
+    case FAST_BIGDIA:
+      var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
+                               best_mv, best_mv_stats);
+      break;
+    case VFAST_DIAMOND:
+      var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+                             best_mv, best_mv_stats);
+      break;
+    case FAST_DIAMOND:
+      var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+                            best_mv, best_mv_stats);
+      break;
+    case FAST_HEX:
+      var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list,
+                            best_mv, best_mv_stats);
+      break;
+    case HEX:
+      var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv,
+                       best_mv_stats);
+      break;
+    case SQUARE:
+      var = square_search(start_mv, ms_params, step_param, 1, cost_list,
+                          best_mv, best_mv_stats);
+      break;
+    case BIGDIA:
+      var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list,
+                          best_mv, best_mv_stats);
+      break;
+    case NSTEP:
+    case NSTEP_8PT:
+    case DIAMOND:
+    case CLAMPED_DIAMOND:
+      var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
+                               best_mv, best_mv_stats, second_best_mv);
+      break;
+    default: assert(0 && "Invalid search method.");
+  }
+
+  // Should we allow a follow on exhaustive search?
+  if (!run_mesh_search &&
+      ((search_method == NSTEP) || (search_method == NSTEP_8PT)) &&
+      !ms_params->ms_buffers.second_pred) {
+    int exhaustive_thr = ms_params->force_mesh_thresh;
+    exhaustive_thr >>=
+        10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+    // Threshold variance for an exhaustive full search.
+    if (var > exhaustive_thr) run_mesh_search = 1;
+  }
+
+  // TODO(yunqing): the following is used to reduce mesh search in temporal
+  // filtering. Can extend it to intrabc.
+  if (!is_intra_mode && ms_params->prune_mesh_search) {
+    const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row),
+                                        abs(start_mv.col - best_mv->col));
+    if (full_pel_mv_diff <= ms_params->mesh_search_mv_diff_threshold) {
+      run_mesh_search = 0;
+    }
+  }
+
+  if (ms_params->sdf != ms_params->vfp->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run mesh search with
+    // skip row features off.
+    // TODO(chiyotsai@google.com): Handle the case where we have a vertical
+    // offset of 1 before we hit this statement to avoid having to redo
+    // motion search.
+    const struct buf_2d *src = ms_params->ms_buffers.src;
+    const struct buf_2d *ref = ms_params->ms_buffers.ref;
+    const int src_stride = src->stride;
+    const int ref_stride = ref->stride;
+
+    const uint8_t *src_address = src->buf;
+    const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv);
+    const int sad =
+        ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride);
+    const int skip_sad =
+        ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride);
+    // We will keep the result of skipping rows if it's good enough. Here, good
+    // enough means the error is less than 1 per pixel.
+    const int kSADThresh =
+        1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
+      new_ms_params.sdf = new_ms_params.vfp->sdf;
+      new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+      new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
+
+      return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
+                                   cost_list, best_mv, best_mv_stats,
+                                   second_best_mv);
+    }
+  }
+
+  if (run_mesh_search) {
+    int var_ex;
+    FULLPEL_MV tmp_mv_ex;
+    FULLPEL_MV_STATS tmp_mv_stats;
+    // Pick the mesh pattern for exhaustive search based on the toolset (intraBC
+    // or non-intraBC)
+    // TODO(chiyotsai@google.com):  There is a bug here where the second best mv
+    // gets overwritten without actually comparing the rdcost.
+    const MESH_PATTERN *const mesh_patterns =
+        ms_params->mesh_patterns[is_intra_mode];
+    // TODO(chiyotsai@google.com): the second best mv is not set correctly by
+    // full_pixel_exhaustive, which can incorrectly override it.
+    var_ex =
+        full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list,
+                              &tmp_mv_ex, &tmp_mv_stats, second_best_mv);
+    if (var_ex < var) {
+      var = var_ex;
+      *best_mv_stats = tmp_mv_stats;
+      *best_mv = tmp_mv_ex;
+    }
+  }
+
+  return var;
+}
+
+int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                            IntraBCHashInfo *intrabc_hash_info,
+                            FULLPEL_MV *best_mv) {
+  if (!av1_use_hash_me(cpi)) return INT_MAX;
+
+  const BLOCK_SIZE bsize = ms_params->bsize;
+  const int block_width = block_size_wide[bsize];
+  const int block_height = block_size_high[bsize];
+
+  if (block_width != block_height) return INT_MAX;
+
+  const FullMvLimits *mv_limits = &ms_params->mv_limits;
+  const MSBuffers *ms_buffer = &ms_params->ms_buffers;
+
+  const uint8_t *src = ms_buffer->src->buf;
+  const int src_stride = ms_buffer->src->stride;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int x_pos = mi_col * MI_SIZE;
+  const int y_pos = mi_row * MI_SIZE;
+
+  uint32_t hash_value1, hash_value2;
+  int best_hash_cost = INT_MAX;
+
+  // for the hashMap
+  hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table;
+
+  av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width,
+                           &hash_value1, &hash_value2, is_cur_buf_hbd(xd));
+
+  const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
+  if (count <= 1) {
+    return INT_MAX;
+  }
+
+  Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
+  for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) {
+    block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator));
+    if (hash_value2 == ref_block_hash.hash_value2) {
+      // Make sure the prediction is from valid area.
+      const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
+                      GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
+      if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
+                           cpi->common.seq_params->mib_size_log2))
+        continue;
+
+      FULLPEL_MV hash_mv;
+      hash_mv.col = ref_block_hash.x - x_pos;
+      hash_mv.row = ref_block_hash.y - y_pos;
+      if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue;
+      FULLPEL_MV_STATS mv_stats;
+      const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats);
+      if (refCost < best_hash_cost) {
+        best_hash_cost = refCost;
+        *best_mv = hash_mv;
+      }
+    }
+  }
+
+  return best_hash_cost;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size,
+                        int full_search, int *sad) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = search_size << 1;
+
+  if (full_search) {
+    for (d = 0; d <= bw; d++) {
+      this_sad = aom_vector_var(&ref[d], src, bwl);
+      if (this_sad < best_sad) {
+        best_sad = this_sad;
+        offset = d;
+      }
+    }
+    center = offset;
+    *sad = best_sad;
+    return (center - (bw >> 1));
+  }
+
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = aom_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  *sad = best_sad;
+  return (center - (bw >> 1));
+}
+
+// A special fast version of motion search used in rt mode.
+// The search window along columns and row is given by:
+//  +/- me_search_size_col/row.
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, const MV *ref_mv,
+                                           unsigned int *y_sad_zero,
+                                           int me_search_size_col,
+                                           int me_search_size_row) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int idx;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+  const int full_search = is_screen;
+  const bool screen_scroll_superblock =
+      is_screen && bsize == cm->seq_params->sb_size;
+  // Keep border a multiple of 16.
+  const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
+  int search_size_width = me_search_size_col;
+  int search_size_height = me_search_size_row;
+  // Adjust based on boundary.
+  if (((mi_col << 2) - search_size_width < -border) ||
+      ((mi_col << 2) + search_size_width > cm->width + border))
+    search_size_width = border;
+  if (((mi_row << 2) - search_size_height < -border) ||
+      ((mi_row << 2) + search_size_height > cm->height + border))
+    search_size_height = border;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  int_mv *best_int_mv = &xd->mi[0]->mv[0];
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  int best_sad_col, best_sad_row;
+  const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+  const int col_norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+  static const MV search_pos[4] = {
+    { -1, 0 },
+    { 0, -1 },
+    { 0, 1 },
+    { 1, 0 },
+  };
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+                         MAX_MB_PLANE);
+  }
+
+  if (xd->bd != 8) {
+    best_int_mv->as_fullmv = kZeroFullMv;
+    best_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                           xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return best_sad;
+  }
+  const int width_ref_buf = (search_size_width << 1) + bw;
+  const int height_ref_buf = (search_size_height << 1) + bh;
+  int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf));
+  int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf));
+  int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
+  int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
+  if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
+    aom_free(hbuf);
+    aom_free(vbuf);
+    aom_free(src_hbuf);
+    aom_free(src_vbuf);
+    aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
+  }
+
+  // Set up prediction 1-D reference set for rows.
+  ref_buf = xd->plane[0].pre[0].buf - search_size_width;
+  aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh,
+                  row_norm_factor);
+
+  // Set up prediction 1-D reference set for cols
+  ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride;
+  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf,
+                  col_norm_factor);
+
+  // Set up src 1-D reference set
+  src_buf = x->plane[0].src.buf;
+  aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+  aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
+
+  // Find the best match per 1-D search
+  best_int_mv->as_fullmv.col =
+      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width,
+                   full_search, &best_sad_col);
+  best_int_mv->as_fullmv.row =
+      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height,
+                   full_search, &best_sad_row);
+
+  // For screen: select between horiz or vert motion.
+  if (is_screen) {
+    if (best_sad_col < best_sad_row)
+      best_int_mv->as_fullmv.row = 0;
+    else
+      best_int_mv->as_fullmv.col = 0;
+  }
+
+  FULLPEL_MV this_mv = best_int_mv->as_fullmv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+  best_sad =
+      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  // Evaluate zero MV if found MV is non-zero.
+  if (best_int_mv->as_int != 0) {
+    tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                          xd->plane[0].pre[0].buf, ref_stride);
+    *y_sad_zero = tmp_sad;
+    if (tmp_sad < best_sad) {
+      best_int_mv->as_fullmv = kZeroFullMv;
+      this_mv = best_int_mv->as_fullmv;
+      ref_buf = xd->plane[0].pre[0].buf;
+      best_sad = tmp_sad;
+    }
+  } else {
+    *y_sad_zero = best_sad;
+  }
+
+  if (!screen_scroll_superblock) {
+    const uint8_t *const pos[4] = {
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
+    };
+
+    cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
+                                   this_sad);
+
+    for (idx = 0; idx < 4; ++idx) {
+      if (this_sad[idx] < best_sad) {
+        best_sad = this_sad[idx];
+        best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
+        best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+      }
+    }
+
+    if (this_sad[0] < this_sad[3])
+      this_mv.row -= 1;
+    else
+      this_mv.row += 1;
+
+    if (this_sad[1] < this_sad[2])
+      this_mv.col -= 1;
+    else
+      this_mv.col += 1;
+
+    ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+
+    tmp_sad =
+        cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+    if (best_sad > tmp_sad) {
+      best_int_mv->as_fullmv = this_mv;
+      best_sad = tmp_sad;
+    }
+  }
+
+  FullMvLimits mv_limits = x->mv_limits;
+  av1_set_mv_search_range(&mv_limits, ref_mv);
+  clamp_fullmv(&best_int_mv->as_fullmv, &mv_limits);
+
+  convert_fullmv_to_mv(best_int_mv);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  aom_free(hbuf);
+  aom_free(vbuf);
+  aom_free(src_hbuf);
+  aom_free(src_vbuf);
+  return best_sad;
+}
+
+// =============================================================================
+//  Fullpixel Motion Search: OBMC
+// =============================================================================
+static INLINE int get_obmc_mvpred_var(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *ref_buf = ms_buffers->ref;
+
+  const MV mv = get_mv_from_fullmv(this_mv);
+  unsigned int unused;
+
+  return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc,
+                  mask, &unused) +
+         mv_err_cost_(&mv, mv_cost_params);
+}
+
+static int obmc_refining_search_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) {
+  const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *ref_buf = ms_buffers->ref;
+  const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const int kSearchRange = 8;
+
+  unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv),
+                                       ref_buf->stride, wsrc, mask) +
+                          mvsad_err_cost_(best_mv, mv_cost_params);
+
+  for (int i = 0; i < kSearchRange; i++) {
+    int best_site = -1;
+
+    for (int j = 0; j < 4; j++) {
+      const FULLPEL_MV mv = { best_mv->row + neighbors[j].row,
+                              best_mv->col + neighbors[j].col };
+      if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+        unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv),
+                                        ref_buf->stride, wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      best_mv->row += neighbors[best_site].row;
+      best_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+static int obmc_diamond_search_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv,
+    FULLPEL_MV *best_mv, int search_step, int *num00) {
+  const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+  const search_site_config *cfg = ms_params->search_sites;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *const ref_buf = ms_buffers->ref;
+
+  // search_step determines the length of the initial step and hence the number
+  // of iterations.
+  const int tot_steps = cfg->num_search_steps - search_step;
+  const uint8_t *best_address, *init_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv);
+  *num00 = 0;
+  *best_mv = start_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) +
+             mvsad_err_cost_(best_mv, mv_cost_params);
+
+  for (int step = tot_steps - 1; step >= 0; --step) {
+    const search_site *const site = cfg->site[step];
+    best_site = 0;
+    for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) {
+      const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row,
+                              best_mv->col + site[idx].mv.col };
+      if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+        int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride,
+                               wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = idx;
+          }
+        }
+      }
+    }
+
+    if (best_site != 0) {
+      best_mv->row += site[best_site].mv.row;
+      best_mv->col += site[best_site].mv.col;
+      best_address += site[best_site].offset;
+    } else if (best_address == init_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+static int obmc_full_pixel_diamond(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv,
+    int step_param, FULLPEL_MV *best_mv) {
+  const search_site_config *cfg = ms_params->search_sites;
+  FULLPEL_MV tmp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme =
+      obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n);
+  if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+  *best_mv = tmp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  const int further_steps = cfg->num_search_steps - 1 - step_param;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv,
+                                        step_param + n, &num00);
+      if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *best_mv = tmp_mv;
+      }
+    }
+  }
+
+  return bestsme;
+}
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                               const int step_param, FULLPEL_MV *best_mv) {
+  if (!ms_params->fast_obmc_search) {
+    const int bestsme =
+        obmc_full_pixel_diamond(ms_params, start_mv, step_param, best_mv);
+    return bestsme;
+  } else {
+    *best_mv = start_mv;
+    clamp_fullmv(best_mv, &ms_params->mv_limits);
+    int thissme = obmc_refining_search_sad(ms_params, best_mv);
+    if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv);
+    return thissme;
+  }
+}
+
+// =============================================================================
+//  Subpixel Motion Search: Translational
+// =============================================================================
+#define INIT_SUBPEL_STEP_SIZE (4)
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// Returns the subpel offset used by various subpel variance functions [m]sv[a]f
+static INLINE int get_subpel_part(int x) { return x & 7; }
+
+// Gets the address of the ref buffer at subpel location (r, c), rounded to the
+// nearest fullpel precision toward - \infty
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV mv) {
+  const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
+  return &buf->buf[offset];
+}
+
+// Estimates the variance of prediction residue using bilinear filter for fast
+// search.
+static INLINE int estimated_pref_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int src_stride = ms_buffers->src->stride;
+  const int ref_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  if (second_pred == NULL) {
+    return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                    sse);
+  } else if (mask) {
+    return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     second_pred, mask, mask_stride, invert_mask, sse);
+  } else {
+    return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     sse, second_pred);
+  }
+}
+
+// Calculates the variance of prediction residue.
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+                                const MV *this_mv,
+                                const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                                unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int src_stride = ms_buffers->src->stride;
+  const int ref_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  unsigned int besterr;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_highbd_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+            invert_mask, xd->bd, subpel_search_type);
+      } else {
+        aom_highbd_comp_avg_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+            subpel_search_type);
+      }
+    } else {
+      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                                subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                                xd->bd, subpel_search_type);
+    }
+    besterr = vfp->vf(pred8, w, src, src_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+            invert_mask, subpel_search_type);
+      } else {
+        aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                    second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                    ref, ref_stride, subpel_search_type);
+      }
+    } else {
+      aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search_type);
+    }
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+  }
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  if (second_pred != NULL) {
+    if (mask) {
+      aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                   second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                   ref, ref_stride, mask, mask_stride,
+                                   invert_mask, subpel_search_type);
+    } else {
+      aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                  second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                  ref, ref_stride, subpel_search_type);
+    }
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, ref, ref_stride, subpel_search_type);
+  }
+
+  besterr = vfp->vf(pred, w, src, src_stride, sse);
+#endif
+  return besterr;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account. It is suffixed "fast" because
+// it uses bilinear filter to estimate the prediction.
+static INLINE unsigned int check_better_fast(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    int thismse;
+    if (is_scaled) {
+      thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+    } else {
+      thismse = estimated_pref_error(this_mv, var_params, &sse);
+    }
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+// Checks whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static AOM_FORCE_INLINE unsigned int check_better(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *is_better) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    int thismse;
+    thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+    cost += thismse;
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *is_better |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost,
+                                    unsigned int right_cost,
+                                    unsigned int up_cost,
+                                    unsigned int down_cost) {
+  const MV diag_step = { up_cost <= down_cost ? -step_size : step_size,
+                         left_cost <= right_cost ? -step_size : step_size };
+
+  return diag_step;
+}
+
+// Searches the four cardinal direction for a better mv, then follows up with a
+// search in the best quadrant. This uses bilinear filter to speed up the
+// calculation.
+static AOM_FORCE_INLINE MV first_level_check_fast(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+    int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int is_scaled) {
+  // Check the four cardinal directions
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  int dummy = 0;
+  const unsigned int left = check_better_fast(
+      xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+      sse1, distortion, &dummy, is_scaled);
+
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const unsigned int right = check_better_fast(
+      xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, &dummy, is_scaled);
+
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const unsigned int up = check_better_fast(
+      xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+      sse1, distortion, &dummy, is_scaled);
+
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+  const unsigned int down = check_better_fast(
+      xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, &dummy, is_scaled);
+
+  const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+  const MV diag_mv = { this_mv.row + diag_step.row,
+                       this_mv.col + diag_step.col };
+
+  // Check the diagonal direction with the best mv
+  check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+                    mv_cost_params, besterr, sse1, distortion, &dummy,
+                    is_scaled);
+
+  return diag_step;
+}
+
+// Performs a following up search after first_level_check_fast is called. This
+// performs two extra chess pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void second_level_check_fast(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step,
+    MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int is_scaled) {
+  assert(diag_step.row == hstep || diag_step.row == -hstep);
+  assert(diag_step.col == hstep || diag_step.col == -hstep);
+  const int tr = this_mv.row;
+  const int tc = this_mv.col;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+  int dummy = 0;
+  if (tr != br && tc != bc) {
+    assert(diag_step.col == bc - tc);
+    assert(diag_step.row == br - tr);
+    const MV chess_mv_1 = { br, bc + diag_step.col };
+    const MV chess_mv_2 = { br + diag_step.row, bc };
+    check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+
+    check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+  } else if (tr == br && tc != bc) {
+    assert(diag_step.col == bc - tc);
+    // Continue searching in the best direction
+    const MV bottom_long_mv = { br + hstep, bc + diag_step.col };
+    const MV top_long_mv = { br - hstep, bc + diag_step.col };
+    check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+    check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+
+    // Search in the direction opposite of the best quadrant
+    const MV rev_mv = { br - diag_step.row, bc };
+    check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+  } else if (tr != br && tc == bc) {
+    assert(diag_step.row == br - tr);
+    // Continue searching in the best direction
+    const MV right_long_mv = { br + diag_step.row, bc + hstep };
+    const MV left_long_mv = { br + diag_step.row, bc - hstep };
+    check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+    check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+
+    // Search in the direction opposite of the best quadrant
+    const MV rev_mv = { br, bc - diag_step.col };
+    check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+  }
+}
+
+// Combines first level check and second level check when applicable. This first
+// searches the four cardinal directions, and perform several
+// diagonal/chess-pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void two_level_checks_fast(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+    int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int iters, int is_scaled) {
+  const MV diag_step = first_level_check_fast(
+      xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, is_scaled);
+  if (iters > 1) {
+    second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep,
+                            mv_limits, var_params, mv_cost_params, besterr,
+                            sse1, distortion, is_scaled);
+  }
+}
+
+static AOM_FORCE_INLINE MV
+first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv,
+                  MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits,
+                  const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                  const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+                  unsigned int *sse1, int *distortion) {
+  int dummy = 0;
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+  const unsigned int left =
+      check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int right =
+      check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int up =
+      check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int down =
+      check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+  const MV diag_mv = { this_mv.row + diag_step.row,
+                       this_mv.col + diag_step.col };
+
+  // Check the diagonal direction with the best mv
+  check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
+               besterr, sse1, distortion, &dummy);
+
+  return diag_step;
+}
+
+// A newer version of second level check that gives better quality.
+// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different
+// from av1_find_best_sub_pixel_tree
+static AOM_FORCE_INLINE void second_level_check_v2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+    MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int is_scaled) {
+  assert(best_mv->row == this_mv.row + diag_step.row ||
+         best_mv->col == this_mv.col + diag_step.col);
+  if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+    return;
+  } else if (this_mv.row == best_mv->row) {
+    // Search away from diagonal step since diagonal search did not provide any
+    // improvement
+    diag_step.row *= -1;
+  } else if (this_mv.col == best_mv->col) {
+    diag_step.col *= -1;
+  }
+
+  const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+  const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+  const MV diag_bias_mv = { best_mv->row + diag_step.row,
+                            best_mv->col + diag_step.col };
+  int has_better_mv = 0;
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                 mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+    check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                 mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+    }
+  } else {
+    check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+                      is_scaled);
+    check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+                      is_scaled);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion,
+                        &has_better_mv, is_scaled);
+    }
+  }
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv);
+  const int src_stride = ms_buffers->src->stride;
+  const int y_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+
+  unsigned int besterr;
+
+  if (second_pred != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_cur_buf_hbd(xd)) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+      if (mask) {
+        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
+                                  mask, mask_stride, invert_mask);
+      } else {
+        aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+      if (mask) {
+        aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                           mask_stride, invert_mask);
+      } else {
+        aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+#else
+    (void)xd;
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+    if (mask) {
+      aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                         mask_stride, invert_mask);
+    } else {
+      aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+    }
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+#endif
+  } else {
+    besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost_(bestmv, mv_cost_params);
+  return besterr;
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int upsampled_setup_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(bestmv, mv_cost_params);
+  return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(const int *cost_list) {
+  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
+                                         int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+// Checks the list of mvs searched in the last iteration and see if we are
+// repeating it. If so, return 1. Otherwise we update the last_mv_search_list
+// with current_mv and return 0.
+static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list,
+                                               const MV current_mv, int iter) {
+  if (last_mv_search_list) {
+    if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) {
+      return 1;
+    }
+
+    last_mv_search_list[iter].as_mv = current_mv;
+  }
+  return 0;
+}
+
+static AOM_INLINE int setup_center_error_facade(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion,
+    int is_scaled) {
+  if (is_scaled) {
+    return upsampled_setup_center_error(xd, cm, bestmv, var_params,
+                                        mv_cost_params, sse1, distortion);
+  } else {
+    return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                              distortion);
+  }
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+    unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)cm;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  // The iteration we are current searching for. Iter 0 corresponds to fullpel
+  // mv, iter 1 to half pel, and so on
+  int iter = 0;
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+
+  if (start_mv_stats != NULL && !is_scaled) {
+    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+    *distortion = start_mv_stats->distortion;
+    *sse1 = start_mv_stats->sse;
+  } else {
+    besterr =
+        setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+                                  sse1, distortion, is_scaled);
+  }
+
+  // If forced_stop is FULL_PEL, return.
+  if (forced_stop == FULL_PEL) return besterr;
+
+  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+    return INT_MAX;
+  }
+  iter++;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      const MV this_mv = { start_mv.row + ir * hstep,
+                           start_mv.col + ic * hstep };
+      int dummy = 0;
+      check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params,
+                        mv_cost_params, &besterr, sse1, distortion, &dummy,
+                        is_scaled);
+    }
+  } else {
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  if (forced_stop < HALF_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+    unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)cm;
+  (void)start_mv_stats;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  // The iteration we are current searching for. Iter 0 corresponds to fullpel
+  // mv, iter 1 to half pel, and so on
+  int iter = 0;
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+
+  if (start_mv_stats != NULL && !is_scaled) {
+    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+    *distortion = start_mv_stats->distortion;
+    *sse1 = start_mv_stats->sse;
+  } else {
+    besterr =
+        setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+                                  sse1, distortion, is_scaled);
+  }
+
+  // If forced_stop is FULL_PEL, return.
+  if (forced_stop == FULL_PEL) return besterr;
+
+  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+    return INT_MAX;
+  }
+  iter++;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+                                  (cost_list[2] < cost_list[4] ? 0 : 2);
+
+    const MV left_mv = { start_mv.row, start_mv.col - hstep };
+    const MV right_mv = { start_mv.row, start_mv.col + hstep };
+    const MV bottom_mv = { start_mv.row + hstep, start_mv.col };
+    const MV top_mv = { start_mv.row - hstep, start_mv.col };
+
+    const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep };
+    const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep };
+    const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep };
+    const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep };
+
+    int dummy = 0;
+
+    switch (whichdir) {
+      case 0:  // bottom left quadrant
+        check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, &dummy, is_scaled);
+        break;
+      case 1:  // bottom right quadrant
+        check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, &dummy, is_scaled);
+        break;
+      case 2:  // top left quadrant
+        check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        break;
+      case 3:  // top right quadrant
+        check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        break;
+    }
+  } else {
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  if (forced_stop < HALF_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 MV start_mv,
+                                 const FULLPEL_MV_STATS *start_mv_stats,
+                                 MV *bestmv, int *distortion,
+                                 unsigned int *sse1,
+                                 int_mv *last_mv_search_list) {
+  (void)start_mv_stats;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const SUBPEL_SEARCH_TYPE subpel_search_type =
+      ms_params->var_params.subpel_search_type;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  // How many steps to take. A round of 0 means fullpel search only, 1 means
+  // half-pel, and so on.
+  const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+
+  unsigned int besterr = INT_MAX;
+
+  *bestmv = start_mv;
+
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+
+  if (start_mv_stats != NULL && !is_scaled) {
+    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+    *distortion = start_mv_stats->distortion;
+    *sse1 = start_mv_stats->sse;
+  } else {
+    if (subpel_search_type != USE_2_TAPS_ORIG) {
+      besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
+                                             mv_cost_params, sse1, distortion);
+    } else {
+      besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                                   distortion);
+    }
+  }
+
+  // If forced_stop is FULL_PEL, return.
+  if (!round) return besterr;
+
+  for (int iter = 0; iter < round; ++iter) {
+    MV iter_center_mv = *bestmv;
+    if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv,
+                                     iter)) {
+      return INT_MAX;
+    }
+
+    MV diag_step;
+    if (subpel_search_type != USE_2_TAPS_ORIG) {
+      diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+                                    mv_limits, var_params, mv_cost_params,
+                                    &besterr, sse1, distortion);
+    } else {
+      diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep,
+                                         mv_limits, var_params, mv_cost_params,
+                                         &besterr, sse1, distortion, is_scaled);
+    }
+
+    // Check diagonal sub-pixel position
+    if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+      second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+                            mv_limits, var_params, mv_cost_params, &besterr,
+                            sse1, distortion, is_scaled);
+    }
+
+    hstep >>= 1;
+  }
+
+  return besterr;
+}
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+// Returns the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                MV start_mv,
+                                const FULLPEL_MV_STATS *start_mv_stats,
+                                MV *bestmv, int *distortion, unsigned int *sse1,
+                                int_mv *last_mv_search_list) {
+  (void)xd;
+  (void)cm;
+  (void)start_mv;
+  (void)start_mv_stats;
+  (void)sse1;
+  (void)distortion;
+  (void)last_mv_search_list;
+
+  const int allow_hp = ms_params->allow_hp;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  bestmv->row = mv_limits->row_max;
+  bestmv->col = mv_limits->col_max;
+
+  unsigned int besterr = 0;
+
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp, 0);
+  return besterr;
+}
+
+// Returns the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                MV start_mv,
+                                const FULLPEL_MV_STATS *start_mv_stats,
+                                MV *bestmv, int *distortion, unsigned int *sse1,
+                                int_mv *last_mv_search_list) {
+  (void)xd;
+  (void)cm;
+  (void)start_mv;
+  (void)start_mv_stats;
+  (void)sse1;
+  (void)distortion;
+  (void)last_mv_search_list;
+
+  const int allow_hp = ms_params->allow_hp;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  bestmv->row = mv_limits->row_min;
+  bestmv->col = mv_limits->col_min;
+
+  unsigned int besterr = 0;
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp, 0);
+  return besterr;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Computes the cost of the current predictor by going through the whole
+// av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv
+// during motion_mode_rd. We are going through the whole
+// av1_enc_build_inter_predictor because we might have changed the interpolation
+// filter, etc before motion_mode_rd is called.
+static INLINE unsigned int compute_motion_cost(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize,
+    const MV *this_mv) {
+  unsigned int mse;
+  unsigned int sse;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+
+  const uint8_t *const src = ms_buffers->src->buf;
+  const int src_stride = ms_buffers->src->stride;
+  const uint8_t *const dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+  const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp;
+
+  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+  mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params);
+  return mse;
+}
+
+// Refines MV in a small range
+
+// Macros to build bitmasks which help us avoid redundant computations
+//
+// To explain the idea here, imagine that on the first iteration of the
+// loop below, we step rightwards. Then, on the second iteration, the neighbors
+// to consider are:
+//     . . .
+//     0 1 .
+//     . . .
+// Where 0 is the initial search point, 1 is the best candidate found in the
+// first iteration, and the dots are the other neighbors of point 1.
+//
+// Naively, we would now need to scan all 8 neighbors of point 1 (point 0 and
+// the seven points marked with dots), and compare them to see where to move
+// next. However, we already evaluated 5 of those 8 neighbors in the last
+// iteration, and decided that they are worse than point 1. So we don't need
+// to re-consider these points. We only really need to consider the three
+// points which are adjacent to point 1 but *not* to point 0.
+//
+// As the algorithm goes on, there are other ways that redundant evaluations
+// can happen, if the search path curls back around on itself.
+//
+// To avoid all possible redundancies, we'd have to build a set containing
+// every point we have already checked, and this would be quite expensive.
+//
+// So instead, we apply a 95%-effective solution with a much lower overhead:
+// we prune out the points which were considered during the previous
+// iteration, but we don't worry about any prior iteration. This can be done
+// as follows:
+//
+// We build a static table, called neighbor_mask, which answers the question
+// "if we moved in direction X last time, which neighbors are new, and which
+//  were scanned last iteration?"
+// Then we can query this table to quickly determine which points we need to
+// evaluate, and which we can skip.
+//
+// To query the table, the logic is simply:
+// neighbor_mask[i] & (1 << j) == "if we moved in direction i last iteration,
+//                             do we need to scan neighbor j this iteration?"
+#define NEIGHBOR_MASK_DIA(left, down, right, up) \
+  (left | (down << 1) | (right << 2) | (up << 3))
+
+#define NEIGHBOR_MASK_SQR(left, down, right, up, down_left, down_right, \
+                          up_left, up_right)                            \
+  (left | (down << 1) | (right << 2) | (up << 3) | (down_left << 4) |   \
+   (down_right << 5) | (up_left << 6) | (up_right << 7))
+
+static const warp_search_config warp_search_info[WARP_SEARCH_METHODS] = {
+  // WARP_SEARCH_DIAMOND
+  {
+    .num_neighbors = 4,
+    .neighbors = { {  0, -1 }, {  1,  0 }, {  0,  1 }, { -1,  0 } },
+    .neighbor_mask = {
+      // If we stepped left last time, consider all points except right
+      NEIGHBOR_MASK_DIA(1, 1, 0, 1),
+      // If we stepped down last time, consider all points except up
+      NEIGHBOR_MASK_DIA(1, 1, 1, 0),
+      // Stepped right last time
+      NEIGHBOR_MASK_DIA(0, 1, 1, 1),
+      // Stepped up last time
+      NEIGHBOR_MASK_DIA(1, 0, 1, 1),
+    },
+  },
+  // WARP_SEARCH_SQUARE
+  {
+    .num_neighbors = 8,
+    .neighbors = { {  0, -1 }, {  1,  0 }, {  0,  1 }, { -1,  0 },
+                   {  1, -1 }, {  1,  1 }, { -1, -1 }, { -1,  1 } },
+    .neighbor_mask = {
+      // If we stepped left last time, then we only need to consider 3 points:
+      // left, down+left, up+left
+      NEIGHBOR_MASK_SQR(1, 0, 0, 0, 1, 0, 1, 0),
+      // If we stepped down last time, then we only need to consider 3 points:
+      // down, down+left, down+right
+      NEIGHBOR_MASK_SQR(0, 1, 0, 0, 1, 1, 0, 0),
+      // Stepped right last time
+      NEIGHBOR_MASK_SQR(0, 0, 1, 0, 0, 1, 0, 1),
+      // Stepped up last time
+      NEIGHBOR_MASK_SQR(0, 0, 0, 1, 0, 0, 1, 1),
+
+      // If we stepped down+left last time, then we need to consider 5 points:
+      // left, down, down+left, down+right, up+left
+      NEIGHBOR_MASK_SQR(1, 1, 0, 0, 1, 1, 1, 0),
+      // Stepped down+right last time
+      NEIGHBOR_MASK_SQR(0, 1, 1, 0, 1, 1, 0, 1),
+      // Stepped up+left last time
+      NEIGHBOR_MASK_SQR(1, 0, 0, 1, 1, 0, 1, 1),
+      // Stepped up+right last time
+      NEIGHBOR_MASK_SQR(0, 0, 1, 1, 0, 1, 1, 1),
+    },
+  },
+};
+
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  BLOCK_SIZE bsize, const int *pts0,
+                                  const int *pts_inref0, int total_samples,
+                                  WARP_SEARCH_METHOD search_method,
+                                  int num_iterations) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+
+  const MV *neighbors = warp_search_info[search_method].neighbors;
+  const int num_neighbors = warp_search_info[search_method].num_neighbors;
+  const uint8_t *neighbor_mask = warp_search_info[search_method].neighbor_mask;
+
+  MV *best_mv = &mbmi->mv[0].as_mv;
+
+  WarpedMotionParams best_wm_params = mbmi->wm_params;
+  int best_num_proj_ref = mbmi->num_proj_ref;
+  unsigned int bestmse;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  const int mv_shift = ms_params->allow_hp ? 0 : 1;
+
+  // Calculate the center position's error
+  assert(av1_is_subpelmv_in_range(mv_limits, *best_mv));
+  bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv);
+
+  // MV search
+  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // First step always scans all neighbors
+  uint8_t valid_neighbors = UINT8_MAX;
+
+  for (int ite = 0; ite < num_iterations; ++ite) {
+    int best_idx = -1;
+
+    for (int idx = 0; idx < num_neighbors; ++idx) {
+      if ((valid_neighbors & (1 << idx)) == 0) {
+        continue;
+      }
+
+      unsigned int thismse;
+
+      MV this_mv = { best_mv->row + neighbors[idx].row * (1 << mv_shift),
+                     best_mv->col + neighbors[idx].col * (1 << mv_shift) };
+      if (av1_is_subpelmv_in_range(mv_limits, this_mv)) {
+        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+        if (total_samples > 1) {
+          mbmi->num_proj_ref =
+              av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+        }
+
+        if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                                 this_mv.row, this_mv.col, &mbmi->wm_params,
+                                 mi_row, mi_col)) {
+          thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv);
+
+          if (thismse < bestmse) {
+            best_idx = idx;
+            best_wm_params = mbmi->wm_params;
+            best_num_proj_ref = mbmi->num_proj_ref;
+            bestmse = thismse;
+          }
+        }
+      }
+    }
+
+    if (best_idx == -1) break;
+
+    if (best_idx >= 0) {
+      best_mv->row += neighbors[best_idx].row * (1 << mv_shift);
+      best_mv->col += neighbors[best_idx].col * (1 << mv_shift);
+      valid_neighbors = neighbor_mask[best_idx];
+    }
+  }
+
+  mbmi->wm_params = best_wm_params;
+  mbmi->num_proj_ref = best_num_proj_ref;
+  return bestmse;
+}
+
+#endif  // !CONFIG_REALTIME_ONLY
+// =============================================================================
+//  Subpixel Motion Search: OBMC
+// =============================================================================
+// Estimates the variance of prediction residue
+static INLINE int estimate_obmc_pref_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *src = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int ref_stride = ms_buffers->ref->stride;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse);
+}
+
+// Calculates the variance of prediction residue
+static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+                                     const MV *this_mv,
+                                     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                                     unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int ref_stride = ms_buffers->ref->stride;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  unsigned int besterr;
+  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                              subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+                              subpel_search_type);
+    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+  }
+#else
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                     subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+  besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#endif
+  return besterr;
+}
+
+static unsigned int setup_obmc_center_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  // TODO(chiyotsai@google.com): There might be a bug here where we didn't use
+  // get_buf_from_mv(ref, *this_mv).
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = ms_buffers->ref->buf;
+  const int ref_stride = ms_buffers->ref->stride;
+  unsigned int besterr =
+      var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(this_mv, mv_cost_params);
+  return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  unsigned int besterr =
+      upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(this_mv, mv_cost_params);
+  return besterr;
+}
+
+// Estimates the variance of prediction residue
+// TODO(chiyotsai@google.com): the cost does does not match the cost in
+// mv_cost_. Investigate this later.
+static INLINE int estimate_obmc_mvcost(const MV *this_mv,
+                                       const MV_COST_PARAMS *mv_cost_params) {
+  const MV *ref_mv = mv_cost_params->ref_mv;
+  const int *mvjcost = mv_cost_params->mvjcost;
+  const int *const *mvcost = mv_cost_params->mvcost;
+  const int error_per_bit = mv_cost_params->error_per_bit;
+  const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type;
+  const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row),
+                       GET_MV_SUBPEL(this_mv->col - ref_mv->col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      return (unsigned)((mv_cost(&diff_mv, mvjcost,
+                                 CONVERT_TO_CONST_MVCOST(mvcost)) *
+                             error_per_bit +
+                         4096) >>
+                        13);
+    case MV_COST_NONE: return 0;
+    default:
+      assert(0 && "L1 norm is not tuned for estimated obmc mvcost");
+      return 0;
+  }
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better_fast(
+    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse);
+
+    cost = estimate_obmc_mvcost(this_mv, mv_cost_params);
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    const int thismse =
+        upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+static AOM_FORCE_INLINE MV obmc_first_level_check(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv,
+    const int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  int dummy = 0;
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    const unsigned int left =
+        obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int right =
+        obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int up =
+        obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int down =
+        obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+    const MV diag_mv = { this_mv.row + diag_step.row,
+                         this_mv.col + diag_step.col };
+
+    // Check the diagonal direction with the best mv
+    obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    return diag_step;
+  } else {
+    const unsigned int left = obmc_check_better_fast(
+        &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+        distortion, &dummy);
+    const unsigned int right = obmc_check_better_fast(
+        &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+        sse1, distortion, &dummy);
+
+    const unsigned int up = obmc_check_better_fast(
+        &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+        distortion, &dummy);
+
+    const unsigned int down = obmc_check_better_fast(
+        &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+        sse1, distortion, &dummy);
+
+    const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+    const MV diag_mv = { this_mv.row + diag_step.row,
+                         this_mv.col + diag_step.col };
+
+    // Check the diagonal direction with the best mv
+    obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    return diag_step;
+  }
+}
+
+// A newer version of second level check for obmc that gives better quality.
+static AOM_FORCE_INLINE void obmc_second_level_check_v2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+    MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  assert(best_mv->row == this_mv.row + diag_step.row ||
+         best_mv->col == this_mv.col + diag_step.col);
+  if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+    return;
+  } else if (this_mv.row == best_mv->row) {
+    // Search away from diagonal step since diagonal search did not provide any
+    // improvement
+    diag_step.row *= -1;
+  } else if (this_mv.col == best_mv->col) {
+    diag_step.col *= -1;
+  }
+
+  const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+  const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+  const MV diag_bias_mv = { best_mv->row + diag_step.row,
+                            best_mv->col + diag_step.col };
+  int has_better_mv = 0;
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+    obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion,
+                        &has_better_mv);
+    }
+  } else {
+    obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion,
+                           &has_better_mv);
+    obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion,
+                           &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+                             mv_cost_params, besterr, sse1, distortion,
+                             &has_better_mv);
+    }
+  }
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+    unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)last_mv_search_list;
+  (void)start_mv_stats;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const SUBPEL_SEARCH_TYPE subpel_search_type =
+      ms_params->var_params.subpel_search_type;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  int hstep = INIT_SUBPEL_STEP_SIZE;
+  const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  if (subpel_search_type != USE_2_TAPS_ORIG)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion);
+  else
+    besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1,
+                                      distortion);
+
+  for (int iter = 0; iter < round; ++iter) {
+    MV iter_center_mv = *bestmv;
+    MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+                                          mv_limits, var_params, mv_cost_params,
+                                          &besterr, sse1, distortion);
+
+    if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+      obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+                                 mv_limits, var_params, mv_cost_params,
+                                 &besterr, sse1, distortion);
+    }
+    hstep >>= 1;
+  }
+
+  return besterr;
+}
+
+// =============================================================================
+//  Public cost function: mv_cost + pred error
+// =============================================================================
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+                       const FULLPEL_MV best_mv,
+                       const aom_variance_fn_ptr_t *vfp,
+                       const struct buf_2d *src, const struct buf_2d *pre) {
+  const MV mv = get_mv_from_fullmv(&best_mv);
+  unsigned int sse, var;
+
+  var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv),
+                pre->stride, &sse);
+  (void)var;
+
+  return sse + mv_err_cost_(&mv, mv_cost_params);
+}
+
+static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
+                                    const FULLPEL_MV best_mv,
+                                    const uint8_t *second_pred,
+                                    const aom_variance_fn_ptr_t *vfp,
+                                    const struct buf_2d *src,
+                                    const struct buf_2d *pre) {
+  const MV mv = get_mv_from_fullmv(&best_mv);
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+                   src->buf, src->stride, &unused, second_pred) +
+         mv_err_cost_(&mv, mv_cost_params);
+}
+
+static INLINE int get_mvpred_mask_var(
+    const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv,
+    const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+    int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
+    const struct buf_2d *pre) {
+  const MV mv = get_mv_from_fullmv(&best_mv);
+  unsigned int unused;
+
+  return vfp->msvf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+                   src->buf, src->stride, second_pred, mask, mask_stride,
+                   invert_mask, &unused) +
+         mv_err_cost_(&mv, mv_cost_params);
+}
+
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params,
+                                const FULLPEL_MV best_mv,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const struct buf_2d *src,
+                                const struct buf_2d *pre) {
+  if (mask) {
+    return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask,
+                               mask_stride, invert_mask, vfp, src, pre);
+  } else {
+    return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src,
+                             pre);
+  }
+}
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
new file mode 100644
index 0000000000..87b9309b61
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
+
+#include "av1/common/mv.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/rd.h"
+
+#include "aom_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct SPEED_FEATURES;
+
+// =============================================================================
+//  Cost functions
+// =============================================================================
+
+enum {
+  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
+  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
+  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
+  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
+  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
+} UENUM1BYTE(MV_COST_TYPE);
+
+typedef struct {
+  // The reference mv used to compute the mv cost
+  const MV *ref_mv;
+  FULLPEL_MV full_ref_mv;
+  MV_COST_TYPE mv_cost_type;
+  const int *mvjcost;
+  const int *mvcost[2];
+  int error_per_bit;
+  // A multiplier used to convert rate to sad cost
+  int sad_per_bit;
+} MV_COST_PARAMS;
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+                    int *const mvcost[2], int weight);
+
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+                       const FULLPEL_MV best_mv,
+                       const aom_variance_fn_ptr_t *vfp,
+                       const struct buf_2d *src, const struct buf_2d *pre);
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params,
+                                const FULLPEL_MV best_mv,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const struct buf_2d *src,
+                                const struct buf_2d *pre);
+
+// =============================================================================
+//  Motion Search
+// =============================================================================
+typedef struct {
+  // The reference buffer
+  const struct buf_2d *ref;
+
+  // The source and predictors/mask used by translational search
+  const struct buf_2d *src;
+  const uint8_t *second_pred;
+  const uint8_t *mask;
+  int mask_stride;
+  int inv_mask;
+
+  // The weighted source and mask used by OBMC
+  const int32_t *wsrc;
+  const int32_t *obmc_mask;
+} MSBuffers;
+
+static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *mask,
+                                            int mask_stride, int invert_mask) {
+  ms_buffers->second_pred = second_pred;
+  ms_buffers->mask = mask;
+  ms_buffers->mask_stride = mask_stride;
+  ms_buffers->inv_mask = invert_mask;
+}
+
+// =============================================================================
+//  Fullpixel Motion Search
+// =============================================================================
+// This struct holds fullpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+  BLOCK_SIZE bsize;
+  // A function pointer to the simd function for fast computation
+  const aom_variance_fn_ptr_t *vfp;
+
+  MSBuffers ms_buffers;
+
+  // WARNING: search_method should be regarded as a private variable and should
+  // not be modified directly so it is in sync with search_sites. To modify it,
+  // use av1_set_mv_search_method.
+  SEARCH_METHODS search_method;
+  const search_site_config *search_sites;
+  FullMvLimits mv_limits;
+
+  int run_mesh_search;    // Sets mesh search unless it got pruned by
+                          // prune_mesh_search.
+  int prune_mesh_search;  // Disables mesh search if the best_mv after a normal
+                          // search if close to the start_mv.
+  int mesh_search_mv_diff_threshold;  // mv diff threshold to enable
+                                      // prune_mesh_search
+  int force_mesh_thresh;  // Forces mesh search if the residue variance is
+                          // higher than the threshold.
+  const struct MESH_PATTERN *mesh_patterns[2];
+
+  // Use maximum search interval of 4 if true. This helps motion search to find
+  // the best motion vector for screen content types.
+  int fine_search_interval;
+
+  int is_intra_mode;
+
+  int fast_obmc_search;
+
+  // For calculating mv cost
+  MV_COST_PARAMS mv_cost_params;
+
+  // Stores the function used to compute the sad. This can be different from the
+  // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
+  aom_sad_fn_t sdf;
+  aom_sad_multi_d_fn_t sdx4df;
+  aom_sad_multi_d_fn_t sdx3df;
+} FULLPEL_MOTION_SEARCH_PARAMS;
+
+typedef struct {
+  int err_cost;
+  unsigned int distortion;
+  unsigned int sse;
+} FULLPEL_MV_STATS;
+
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
+
+void av1_make_default_fullpel_ms_params(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
+    const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+    SEARCH_METHODS search_method, int fine_search_interval);
+
+/*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */
+void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const IntraBCMVCosts *dv_costs);
+
+// Sets up configs for fullpixel DIAMOND / CLAMPED_DIAMOND search method.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+                                    int level);
+// Sets up configs for firstpass motion search.
+void av1_init_motion_fpf(search_site_config *cfg, int stride);
+// Sets up configs for NSTEP / NSTEP_8PT motion search method.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+                                        int level);
+// Sets up configs for BIGDIA / FAST_DIAMOND / FAST_BIGDIA
+// motion search method.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+                                         int level);
+// Sets up configs for HEX or FAST_HEX motion search method.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+                                      int level);
+// Sets up configs for SQUARE motion search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+                                         int level);
+
+/*! Function pointer to search site config initialization of different search
+ * method functions. */
+typedef void (*av1_init_search_site_config)(search_site_config *cfg, int stride,
+                                            int level);
+
+/*! Array of function pointers used to set the motion search config. */
+extern const av1_init_search_site_config
+    av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS];
+
+// Array to inform which all search methods are having
+// same candidates and different in number of search steps.
+static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = {
+  DIAMOND,          // DIAMOND
+  NSTEP,            // NSTEP
+  NSTEP_8PT,        // NSTEP_8PT
+  CLAMPED_DIAMOND,  // CLAMPED_DIAMOND
+  HEX,              // HEX
+  BIGDIA,           // BIGDIA
+  SQUARE,           // SQUARE
+  HEX,              // FAST_HEX
+  BIGDIA,           // FAST_DIAMOND
+  BIGDIA,           // FAST_BIGDIA
+  BIGDIA            // VFAST_DIAMOND
+};
+
+// Reinitialize the search site config.
+static AOM_INLINE void av1_refresh_search_site_config(
+    search_site_config *ss_cfg_buf, SEARCH_METHODS search_method,
+    const int ref_stride) {
+  const int level =
+      search_method == NSTEP_8PT || search_method == CLAMPED_DIAMOND;
+  search_method = search_method_lookup[search_method];
+  av1_init_motion_compensation[search_method](&ss_cfg_buf[search_method],
+                                              ref_stride, level);
+}
+
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_search_method(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+    SEARCH_METHODS search_method) {
+  ms_params->search_method = search_method;
+  ms_params->search_sites =
+      &search_sites[search_method_lookup[ms_params->search_method]];
+}
+
+// Set up limit values for MV components.
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_row_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_row, int mi_height, int border) {
+  const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+  const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+  mv_limits->row_min = AOMMAX(min1, min2);
+  const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE +
+                   border - 2 * AOM_INTERP_EXTEND;
+  const int max2 =
+      (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+  mv_limits->row_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_col_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_col, int mi_width, int border) {
+  const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+  const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+  mv_limits->col_min = AOMMAX(min1, min2);
+  const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border -
+                   2 * AOM_INTERP_EXTEND;
+  const int max2 =
+      (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+  mv_limits->col_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_row, int mi_col, int mi_height, int mi_width, int border) {
+  av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border);
+  av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border);
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
+
+int av1_init_search_range(int size);
+
+unsigned int av1_int_pro_motion_estimation(
+    const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, const MV *ref_mv, unsigned int *y_sad_zero,
+    int me_search_size_col, int me_search_size_row);
+
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                             const FULLPEL_MV start_mv, FULLPEL_MV *best_mv);
+
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          const int step_param, int *cost_list,
+                          FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+                          FULLPEL_MV *second_best_mv);
+
+int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd,
+                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                            IntraBCHashInfo *intrabc_hash_info,
+                            FULLPEL_MV *best_mv);
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                               const int step_param, FULLPEL_MV *best_mv);
+
+static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
+                                         FULLPEL_MV mv) {
+  return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+         (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+// =============================================================================
+//  Subpixel Motion Search
+// =============================================================================
+enum {
+  EIGHTH_PEL,
+  QUARTER_PEL,
+  HALF_PEL,
+  FULL_PEL
+} UENUM1BYTE(SUBPEL_FORCE_STOP);
+
+typedef struct {
+  const aom_variance_fn_ptr_t *vfp;
+  SUBPEL_SEARCH_TYPE subpel_search_type;
+  // Source and reference buffers
+  MSBuffers ms_buffers;
+  int w, h;
+} SUBPEL_SEARCH_VAR_PARAMS;
+
+// This struct holds subpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+  // High level motion search settings
+  int allow_hp;
+  const int *cost_list;
+  SUBPEL_FORCE_STOP forced_stop;
+  int iters_per_step;
+  SubpelMvLimits mv_limits;
+
+  // For calculating mv cost
+  MV_COST_PARAMS mv_cost_params;
+
+  // Distortion calculation params
+  SUBPEL_SEARCH_VAR_PARAMS var_params;
+} SUBPEL_MOTION_SEARCH_PARAMS;
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                       const struct AV1_COMP *cpi,
+                                       const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                       const MV *ref_mv, const int *cost_list);
+
+typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                   MV start_mv,
+                                   const FULLPEL_MV_STATS *start_mv_stats,
+                                   MV *bestmv, int *distortion,
+                                   unsigned int *sse1,
+                                   int_mv *last_mv_search_list);
+
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up;
+
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  BLOCK_SIZE bsize, const int *pts0,
+                                  const int *pts_inref0, int total_samples,
+                                  WARP_SEARCH_METHOD search_method,
+                                  int num_iterations);
+
+static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
+  for (int z = 0; z < 3; z++) {
+    fractional_best_mv[z].as_int = INVALID_MV;
+  }
+}
+
+static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits,
+                                                  const FullMvLimits *mv_limits,
+                                                  const MV *ref_mv) {
+  const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL);
+  int minc = AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv);
+  int maxc = AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv);
+  int minr = AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv);
+  int maxr = AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv);
+
+  maxc = AOMMAX(minc, maxc);
+  maxr = AOMMAX(minr, maxr);
+
+  subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc);
+  subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc);
+  subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr);
+  subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
+                                           MV mv) {
+  return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+         (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+
+static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
+  return mv->row * stride + mv->col;
+}
+
+static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
+                                                 const FULLPEL_MV *mv) {
+  return &buf->buf[get_offset_from_fullmv(mv, buf->stride)];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mcomp_structs.h b/third_party/aom/av1/encoder/mcomp_structs.h
new file mode 100644
index 0000000000..06660cf4a6
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp_structs.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
+#define AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
+
+#include "av1/common/mv.h"
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+// Maximum number of neighbors to scan per iteration during
+// WARPED_CAUSAL refinement
+// Note: The elements of warp_search_config.neighbor_mask must be at least
+// MAX_WARP_SEARCH_NEIGHBORS many bits wide. So the type may need to be
+// widened if this value is increased.
+#define MAX_WARP_SEARCH_NEIGHBORS 8
+
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+  (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
+typedef struct {
+  FULLPEL_MV coord;
+  int coord_offset;
+} search_neighbors;
+// motion search site
+typedef struct search_site {
+  FULLPEL_MV mv;
+  int offset;
+} search_site;
+
+typedef struct search_site_config {
+  search_site site[MAX_MVSEARCH_STEPS * 2][16 + 1];
+  // Number of search steps.
+  int num_search_steps;
+  int searches_per_step[MAX_MVSEARCH_STEPS * 2];
+  int radius[MAX_MVSEARCH_STEPS * 2];
+  int stride;
+} search_site_config;
+
+enum {
+  // Search 8-points in the radius grid around center, up to 11 search stages.
+  DIAMOND = 0,
+  // Search 12-points in the radius/tan_radius grid around center,
+  // up to 15 search stages.
+  NSTEP = 1,
+  // Search 8-points in the radius grid around center, up to 16 search stages.
+  NSTEP_8PT = 2,
+  // Search 8-points in the radius grid around center, upto 11 search stages
+  // with clamping of search radius.
+  CLAMPED_DIAMOND = 3,
+  // Search maximum 8-points in the radius grid around center,
+  // up to 11 search stages. First stage consists of 8 search points
+  // and the rest with 6 search points each in hex shape.
+  HEX = 4,
+  // Search maximum 8-points in the radius grid around center,
+  // up to 11 search stages. First stage consists of 4 search
+  // points and the rest with 8 search points each.
+  BIGDIA = 5,
+  // Search 8-points in the square grid around center, up to 11 search stages.
+  SQUARE = 6,
+  // HEX search with up to 2 stages.
+  FAST_HEX = 7,
+  // BIGDIA search with up to 2 stages.
+  FAST_DIAMOND = 8,
+  // BIGDIA search with up to 3 stages.
+  FAST_BIGDIA = 9,
+  // BIGDIA search with up to 1 stage.
+  VFAST_DIAMOND = 10,
+  // Total number of search methods.
+  NUM_SEARCH_METHODS,
+  // Number of distinct search methods.
+  NUM_DISTINCT_SEARCH_METHODS = SQUARE + 1,
+} UENUM1BYTE(SEARCH_METHODS);
+
+typedef struct warp_search_config {
+  int num_neighbors;
+  MV neighbors[MAX_WARP_SEARCH_NEIGHBORS];
+  // Bitmask which is used to prune the search neighbors at one iteration
+  // based on which direction we chose in the previous iteration.
+  // See comments in av1_refine_warped_mv for details.
+  uint8_t neighbor_mask[MAX_WARP_SEARCH_NEIGHBORS];
+} warp_search_config;
+
+// Methods for refining WARPED_CAUSAL motion vectors
+enum {
+  // Search 4 adjacent points in a diamond shape at each iteration
+  WARP_SEARCH_DIAMOND,
+  // Search 8 adjacent points in a square at each iteration
+  WARP_SEARCH_SQUARE,
+  WARP_SEARCH_METHODS
+} UENUM1BYTE(WARP_SEARCH_METHOD);
+
+#endif  // AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
diff --git a/third_party/aom/av1/encoder/misc_model_weights.h b/third_party/aom/av1/encoder/misc_model_weights.h
new file mode 100644
index 0000000000..f00aeabcf6
--- /dev/null
+++ b/third_party/aom/av1/encoder/misc_model_weights.h
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define MV_PREC_FEATURE_SIZE 18
+
+#define NUM_DNN_LAYERS 1
+#define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE
+#define MV_PREC_LAYER_SIZE_0 32
+#define NUM_LOGITS 1
+
+const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f,
+                                                       141.6251917346238f,
+                                                       0.36313633945679064f,
+                                                       0.0028162791958822085f,
+                                                       0.000484820537626698f,
+                                                       0.002769969388939025f,
+                                                       0.0f,
+                                                       0.00031274626720947577f,
+                                                       0.00020578555375160075f,
+                                                       0.0007075246732697733f,
+                                                       0.000539641029909925f,
+                                                       0.0013939401375906984f,
+                                                       4.985394760423499f,
+                                                       4.985394760423499f,
+                                                       4.9992148717283085f,
+                                                       5.143739822380163f,
+                                                       5.518483124004564f,
+                                                       87.63597847427077f };
+
+const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f,
+                                                      68.04472572607503f,
+                                                      13.23247674430399f,
+                                                      0.0029123438396921955f,
+                                                      0.0015331406169374737f,
+                                                      0.0029149813096313775f,
+                                                      1.0f,
+                                                      0.00047501102871357813f,
+                                                      0.00030025962993117947f,
+                                                      0.0009861163580391207f,
+                                                      0.0012157593528004055f,
+                                                      0.002004954948490521f,
+                                                      6.539447500484038f,
+                                                      6.539447500484038f,
+                                                      6.396589058279465f,
+                                                      3.4870155874262516f,
+                                                      3.8911353973740535f,
+                                                      112.07985259573601f };
+
+const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f,
+                                                 -0.1483527373474774f,
+                                                 0.08112076098858864f,
+                                                 -0.9582568679627453f,
+                                                 -0.34794757171071206f,
+                                                 0.6465225723304947f,
+                                                 0.0f,
+                                                 0.06754171885839604f,
+                                                 0.27156803620541214f,
+                                                 0.10635231245664407f,
+                                                 -0.031183926995968583f,
+                                                 0.048122572260291f,
+                                                 -0.19498534230045128f,
+                                                 -0.2614116319273316f,
+                                                 -0.3223762845136331f,
+                                                 -1.2063368350609205f,
+                                                 -0.523333556911706f,
+                                                 1.075632260890728f,
+                                                 0.48989726814387946f,
+                                                 -0.34816466111070477f,
+                                                 0.41668357610256473f,
+                                                 -1.0973562848791671f,
+                                                 0.04183921854389494f,
+                                                 -0.9123815389260476f,
+                                                 0.0f,
+                                                 0.859965047744027f,
+                                                 0.1962095804679813f,
+                                                 0.2606564339077058f,
+                                                 0.26695868715184895f,
+                                                 0.5319308568326692f,
+                                                 -0.23717505799723165f,
+                                                 -0.43127224481782567f,
+                                                 -0.3214545776203726f,
+                                                 0.5850852241402176f,
+                                                 -0.26705531612587813f,
+                                                 -0.5786016766610093f,
+                                                 0.9360519909983003f,
+                                                 0.20771329289016555f,
+                                                 -0.027614159544811823f,
+                                                 -1.175022807046164f,
+                                                 -0.07578967497693835f,
+                                                 0.6890172485324256f,
+                                                 0.0f,
+                                                 -0.008008338164988263f,
+                                                 -0.08064800010158935f,
+                                                 -0.22606910981666667f,
+                                                 0.4541586669210879f,
+                                                 0.07731527661370792f,
+                                                 -0.6744475941247964f,
+                                                 -0.2625842448396184f,
+                                                 1.7018613444303785f,
+                                                 -0.08622229073162656f,
+                                                 0.041858142814941275f,
+                                                 -0.24575964090386415f,
+                                                 -0.046626044730994964f,
+                                                 0.7608713064175202f,
+                                                 -0.23330119070907146f,
+                                                 -0.10115510984500826f,
+                                                 0.9722537349192069f,
+                                                 0.11718554254290829f,
+                                                 0.0f,
+                                                 0.2075123446014759f,
+                                                 0.09465167310768637f,
+                                                 0.7609896851963016f,
+                                                 0.4441038581385328f,
+                                                 0.26064144727430955f,
+                                                 -0.14678625366485035f,
+                                                 -0.03597014452200524f,
+                                                 0.3128680867196166f,
+                                                 1.102496797385966f,
+                                                 0.06642253233084111f,
+                                                 -1.2665494483407629f,
+                                                 0.09049412632000911f,
+                                                 -1.1160621999565095f,
+                                                 0.043420275255913035f,
+                                                 -0.8811412259978966f,
+                                                 0.21076234632287777f,
+                                                 0.16571534463543866f,
+                                                 0.0f,
+                                                 -0.7324075176473275f,
+                                                 -0.3677622514459495f,
+                                                 0.3273532243056415f,
+                                                 0.22922161936797775f,
+                                                 0.8204766691058087f,
+                                                 0.02982161033720488f,
+                                                 0.5266419954188112f,
+                                                 -1.0032154963302191f,
+                                                 0.7007602969763729f,
+                                                 0.37196355167990885f,
+                                                 -0.7608579453228548f,
+                                                 0.08568111584781847f,
+                                                 0.07011061059123677f,
+                                                 0.3233263598082507f,
+                                                 -0.08249928295410253f,
+                                                 0.08220165761319252f,
+                                                 0.22148722752246794f,
+                                                 0.0f,
+                                                 0.6122392701743506f,
+                                                 -0.26429838296378333f,
+                                                 0.31958081620005463f,
+                                                 -0.006027177397853826f,
+                                                 -0.3088310785887994f,
+                                                 -0.5436192046707807f,
+                                                 -0.011080356757423306f,
+                                                 0.12632650770008413f,
+                                                 -0.45097913215234525f,
+                                                 1.8008072867127298f,
+                                                 -0.7630029654575501f,
+                                                 -0.4054774329826579f,
+                                                 0.40386074452544535f,
+                                                 -0.18541426257453025f,
+                                                 0.2444879765079863f,
+                                                 -0.6216724756115081f,
+                                                 0.27030299321302f,
+                                                 0.0f,
+                                                 -0.6835848952967989f,
+                                                 -0.7914184320964815f,
+                                                 -0.6761595019582928f,
+                                                 -1.009565565604081f,
+                                                 -0.1904242439353305f,
+                                                 0.4463417126318631f,
+                                                 0.6025503823452971f,
+                                                 0.5149990860115566f,
+                                                 1.0242970663937634f,
+                                                 0.037947306826401385f,
+                                                 0.07039339786212848f,
+                                                 0.14273796789711987f,
+                                                 0.168103961425691f,
+                                                 1.6596066376811978f,
+                                                 0.19321092229384657f,
+                                                 -0.3710750388148514f,
+                                                 -0.01717015559410288f,
+                                                 0.0f,
+                                                 0.3005688477942597f,
+                                                 0.23877080653829577f,
+                                                 0.2718594552971173f,
+                                                 0.3885402571589898f,
+                                                 0.32999531945669247f,
+                                                 -0.6134460954213243f,
+                                                 -0.13972265462799183f,
+                                                 -0.07180089575716991f,
+                                                 -1.014572598188105f,
+                                                 0.0717207322809836f,
+                                                 0.34896157745155615f,
+                                                 -0.27127687591403f,
+                                                 -0.5058651212773623f,
+                                                 -1.5442435628306925f,
+                                                 -0.6399784724734707f,
+                                                 0.6274301429074947f,
+                                                 -0.4645750072767051f,
+                                                 0.0f,
+                                                 -0.2406726815244178f,
+                                                 -0.06321214115916597f,
+                                                 0.312856714253404f,
+                                                 0.16459514124116134f,
+                                                 0.3993579604809623f,
+                                                 -0.15232044351561913f,
+                                                 -0.5613743948568469f,
+                                                 0.7219801372223262f,
+                                                 0.2936857469624009f,
+                                                 0.7823466656034087f,
+                                                 -0.12416947814098349f,
+                                                 -0.36413756654028345f,
+                                                 -0.07992098796866462f,
+                                                 -0.7395722879842416f,
+                                                 0.8639913543220514f,
+                                                 -0.311931773757945f,
+                                                 -1.7308240470400613f,
+                                                 0.0f,
+                                                 0.394499716712104f,
+                                                 0.6511462819539963f,
+                                                 -0.0722425275974144f,
+                                                 0.13490818194661386f,
+                                                 0.055319135836378035f,
+                                                 0.15389577508097013f,
+                                                 0.28958598328870605f,
+                                                 -0.14608429470539772f,
+                                                 0.09488817462478298f,
+                                                 -0.17231294096622088f,
+                                                 0.6721115415911466f,
+                                                 -0.05664621150536103f,
+                                                 0.03291799673669331f,
+                                                 0.02845382711057482f,
+                                                 -0.9953563446999164f,
+                                                 -0.17994298220605923f,
+                                                 0.6560824519337476f,
+                                                 0.0f,
+                                                 -0.30990646375917935f,
+                                                 0.17215517202874f,
+                                                 0.2026816225170481f,
+                                                 0.22011958747715601f,
+                                                 0.3562520768889686f,
+                                                 -0.18436559057189175f,
+                                                 0.1733377147302066f,
+                                                 0.02818276995640877f,
+                                                 -0.29703005574859076f,
+                                                 -0.3310652639215064f,
+                                                 -1.6091173258529277f,
+                                                 0.45461585790028003f,
+                                                 -0.5078643334592593f,
+                                                 -0.338997374732338f,
+                                                 0.4688619590359733f,
+                                                 0.627099126828289f,
+                                                 -0.5249801376494249f,
+                                                 0.0f,
+                                                 0.34465498218272883f,
+                                                 0.009891680630908135f,
+                                                 -0.27244020967349f,
+                                                 0.05404589867626979f,
+                                                 -0.06220329325739666f,
+                                                 -0.13365376464759104f,
+                                                 -0.13098573553512366f,
+                                                 0.11434198976289106f,
+                                                 0.6740951247574676f,
+                                                 1.3381727185724581f,
+                                                 -1.4865773213251936f,
+                                                 0.05809898701966341f,
+                                                 0.25380780261023456f,
+                                                 1.2716367496512722f,
+                                                 0.1768290070780598f,
+                                                 -0.07554828135356352f,
+                                                 0.8180570085344856f,
+                                                 0.0f,
+                                                 1.0788448980077463f,
+                                                 0.0651938742459459f,
+                                                 0.3807672030015587f,
+                                                 0.6144792680268445f,
+                                                 0.011660612214908059f,
+                                                 -0.018306023765580288f,
+                                                 0.44140813809926516f,
+                                                 -0.13411994195502386f,
+                                                 0.15920368955127778f,
+                                                 -0.19382358417849888f,
+                                                 -0.08802147969690055f,
+                                                 -0.019731052733814477f,
+                                                 0.1104744229169665f,
+                                                 -0.195834419735958f,
+                                                 -0.5005295046454347f,
+                                                 -0.17041241868229032f,
+                                                 -0.471942117351489f,
+                                                 0.0f,
+                                                 -0.3599073304761372f,
+                                                 -0.2745532782968519f,
+                                                 -0.8323064841106417f,
+                                                 -0.88355885384943f,
+                                                 -0.02826466859020679f,
+                                                 0.06977870308805256f,
+                                                 0.11926112095374196f,
+                                                 1.367382707959643f,
+                                                 -0.06119843162964051f,
+                                                 -0.5331395268889569f,
+                                                 -1.2155531584240624f,
+                                                 -0.01896651779524327f,
+                                                 0.10591845408571081f,
+                                                 -0.010632842156504733f,
+                                                 0.6150787968629282f,
+                                                 -0.4191690185896091f,
+                                                 -0.9961718918346271f,
+                                                 0.0f,
+                                                 0.23370364516013867f,
+                                                 0.4156033072362998f,
+                                                 0.1261005546633433f,
+                                                 0.0812413884532226f,
+                                                 -0.008894337353937203f,
+                                                 0.07984447025056046f,
+                                                 -0.1258098052766725f,
+                                                 -0.40245475467767916f,
+                                                 1.78188906675019f,
+                                                 -1.1544387954232302f,
+                                                 -0.41768781481273387f,
+                                                 0.6791211165341995f,
+                                                 -0.4175127856183446f,
+                                                 -0.07353219159767788f,
+                                                 -0.2888813577574072f,
+                                                 -0.7107767892597061f,
+                                                 -1.0450031091195449f,
+                                                 0.0f,
+                                                 -0.9221599545079143f,
+                                                 -0.6747876356740621f,
+                                                 0.30241454354872105f,
+                                                 0.4924965303373908f,
+                                                 -0.14042722740054084f,
+                                                 0.27744210409350445f,
+                                                 -0.14788270997426836f,
+                                                 -0.9081467469237995f,
+                                                 -0.04513115674995093f,
+                                                 -0.5254168669125793f,
+                                                 -0.6999012037974789f,
+                                                 0.434661246306547f,
+                                                 -0.7193303957246092f,
+                                                 -0.9117952623409744f,
+                                                 -1.5097267865916142f,
+                                                 -0.20779888103770922f,
+                                                 0.4935562480901218f,
+                                                 0.0f,
+                                                 0.18303393908923593f,
+                                                 0.34753722677570037f,
+                                                 0.29291001533177663f,
+                                                 0.3832351878354224f,
+                                                 0.3295194956120599f,
+                                                 -0.32398033003617527f,
+                                                 -0.31570906736433746f,
+                                                 0.23657779050372962f,
+                                                 0.9510794465234161f,
+                                                 -0.5122243902568278f,
+                                                 0.08652112725315658f,
+                                                 0.2246634353717998f,
+                                                 -0.9032595595582497f,
+                                                 -0.8936484034533545f,
+                                                 0.6012969720865752f,
+                                                 -0.6454216646117924f,
+                                                 -1.1753786049658332f,
+                                                 0.0f,
+                                                 -0.4360545677728656f,
+                                                 -0.6586237455328507f,
+                                                 -0.34347301697886656f,
+                                                 -0.8909724651992144f,
+                                                 -0.24378721818350263f,
+                                                 0.6179733359297576f,
+                                                 0.0661661181742234f,
+                                                 -0.14120142044993794f,
+                                                 -0.07732699885498932f,
+                                                 1.0221355882357506f,
+                                                 0.44514798994115284f,
+                                                 -0.7371569579959046f,
+                                                 -0.7212499572378936f,
+                                                 0.7453626921081045f,
+                                                 0.5478757761345768f,
+                                                 -0.39411232789985384f,
+                                                 0.7200542656743857f,
+                                                 0.0f,
+                                                 -0.11790869453118827f,
+                                                 -0.12317030713581928f,
+                                                 -0.4207902738133338f,
+                                                 0.15895105878327986f,
+                                                 0.304261777102111f,
+                                                 0.11450744587017621f,
+                                                 -0.11470709991317944f,
+                                                 0.5949222371739038f,
+                                                 0.6549518619412444f,
+                                                 -0.24390606570422838f,
+                                                 -0.4212796009440803f,
+                                                 -0.6269666206320964f,
+                                                 -0.5421193969807078f,
+                                                 -0.12297772128652287f,
+                                                 0.021517257619930424f,
+                                                 0.25462855095544523f,
+                                                 -0.22107798187348246f,
+                                                 0.0f,
+                                                 0.5204516300095662f,
+                                                 0.2837402841862462f,
+                                                 0.11310823283285916f,
+                                                 0.8944351685018025f,
+                                                 0.17487203235834015f,
+                                                 -0.5271221928634433f,
+                                                 -0.19516594503423199f,
+                                                 0.452456617580365f,
+                                                 1.2456272242706414f,
+                                                 0.24166615894862817f,
+                                                 0.09411429305204502f,
+                                                 -0.2730072283327243f,
+                                                 -0.8129383770918172f,
+                                                 -0.24093254193486136f,
+                                                 0.5696499174142177f,
+                                                 -0.11110805836073044f,
+                                                 -0.3968204166235694f,
+                                                 0.0f,
+                                                 -0.04388165369378549f,
+                                                 -0.005631266017272595f,
+                                                 -0.02574211858479705f,
+                                                 0.06230399626660669f,
+                                                 0.17677671232932785f,
+                                                 0.5172871274400965f,
+                                                 0.4919150085620063f,
+                                                 -1.597656637582941f,
+                                                 0.02415185715719143f,
+                                                 -0.17945446376668306f,
+                                                 -0.39340600199798886f,
+                                                 0.25013205256886845f,
+                                                 0.05972330340308685f,
+                                                 0.1359911505596489f,
+                                                 -0.02341033271820833f,
+                                                 0.15726074644063684f,
+                                                 0.47512625913020357f,
+                                                 0.0f,
+                                                 0.7327341664835779f,
+                                                 -0.3689092312320013f,
+                                                 0.4571824787436036f,
+                                                 0.6215465537945456f,
+                                                 0.0944111296842023f,
+                                                 -0.12571956176607574f,
+                                                 -0.2507235674395462f,
+                                                 -0.09579602654351593f,
+                                                 1.4463357293728496f,
+                                                 0.749153535856049f,
+                                                 -0.5553955120807588f,
+                                                 -0.09622771929369946f,
+                                                 -0.2598697420394813f,
+                                                 -0.964691815299676f,
+                                                 -0.8289963178173902f,
+                                                 0.7112949291983329f,
+                                                 -0.8667009730492162f,
+                                                 0.0f,
+                                                 -0.48698304169042794f,
+                                                 -0.18786095669893707f,
+                                                 -0.11425249263203247f,
+                                                 -0.3693391011684809f,
+                                                 0.09933145842585253f,
+                                                 0.2568559685298844f,
+                                                 0.7048512233651738f,
+                                                 0.6056238412407038f,
+                                                 -0.4355558119826642f,
+                                                 0.17318931883915484f,
+                                                 0.6481333496429564f,
+                                                 -0.45728823054344486f,
+                                                 -0.006325004538589701f,
+                                                 0.45609864075494927f,
+                                                 -0.6199385981116988f,
+                                                 0.035105808783046165f,
+                                                 0.1203147963894839f,
+                                                 0.0f,
+                                                 0.383402190836527f,
+                                                 0.048429009055370106f,
+                                                 0.5887186439275204f,
+                                                 -0.20538767641607814f,
+                                                 -0.031237879611002117f,
+                                                 0.3140759860883231f,
+                                                 0.24447070584999556f,
+                                                 0.7271263905705878f,
+                                                 0.8432799162434237f,
+                                                 -0.11530577554199217f,
+                                                 -0.7781023892314718f,
+                                                 0.05359488822710336f,
+                                                 0.5624870388700809f,
+                                                 0.5134656523208906f,
+                                                 0.18304041423438375f,
+                                                 -0.04237421156328257f,
+                                                 -0.20759809886942207f,
+                                                 0.0f,
+                                                 -0.06249337454975615f,
+                                                 0.10081284533873777f,
+                                                 0.3894374350259183f,
+                                                 1.518217777528342f,
+                                                 -0.9100037950171563f,
+                                                 0.17796906121831477f,
+                                                 -0.2892167255357892f,
+                                                 0.6117902467884032f,
+                                                 0.13332120964959573f,
+                                                 -0.3487155932849374f,
+                                                 -0.32920583745734694f,
+                                                 0.08242631209809854f,
+                                                 -0.24920225708110588f,
+                                                 0.8401757259392635f,
+                                                 0.11729108681358365f,
+                                                 0.11222925752499184f,
+                                                 -0.027078490721459958f,
+                                                 0.0f,
+                                                 0.726132375517389f,
+                                                 0.72220359881096f,
+                                                 0.5721582611845177f,
+                                                 0.15139162075524315f,
+                                                 0.6676549461551197f,
+                                                 -0.321449586554697f,
+                                                 -0.10141104515219895f,
+                                                 -0.09711123988777906f,
+                                                 0.9623356184776928f,
+                                                 -0.7941822373167173f,
+                                                 -0.9373923554119346f,
+                                                 0.4573241832354059f,
+                                                 -0.42029139056126147f,
+                                                 0.2675223459380999f,
+                                                 -0.5487300191551386f,
+                                                 0.2236621891916084f,
+                                                 0.11692039230044018f,
+                                                 0.0f,
+                                                 0.1758399202780961f,
+                                                 0.676447587678781f,
+                                                 0.5945412815881029f,
+                                                 0.5669863357359594f,
+                                                 0.8433565415303922f,
+                                                 -0.30300550790708036f,
+                                                 -0.43332881999693673f,
+                                                 -0.4996522695731392f,
+                                                 -0.2084930815451962f,
+                                                 0.27765278702463786f,
+                                                 1.0886848763946915f,
+                                                 -0.0739433655813831f,
+                                                 -0.4762801579229192f,
+                                                 -0.2490825339320731f,
+                                                 -1.8820479350439439f,
+                                                 -0.4251592225775914f,
+                                                 -0.3992922365484464f,
+                                                 0.0f,
+                                                 0.19598917760218867f,
+                                                 0.4860238022746914f,
+                                                 0.3364528828641281f,
+                                                 0.3350950865226741f,
+                                                 0.2773654548632006f,
+                                                 -0.30547262140782566f,
+                                                 0.028649620490728344f,
+                                                 -0.11763407628280315f,
+                                                 0.6237318502627169f,
+                                                 -0.3958952632477945f,
+                                                 0.14797171297835243f,
+                                                 0.45821729624747465f,
+                                                 -0.8687137170773626f,
+                                                 0.06989667196937126f,
+                                                 -0.5752606929478727f,
+                                                 0.16986945686358412f,
+                                                 0.6925071596817824f,
+                                                 0.0f,
+                                                 0.4991250796183003f,
+                                                 0.03424654896322111f,
+                                                 0.6153698611882319f,
+                                                 0.5070872444849457f,
+                                                 0.43615747516328135f,
+                                                 -0.7870352838659244f,
+                                                 -0.6424101231965247f,
+                                                 -0.7005774876651399f,
+                                                 0.79983115431488f,
+                                                 0.15720357955596242f,
+                                                 -1.408372612176309f,
+                                                 -0.039294695217213765f,
+                                                 0.6979415372962309f,
+                                                 0.27403316751965656f,
+                                                 1.2844596102619275f,
+                                                 -0.2781534150257364f,
+                                                 0.3248437714908865f,
+                                                 0.0f,
+                                                 0.4364362371752831f,
+                                                 -0.2548580911485434f,
+                                                 -0.19578001373349452f,
+                                                 -0.04597194387828005f,
+                                                 -0.010035156855533233f,
+                                                 0.0415941475251266f,
+                                                 0.07929549739797387f,
+                                                 -0.060629652912508866f,
+                                                 0.5977303008711333f,
+                                                 -1.4404008068066554f,
+                                                 0.8555694790197376f,
+                                                 -0.03693438534401856f,
+                                                 0.17761411164512408f,
+                                                 -0.11858304304109235f,
+                                                 -1.4241324353471327f,
+                                                 0.1533849765389186f,
+                                                 0.7650643783126995f,
+                                                 0.0f,
+                                                 -0.0639949379280401f,
+                                                 0.4288617817939563f,
+                                                 0.4235508646885404f,
+                                                 0.3419843254383798f,
+                                                 -0.015992360660098768f,
+                                                 -0.773247697505441f,
+                                                 -0.4908452922015917f,
+                                                 0.9868134897291486f,
+                                                 -0.5078689994742608f,
+                                                 1.05632043744864f,
+                                                 -0.38867419409275117f,
+                                                 -0.0065547696858664194f,
+                                                 -0.3056003173415037f,
+                                                 -0.333762331930102f,
+                                                 0.4459671174011671f,
+                                                 0.08219092584580244f,
+                                                 -0.08099158579518179f,
+                                                 0.0f,
+                                                 -0.1568180656346373f,
+                                                 -0.061962372393910135f,
+                                                 0.14065868174859464f,
+                                                 -0.055925712798972765f,
+                                                 0.05136117465820622f,
+                                                 0.0907831030477633f,
+                                                 0.19518110495319604f,
+                                                 -0.7470794578145956f,
+                                                 1.5945999734733545f,
+                                                 -0.4351697502345834f,
+                                                 -0.33253649399571805f };
+
+const float av1_mv_prec_nn_bias_layer_0[] = {
+  -0.651213833993862f,   -1.1243309933417809f,  -0.2123880023097051f,
+  0.23095477452877616f,  -0.6668057665893545f,  0.3082268148379634f,
+  -0.3344916753975844f,  -0.20920185606857844f, 0.6057933917964854f,
+  0.5031857662559803f,   -1.5380096313468152f,  -0.4457245344804041f,
+  1.82368055812373f,     0.7973912064077963f,   0.25706500555622913f,
+  0.1394695119825382f,   0.4508811973450553f,   -0.5408959545111782f,
+  1.064829233697863f,    0.3733268644246235f,   1.1173169029905483f,
+  -0.2012817466400134f,  -0.16628447748302294f, 1.3086000088940826f,
+  0.7267092979664235f,   -0.9097857006590555f,  -0.7564259343863077f,
+  -0.49844128036716173f, -0.4675729246975423f,  -0.03626154526362181f,
+  -0.41957330902404616f, -0.9658160514319954f
+};
+
+const float av1_mv_prec_nn_weights_layer_1[] = {
+  1.5017296484510276f,  1.044216918060133f,   -1.066541411740906f,
+  -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f,
+  0.7117244268817873f,  -0.7695942296628597f, 0.7892157680137047f,
+  -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f,
+  -0.9699580532370483f, 0.5849682956422552f,  -1.0372272986941953f,
+  -0.5005014627824439f, 1.1816204711740521f,  -1.2204867615892114f,
+  0.4510263977504913f,  0.35567865078585165f, -0.7811389330738839f,
+  -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f,
+  0.8861643352684585f,  0.6438840651522237f,  0.6677191546466089f,
+  0.9703715021995785f,  1.250893534236489f,   0.7733742028067933f,
+  -1.249673977776904f,  -1.2890127265725608f
+};
+
+const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f };
+
+static const NN_CONFIG av1_mv_prec_dnn_config = {
+  NUM_DNN_FEATURES,
+  NUM_LOGITS,
+  NUM_DNN_LAYERS,
+  { MV_PREC_LAYER_SIZE_0 },
+  {
+      av1_mv_prec_nn_weights_layer_0,
+      av1_mv_prec_nn_weights_layer_1,
+  },
+  {
+      av1_mv_prec_nn_bias_layer_0,
+      av1_mv_prec_nn_bias_layer_1,
+  },
+};
+#undef NUM_DNN_LAYERS
+#undef NUM_DNN_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
new file mode 100644
index 0000000000..94cd56c5d1
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/ml.h"
+
+void av1_nn_output_prec_reduce(float *const output, int num_output) {
+  const int prec_bits = 9;
+  const int prec = 1 << prec_bits;
+  const float inv_prec = (float)(1.0 / prec);
+  for (int i = 0; i < num_output; i++) {
+    output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec;
+  }
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_c(const float *input_nodes,
+                      const NN_CONFIG *const nn_config, int reduce_prec,
+                      float *const output) {
+  int num_input_nodes = nn_config->num_inputs;
+  int buf_index = 0;
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+
+  // Propagate hidden layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (int layer = 0; layer < num_layers; ++layer) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    float *output_nodes = buf[buf_index];
+    const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+    assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+    for (int node = 0; node < num_output_nodes; ++node) {
+      float val = layer_bias[node];
+      for (int i = 0; i < num_input_nodes; ++i)
+        val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+      // ReLU as activation function.
+      val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
+      output_nodes[node] = val;
+    }
+    num_input_nodes = num_output_nodes;
+    input_nodes = output_nodes;
+    buf_index = 1 - buf_index;
+  }
+
+  // Final output layer.
+  const float *layer_weights = nn_config->weights[num_layers];
+  const float *layer_bias = nn_config->bias[num_layers];
+  for (int node = 0; node < nn_config->num_outputs; ++node) {
+    float val = layer_bias[node];
+    for (int i = 0; i < num_input_nodes; ++i)
+      val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+    output[node] = val;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
+
+#if CONFIG_NN_V2
+// Applies the ReLu activation to one fc layer
+// output[i] = Max(input[i],0.0f)
+static float *nn_relu(const float *input, FC_LAYER *layer) {
+  for (int i = 0; i < layer->num_outputs; ++i) {
+    layer->output[i] = AOMMAX(input[i], 0.0f);
+  }
+
+  return layer->output;
+}
+
+// Applies the Sigmoid activation to one fc layer
+// output[i] = 1/(1+exp(input[i]))
+static float *nn_sigmoid(const float *input, FC_LAYER *layer) {
+  for (int i = 0; i < layer->num_outputs; ++i) {
+    const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f);
+    layer->output[i] = 1.0f / (1.0f + expf(-tmp));
+  }
+
+  return layer->output;
+}
+
+// Forward prediction in one fc layer, used in function av1_nn_predict_V2
+static float *nn_fc_forward(const float *input, FC_LAYER *layer) {
+  const float *weights = layer->weights;
+  const float *bias = layer->bias;
+  assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER);
+  // fc
+  for (int node = 0; node < layer->num_outputs; ++node) {
+    float val = bias[node];
+    for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i];
+    layer->output[node] = val;
+    weights += layer->num_inputs;
+  }
+
+  // activation
+  switch (layer->activation) {
+    case NONE: return layer->output;
+    case RELU: return nn_relu(layer->output, layer);
+    case SIGMOID: return nn_sigmoid(layer->output, layer);
+    case SOFTSIGN:
+      assert(0 && "Softsign has not been supported in NN.");  // TO DO
+      return NULL;
+    default:
+      assert(0 && "Unknown activation");  // Unknown activation
+      return NULL;
+  }
+}
+
+void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config,
+                       int reduce_prec, float *output) {
+  const float *input_nodes = feature;
+
+  // Propagate the layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (int i = 0; i < num_layers; ++i) {
+    input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i);
+    assert(nn_config->layer[i + 1].num_inputs ==
+           nn_config->layer[i].num_outputs);
+  }
+
+  // Final layer
+  input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers);
+  assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits);
+  // Copy the final layer output
+  memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits);
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits);
+}
+#endif  // CONFIG_NN_V2
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+  // Softmax function is invariant to adding the same constant
+  // to all input values, so we subtract the maximum input to avoid
+  // possible overflow.
+  float max_input = input[0];
+  for (int i = 1; i < n; i++) max_input = AOMMAX(max_input, input[i]);
+  float sum_out = 0.0f;
+  for (int i = 0; i < n; i++) {
+    // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+    const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+    output[i] = expf(normalized_input);
+    sum_out += output[i];
+  }
+  for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
+
+void av1_nn_fast_softmax_16_c(const float *input, float *output) {
+  const int kNumClasses = 16;
+  float max_input = input[0];
+  for (int i = 1; i < kNumClasses; i++) max_input = AOMMAX(max_input, input[i]);
+  float sum_out = 0.0f;
+  for (int i = 0; i < kNumClasses; i++) {
+    // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+    const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+    output[i] = approx_exp(normalized_input);
+    sum_out += output[i];
+  }
+  for (int i = 0; i < kNumClasses; i++) output[i] /= sum_out;
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
new file mode 100644
index 0000000000..566f9271dd
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/av1_rtcd.h"
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+struct NN_CONFIG {
+  int num_inputs;         // Number of input nodes, i.e. features.
+  int num_outputs;        // Number of output nodes.
+  int num_hidden_layers;  // Number of hidden layers, maximum 10.
+  // Number of nodes for each hidden layer.
+  int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+  // Weight parameters, indexed by layer.
+  const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+  // Bias parameters, indexed by layer.
+  const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+};
+// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs
+
+#if CONFIG_NN_V2
+// Fully-connectedly layer configuration
+struct FC_LAYER {
+  const int num_inputs;   // Number of input nodes, i.e. features.
+  const int num_outputs;  // Number of output nodes.
+
+  float *weights;               // Weight parameters.
+  float *bias;                  // Bias parameters.
+  const ACTIVATION activation;  // Activation function.
+
+  float *output;  // The output array.
+  float *dY;      // Gradient of outputs
+  float *dW;      // Gradient of weights.
+  float *db;      // Gradient of bias
+};
+
+// NN configure structure V2
+struct NN_CONFIG_V2 {
+  const int num_hidden_layers;  // Number of hidden layers, max = 10.
+  FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1];  // The layer array
+  const int num_logits;                      // Number of output nodes.
+  float *logits;    // Raw prediction (same as output of final layer)
+  const LOSS loss;  // Loss function
+};
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config,
+                       int reduce_prec, float *output);
+#endif  // CONFIG_NN_V2
+
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
+// A faster but less accurate version of av1_nn_softmax(input, output, 16)
+void av1_nn_fast_softmax_16_c(const float *input, float *output);
+
+// Applies a precision reduction to output of av1_nn_predict to prevent
+// mismatches between C and SIMD implementations.
+void av1_nn_output_prec_reduce(float *const output, int num_output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ML_H_
diff --git a/third_party/aom/av1/encoder/mode_prune_model_weights.h b/third_party/aom/av1/encoder/mode_prune_model_weights.h
new file mode 100644
index 0000000000..98ec36808a
--- /dev/null
+++ b/third_party/aom/av1/encoder/mode_prune_model_weights.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_HIDDEN_LAYERS_12 1
+#define NUM_FEATURES_12 6
+#define NUM_LAYER_0_UNITS_12 24
+#define NUM_LOGITS_12 2
+
+static const float av1_intrap_hiddenlayer_0_kernel_12[] = {
+  7.28372f,       -1.3333898f,    -1.3180022f,   -0.007156151f, -0.40799126f,
+  -0.57538104f,   -31.81647f,     6.7057495f,    6.351472f,     -0.029544508f,
+  0.026801195f,   1.12863f,       -0.70769817f,  -0.24183524f,  0.0649113f,
+  -0.7189517f,    0.21791299f,    0.12840256f,   -0.56424767f,  0.16924907f,
+  0.4605501f,     -0.170895f,     -0.60358995f,  -0.15383226f,  -4.0523643f,
+  0.6961917f,     1.3100256f,     -0.4189354f,   0.37264112f,   -0.14555685f,
+  10.628014f,     8.184437f,      8.941916f,     -0.011731001f, -0.45127156f,
+  0.42704004f,    36.84277f,      8.988796f,     8.844238f,     0.00030091056f,
+  -0.022038324f,  1.3566176f,     -8.863219f,    -0.84811693f,  -1.0908632f,
+  0.00023130262f, -1.0698471f,    -6.755927f,    7.1711984f,    4.7216063f,
+  3.5099216f,     -0.6650184f,    0.5935173f,    -0.6696286f,   11.8595295f,
+  0.3001874f,     0.29822728f,    0.04319222f,   -1.203178f,    1.1210147f,
+  0.035045594f,   -0.20559944f,   -0.015388541f, -0.7857941f,   -0.94100875f,
+  -0.1278549f,    -19.22603f,     7.9466896f,    6.5048656f,    -0.22195444f,
+  0.19061874f,    1.3927288f,     -8.896529f,    -0.48146892f,  -1.6098932f,
+  -0.0030235797f, -0.6533787f,    -2.1333003f,   -22.256454f,   -4.934058f,
+  -4.4707212f,    -0.015831878f,  -0.4243649f,   -2.776269f,    -0.23762038f,
+  0.1820098f,     -0.51865315f,   -1.1893421f,   0.34969202f,   0.10636194f,
+  14.545696f,     1.3849198f,     2.6815193f,    -0.5145498f,   0.45948258f,
+  -0.8842355f,    -0.9111363f,    -0.39652422f,  0.077266276f,  -0.68084997f,
+  0.4593515f,     -0.28872707f,   -6.936231f,    1.12253f,      1.7616503f,
+  -0.014069137f,  -0.0052156276f, -4.5095444f,   6.2076726f,    -0.058755957f,
+  -0.4675936f,    -0.13039507f,   0.12094394f,   -0.07285393f,  68.26125f,
+  7.4893136f,     8.770954f,      0.020274093f,  -0.027877754f, 1.6579602f,
+  -0.1825479f,    0.34832543f,    0.07472531f,   -0.44812247f,  -1.0941806f,
+  -0.16749863f,   1.1394324f,     0.47983396f,   -0.99983627f,  -0.00064249727f,
+  -1.3345739f,    -0.057157427f,  -18.14875f,    16.506035f,    15.539248f,
+  0.013191509f,   -0.021674965f,  -25.006235f,   0.51220596f,   0.7334426f,
+  0.81836903f,    -1.0443225f,    0.4459505f,    -1.2045046f
+};
+
+static const float av1_intrap_hiddenlayer_0_bias_12[] = {
+  -4.154915f,   14.33833f,   0.0f,       0.0f,         2.0440118f, 12.40922f,
+  -16.77514f,   0.5879813f,  3.2305415f, 0.8303539f,   0.0f,       14.488708f,
+  2.94393f,     1.874383f,   0.0f,       -0.53140444f, 0.0f,       1.8456234f,
+  -0.55427986f, -19.856262f, 0.0f,       0.17281002f,  48.31631f,  0.0f
+};
+
+static const float av1_intrap_logits_kernel_12[] = {
+  0.26843873f,   -0.09576241f,  0.34427166f,  0.09914787f,  -0.10275399f,
+  0.02999484f,   -0.1467772f,   0.11594324f,  0.29200763f,  0.0067976206f,
+  0.050393578f,  -0.018694371f, 0.3333476f,   0.2127221f,   0.35128218f,
+  0.19968672f,   0.08099991f,   0.084850654f, -0.16045967f, 0.30286232f,
+  0.6164765f,    -0.27140254f,  0.08210814f,  0.34852806f,  0.25028184f,
+  -0.12188078f,  0.16310331f,   0.31253803f,  -0.10792341f, 0.065858394f,
+  -0.1349708f,   0.08948815f,   0.31905392f,  0.03680656f,  -0.05040944f,
+  -0.051539157f, 0.3211852f,    0.2137136f,   0.45037416f,  0.22748767f,
+  -0.10978614f,  0.06475646f,   -0.16954158f, 0.32831904f,  0.16479677f,
+  -0.30020145f,  0.066221856f,  0.37213042f
+};
+
+static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f };
+
+static const NN_CONFIG av1_intrap_nn_config = {
+  NUM_FEATURES_12,
+  NUM_LOGITS_12,
+  NUM_HIDDEN_LAYERS_12,
+  {
+      NUM_LAYER_0_UNITS_12,
+  },
+  {
+      av1_intrap_hiddenlayer_0_kernel_12,
+      av1_intrap_logits_kernel_12,
+  },
+  {
+      av1_intrap_hiddenlayer_0_bias_12,
+      av1_intrap_logits_bias_12,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_12
+#undef NUM_FEATURES_12
+#undef NUM_LAYER_0_UNITS_12
+#undef NUM_LOGITS_12
+
+#define NUM_HIDDEN_LAYERS_15 1
+#define NUM_FEATURES_15 6
+#define NUM_LAYER_0_UNITS_15 24
+#define NUM_LOGITS_15 2
+
+static const float av1_intraph_hiddenlayer_0_kernel_15[] = {
+  -0.77480125f,   0.3219551f,    -0.015702145f,   -0.5310235f,   0.5254026f,
+  -1.1522819f,    2.682016f,     0.08001052f,     -0.2539285f,   0.04711023f,
+  -0.81296307f,   0.2675382f,    0.1952474f,      -0.0664705f,   1.2989824f,
+  -0.3150117f,    -0.8022715f,   0.045423955f,    -27.584324f,   -2.5608704f,
+  -3.2280366f,    0.05272543f,   -0.47141576f,    -0.07644298f,  -53.77942f,
+  -22.393923f,    -23.027853f,   -0.00015186476f, -0.010696465f, 2.7064638f,
+  -22.776028f,    11.514891f,    11.138167f,      -0.001243723f, -0.4802433f,
+  -8.758646f,     0.26398206f,   -0.23485385f,    0.27586034f,   -0.004954741f,
+  -0.4935232f,    -0.017607696f, 69.56049f,       -1.1756641f,   -0.052366666f,
+  -0.38052833f,   0.32474658f,   0.04634263f,     0.8583235f,    -0.528438f,
+  -0.7868907f,    -0.4757781f,   0.4620985f,      -0.70621157f,  231.40195f,
+  6.805205f,      9.420295f,     0.02585775f,     -0.03480937f,  1.3577378f,
+  0.1758226f,     15.056758f,    14.437874f,      -0.1305005f,   0.115103304f,
+  0.21297209f,    55.821743f,    -6.611156f,      -6.8552365f,   -0.011928095f,
+  -0.2042175f,    1.2557873f,    -1.0722278f,     -0.2683614f,   0.48318478f,
+  -0.73739994f,   0.54055226f,   -0.03224738f,    -0.06767959f,  -0.21015017f,
+  0.29171246f,    -0.6937296f,   -1.2342545f,     -0.41278538f,  -37.9365f,
+  17.68424f,      16.263042f,    -0.074828684f,   0.06607806f,   -0.16763286f,
+  13.594707f,     0.6152676f,    -0.4371223f,     -0.8365592f,   0.8273623f,
+  -1.2126317f,    0.1216157f,    -1.3002136f,     -0.18856938f,  -0.2589358f,
+  -0.76897144f,   0.21777137f,   -122.25033f,     -0.23490006f,  -3.1238277f,
+  -0.13916978f,   0.08576391f,   -1.7391548f,     -116.24812f,   14.906071f,
+  13.468357f,     0.02332889f,   -0.034617376f,   -18.506111f,   0.7500542f,
+  -1.1882535f,    0.40848416f,   -0.28434393f,    -0.71471655f,  -0.29188696f,
+  -0.46588746f,   -0.17324813f,  -0.62460244f,    -1.1801276f,   0.28993344f,
+  -0.22072886f,   129.2688f,     -0.33782578f,    -0.34836572f,  -0.034112718f,
+  -0.023666814f,  -0.5865087f,   -33.484146f,     1.1431375f,    0.56056374f,
+  -0.0049730353f, -0.24347587f,  -1.3003352f,     0.88973033f,   0.8499571f,
+  -0.5678484f,    -0.39009875f,  -0.062105156f,   -0.13965102f
+};
+
+static const float av1_intraph_hiddenlayer_0_bias_15[] = {
+  0.0f,       -0.2926711f, 0.0f,         -1.0303509f, -27.459345f,  12.412848f,
+  0.0f,       -2.5971522f, -0.02733541f, -19.881912f, 14.391992f,   -8.249469f,
+  0.0f,       0.0f,        13.676118f,   -0.6472994f, -0.07189449f, 1.1986839f,
+  52.479107f, 0.0f,        0.0f,         -3.0187025f, 1.4435643f,   0.0f
+};
+
+static const float av1_intraph_logits_kernel_15[] = {
+  0.05390722f,   -0.06859513f, 0.036842898f, 0.190772f,    0.13623567f,
+  0.09321194f,   0.2314745f,   -0.13958375f, -0.3058229f,  -0.0104543045f,
+  0.11336068f,   -0.276115f,   0.00470723f,  -0.49123898f, -0.15988174f,
+  0.087681435f,  0.022517204f, 0.073877744f, 0.2968856f,   -0.1401399f,
+  -0.38788354f,  -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f,
+  -0.032179773f, -0.35758728f, 0.25819537f,  0.11468631f,  0.13573235f,
+  -0.2672175f,   0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f,
+  -0.21821865f,  0.08434734f,  0.3129456f,   -0.18215221f, 0.08884877f,
+  -0.35621428f,  0.11405768f,  0.27370325f,  0.14956686f,  0.01604587f,
+  -0.18334487f,  -0.42385718f, -0.08033409f
+};
+
+static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f };
+
+static const NN_CONFIG av1_intrap_hd_nn_config = {
+  NUM_FEATURES_15,
+  NUM_LOGITS_15,
+  NUM_HIDDEN_LAYERS_15,
+  {
+      NUM_LAYER_0_UNITS_15,
+  },
+  {
+      av1_intraph_hiddenlayer_0_kernel_15,
+      av1_intraph_logits_kernel_15,
+  },
+  {
+      av1_intraph_hiddenlayer_0_bias_15,
+      av1_intraph_logits_bias_15,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_15
+#undef NUM_FEATURES_15
+#undef NUM_LAYER_0_UNITS_15
+#undef NUM_LOGITS_15
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/model_rd.h b/third_party/aom/av1/encoder/model_rd.h
new file mode 100644
index 0000000000..f7e8b96b5b
--- /dev/null
+++ b/third_party/aom/av1/encoder/model_rd.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODEL_RD_H_
+#define AOM_AV1_ENCODER_MODEL_RD_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "config/aom_dsp_rtcd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 1
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_MOTION_MODE_RD 1
+
+typedef void (*model_rd_for_sb_type)(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+    uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+                                       const MACROBLOCK *const x,
+                                       BLOCK_SIZE plane_bsize, int plane,
+                                       int64_t sse, int num_samples, int *rate,
+                                       int64_t *dist);
+
+static int64_t calculate_sse(MACROBLOCKD *const xd,
+                             const struct macroblock_plane *p,
+                             struct macroblockd_plane *pd, const int bw,
+                             const int bh) {
+  int64_t sse = 0;
+  const int shift = xd->bd - 8;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                         bw, bh);
+  } else {
+    sse =
+        aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+  }
+#else
+  sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+#endif
+  sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+  return sse;
+}
+
+static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
+                                            int plane, const BLOCK_SIZE bsize) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  int bw, bh;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+
+  int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+
+  return sse;
+}
+
+static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi,
+                                         const MACROBLOCK *const x,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         int64_t sse, int num_samples,
+                                         int *rate, int64_t *dist) {
+  (void)num_samples;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+
+  // Fast approximate the modelling function.
+  if (cpi->sf.rd_sf.simple_model_rd_from_var) {
+    const int64_t square_error = sse;
+    int quantizer = p->dequant_QTX[1] >> dequant_shift;
+    if (quantizer < 120)
+      *rate = (int)AOMMIN(
+          (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+          INT_MAX);
+    else
+      *rate = 0;
+    assert(*rate >= 0);
+    *dist = (square_error * quantizer) >> 8;
+  } else {
+    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
+                                 p->dequant_QTX[1] >> dequant_shift, rate,
+                                 dist);
+  }
+  *dist <<= 4;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
+                                             const MACROBLOCK *const x,
+                                             BLOCK_SIZE plane_bsize, int plane,
+                                             int64_t sse, int num_samples,
+                                             int *rate, int64_t *dist) {
+  (void)cpi;
+  (void)plane_bsize;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1);
+
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    return;
+  }
+  const double sse_norm = (double)sse / num_samples;
+  const double qstepsqr = (double)qstep * qstep;
+  const double xqr = log2(sse_norm / qstepsqr);
+  double rate_f, dist_by_sse_norm_f;
+  av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
+                       &dist_by_sse_norm_f);
+
+  const double dist_f = dist_by_sse_norm_f * sse_norm;
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+
+  // Check if skip is better
+  if (rate_i == 0) {
+    dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
+    rate_i = 0;
+    dist_i = sse << 4;
+  }
+
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+}
+
+static AOM_INLINE void model_rd_for_sb(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+    uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    int64_t *plane_sse, int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int plane;
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    int64_t sse;
+    int rate;
+    int64_t dist;
+
+    sse = calculate_sse(xd, p, pd, bw, bh);
+
+    model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+    assert(rate_sum >= 0);
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  rate_sum = AOMMIN(rate_sum, INT_MAX);
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static AOM_INLINE void model_rd_for_sb_with_curvfit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+    uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    int64_t *plane_sse, int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t dist, sse;
+    int rate;
+    int bw, bh;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+
+    sse = calculate_sse(xd, p, pd, bw, bh);
+    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                          &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType);
+
+static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+  model_rd_for_sb, model_rd_for_sb_with_curvfit
+};
+
+static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+  model_rd_from_sse, model_rd_with_curvfit
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_MODEL_RD_H_
diff --git a/third_party/aom/av1/encoder/motion_search_facade.c b/third_party/aom/av1/encoder/motion_search_facade.c
new file mode 100644
index 0000000000..e7eec29dc3
--- /dev/null
+++ b/third_party/aom/av1/encoder/motion_search_facade.c
@@ -0,0 +1,1071 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
+
+#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3)
+
+typedef struct {
+  int_mv fmv;
+  int weight;
+} cand_mv_t;
+
+static int compare_weight(const void *a, const void *b) {
+  const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight;
+  if (diff < 0)
+    return 1;
+  else if (diff > 0)
+    return -1;
+  return 0;
+}
+
+// Allow more mesh searches for screen content type on the ARF.
+static int use_fine_search_interval(const AV1_COMP *const cpi) {
+  return cpi->is_screen_content_type &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+         cpi->oxcf.speed <= 2;
+}
+
+// Iterate through the tpl and collect the mvs to be used as candidates
+static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
+                                             const MACROBLOCK *x,
+                                             BLOCK_SIZE bsize, int ref,
+                                             cand_mv_t *cand, int *cand_count,
+                                             int *total_cand_weight) {
+  const SuperBlockEnc *sb_enc = &x->sb_enc;
+  if (!sb_enc->tpl_data_count) {
+    return;
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  const BLOCK_SIZE tpl_bsize =
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+  const int tplw = mi_size_wide[tpl_bsize];
+  const int tplh = mi_size_high[tpl_bsize];
+  const int nw = mi_size_wide[bsize] / tplw;
+  const int nh = mi_size_high[bsize] / tplh;
+
+  if (nw >= 1 && nh >= 1) {
+    const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size];
+    const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size];
+    const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw;
+    int valid = 1;
+
+    // Assign large weight to start_mv, so it is always tested.
+    cand[0].weight = nw * nh;
+
+    for (int k = 0; k < nh; k++) {
+      for (int l = 0; l < nw; l++) {
+        const int_mv mv =
+            sb_enc
+                ->tpl_mv[start + k * sb_enc->tpl_stride + l][ref - LAST_FRAME];
+        if (mv.as_int == INVALID_MV) {
+          valid = 0;
+          break;
+        }
+
+        const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
+                                 GET_MV_RAWPEL(mv.as_mv.col) };
+        int unique = 1;
+        for (int m = 0; m < *cand_count; m++) {
+          if (RIGHT_SHIFT_MV(fmv.row) ==
+                  RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.row) &&
+              RIGHT_SHIFT_MV(fmv.col) ==
+                  RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.col)) {
+            unique = 0;
+            cand[m].weight++;
+            break;
+          }
+        }
+
+        if (unique) {
+          cand[*cand_count].fmv.as_fullmv = fmv;
+          cand[*cand_count].weight = 1;
+          (*cand_count)++;
+        }
+      }
+      if (!valid) break;
+    }
+
+    if (valid) {
+      *total_cand_weight = 2 * nh * nw;
+      if (*cand_count > 2)
+        qsort(cand, *cand_count, sizeof(cand[0]), &compare_weight);
+    }
+  }
+}
+
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+                              int search_range, inter_mode_info *mode_info,
+                              int_mv *best_mv,
+                              struct HandleInterModeArgs *const args) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int bestsme = INT_MAX;
+  const int ref = mbmi->ref_frame[ref_idx];
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const MvCosts *mv_costs = x->mv_costs;
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    }
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+  int step_param;
+  if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) {
+    // Take the weighted average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (av1_init_search_range(x->max_mv_context[ref]) +
+                  mv_search_params->mv_step_param) /
+                 2;
+  } else {
+    step_param = mv_search_params->mv_step_param;
+  }
+
+  const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+  FULLPEL_MV start_mv;
+  if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+    start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv);
+  else
+    start_mv = get_fullmv_from_mv(&ref_mv);
+
+  // cand stores start_mv and all possible MVs in a SB.
+  cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1];
+  av1_zero(cand);
+  cand[0].fmv.as_fullmv = start_mv;
+  int cnt = 1;
+  int total_weight = 0;
+
+  if (!cpi->sf.mv_sf.full_pixel_search_level &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION) {
+    get_mv_candidate_from_tpl(cpi, x, bsize, ref, cand, &cnt, &total_weight);
+  }
+
+  const int cand_cnt = AOMMIN(2, cnt);
+  // TODO(any): Test the speed feature for OBMC_CAUSAL mode.
+  if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION) {
+    const int stack_size = args->start_mv_cnt;
+    for (int cand_idx = 0; cand_idx < cand_cnt; cand_idx++) {
+      int_mv *fmv_cand = &cand[cand_idx].fmv;
+      int skip_cand_mv = 0;
+
+      // Check difference between mvs in the stack and candidate mv.
+      for (int stack_idx = 0; stack_idx < stack_size; stack_idx++) {
+        const uint8_t this_ref_mv_idx = args->ref_mv_idx_stack[stack_idx];
+        const FULLPEL_MV *fmv_stack = &args->start_mv_stack[stack_idx];
+        const int this_newmv_valid =
+            args->single_newmv_valid[this_ref_mv_idx][ref];
+        const int row_diff = abs(fmv_stack->row - fmv_cand->as_fullmv.row);
+        const int col_diff = abs(fmv_stack->col - fmv_cand->as_fullmv.col);
+
+        if (!this_newmv_valid) continue;
+
+        if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 2) {
+          // Prunes the current start_mv candidate, if the absolute mv
+          // difference of both row and column are <= 1.
+          if (row_diff <= 1 && col_diff <= 1) {
+            skip_cand_mv = 1;
+            break;
+          }
+        } else if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 1) {
+          // Prunes the current start_mv candidate, if the sum of the absolute
+          // mv difference of row and column is <= 1.
+          if (row_diff + col_diff <= 1) {
+            skip_cand_mv = 1;
+            break;
+          }
+        }
+      }
+      if (skip_cand_mv) {
+        // Ensure atleast one full-pel motion search is not pruned.
+        assert(mbmi->ref_mv_idx != 0);
+        // Mark the candidate mv as invalid so that motion search gets skipped.
+        cand[cand_idx].fmv.as_int = INVALID_MV;
+      } else {
+        // Store start_mv candidate and corresponding ref_mv_idx of full-pel
+        // search in the mv stack (except last ref_mv_idx).
+        if (mbmi->ref_mv_idx != MAX_REF_MV_SEARCH - 1) {
+          assert(args->start_mv_cnt < (MAX_REF_MV_SEARCH - 1) * 2);
+          args->start_mv_stack[args->start_mv_cnt] = fmv_cand->as_fullmv;
+          args->ref_mv_idx_stack[args->start_mv_cnt] = mbmi->ref_mv_idx;
+          args->start_mv_cnt++;
+        }
+      }
+    }
+  }
+
+  // Hot fix for asan complaints when resize mode is on. When resize mode is on,
+  // the stride of the reference frame can be different from indicated by
+  // MotionVectorSearchParams::search_site_cfg. When this happens, we need to
+  // readjust the stride.
+  const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, mv_sf, bsize);
+  const search_site_config *src_search_site_cfg =
+      av1_get_search_site_config(cpi, x, search_method);
+
+  // Further reduce the search range.
+  if (search_range < INT_MAX) {
+    const search_site_config *search_site_cfg =
+        &src_search_site_cfg[search_method_lookup[search_method]];
+    // Max step_param is search_site_cfg->num_search_steps.
+    if (search_range < 1) {
+      step_param = search_site_cfg->num_search_steps;
+    } else {
+      while (search_site_cfg->radius[search_site_cfg->num_search_steps -
+                                     step_param - 1] > (search_range << 1) &&
+             search_site_cfg->num_search_steps - step_param - 1 > 0)
+        step_param++;
+    }
+  }
+
+  int cost_list[5];
+  FULLPEL_MV_STATS best_mv_stats;
+  int_mv second_best_mv;
+  best_mv->as_int = second_best_mv.as_int = INVALID_MV;
+
+  // Allow more mesh searches for screen content type on the ARF.
+  const int fine_search_interval = use_fine_search_interval(cpi);
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+
+  switch (mbmi->motion_mode) {
+    case SIMPLE_TRANSLATION: {
+      // Perform a search with the top 2 candidates
+      int sum_weight = 0;
+      for (int m = 0; m < cand_cnt; m++) {
+        int_mv smv = cand[m].fmv;
+        FULLPEL_MV this_best_mv, this_second_best_mv;
+        FULLPEL_MV_STATS this_mv_stats;
+
+        if (smv.as_int == INVALID_MV) continue;
+
+        av1_make_default_fullpel_ms_params(
+            &full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv,
+            src_search_site_cfg, search_method, fine_search_interval);
+
+        const int thissme =
+            av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param,
+                                  cond_cost_list(cpi, cost_list), &this_best_mv,
+                                  &this_mv_stats, &this_second_best_mv);
+
+        if (thissme < bestsme) {
+          bestsme = thissme;
+          best_mv->as_fullmv = this_best_mv;
+          best_mv_stats = this_mv_stats;
+          second_best_mv.as_fullmv = this_second_best_mv;
+        }
+
+        sum_weight += cand[m].weight;
+        if (4 * sum_weight > 3 * total_weight) break;
+      }
+    } break;
+    case OBMC_CAUSAL:
+      av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+                                         &ref_mv, start_mv, src_search_site_cfg,
+                                         search_method, fine_search_interval);
+
+      bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
+                                           step_param, &best_mv->as_fullmv);
+      break;
+    default: assert(0 && "Invalid motion mode!\n");
+  }
+  if (best_mv->as_int == INVALID_MV) return;
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  // Terminate search with the current ref_idx based on fullpel mv, rate cost,
+  // and other know cost.
+  if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION &&
+      best_mv->as_int != INVALID_MV) {
+    int_mv this_mv;
+    this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+    const int ref_mv_idx = mbmi->ref_mv_idx;
+    const int this_mv_rate =
+        av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                        mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+    mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
+    mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+    mode_info[ref_mv_idx].full_mv_bestsme = bestsme;
+
+    for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+      // Check if the motion search result same as previous results
+      if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) {
+        // Compare the rate cost
+        const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate +
+                                   mode_info[prev_ref_idx].drl_cost;
+        const int this_rate_cost =
+            this_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+        if (prev_rate_cost <= this_rate_cost) {
+          // If the current rate_cost is worse than the previous rate_cost, then
+          // we terminate the search. Since av1_single_motion_search is only
+          // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the
+          // best_mv to INVALID mv to signal that we wish to terminate search
+          // for the current mode.
+          best_mv->as_int = INVALID_MV;
+          return;
+        }
+      }
+
+      // Terminate the evaluation of current ref_mv_idx based on bestsme and
+      // drl_cost.
+      const int psme = mode_info[prev_ref_idx].full_mv_bestsme;
+      if (psme == INT_MAX) continue;
+      const int thr =
+          cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme;
+      if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 &&
+          mode_info[ref_mv_idx].full_mv_bestsme > thr &&
+          mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) {
+        best_mv->as_int = INVALID_MV;
+        return;
+      }
+    }
+  }
+
+  if (cpi->common.features.cur_frame_force_integer_mv) {
+    convert_fullmv_to_mv(best_mv);
+  }
+
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  int best_mv_rate = 0;
+  int mv_rate_calculated = 0;
+  if (use_fractional_mv) {
+    int_mv fractional_ms_list[3];
+    av1_set_fractional_mv(fractional_ms_list);
+    int dis; /* TODO: use dis in distortion calculation later. */
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+    assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+
+    switch (mbmi->motion_mode) {
+      case SIMPLE_TRANSLATION:
+        if (mv_sf->use_accurate_subpel_search) {
+          const int try_second = second_best_mv.as_int != INVALID_MV &&
+                                 second_best_mv.as_int != best_mv->as_int &&
+                                 (mv_sf->disable_second_mv <= 1);
+          const int best_mv_var = mv_search_params->find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+              &best_mv->as_mv, &dis, &x->pred_sse[ref], fractional_ms_list);
+
+          if (try_second) {
+            struct macroblockd_plane *p = xd->plane;
+            const BUFFER_SET orig_dst = {
+              { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+              { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+            };
+            int64_t rd = INT64_MAX;
+            if (!mv_sf->disable_second_mv) {
+              // Calculate actual rd cost.
+              mbmi->mv[0].as_mv = best_mv->as_mv;
+              av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                            bsize, 0, 0);
+              av1_subtract_plane(x, bsize, 0);
+              RD_STATS this_rd_stats;
+              av1_init_rd_stats(&this_rd_stats);
+              av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize,
+                                    max_txsize_rect_lookup[bsize]);
+              int this_mv_rate = av1_mv_bit_cost(
+                  &best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                  mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+              rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate,
+                          this_rd_stats.dist);
+            }
+
+            MV this_best_mv;
+            subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+            if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
+                                         subpel_start_mv)) {
+              unsigned int sse;
+              const int this_var = mv_search_params->find_fractional_mv_step(
+                  xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv,
+                  &dis, &sse, fractional_ms_list);
+
+              if (!mv_sf->disable_second_mv) {
+                // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost
+                // to choose the better MV.
+                mbmi->mv[0].as_mv = this_best_mv;
+                av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                              bsize, 0, 0);
+                av1_subtract_plane(x, bsize, 0);
+                RD_STATS tmp_rd_stats;
+                av1_init_rd_stats(&tmp_rd_stats);
+                av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize,
+                                      max_txsize_rect_lookup[bsize]);
+                int tmp_mv_rate = av1_mv_bit_cost(
+                    &this_best_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                    mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+                int64_t tmp_rd =
+                    RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate,
+                           tmp_rd_stats.dist);
+                if (tmp_rd < rd) {
+                  best_mv->as_mv = this_best_mv;
+                  x->pred_sse[ref] = sse;
+                }
+              } else {
+                // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the
+                // best MV.
+                if (this_var < best_mv_var) {
+                  best_mv->as_mv = this_best_mv;
+                  x->pred_sse[ref] = sse;
+                }
+              }
+            }
+          }
+        } else {
+          mv_search_params->find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+              &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL);
+        }
+        break;
+      case OBMC_CAUSAL:
+        av1_find_best_obmc_sub_pixel_tree_up(
+            xd, cm, &ms_params, subpel_start_mv, NULL, &best_mv->as_mv, &dis,
+            &x->pred_sse[ref], NULL);
+        break;
+      default: assert(0 && "Invalid motion mode!\n");
+    }
+
+    // Terminate search with the current ref_idx based on subpel mv and rate
+    // cost.
+    if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL &&
+        mbmi->motion_mode == SIMPLE_TRANSLATION &&
+        best_mv->as_int != INVALID_MV) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      best_mv_rate =
+          av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                          mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+      mv_rate_calculated = 1;
+
+      for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+        if (!args->single_newmv_valid[prev_ref_idx][ref]) continue;
+        // Check if the motion vectors are the same.
+        if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) {
+          // Skip this evaluation if the previous one is skipped.
+          if (mode_info[prev_ref_idx].skip) {
+            mode_info[ref_mv_idx].skip = 1;
+            break;
+          }
+          // Compare the rate cost that we current know.
+          const int prev_rate_cost =
+              args->single_newmv_rate[prev_ref_idx][ref] +
+              mode_info[prev_ref_idx].drl_cost;
+          const int this_rate_cost =
+              best_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+          if (prev_rate_cost <= this_rate_cost) {
+            // If the current rate_cost is worse than the previous rate_cost,
+            // then we terminate the search for this ref_mv_idx.
+            mode_info[ref_mv_idx].skip = 1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (mv_rate_calculated) {
+    *rate_mv = best_mv_rate;
+  } else {
+    *rate_mv =
+        av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                        mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  }
+}
+
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, int_mv *cur_mv,
+                            const uint8_t *mask, int mask_stride, int *rate_mv,
+                            int allow_second_mv, int joint_me_num_refine_iter) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  const int plane = 0;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+  const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
+  const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+  const MvCosts *mv_costs = x->mv_costs;
+  int_mv ref_mv[2];
+  int ite, ref;
+
+  // Get the prediction block from the 'other' reference frame.
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  InterPredParams inter_pred_params;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = { INT_MAX, INT_MAX };
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    av1_get_scaled_ref_frame(cpi, refs[0]),
+    av1_get_scaled_ref_frame(cpi, refs[1])
+  };
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+  uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
+
+  int_mv best_mv, second_best_mv;
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < (2 * joint_me_num_refine_iter); ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+    if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+      if (cur_mv[id].as_int == init_mv[id].as_int) {
+        break;
+      } else {
+        int_mv cur_int_mv, init_int_mv;
+        cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+        cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3;
+        init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+        init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+        if (cur_int_mv.as_int == init_int_mv.as_int) {
+          break;
+        }
+      }
+    }
+    for (ref = 0; ref < 2; ++ref) {
+      ref_mv[ref] = av1_get_ref_mv(x, ref);
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      if (scaled_ref_frame[ref]) {
+        int i;
+        for (i = 0; i < num_planes; i++)
+          backup_yv12[ref][i] = xd->plane[i].pre[ref];
+        av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                             NULL, num_planes);
+      }
+    }
+
+    assert(IMPLIES(scaled_ref_frame[0] != NULL,
+                   cm->width == scaled_ref_frame[0]->y_crop_width &&
+                       cm->height == scaled_ref_frame[0]->y_crop_height));
+    assert(IMPLIES(scaled_ref_frame[1] != NULL,
+                   cm->width == scaled_ref_frame[1]->y_crop_width &&
+                       cm->height == scaled_ref_frame[1]->y_crop_height));
+
+    // Initialize based on (possibly scaled) prediction buffers.
+    ref_yv12[0] = xd->plane[plane].pre[0];
+    ref_yv12[1] = xd->plane[plane].pre[1];
+
+    av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          &cm->sf_identity, &ref_yv12[!id], interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    // Since we have scaled the reference frames to match the size of the
+    // current frame we must use a unit scaling factor during mode selection.
+    av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv,
+                                      &inter_pred_params);
+
+    // Do full-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    // Make motion search params
+    FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+    FULLPEL_MV_STATS best_mv_stats;
+    const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+    const SEARCH_METHODS search_method =
+        av1_get_default_mv_search_method(x, mv_sf, bsize);
+    const search_site_config *src_search_sites =
+        av1_get_search_site_config(cpi, x, search_method);
+    // Use the mv result from the single mode as mv predictor.
+    const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
+    av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+                                       &ref_mv[id].as_mv, start_fullmv,
+                                       src_search_sites, search_method,
+                                       /*fine_search_interval=*/0);
+
+    av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+                             mask_stride, id);
+
+    // Small-range full-pixel motion search.
+    if (!mv_sf->disable_extensive_joint_motion_search &&
+        mbmi->interinter_comp.type != COMPOUND_WEDGE) {
+      bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+                                      &best_mv.as_fullmv, &best_mv_stats,
+                                      &second_best_mv.as_fullmv);
+    } else {
+      bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+                                         &best_mv.as_fullmv);
+      second_best_mv = best_mv;
+    }
+
+    const int try_second = second_best_mv.as_int != INVALID_MV &&
+                           second_best_mv.as_int != best_mv.as_int &&
+                           allow_second_mv;
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+    for (ref = 0; ref < 2; ++ref) {
+      if (scaled_ref_frame[ref]) {
+        // Swap back the original buffers for subpel motion search.
+        for (int i = 0; i < num_planes; i++) {
+          xd->plane[i].pre[ref] = backup_yv12[ref][i];
+        }
+        // Re-initialize based on unscaled prediction buffers.
+        ref_yv12[ref] = xd->plane[plane].pre[ref];
+      }
+    }
+
+    // Do sub-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    if (cpi->common.features.cur_frame_force_integer_mv) {
+      convert_fullmv_to_mv(&best_mv);
+    }
+    if (bestsme < INT_MAX &&
+        cpi->common.features.cur_frame_force_integer_mv == 0) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+      av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                        &ref_mv[id].as_mv, NULL);
+      av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+                               mask, mask_stride, id);
+      ms_params.forced_stop = EIGHTH_PEL;
+      MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+      assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+      bestsme = cpi->mv_search_params.find_fractional_mv_step(
+          xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &sse, NULL);
+
+      if (try_second) {
+        MV this_best_mv;
+        MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+        if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) {
+          const int thissme = cpi->mv_search_params.find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis,
+              &sse, NULL);
+          if (thissme < bestsme) {
+            best_mv.as_mv = this_best_mv;
+            bestsme = thissme;
+          }
+        }
+      }
+    }
+
+    // Restore the pointer to the first prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+    if (bestsme < last_besterr[id]) {
+      cur_mv[id] = best_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+    *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+                                mv_costs->nmv_joint_cost,
+                                mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  }
+
+  return AOMMIN(last_besterr[0], last_besterr[1]);
+}
+
+// Search for the best mv for one component of a compound,
+// given that the other component is fixed.
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, MV *this_mv,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *mask, int mask_stride,
+                                      int *rate_mv, int ref_idx) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int ref = mbmi->ref_frame[ref_idx];
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const MvCosts *mv_costs = x->mv_costs;
+
+  struct buf_2d backup_yv12[MAX_MB_PLANE];
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+
+  // Check that this is either an interinter or an interintra block
+  assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+  // Store the first prediction buffer.
+  struct buf_2d orig_yv12;
+  if (ref_idx) {
+    orig_yv12 = pd->pre[0];
+    pd->pre[0] = pd->pre[ref_idx];
+  }
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    }
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    // The index below needs to be 0 instead of ref_idx since we assume the
+    // 0th slot to be used for subsequent searches. Note that the ref_idx
+    // reference buffer has been copied to the 0th slot in the code above.
+    // Now we need to swap the reference frame for the 0th slot.
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  int bestsme = INT_MAX;
+  int_mv best_mv;
+
+  // Make motion search params
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  FULLPEL_MV_STATS best_mv_stats;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+  const search_site_config *src_search_sites =
+      av1_get_search_site_config(cpi, x, search_method);
+  // Use the mv result from the single mode as mv predictor.
+  const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+                                     &ref_mv.as_mv, start_fullmv,
+                                     src_search_sites, search_method,
+                                     /*fine_search_interval=*/0);
+
+  av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+                           mask_stride, ref_idx);
+
+  // Small-range full-pixel motion search.
+  bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+                                  &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search for the 0th slot.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = backup_yv12[i];
+    }
+  }
+
+  if (cpi->common.features.cur_frame_force_integer_mv) {
+    convert_fullmv_to_mv(&best_mv);
+  }
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
+    int dis; /* TODO: use dis in distortion calculation later. */
+    unsigned int sse;
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv,
+                                      NULL);
+    av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+                             mask, mask_stride, ref_idx);
+    ms_params.forced_stop = EIGHTH_PEL;
+    MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+    bestsme = cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, start_mv, &best_mv_stats, &best_mv.as_mv, &dis,
+        &sse, NULL);
+  }
+
+  // Restore the pointer to the first unscaled prediction buffer.
+  if (ref_idx) pd->pre[0] = orig_yv12;
+
+  if (bestsme < INT_MAX) *this_mv = best_mv.as_mv;
+
+  *rate_mv = 0;
+
+  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, mv_costs->nmv_joint_cost,
+                              mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  return bestsme;
+}
+
+static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
+                                               MACROBLOCK *x, BLOCK_SIZE bsize,
+                                               const MV *other_mv, int ref_idx,
+                                               uint8_t *second_pred) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x);
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y);
+
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+
+  const int plane = 0;
+  struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
+
+  struct scale_factors sf;
+  av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+                                    cm->width, cm->height);
+
+  InterPredParams inter_pred_params;
+
+  av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col,
+                        pd->subsampling_x, pd->subsampling_y, xd->bd,
+                        is_cur_buf_hbd(xd), 0, &sf, &ref_yv12,
+                        mbmi->interp_filters);
+  inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+
+  // Get the prediction block from the 'other' reference frame.
+  av1_enc_build_one_inter_predictor(second_pred, pw, other_mv,
+                                    &inter_pred_params);
+}
+
+// Wrapper for av1_compound_single_motion_search, for the common case
+// where the second prediction is also an inter mode.
+int av1_compound_single_motion_search_interinter(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+    const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(xd->mi[0]));
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+  uint8_t *second_pred;
+  if (is_cur_buf_hbd(xd))
+    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+  else
+    second_pred = (uint8_t *)second_pred_alloc_16;
+
+  MV *this_mv = &cur_mv[ref_idx].as_mv;
+  const MV *other_mv = &cur_mv[!ref_idx].as_mv;
+  build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred);
+  return av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred,
+                                           mask, mask_stride, rate_mv, ref_idx);
+}
+
+static AOM_INLINE void do_masked_motion_search_indexed(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+    int_mv *tmp_mv, int *rate_mv, int which) {
+  // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  BLOCK_SIZE sb_type = mbmi->bsize;
+  const uint8_t *mask;
+  const int mask_stride = block_size_wide[bsize];
+
+  mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+  tmp_mv[0].as_int = cur_mv[0].as_int;
+  tmp_mv[1].as_int = cur_mv[1].as_int;
+  if (which == 0 || which == 1) {
+    av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask,
+                                                 mask_stride, rate_mv, which);
+  } else if (which == 2) {
+    const int joint_me_num_refine_iter =
+        cpi->sf.inter_sf.enable_fast_compound_mode_search == 2
+            ? REDUCED_JOINT_ME_REFINE_ITER
+            : NUM_JOINT_ME_REFINE_ITER;
+    av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv,
+                            !cpi->sf.mv_sf.disable_second_mv,
+                            joint_me_num_refine_iter);
+  }
+}
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                          MACROBLOCK *x,
+                                          const int_mv *const cur_mv,
+                                          const BLOCK_SIZE bsize,
+                                          const PREDICTION_MODE this_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int_mv tmp_mv[2];
+  int tmp_rate_mv = 0;
+  // TODO(jingning): The average compound mode has proper SAD and variance
+  // functions implemented, and is triggerd by setting the mask pointer as
+  // Null. Need to further implement those for frame distance weighted mode.
+  mbmi->interinter_comp.seg_mask =
+      mbmi->interinter_comp.type == COMPOUND_AVERAGE ? NULL : xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+  if (this_mode == NEW_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    tmp_mv, &tmp_rate_mv, 2);
+    mbmi->mv[0].as_int = tmp_mv[0].as_int;
+    mbmi->mv[1].as_int = tmp_mv[1].as_int;
+  } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) {
+    // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
+    // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV
+    int which = (NEWMV == compound_ref1_mode(this_mode));
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    tmp_mv, &tmp_rate_mv, which);
+    mbmi->mv[which].as_int = tmp_mv[which].as_int;
+  }
+  return tmp_rate_mv;
+}
+
+int_mv av1_simple_motion_search_sse_var(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize, int ref,
+                                        FULLPEL_MV start_mv, int num_planes,
+                                        int use_subpixel, unsigned int *sse,
+                                        unsigned int *var) {
+  assert(num_planes == 1 &&
+         "Currently simple_motion_search only supports luma plane");
+  assert(!frame_is_intra_only(&cpi->common) &&
+         "Simple motion search only enabled for non-key frames");
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->bsize = bsize;
+  mbmi->ref_frame[0] = ref;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  struct buf_2d backup_yv12;
+  // ref_mv is used to calculate the cost of the motion vector
+  const MV ref_mv = kZeroMv;
+  const int step_param =
+      AOMMIN(cpi->mv_search_params.mv_step_param +
+                 cpi->sf.part_sf.simple_motion_search_reduce_search_steps,
+             MAX_MVSEARCH_STEPS - 2);
+  int cost_list[5];
+  const int ref_idx = 0;
+  int bestsme;
+  int_mv best_mv;
+  FULLPEL_MV_STATS best_mv_stats;
+
+  av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+                       get_ref_scale_factors(cm, ref), num_planes);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  if (scaled_ref_frame) {
+    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  // Allow more mesh searches for screen content type on the ARF.
+  const int fine_search_interval = use_fine_search_interval(cpi);
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, mv_sf, bsize);
+  const search_site_config *src_search_sites =
+      av1_get_search_site_config(cpi, x, search_method);
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
+                                     start_mv, src_search_sites, search_method,
+                                     fine_search_interval);
+
+  bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                                  cond_cost_list(cpi, cost_list),
+                                  &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+  const int use_subpel_search =
+      bestsme < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
+      use_subpixel &&
+      (cpi->sf.mv_sf.simple_motion_subpel_force_stop != FULL_PEL);
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+  if (use_subpel_search) {
+    int not_used = 0;
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params().
+    ms_params.forced_stop = mv_sf->simple_motion_subpel_force_stop;
+
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv,
+        &not_used, &x->pred_sse[ref], NULL);
+
+    mbmi->mv[0] = best_mv;
+
+    // Get a copy of the prediction output
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+    *var = cpi->ppi->fn_ptr[bsize].vf(
+        x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
+        xd->plane[0].dst.stride, sse);
+  } else {
+    // Manually convert from units of pixel to 1/8-pixels if we are not doing
+    // subpel search
+    convert_fullmv_to_mv(&best_mv);
+    *var = best_mv_stats.distortion;
+    *sse = best_mv_stats.sse;
+  }
+
+  return best_mv;
+}
diff --git a/third_party/aom/av1/encoder/motion_search_facade.h b/third_party/aom/av1/encoder/motion_search_facade.h
new file mode 100644
index 0000000000..d1fa915bca
--- /dev/null
+++ b/third_party/aom/av1/encoder/motion_search_facade.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_
+#define AOM_AV1_ENCODER_MOTION_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_JOINT_ME_REFINE_ITER 2
+#define REDUCED_JOINT_ME_REFINE_ITER 1
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_modes_info, which makes this terribly confusing.
+typedef struct {
+  int drl_cost;
+  int_mv full_search_mv;
+  int full_mv_rate;
+  int full_mv_bestsme;
+  int skip;
+} inter_mode_info;
+
+struct HandleInterModeArgs;
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+                              int search_range, inter_mode_info *mode_info,
+                              int_mv *best_mv,
+                              struct HandleInterModeArgs *const args);
+
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, int_mv *cur_mv,
+                            const uint8_t *mask, int mask_stride, int *rate_mv,
+                            int allow_second_mv, int joint_me_num_refine_iter);
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                          MACROBLOCK *x,
+                                          const int_mv *const cur_mv,
+                                          const BLOCK_SIZE bsize,
+                                          const PREDICTION_MODE this_mode);
+
+int av1_compound_single_motion_search_interinter(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+    const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx);
+
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, MV *this_mv,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *mask, int mask_stride,
+                                      int *rate_mv, int ref_idx);
+
+// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
+// ref and calculates the sse and var of the residue. Note that this sets the
+// offset of mbmi, so we will need to reset it after calling this function.
+int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize, int ref,
+                                        const FULLPEL_MV start_mv,
+                                        int num_planes, int use_subpixel,
+                                        unsigned int *sse, unsigned int *var);
+
+static AOM_INLINE const search_site_config *av1_get_search_site_config(
+    const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) {
+  const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+  // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+  // that's shared by multiple threads. In most cases where all frames have the
+  // same resolution, the cache contains the search site config that we need.
+  const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+  if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_SRC]->stride) {
+    return mv_search_params->search_site_cfg[SS_CFG_SRC];
+  } else if (ref_stride ==
+             mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]->stride) {
+    return mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD];
+  }
+
+  // If the cache does not contain the correct stride, then we will need to rely
+  // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+  // thread level config doesn't match, then we need to update it.
+  search_method = search_method_lookup[search_method];
+  assert(search_method_lookup[search_method] == search_method &&
+         "The search_method_lookup table should be idempotent.");
+  if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+    av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+                                   ref_stride);
+  }
+
+  return x->search_site_cfg_buf;
+}
+
+static AOM_INLINE SEARCH_METHODS
+av1_get_faster_search_method(SEARCH_METHODS search_method) {
+  // Note on search method's accuracy:
+  //  1. NSTEP
+  //  2. DIAMOND
+  //  3. BIGDIA \approx SQUARE
+  //  4. HEX.
+  //  5. FAST_HEX \approx FAST_DIAMOND
+  switch (search_method) {
+    case NSTEP: return DIAMOND;
+    case NSTEP_8PT: return DIAMOND;
+    case DIAMOND: return BIGDIA;
+    case CLAMPED_DIAMOND: return BIGDIA;
+    case BIGDIA: return HEX;
+    case SQUARE: return HEX;
+    case HEX: return FAST_HEX;
+    case FAST_HEX: return FAST_HEX;
+    case FAST_DIAMOND: return VFAST_DIAMOND;
+    case FAST_BIGDIA: return FAST_BIGDIA;
+    case VFAST_DIAMOND: return VFAST_DIAMOND;
+    default: assert(0 && "Invalid search method!"); return DIAMOND;
+  }
+}
+
+static AOM_INLINE SEARCH_METHODS av1_get_default_mv_search_method(
+    const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) {
+  SEARCH_METHODS search_method = mv_sf->search_method;
+  const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
+  const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
+  const int qband = x->qindex >> (QINDEX_BITS - 2);
+  const bool use_faster_search_method =
+      (sf_blk_search_method == 1 && min_dim >= 32) ||
+      (sf_blk_search_method >= 2 && min_dim >= 16 &&
+       x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3);
+
+  if (use_faster_search_method) {
+    search_method = av1_get_faster_search_method(search_method);
+  }
+  return search_method;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MOTION_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/mv_prec.c b/third_party/aom/av1/encoder/mv_prec.c
new file mode 100644
index 0000000000..b64f4dcd0e
--- /dev/null
+++ b/third_party/aom/av1/encoder/mv_prec.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "av1/encoder/encodemv.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/misc_model_weights.h"
+#endif  // !CONFIG_REALTIME_ONLY
+#include "av1/encoder/mv_prec.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int_mv get_ref_mv_for_mv_stats(
+    const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+    int ref_idx) {
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+
+  const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame;
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+  if (ref_frames[1] > INTRA_FRAME) {
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
+  }
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) {
+  const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]);
+  const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0;
+  const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB);
+
+  return av1_cost_symbol(p15);
+}
+
+static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val,
+                                         int comp_idx, const AV1_COMP *cpi,
+                                         int *rates) {
+  assert(comp_val != 0 && "mv component should not have zero value!");
+  const int sign = comp_val < 0;
+  const int mag = sign ? -comp_val : comp_val;
+  const int mag_minus_1 = mag - 1;
+  int offset;
+  const int mv_class = av1_get_mv_class(mag_minus_1, &offset);
+  const int int_part = offset >> 3;         // int mv data
+  const int frac_part = (offset >> 1) & 3;  // fractional mv data
+  const int high_part = offset & 1;         // high precision mv data
+  const int use_hp = cpi->common.features.allow_high_precision_mv;
+  int r_idx = 0;
+
+  const MACROBLOCK *const x = &cpi->td.mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  nmv_context *nmvc = &ec_ctx->nmvc;
+  nmv_component *mvcomp_ctx = nmvc->comps;
+  nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx];
+  aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf;
+  aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf;
+  aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf;
+  aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf;
+  aom_cdf_prob *frac_part_cdf = mv_class
+                                    ? (cur_mvcomp_ctx->fp_cdf)
+                                    : (cur_mvcomp_ctx->class0_fp_cdf[int_part]);
+  aom_cdf_prob *high_part_cdf =
+      mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf);
+
+  const int sign_rate = get_symbol_cost(sign_cdf, sign);
+  rates[r_idx++] = sign_rate;
+  update_cdf(sign_cdf, sign, 2);
+
+  const int class_rate = get_symbol_cost(class_cdf, mv_class);
+  rates[r_idx++] = class_rate;
+  update_cdf(class_cdf, mv_class, MV_CLASSES);
+
+  int int_bit_rate = 0;
+  if (mv_class == MV_CLASS_0) {
+    int_bit_rate = get_symbol_cost(class0_cdf, int_part);
+    update_cdf(class0_cdf, int_part, CLASS0_SIZE);
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (int i = 0; i < n; ++i) {
+      int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1);
+      update_cdf(bits_cdf[i], (int_part >> i) & 1, 2);
+    }
+  }
+  rates[r_idx++] = int_bit_rate;
+  const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part);
+  rates[r_idx++] = frac_part_rate;
+  update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE);
+  const int high_part_rate =
+      use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0;
+  if (use_hp) {
+    update_cdf(high_part_cdf, high_part, 2);
+  }
+  rates[r_idx++] = high_part_rate;
+
+  mv_stats->last_bit_zero += !high_part;
+  mv_stats->last_bit_nonzero += high_part;
+  const int total_rate =
+      (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate);
+  return total_rate;
+}
+
+static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
+                                        const MV *cur_mv, const AV1_COMP *cpi) {
+  const MACROBLOCK *const x = &cpi->td.mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  nmv_context *nmvc = &ec_ctx->nmvc;
+  aom_cdf_prob *joint_cdf = nmvc->joints_cdf;
+  const int use_hp = cpi->common.features.allow_high_precision_mv;
+
+  const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col };
+  const int mv_joint = av1_get_mv_joint(&diff);
+  // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp
+  const MV hp_diff = diff;
+  const int hp_mv_joint = av1_get_mv_joint(&hp_diff);
+  const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 };
+  const MV lp_diff = use_hp ? truncated_diff : diff;
+  const int lp_mv_joint = av1_get_mv_joint(&lp_diff);
+
+  const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint);
+  const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint);
+  const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint);
+
+  update_cdf(joint_cdf, mv_joint, MV_JOINTS);
+
+  mv_stats->total_mv_rate += mv_joint_rate;
+  mv_stats->hp_total_mv_rate += hp_mv_joint_rate;
+  mv_stats->lp_total_mv_rate += lp_mv_joint_rate;
+  mv_stats->mv_joint_count[mv_joint]++;
+
+  for (int comp_idx = 0; comp_idx < 2; comp_idx++) {
+    const int comp_val = comp_idx ? diff.col : diff.row;
+    const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row;
+    const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row;
+    int rates[5];
+    av1_zero_array(rates, 5);
+
+    const int comp_rate =
+        comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates)
+                 : 0;
+    // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false
+    const int hp_rate =
+        hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0;
+    const int lp_rate =
+        lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0;
+
+    mv_stats->total_mv_rate += comp_rate;
+    mv_stats->hp_total_mv_rate += hp_rate;
+    mv_stats->lp_total_mv_rate += lp_rate;
+  }
+}
+
+static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
+                                          const AV1_COMP *cpi, int mi_row,
+                                          int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) {
+    return;
+  }
+
+  const MB_MODE_INFO *mbmi =
+      mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col];
+  const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame =
+      cpi->mbmi_ext_info.frame_base +
+      get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+                     cpi->mbmi_ext_info.stride);
+
+  if (!is_inter_block(mbmi)) {
+    mv_stats->intra_count++;
+    return;
+  }
+  mv_stats->inter_count++;
+
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int is_compound = has_second_ref(mbmi);
+
+  if (mode == NEWMV || mode == NEW_NEWMV) {
+    // All mvs are new
+    for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) {
+      const MV ref_mv =
+          get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+      const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+      keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV ||
+             mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    // has exactly one new_mv
+    mv_stats->default_mvs += 1;
+
+    const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV);
+    const MV ref_mv =
+        get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+    const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+
+    keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+  } else {
+    // No new_mv
+    mv_stats->default_mvs += 1 + is_compound;
+  }
+
+  // Add texture information
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int num_rows = block_size_high[bsize];
+  const int num_cols = block_size_wide[bsize];
+  const int y_stride = cpi->source->y_stride;
+  const int px_row = 4 * mi_row, px_col = 4 * mi_col;
+  const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd = cm->seq_params->bit_depth;
+  if (buf_is_hbd) {
+    uint16_t *source_buf =
+        CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
+    for (int row = 0; row < num_rows - 1; row++) {
+      for (int col = 0; col < num_cols - 1; col++) {
+        const int offset = row * y_stride + col;
+        const int horz_diff =
+            abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8);
+        const int vert_diff =
+            abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8);
+        mv_stats->horz_text += horz_diff;
+        mv_stats->vert_text += vert_diff;
+        mv_stats->diag_text += horz_diff * vert_diff;
+      }
+    }
+  } else {
+    uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col;
+    for (int row = 0; row < num_rows - 1; row++) {
+      for (int col = 0; col < num_cols - 1; col++) {
+        const int offset = row * y_stride + col;
+        const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]);
+        const int vert_diff =
+            abs(source_buf[offset + y_stride] - source_buf[offset]);
+        mv_stats->horz_text += horz_diff;
+        mv_stats->vert_text += vert_diff;
+        mv_stats->diag_text += horz_diff * vert_diff;
+      }
+    }
+  }
+}
+
+// Split block
+static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
+                                           const AV1_COMP *cpi, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *cm = &cpi->common;
+
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int qbs = mi_size_wide[bsize] / 4;
+  switch (partition) {
+    case PARTITION_NONE:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize);
+      break;
+    case PARTITION_HORZ_A:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        const int this_mi_row = mi_row + i * qbs;
+        collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        const int this_mi_col = mi_col + i * qbs;
+        collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col);
+      }
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
+                                             const AV1_COMP *cpi,
+                                             const TileInfo *tile_info) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int mi_row_start = tile_info->mi_row_start;
+  const int mi_row_end = tile_info->mi_row_end;
+  const int mi_col_start = tile_info->mi_col_start;
+  const int mi_col_end = tile_info->mi_col_end;
+  const int sb_size_mi = cm->seq_params->mib_size;
+  BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
+    for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
+    }
+  }
+}
+
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) {
+  MV_STATS *mv_stats = &cpi->mv_stats;
+  const AV1_COMMON *cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      av1_tile_set_col(&tile_info, cm, tile_col);
+      cpi->tile_data[tile_idx].tctx = *cm->fc;
+      cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx;
+      collect_mv_stats_tile(mv_stats, cpi, &tile_info);
+    }
+  }
+
+  mv_stats->q = current_q;
+  mv_stats->order = cpi->common.current_frame.order_hint;
+  mv_stats->valid = 1;
+}
+
+static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
+                                        int current_q) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int order_hint = cpi->common.current_frame.order_hint;
+  const int order_diff = order_hint - mv_stats->order;
+  const float area = (float)(cm->width * cm->height);
+  float features[MV_PREC_FEATURE_SIZE] = {
+    (float)current_q,
+    (float)mv_stats->q,
+    (float)order_diff,
+    mv_stats->inter_count / area,
+    mv_stats->intra_count / area,
+    mv_stats->default_mvs / area,
+    mv_stats->mv_joint_count[0] / area,
+    mv_stats->mv_joint_count[1] / area,
+    mv_stats->mv_joint_count[2] / area,
+    mv_stats->mv_joint_count[3] / area,
+    mv_stats->last_bit_zero / area,
+    mv_stats->last_bit_nonzero / area,
+    mv_stats->total_mv_rate / area,
+    mv_stats->hp_total_mv_rate / area,
+    mv_stats->lp_total_mv_rate / area,
+    mv_stats->horz_text / area,
+    mv_stats->vert_text / area,
+    mv_stats->diag_text / area,
+  };
+
+  for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) {
+    features[f_idx] =
+        (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx];
+  }
+  float score = 0.0f;
+
+  av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score);
+
+  const int use_high_hp = score >= 0.0f;
+  return use_high_hp;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) {
+  int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH;
+#if !CONFIG_REALTIME_ONLY
+  MV_STATS *mv_stats = &cpi->mv_stats;
+#endif  // !CONFIG_REALTIME_ONLY
+
+  if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) {
+    use_hp = 0;
+  }
+#if !CONFIG_REALTIME_ONLY
+  else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+           av1_frame_allows_smart_mv(cpi) && mv_stats->valid) {
+    use_hp = get_smart_mv_prec(cpi, mv_stats, qindex);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  av1_set_high_precision_mv(cpi, use_hp,
+                            cpi->common.features.cur_frame_force_integer_mv);
+}
diff --git a/third_party/aom/av1/encoder/mv_prec.h b/third_party/aom/av1/encoder/mv_prec.h
new file mode 100644
index 0000000000..55108b6cdb
--- /dev/null
+++ b/third_party/aom/av1/encoder/mv_prec.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MV_PREC_H_
+#define AOM_AV1_ENCODER_MV_PREC_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+
+// Q threshold for high precision mv.
+#define HIGH_PRECISION_MV_QTHRESH 128
+#if !CONFIG_REALTIME_ONLY
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q);
+
+static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
+  const int gf_group_index = cpi->gf_frame_index;
+  const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index];
+  return !frame_is_intra_only(&cpi->common) &&
+         !(gf_update_type == INTNL_OVERLAY_UPDATE ||
+           gf_update_type == OVERLAY_UPDATE);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void av1_set_high_precision_mv(
+    AV1_COMP *cpi, int allow_high_precision_mv,
+    int cur_frame_force_integer_mv) {
+  MvCosts *const mv_costs = cpi->td.mb.mv_costs;
+  // Avoid accessing 'mv_costs' when it is not allocated.
+  if (mv_costs == NULL) return;
+
+  const int copy_hp = cpi->common.features.allow_high_precision_mv =
+      allow_high_precision_mv && !cur_frame_force_integer_mv;
+
+  mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+  mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+  mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+  mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
+  mv_costs->mv_cost_stack =
+      copy_hp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
+}
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex);
+
+#endif  // AOM_AV1_ENCODER_MV_PREC_H_
diff --git a/third_party/aom/av1/encoder/nonrd_opt.c b/third_party/aom/av1/encoder/nonrd_opt.c
new file mode 100644
index 0000000000..651ca43a2e
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_opt.c
@@ -0,0 +1,933 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/rdopt.h"
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_16x16 = {
+  av1_fast_idtx_scan_16x16, av1_fast_idtx_iscan_16x16
+};
+
+#define DECLARE_BLOCK_YRD_BUFFERS()                      \
+  DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \
+  DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]);  \
+  DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]);   \
+  uint16_t eob[1];
+
+#define DECLARE_BLOCK_YRD_VARS()                                          \
+  /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the  \
+   * entire bsize and write macroblock_plane::coeff. So low_coeff is kept \
+   * as a non-const so we can reassign it to macroblock_plane::coeff. */  \
+  int16_t *low_coeff = (int16_t *)coeff_buf;                              \
+  int16_t *const low_qcoeff = (int16_t *)qcoeff_buf;                      \
+  int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf;                    \
+  const int diff_stride = bw;
+
+#define DECLARE_LOOP_VARS_BLOCK_YRD() \
+  const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+static AOM_FORCE_INLINE void update_yrd_loop_vars(
+    MACROBLOCK *x, int *skippable, int step, int ncoeffs,
+    int16_t *const low_coeff, int16_t *const low_qcoeff,
+    int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost,
+    int tx_blk_id) {
+  const int is_txfm_skip = (ncoeffs == 0);
+  *skippable &= is_txfm_skip;
+  x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
+  *eob_cost += get_msb(ncoeffs + 1);
+  if (ncoeffs == 1)
+    this_rdc->rate += (int)abs(low_qcoeff[0]);
+  else if (ncoeffs > 1)
+    this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
+
+  this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
+}
+
+static INLINE void aom_process_hadamard_lp_8x16(MACROBLOCK *x,
+                                                int max_blocks_high,
+                                                int max_blocks_wide,
+                                                int num_4x4_w, int step,
+                                                int block_step) {
+  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+  const int bw = 4 * num_4x4_w;
+  const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide);
+  int block = 0;
+
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0; c < num_4x4; c += 2 * block_step) {
+      const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2];
+      int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block);
+      aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff);
+      block += 2 * step;
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define DECLARE_BLOCK_YRD_HBD_VARS()     \
+  tran_low_t *const coeff = coeff_buf;   \
+  tran_low_t *const qcoeff = qcoeff_buf; \
+  tran_low_t *const dqcoeff = dqcoeff_buf;
+
+static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd(
+    MACROBLOCK *x, int *skippable, int step, int ncoeffs,
+    tran_low_t *const coeff, tran_low_t *const qcoeff,
+    tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost,
+    int tx_blk_id) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int is_txfm_skip = (ncoeffs == 0);
+  *skippable &= is_txfm_skip;
+  x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
+  *eob_cost += get_msb(ncoeffs + 1);
+
+  int64_t dummy;
+  if (ncoeffs == 1)
+    this_rdc->rate += (int)abs(qcoeff[0]);
+  else if (ncoeffs > 1)
+    this_rdc->rate += aom_satd(qcoeff, step << 4);
+  this_rdc->dist +=
+      av1_highbd_block_error(coeff, dqcoeff, step << 4, &dummy, xd->bd) >> 2;
+}
+#endif
+
+/*!\brief Calculates RD Cost using Hadamard transform.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost using Hadamard transform. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    this_rdc       Pointer to calculated RD Cost
+ * \param[in]    skippable      Pointer to a flag indicating possible tx skip
+ * \param[in]    bsize          Current block size
+ * \param[in]    tx_size        Transform size
+ * \param[in]    is_inter_mode  Flag to indicate inter mode
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
+ * coefficients for Hadamard transform
+ */
+void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
+                   BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[AOM_PLANE_Y];
+  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int num_4x4_w = mi_size_wide[bsize];
+  const int num_4x4_h = mi_size_high[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  const int row_step = step * num_4x4_w >> tx_size;
+  int block = 0;
+  const int max_blocks_wide =
+      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+  const int max_blocks_high =
+      num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+  int eob_cost = 0;
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
+  const int use_hbd = is_cur_buf_hbd(xd);
+  int num_blk_skip_w = num_4x4_w;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride);
+  } else {
+    aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                       pd->dst.buf, pd->dst.stride);
+  }
+#else
+  aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
+#endif
+
+  // Keep the intermediate value on the stack here. Writing directly to
+  // skippable causes speed regression due to load-and-store issues in
+  // update_yrd_loop_vars.
+  int temp_skippable = 1;
+  this_rdc->dist = 0;
+  this_rdc->rate = 0;
+  // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks
+  // can be done per function call. Hence the call of Hadamard txfm is
+  // abstracted here for the specified cases.
+  int is_tx_8x8_dual_applicable =
+      (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 &&
+       block_size_high[bsize] >= 8);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  // As of now, dual implementation of hadamard txfm is available for low
+  // bitdepth.
+  if (use_hbd) is_tx_8x8_dual_applicable = 0;
+#endif
+
+  if (is_tx_8x8_dual_applicable) {
+    aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w,
+                                 step, block_step);
+  }
+
+  const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+  DECLARE_BLOCK_YRD_BUFFERS()
+  DECLARE_BLOCK_YRD_VARS()
+#if CONFIG_AV1_HIGHBITDEPTH
+  DECLARE_BLOCK_YRD_HBD_VARS()
+#else
+  (void)use_hbd;
+#endif
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+      DECLARE_LOOP_VARS_BLOCK_YRD()
+
+      switch (tx_size) {
+#if CONFIG_AV1_HIGHBITDEPTH
+        case TX_16X16:
+          if (use_hbd) {
+            aom_hadamard_16x16(src_diff, diff_stride, coeff);
+            av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob,
+                            // default_scan_fp_16x16_transpose and
+                            // av1_default_iscan_fp_16x16_transpose have to be
+                            // used together.
+                            default_scan_fp_16x16_transpose,
+                            av1_default_iscan_fp_16x16_transpose);
+          } else {
+            aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+            av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
+                            p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                            p->dequant_QTX, eob,
+                            // default_scan_lp_16x16_transpose and
+                            // av1_default_iscan_lp_16x16_transpose have to be
+                            // used together.
+                            default_scan_lp_16x16_transpose,
+                            av1_default_iscan_lp_16x16_transpose);
+          }
+          break;
+        case TX_8X8:
+          if (use_hbd) {
+            aom_hadamard_8x8(src_diff, diff_stride, coeff);
+            av1_quantize_fp(
+                coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+                p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
+                default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
+          } else {
+            if (is_tx_8x8_dual_applicable) {
+              // The coeffs are pre-computed for the whole block, so re-assign
+              // low_coeff to the appropriate location.
+              const int block_offset = BLOCK_OFFSET(block + s);
+              low_coeff = (int16_t *)p->coeff + block_offset;
+            } else {
+              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+            }
+            av1_quantize_lp(
+                low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff,
+                low_dqcoeff, p->dequant_QTX, eob,
+                // default_scan_8x8_transpose and
+                // av1_default_iscan_8x8_transpose have to be used together.
+                default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
+          }
+          break;
+        default:
+          assert(tx_size == TX_4X4);
+          // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate
+          // normal coefficients order, so we don't need to change the scan
+          // order here.
+          if (use_hbd) {
+            aom_fdct4x4(src_diff, coeff, diff_stride);
+            av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
+          } else {
+            aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+            av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+                            low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                            scan_order->scan, scan_order->iscan);
+          }
+          break;
+#else
+        case TX_16X16:
+          aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+          av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX,
+                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                          default_scan_lp_16x16_transpose,
+                          av1_default_iscan_lp_16x16_transpose);
+          break;
+        case TX_8X8:
+          if (is_tx_8x8_dual_applicable) {
+            // The coeffs are pre-computed for the whole block, so re-assign
+            // low_coeff to the appropriate location.
+            const int block_offset = BLOCK_OFFSET(block + s);
+            low_coeff = (int16_t *)p->coeff + block_offset;
+          } else {
+            aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+          }
+          av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
+                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                          default_scan_8x8_transpose,
+                          av1_default_iscan_8x8_transpose);
+          break;
+        default:
+          aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+          av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                          scan_order->scan, scan_order->iscan);
+          break;
+#endif
+      }
+      assert(*eob <= 1024);
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (use_hbd)
+        update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff,
+                                 dqcoeff, this_rdc, &eob_cost,
+                                 r * num_blk_skip_w + c);
+      else
+#endif
+        update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
+                             low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
+                             r * num_blk_skip_w + c);
+    }
+    block += row_step;
+  }
+
+  this_rdc->skip_txfm = *skippable = temp_skippable;
+  if (this_rdc->sse < INT64_MAX) {
+    this_rdc->sse = (this_rdc->sse << 6) >> 2;
+    if (temp_skippable) {
+      this_rdc->dist = 0;
+      this_rdc->dist = this_rdc->sse;
+      return;
+    }
+  }
+
+  // If skippable is set, rate gets clobbered later.
+  this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+  this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
+// Explicitly enumerate the cases so the compiler can generate SIMD for the
+// function. According to the disassembler, gcc generates SSE codes for each of
+// the possible block sizes. The hottest case is tx_width 16, which takes up
+// about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since
+// av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
+// potential room of improvement for writing AVX2 optimization is only 3% * 8% =
+// 0.24% of total encoding time.
+static AOM_INLINE void scale_square_buf_vals(int16_t *dst, int tx_width,
+                                             const int16_t *src,
+                                             int src_stride) {
+#define DO_SCALING                                                   \
+  do {                                                               \
+    for (int idy = 0; idy < tx_width; ++idy) {                       \
+      for (int idx = 0; idx < tx_width; ++idx) {                     \
+        dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \
+      }                                                              \
+    }                                                                \
+  } while (0)
+
+  if (tx_width == 4) {
+    DO_SCALING;
+  } else if (tx_width == 8) {
+    DO_SCALING;
+  } else if (tx_width == 16) {
+    DO_SCALING;
+  } else {
+    assert(0);
+  }
+
+#undef DO_SCALING
+}
+
+/*!\brief Calculates RD Cost when the block uses Identity transform.
+ * Note that this function is only for low bit depth encoding, since it
+ * is called in real-time mode for now, which sets high bit depth to 0:
+ * -DCONFIG_AV1_HIGHBITDEPTH=0
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    pred_buf       Pointer to the prediction buffer
+ * \param[in]    pred_stride    Stride for the prediction buffer
+ * \param[in]    this_rdc       Pointer to calculated RD Cost
+ * \param[in]    skippable      Pointer to a flag indicating possible tx skip
+ * \param[in]    bsize          Current block size
+ * \param[in]    tx_size        Transform size
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if all coefficients are zero.
+ */
+void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf,
+                        int pred_stride, RD_STATS *this_rdc, int *skippable,
+                        BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int num_4x4_w = mi_size_wide[bsize];
+  const int num_4x4_h = mi_size_high[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  const int max_blocks_wide =
+      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+  const int max_blocks_high =
+      num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+  int eob_cost = 0;
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
+  const int num_blk_skip_w = num_4x4_w;
+  // Keep the intermediate value on the stack here. Writing directly to
+  // skippable causes speed regression due to load-and-store issues in
+  // update_yrd_loop_vars.
+  int temp_skippable = 1;
+  int tx_wd = 0;
+  const SCAN_ORDER *scan_order = NULL;
+  switch (tx_size) {
+    case TX_64X64:
+      assert(0);  // Not implemented
+      break;
+    case TX_32X32:
+      assert(0);  // Not used
+      break;
+    case TX_16X16:
+      scan_order = &av1_fast_idtx_scan_order_16x16;
+      tx_wd = 16;
+      break;
+    case TX_8X8:
+      scan_order = &av1_fast_idtx_scan_order_8x8;
+      tx_wd = 8;
+      break;
+    default:
+      assert(tx_size == TX_4X4);
+      scan_order = &av1_fast_idtx_scan_order_4x4;
+      tx_wd = 4;
+      break;
+  }
+  assert(scan_order != NULL);
+
+  this_rdc->dist = 0;
+  this_rdc->rate = 0;
+  aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pred_buf, pred_stride);
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  DECLARE_BLOCK_YRD_BUFFERS()
+  DECLARE_BLOCK_YRD_VARS()
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+      DECLARE_LOOP_VARS_BLOCK_YRD()
+      scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride);
+      av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
+                      p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX,
+                      eob, scan_order->scan, scan_order->iscan);
+      assert(*eob <= 1024);
+      update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
+                           low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
+                           r * num_blk_skip_w + c);
+    }
+  }
+  this_rdc->skip_txfm = *skippable = temp_skippable;
+  if (this_rdc->sse < INT64_MAX) {
+    this_rdc->sse = (this_rdc->sse << 6) >> 2;
+    if (temp_skippable) {
+      this_rdc->dist = 0;
+      this_rdc->dist = this_rdc->sse;
+      return;
+    }
+  }
+  // If skippable is set, rate gets clobbered later.
+  this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+  this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
+int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               RD_STATS *this_rdc, int start_plane,
+                               int stop_plane) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  int plane;
+  int64_t tot_sse = 0;
+
+  this_rdc->rate = 0;
+  this_rdc->dist = 0;
+  this_rdc->skip_txfm = 0;
+
+  for (plane = start_plane; plane <= stop_plane; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const uint32_t dc_quant = p->dequant_QTX[0];
+    const uint32_t ac_quant = p->dequant_QTX[1];
+    const BLOCK_SIZE bs = plane_bsize;
+    unsigned int var;
+    if (!x->color_sensitivity[COLOR_SENS_IDX(plane)]) continue;
+
+    var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                                  pd->dst.stride, &sse);
+    assert(sse >= var);
+    tot_sse += sse;
+
+    av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                 dc_quant >> 3, &rate, &dist);
+
+    this_rdc->rate += rate >> 1;
+    this_rdc->dist += dist << 3;
+
+    av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
+                                 &rate, &dist);
+
+    this_rdc->rate += rate;
+    this_rdc->dist += dist << 4;
+  }
+
+  if (this_rdc->rate == 0) {
+    this_rdc->skip_txfm = 1;
+  }
+
+  if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
+      RDCOST(x->rdmult, 0, tot_sse << 4)) {
+    this_rdc->rate = 0;
+    this_rdc->dist = tot_sse << 4;
+    this_rdc->skip_txfm = 1;
+  }
+
+  return tot_sse;
+}
+
+static void compute_intra_yprediction(const AV1_COMMON *cm,
+                                      PREDICTION_MODE mode, BLOCK_SIZE bsize,
+                                      MACROBLOCK *x, MACROBLOCKD *xd) {
+  const SequenceHeader *seq_params = cm->seq_params;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int plane = 0;
+  int row, col;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+      p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+      pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+      av1_predict_intra_block(
+          xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+          block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0,
+          FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride,
+          0, 0, plane);
+    }
+  }
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+}
+
+// Checks whether Intra mode needs to be pruned based on
+// 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad'
+// speed features.
+static INLINE bool is_prune_intra_mode(
+    AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize,
+    uint8_t segment_id, SOURCE_SAD source_sad_nonrd,
+    uint8_t color_sensitivity[MAX_MB_PLANE - 1]) {
+  const PREDICTION_MODE this_mode = intra_mode_list[mode_index];
+  if (mode_index > 2 || force_intra_check == 0) {
+    if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
+      return true;
+
+    if (this_mode == DC_PRED) return false;
+
+    if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false;
+
+    const bool has_color_sensitivity =
+        color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] &&
+        color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)];
+    if (has_color_sensitivity &&
+        (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad ||
+         cyclic_refresh_segment_id_boosted(segment_id) ||
+         source_sad_nonrd > kMedSad))
+      return false;
+
+    return true;
+  }
+  return false;
+}
+
+/*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost for an intra mode for a single TX block using Hadamard
+ * transform.
+ * \param[in]    plane          Color plane
+ * \param[in]    block          Index of a TX block in a prediction block
+ * \param[in]    row            Row of a current TX block
+ * \param[in]    col            Column of a current TX block
+ * \param[in]    plane_bsize    Block size of a current prediction block
+ * \param[in]    tx_size        Transform size
+ * \param[in]    arg            Pointer to a structure that holds parameters
+ *                              for intra mode search
+ *
+ * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode
+ * are set in \c args->rdc and \c args->mode
+ */
+void av1_estimate_block_intra(int plane, int block, int row, int col,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              void *arg) {
+  struct estimate_block_intra_args *const args = arg;
+  AV1_COMP *const cpi = args->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int64_t src_stride = p->src.stride;
+  const int64_t dst_stride = pd->dst.stride;
+
+  (void)block;
+
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+  if (args->prune_mode_based_on_sad) {
+    unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf(
+        p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride);
+    const unsigned int sad_threshold =
+        args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4)
+                                   : UINT_MAX;
+    // Skip the evaluation of current mode if its SAD is more than a threshold.
+    if (this_sad > sad_threshold) {
+      // For the current mode, set rate and distortion to maximum possible
+      // values and return.
+      // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip
+      // the evaluation of the current mode.
+      args->rdc->rate = INT_MAX;
+      args->rdc->dist = INT64_MAX;
+      return;
+    }
+    if (this_sad < args->best_sad) {
+      args->best_sad = this_sad;
+    }
+  }
+
+  RD_STATS this_rdc;
+  av1_invalid_rd_stats(&this_rdc);
+
+  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+
+  if (plane == 0) {
+    av1_block_yrd(x, &this_rdc, &args->skippable, bsize_tx,
+                  AOMMIN(tx_size, TX_16X16));
+  } else {
+    av1_model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane);
+  }
+
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+  assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX);
+  args->rdc->rate += this_rdc.rate;
+  args->rdc->dist += this_rdc.dist;
+}
+
+/*!\brief Estimates best intra mode for inter mode search
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ *
+ * Using heuristics based on best inter mode, block size, and other decides
+ * whether to check intra modes. If so, estimates and selects best intra mode
+ * from the reduced set of intra modes (max 4 intra modes checked)
+ *
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    bsize                    Current block size
+ * \param[in]    best_early_term          Flag, indicating that TX for the
+ *                                        best inter mode was skipped
+ * \param[in]    ref_cost_intra           Cost of signalling intra mode
+ * \param[in]    reuse_prediction         Flag, indicating prediction re-use
+ * \param[in]    orig_dst                 Original destination buffer
+ * \param[in]    tmp_buffers              Pointer to a temporary buffers for
+ *                                        prediction re-use
+ * \param[out]   this_mode_pred           Pointer to store prediction buffer
+ *                                        for prediction re-use
+ * \param[in]    best_rdc                 Pointer to RD cost for the best
+ *                                        selected intra mode
+ * \param[in]    best_pickmode            Pointer to a structure containing
+ *                                        best mode picked so far
+ * \param[in]    ctx                      Pointer to structure holding coding
+ *                                        contexts and modes for the block
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c best_rdc and best selected mode is placed to \c best_pickmode
+ *
+ */
+void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                             int best_early_term, unsigned int ref_cost_intra,
+                             int reuse_prediction, struct buf_2d *orig_dst,
+                             PRED_BUFFER *tmp_buffers,
+                             PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+                             BEST_PICKMODE *best_pickmode,
+                             PICK_MODE_CONTEXT *ctx) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const unsigned char segment_id = mi->segment_id;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
+  const bool is_screen_content =
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+
+  const CommonQuantParams *quant_params = &cm->quant_params;
+
+  RD_STATS this_rdc;
+
+  int intra_cost_penalty = av1_get_intra_cost_penalty(
+      quant_params->base_qindex, quant_params->y_dc_delta_q,
+      cm->seq_params->bit_depth);
+  int64_t inter_mode_thresh =
+      RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
+  int perform_intra_pred = rt_sf->check_intra_pred_nonrd;
+  int force_intra_check = 0;
+  // For spatial enhancement layer: turn off intra prediction if the
+  // previous spatial layer as golden ref is not chosen as best reference.
+  // only do this for temporal enhancement layer and on non-key frames.
+  if (cpi->svc.spatial_layer_id > 0 &&
+      best_pickmode->best_ref_frame != GOLDEN_FRAME &&
+      cpi->svc.temporal_layer_id > 0 &&
+      !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
+    perform_intra_pred = 0;
+
+  int do_early_exit_rdthresh = 1;
+
+  uint32_t spatial_var_thresh = 50;
+  int motion_thresh = 32;
+  // Adjust thresholds to make intra mode likely tested if the other
+  // references (golden, alt) are skipped/not checked. For now always
+  // adjust for svc mode.
+  if (cpi->ppi->use_svc || (rt_sf->use_nonrd_altref_frame == 0 &&
+                            rt_sf->nonrd_prune_ref_frame_search > 0)) {
+    spatial_var_thresh = 150;
+    motion_thresh = 0;
+  }
+
+  // Some adjustments to checking intra mode based on source variance.
+  if (x->source_variance < spatial_var_thresh) {
+    // If the best inter mode is large motion or non-LAST ref reduce intra cost
+    // penalty, so intra mode is more likely tested.
+    if (best_rdc->rdcost != INT64_MAX &&
+        (best_pickmode->best_ref_frame != LAST_FRAME ||
+         abs(mi->mv[0].as_mv.row) >= motion_thresh ||
+         abs(mi->mv[0].as_mv.col) >= motion_thresh)) {
+      intra_cost_penalty = intra_cost_penalty >> 2;
+      inter_mode_thresh =
+          RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
+      do_early_exit_rdthresh = 0;
+    }
+    if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
+         x->content_state_sb.source_sad_nonrd >= kHighSad) ||
+        (is_screen_content && x->source_variance < 50 &&
+         ((bsize >= BLOCK_32X32 &&
+           x->content_state_sb.source_sad_nonrd != kZeroSad) ||
+          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)))
+      force_intra_check = 1;
+    // For big blocks worth checking intra (since only DC will be checked),
+    // even if best_early_term is set.
+    if (bsize >= BLOCK_32X32) best_early_term = 0;
+  } else if (rt_sf->source_metrics_sb_nonrd &&
+             x->content_state_sb.source_sad_nonrd <= kLowSad) {
+    perform_intra_pred = 0;
+  }
+
+  if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) {
+    if (rt_sf->skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV)
+      perform_intra_pred = 0;
+    else if (rt_sf->skip_intra_pred == 2)
+      perform_intra_pred = 0;
+  }
+
+  if (!(best_rdc->rdcost == INT64_MAX || force_intra_check ||
+        (perform_intra_pred && !best_early_term &&
+         bsize <= cpi->sf.part_sf.max_intra_bsize))) {
+    return;
+  }
+
+  // Early exit based on RD cost calculated using known rate. When
+  // is_screen_content is true, more bias is given to intra modes. Hence,
+  // considered conservative threshold in early exit for the same.
+  const int64_t known_rd = is_screen_content
+                               ? CALC_BIASED_RDCOST(inter_mode_thresh)
+                               : inter_mode_thresh;
+  if (known_rd > best_rdc->rdcost) return;
+
+  struct estimate_block_intra_args args;
+  init_estimate_block_intra_args(&args, cpi, x);
+  TX_SIZE intra_tx_size = AOMMIN(
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+      TX_16X16);
+  if (is_screen_content && cpi->rc.high_source_sad &&
+      x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16)
+    intra_tx_size = TX_4X4;
+
+  PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+  if (reuse_prediction && best_pred != NULL) {
+    const int bh = block_size_high[bsize];
+    const int bw = block_size_wide[bsize];
+    if (best_pred->data == orig_dst->buf) {
+      *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)];
+      aom_convolve_copy(best_pred->data, best_pred->stride,
+                        (*this_mode_pred)->data, (*this_mode_pred)->stride, bw,
+                        bh);
+      best_pickmode->best_pred = *this_mode_pred;
+    }
+  }
+  pd->dst = *orig_dst;
+
+  for (int midx = 0; midx < RTC_INTRA_MODES; ++midx) {
+    const PREDICTION_MODE this_mode = intra_mode_list[midx];
+    const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+    const int64_t mode_rd_thresh = rd_threshes[mode_index];
+
+    if (is_prune_intra_mode(cpi, midx, force_intra_check, bsize, segment_id,
+                            x->content_state_sb.source_sad_nonrd,
+                            x->color_sensitivity))
+      continue;
+
+    if (is_screen_content && rt_sf->source_metrics_sb_nonrd) {
+      // For spatially flat blocks with zero motion only check
+      // DC mode.
+      if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+          x->source_variance == 0 && this_mode != DC_PRED)
+        continue;
+      // Only test Intra for big blocks if spatial_variance is small.
+      else if (bsize > BLOCK_32X32 && x->source_variance > 50)
+        continue;
+    }
+
+    if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]) &&
+        (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
+      continue;
+    }
+    const BLOCK_SIZE uv_bsize =
+        get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                             xd->plane[AOM_PLANE_U].subsampling_y);
+
+    mi->mode = this_mode;
+    mi->ref_frame[0] = INTRA_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+
+    av1_invalid_rd_stats(&this_rdc);
+    args.mode = this_mode;
+    args.skippable = 1;
+    args.rdc = &this_rdc;
+    mi->tx_size = intra_tx_size;
+    compute_intra_yprediction(cm, this_mode, bsize, x, xd);
+    // Look into selecting tx_size here, based on prediction residual.
+    av1_block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size);
+    // TODO(kyslov@) Need to account for skippable
+    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+      av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_U,
+                                             av1_estimate_block_intra, &args);
+    }
+    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+      av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_V,
+                                             av1_estimate_block_intra, &args);
+    }
+
+    int mode_cost = 0;
+    if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
+      mode_cost +=
+          x->mode_costs.angle_delta_cost[this_mode - V_PRED]
+                                        [MAX_ANGLE_DELTA +
+                                         mi->angle_delta[PLANE_TYPE_Y]];
+    }
+    if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+      mode_cost += x->mode_costs.filter_intra_cost[bsize][0];
+    }
+    this_rdc.rate += ref_cost_intra;
+    this_rdc.rate += intra_cost_penalty;
+    this_rdc.rate += mode_cost;
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+    if (is_screen_content && rt_sf->source_metrics_sb_nonrd) {
+      // For blocks with low spatial variance and color sad,
+      // favor the intra-modes, only on scene/slide change.
+      if (cpi->rc.high_source_sad && x->source_variance < 800 &&
+          (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+           x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]))
+        this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost);
+      // Otherwise bias against intra for blocks with zero
+      // motion and no color, on non-scene/slide changes.
+      else if (!cpi->rc.high_source_sad && x->source_variance > 0 &&
+               x->content_state_sb.source_sad_nonrd == kZeroSad &&
+               x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+               x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+        this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1;
+    }
+
+    if (this_rdc.rdcost < best_rdc->rdcost) {
+      *best_rdc = this_rdc;
+      best_pickmode->best_mode = this_mode;
+      best_pickmode->best_tx_size = mi->tx_size;
+      best_pickmode->best_ref_frame = INTRA_FRAME;
+      best_pickmode->best_second_ref_frame = NONE;
+      best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm;
+      mi->uv_mode = this_mode;
+      mi->mv[0].as_int = INVALID_MV;
+      mi->mv[1].as_int = INVALID_MV;
+      if (!this_rdc.skip_txfm)
+        memset(ctx->blk_skip, 0,
+               sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+    }
+  }
+  if (best_pickmode->best_ref_frame == INTRA_FRAME)
+    memset(ctx->blk_skip, 0,
+           sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+  mi->tx_size = best_pickmode->best_tx_size;
+}
diff --git a/third_party/aom/av1/encoder/nonrd_opt.h b/third_party/aom/av1/encoder/nonrd_opt.h
new file mode 100644
index 0000000000..a53578ebad
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_opt.h
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_NONRD_OPT_H_
+#define AOM_AV1_ENCODER_NONRD_OPT_H_
+
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/rdopt.h"
+
+#define RTC_INTER_MODES (4)
+#define RTC_INTRA_MODES (4)
+#define RTC_MODES (AOMMAX(RTC_INTER_MODES, RTC_INTRA_MODES))
+#define CALC_BIASED_RDCOST(rdcost) (7 * (rdcost) >> 3)
+#define NUM_COMP_INTER_MODES_RT (6)
+#define NUM_INTER_MODES 12
+#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \
+  (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false)
+#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16)
+#define FILTER_SEARCH_SIZE 2
+#if !CONFIG_REALTIME_ONLY
+#define MOTION_MODE_SEARCH_SIZE 2
+#endif
+
+extern int g_pick_inter_mode_cnt;
+/*!\cond */
+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
+typedef struct {
+  PRED_BUFFER *best_pred;
+  PREDICTION_MODE best_mode;
+  TX_SIZE best_tx_size;
+  TX_TYPE tx_type;
+  MV_REFERENCE_FRAME best_ref_frame;
+  MV_REFERENCE_FRAME best_second_ref_frame;
+  uint8_t best_mode_skip_txfm;
+  uint8_t best_mode_initial_skip_flag;
+  int_interpfilters best_pred_filter;
+  MOTION_MODE best_motion_mode;
+  WarpedMotionParams wm_params;
+  int num_proj_ref;
+  PALETTE_MODE_INFO pmi;
+  int64_t best_sse;
+} BEST_PICKMODE;
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+  PREDICTION_MODE pred_mode;
+} COMP_REF_MODE;
+
+struct estimate_block_intra_args {
+  AV1_COMP *cpi;
+  MACROBLOCK *x;
+  PREDICTION_MODE mode;
+  int skippable;
+  RD_STATS *rdc;
+  unsigned int best_sad;
+  bool prune_mode_based_on_sad;
+};
+/*!\endcond */
+
+/*!\brief Structure to store parameters and statistics used in non-rd inter mode
+ * evaluation.
+ */
+typedef struct {
+  //! Structure to hold best inter mode data
+  BEST_PICKMODE best_pickmode;
+  //! Structure to RD cost of current mode
+  RD_STATS this_rdc;
+  //! Pointer to the RD Cost for the best mode found so far
+  RD_STATS best_rdc;
+  //! Distortion of chroma planes for all modes and reference frames
+  int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES];
+  //! Buffer to hold predicted block for all reference frames and planes
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  //! Array to hold variance of all modes and reference frames
+  unsigned int vars[RTC_INTER_MODES][REF_FRAMES];
+  //! Array to hold ref cost of single reference mode for all ref frames
+  unsigned int ref_costs_single[REF_FRAMES];
+  //! Array to hold motion vector for all modes and reference frames
+  int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+  //! Array to hold best mv for all modes and reference frames
+  int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES];
+  //! Array to hold inter mode cost of single ref mode for all ref frames
+  int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES];
+  //! Array to hold use reference frame mask for each reference frame
+  int use_ref_frame_mask[REF_FRAMES];
+  //! Array to hold flags of evaluated modes for each reference frame
+  uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
+  //! Array to hold flag indicating if scaled reference frame is used.
+  bool use_scaled_ref_frame[REF_FRAMES];
+} InterModeSearchStateNonrd;
+
+static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2,
+                                                          2, 2, 3, 3, 3, 4,
+                                                          4, 4, 5, 5 };
+static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1,
+                                                           2, 3, 2, 3, 4, 3,
+                                                           4, 5, 4, 5 };
+
+static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
+                                                   SMOOTH_PRED };
+
+static const PREDICTION_MODE inter_mode_list[] = { NEARESTMV, NEARMV, GLOBALMV,
+                                                   NEWMV };
+
+static const THR_MODES mode_idx[REF_FRAMES][RTC_MODES] = {
+  { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
+  { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
+  { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 },
+  { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 },
+  { THR_NEARESTG, THR_NEARG, THR_GLOBALG, THR_NEWG },
+  { THR_NEARESTB, THR_NEARB, THR_GLOBALB, THR_NEWB },
+  { THR_NEARESTA2, THR_NEARA2, THR_GLOBALA2, THR_NEWA2 },
+  { THR_NEARESTA, THR_NEARA, THR_GLOBALA, THR_NEWA },
+};
+
+// GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT
+// mode
+static const REF_MODE ref_mode_set[NUM_INTER_MODES] = {
+  { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
+  { LAST_FRAME, GLOBALMV },    { LAST_FRAME, NEWMV },
+  { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
+  { GOLDEN_FRAME, GLOBALMV },  { GOLDEN_FRAME, NEWMV },
+  { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV },
+  { ALTREF_FRAME, GLOBALMV },  { ALTREF_FRAME, NEWMV },
+};
+
+static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = {
+  { { LAST_FRAME, GOLDEN_FRAME }, GLOBAL_GLOBALMV },
+  { { LAST_FRAME, GOLDEN_FRAME }, NEAREST_NEARESTMV },
+  { { LAST_FRAME, LAST2_FRAME }, GLOBAL_GLOBALMV },
+  { { LAST_FRAME, LAST2_FRAME }, NEAREST_NEARESTMV },
+  { { LAST_FRAME, ALTREF_FRAME }, GLOBAL_GLOBALMV },
+  { { LAST_FRAME, ALTREF_FRAME }, NEAREST_NEARESTMV },
+};
+
+static const int_interpfilters filters_ref_set[9] = {
+  [0].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR },
+  [1].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+  [2].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH },
+  [3].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR },
+  [4].as_filters = { MULTITAP_SHARP, MULTITAP_SHARP },
+  [5].as_filters = { EIGHTTAP_REGULAR, MULTITAP_SHARP },
+  [6].as_filters = { MULTITAP_SHARP, EIGHTTAP_REGULAR },
+  [7].as_filters = { EIGHTTAP_SMOOTH, MULTITAP_SHARP },
+  [8].as_filters = { MULTITAP_SHARP, EIGHTTAP_SMOOTH }
+};
+
+enum {
+  //  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV),
+  INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV),
+};
+
+// The original scan order (default_scan_8x8) is modified according to the extra
+// transpose in hadamard c implementation, i.e., aom_hadamard_lp_8x8_c and
+// aom_hadamard_8x8_c.
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8_transpose[64]) = {
+  0,  8,  1,  2,  9,  16, 24, 17, 10, 3,  4,  11, 18, 25, 32, 40,
+  33, 26, 19, 12, 5,  6,  13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
+  28, 21, 14, 7,  15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
+  23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
+};
+
+// The original scan order (av1_default_iscan_8x8) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_lp_8x8_avx2 and
+// aom_hadamard_8x8_avx2. Since hadamard AVX2 implementation will modify the
+// order of coefficients, such that the normal scan order is no longer
+// guaranteed to scan low coefficients first, therefore we modify the scan order
+// accordingly.
+// Note that this one has to be used together with default_scan_8x8_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_default_iscan_8x8_transpose[64]) = {
+  0,  2,  3,  9,  10, 20, 21, 35, 1,  4,  8,  11, 19, 22, 34, 36,
+  5,  7,  12, 18, 23, 33, 37, 48, 6,  13, 17, 24, 32, 38, 47, 49,
+  14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58,
+  27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63
+};
+
+// The original scan order (default_scan_16x16) is modified according to the
+// extra transpose in hadamard c implementation in lp case, i.e.,
+// aom_hadamard_lp_16x16_c.
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_lp_16x16_transpose[256]) = {
+  0,   8,   2,   4,   10,  16,  24,  18,  12,  6,   64,  14,  20,  26,  32,
+  40,  34,  28,  22,  72,  66,  68,  74,  80,  30,  36,  42,  48,  56,  50,
+  44,  38,  88,  82,  76,  70,  128, 78,  84,  90,  96,  46,  52,  58,  1,
+  9,   3,   60,  54,  104, 98,  92,  86,  136, 130, 132, 138, 144, 94,  100,
+  106, 112, 62,  5,   11,  17,  25,  19,  13,  7,   120, 114, 108, 102, 152,
+  146, 140, 134, 192, 142, 148, 154, 160, 110, 116, 122, 65,  15,  21,  27,
+  33,  41,  35,  29,  23,  73,  67,  124, 118, 168, 162, 156, 150, 200, 194,
+  196, 202, 208, 158, 164, 170, 176, 126, 69,  75,  81,  31,  37,  43,  49,
+  57,  51,  45,  39,  89,  83,  77,  71,  184, 178, 172, 166, 216, 210, 204,
+  198, 206, 212, 218, 224, 174, 180, 186, 129, 79,  85,  91,  97,  47,  53,
+  59,  61,  55,  105, 99,  93,  87,  137, 131, 188, 182, 232, 226, 220, 214,
+  222, 228, 234, 240, 190, 133, 139, 145, 95,  101, 107, 113, 63,  121, 115,
+  109, 103, 153, 147, 141, 135, 248, 242, 236, 230, 238, 244, 250, 193, 143,
+  149, 155, 161, 111, 117, 123, 125, 119, 169, 163, 157, 151, 201, 195, 252,
+  246, 254, 197, 203, 209, 159, 165, 171, 177, 127, 185, 179, 173, 167, 217,
+  211, 205, 199, 207, 213, 219, 225, 175, 181, 187, 189, 183, 233, 227, 221,
+  215, 223, 229, 235, 241, 191, 249, 243, 237, 231, 239, 245, 251, 253, 247,
+  255
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// The original scan order (default_scan_16x16) is modified according to the
+// extra shift in hadamard c implementation in fp case, i.e.,
+// aom_hadamard_16x16_c. Note that 16x16 lp and fp hadamard generate different
+// outputs, so we handle them separately.
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_fp_16x16_transpose[256]) = {
+  0,   4,   2,   8,   6,   16,  20,  18,  12,  10,  64,  14,  24,  22,  32,
+  36,  34,  28,  26,  68,  66,  72,  70,  80,  30,  40,  38,  48,  52,  50,
+  44,  42,  84,  82,  76,  74,  128, 78,  88,  86,  96,  46,  56,  54,  1,
+  5,   3,   60,  58,  100, 98,  92,  90,  132, 130, 136, 134, 144, 94,  104,
+  102, 112, 62,  9,   7,   17,  21,  19,  13,  11,  116, 114, 108, 106, 148,
+  146, 140, 138, 192, 142, 152, 150, 160, 110, 120, 118, 65,  15,  25,  23,
+  33,  37,  35,  29,  27,  69,  67,  124, 122, 164, 162, 156, 154, 196, 194,
+  200, 198, 208, 158, 168, 166, 176, 126, 73,  71,  81,  31,  41,  39,  49,
+  53,  51,  45,  43,  85,  83,  77,  75,  180, 178, 172, 170, 212, 210, 204,
+  202, 206, 216, 214, 224, 174, 184, 182, 129, 79,  89,  87,  97,  47,  57,
+  55,  61,  59,  101, 99,  93,  91,  133, 131, 188, 186, 228, 226, 220, 218,
+  222, 232, 230, 240, 190, 137, 135, 145, 95,  105, 103, 113, 63,  117, 115,
+  109, 107, 149, 147, 141, 139, 244, 242, 236, 234, 238, 248, 246, 193, 143,
+  153, 151, 161, 111, 121, 119, 125, 123, 165, 163, 157, 155, 197, 195, 252,
+  250, 254, 201, 199, 209, 159, 169, 167, 177, 127, 181, 179, 173, 171, 213,
+  211, 205, 203, 207, 217, 215, 225, 175, 185, 183, 189, 187, 229, 227, 221,
+  219, 223, 233, 231, 241, 191, 245, 243, 237, 235, 239, 249, 247, 253, 251,
+  255
+};
+#endif
+
+// The original scan order (av1_default_iscan_16x16) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_lp_16x16_avx2.
+// Since hadamard AVX2 implementation will modify the order of coefficients,
+// such that the normal scan order is no longer guaranteed to scan low
+// coefficients first, therefore we modify the scan order accordingly. Note that
+// this one has to be used together with default_scan_lp_16x16_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_default_iscan_lp_16x16_transpose[256]) = {
+  0,   44,  2,   46,  3,   63,  9,   69,  1,   45,  4,   64,  8,   68,  11,
+  87,  5,   65,  7,   67,  12,  88,  18,  94,  6,   66,  13,  89,  17,  93,
+  24,  116, 14,  90,  16,  92,  25,  117, 31,  123, 15,  91,  26,  118, 30,
+  122, 41,  148, 27,  119, 29,  121, 42,  149, 48,  152, 28,  120, 43,  150,
+  47,  151, 62,  177, 10,  86,  20,  96,  21,  113, 35,  127, 19,  95,  22,
+  114, 34,  126, 37,  144, 23,  115, 33,  125, 38,  145, 52,  156, 32,  124,
+  39,  146, 51,  155, 58,  173, 40,  147, 50,  154, 59,  174, 73,  181, 49,
+  153, 60,  175, 72,  180, 83,  198, 61,  176, 71,  179, 84,  199, 98,  202,
+  70,  178, 85,  200, 97,  201, 112, 219, 36,  143, 54,  158, 55,  170, 77,
+  185, 53,  157, 56,  171, 76,  184, 79,  194, 57,  172, 75,  183, 80,  195,
+  102, 206, 74,  182, 81,  196, 101, 205, 108, 215, 82,  197, 100, 204, 109,
+  216, 131, 223, 99,  203, 110, 217, 130, 222, 140, 232, 111, 218, 129, 221,
+  141, 233, 160, 236, 128, 220, 142, 234, 159, 235, 169, 245, 78,  193, 104,
+  208, 105, 212, 135, 227, 103, 207, 106, 213, 134, 226, 136, 228, 107, 214,
+  133, 225, 137, 229, 164, 240, 132, 224, 138, 230, 163, 239, 165, 241, 139,
+  231, 162, 238, 166, 242, 189, 249, 161, 237, 167, 243, 188, 248, 190, 250,
+  168, 244, 187, 247, 191, 251, 210, 254, 186, 246, 192, 252, 209, 253, 211,
+  255
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// The original scan order (av1_default_iscan_16x16) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_16x16_avx2.
+// Since hadamard AVX2 implementation will modify the order of coefficients,
+// such that the normal scan order is no longer guaranteed to scan low
+// coefficients first, therefore we modify the scan order accordingly. Note that
+// this one has to be used together with default_scan_fp_16x16_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_default_iscan_fp_16x16_transpose[256]) = {
+  0,   44,  2,   46,  1,   45,  4,   64,  3,   63,  9,   69,  8,   68,  11,
+  87,  5,   65,  7,   67,  6,   66,  13,  89,  12,  88,  18,  94,  17,  93,
+  24,  116, 14,  90,  16,  92,  15,  91,  26,  118, 25,  117, 31,  123, 30,
+  122, 41,  148, 27,  119, 29,  121, 28,  120, 43,  150, 42,  149, 48,  152,
+  47,  151, 62,  177, 10,  86,  20,  96,  19,  95,  22,  114, 21,  113, 35,
+  127, 34,  126, 37,  144, 23,  115, 33,  125, 32,  124, 39,  146, 38,  145,
+  52,  156, 51,  155, 58,  173, 40,  147, 50,  154, 49,  153, 60,  175, 59,
+  174, 73,  181, 72,  180, 83,  198, 61,  176, 71,  179, 70,  178, 85,  200,
+  84,  199, 98,  202, 97,  201, 112, 219, 36,  143, 54,  158, 53,  157, 56,
+  171, 55,  170, 77,  185, 76,  184, 79,  194, 57,  172, 75,  183, 74,  182,
+  81,  196, 80,  195, 102, 206, 101, 205, 108, 215, 82,  197, 100, 204, 99,
+  203, 110, 217, 109, 216, 131, 223, 130, 222, 140, 232, 111, 218, 129, 221,
+  128, 220, 142, 234, 141, 233, 160, 236, 159, 235, 169, 245, 78,  193, 104,
+  208, 103, 207, 106, 213, 105, 212, 135, 227, 134, 226, 136, 228, 107, 214,
+  133, 225, 132, 224, 138, 230, 137, 229, 164, 240, 163, 239, 165, 241, 139,
+  231, 162, 238, 161, 237, 167, 243, 166, 242, 189, 249, 188, 248, 190, 250,
+  168, 244, 187, 247, 186, 246, 192, 252, 191, 251, 210, 254, 209, 253, 211,
+  255
+};
+#endif
+
+// For entropy coding, IDTX shares the scan orders of the other 2D-transforms,
+// but the fastest way to calculate the IDTX transform (i.e. no transposes)
+// results in coefficients that are a transposition of the entropy coding
+// versions. These tables are used as substitute for the scan order for the
+// faster version of IDTX.
+
+// Must be used together with av1_fast_idtx_iscan_4x4
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_fast_idtx_scan_4x4[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                                9, 12, 13, 10, 7, 11, 14, 15 };
+
+// Must be used together with av1_fast_idtx_scan_4x4
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_fast_idtx_iscan_4x4[16]) = { 0, 1, 5,  6,  2, 4,  7,  12,
+                                                 3, 8, 11, 13, 9, 10, 14, 15 };
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_4x4 = {
+  av1_fast_idtx_scan_4x4, av1_fast_idtx_iscan_4x4
+};
+
+// Must be used together with av1_fast_idtx_iscan_8x8
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_8x8[64]) = {
+  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+// Must be used together with av1_fast_idtx_scan_8x8
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_8x8[64]) = {
+  0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
+  3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63
+};
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_8x8 = {
+  av1_fast_idtx_scan_8x8, av1_fast_idtx_iscan_8x8
+};
+
+// Must be used together with av1_fast_idtx_iscan_16x16
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_16x16[256]) = {
+  0,   1,   16,  32,  17,  2,   3,   18,  33,  48,  64,  49,  34,  19,  4,
+  5,   20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,  6,   7,   22,
+  37,  52,  67,  82,  97,  112, 128, 113, 98,  83,  68,  53,  38,  23,  8,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 160, 145, 130, 115, 100,
+  85,  70,  55,  40,  25,  10,  11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87,  72,  57,  42,  27,
+  12,  13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  224, 209, 194, 179, 164, 149, 134, 119, 104, 89,  74,  59,  44,  29,  14,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91,  76,  61,  46,
+  31,  47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+  243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,  78,  63,  79,  94,
+  109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+  170, 155, 140, 125, 110, 95,  111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+  218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+  250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+  255
+};
+
+// Must be used together with av1_fast_idtx_scan_16x16
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_16x16[256]) = {
+  0,   1,   5,   6,   14,  15,  27,  28,  44,  45,  65,  66,  90,  91,  119,
+  120, 2,   4,   7,   13,  16,  26,  29,  43,  46,  64,  67,  89,  92,  118,
+  121, 150, 3,   8,   12,  17,  25,  30,  42,  47,  63,  68,  88,  93,  117,
+  122, 149, 151, 9,   11,  18,  24,  31,  41,  48,  62,  69,  87,  94,  116,
+  123, 148, 152, 177, 10,  19,  23,  32,  40,  49,  61,  70,  86,  95,  115,
+  124, 147, 153, 176, 178, 20,  22,  33,  39,  50,  60,  71,  85,  96,  114,
+  125, 146, 154, 175, 179, 200, 21,  34,  38,  51,  59,  72,  84,  97,  113,
+  126, 145, 155, 174, 180, 199, 201, 35,  37,  52,  58,  73,  83,  98,  112,
+  127, 144, 156, 173, 181, 198, 202, 219, 36,  53,  57,  74,  82,  99,  111,
+  128, 143, 157, 172, 182, 197, 203, 218, 220, 54,  56,  75,  81,  100, 110,
+  129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55,  76,  80,  101, 109,
+  130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77,  79,  102, 108,
+  131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78,  103, 107,
+  132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106,
+  133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105,
+  134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253,
+  135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254,
+  255
+};
+
+// Indicates the blocks for which RD model should be based on special logic
+static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+                                    BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int large_block = bsize >= BLOCK_32X32;
+  // Only enable for low bitdepth to mitigate issue: b/303023614.
+  return cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
+         !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+         cm->quant_params.base_qindex && !cpi->oxcf.use_highbitdepth;
+}
+/*!\brief Finds predicted motion vectors for a block.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds predicted motion vectors for a block from a certain reference frame.
+ * First, it fills reference MV stack, then picks the test from the stack and
+ * predicts the final MV for a block for each mode.
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    ref_frame                Reference frame for which to find
+ *                                        ref MVs
+ * \param[out]   frame_mv                 Predicted MVs for a block
+ * \param[in]    yv12_mb                  Buffer to hold predicted block
+ * \param[in]    bsize                    Current block size
+ * \param[in]    force_skip_low_temp_var  Flag indicating possible mode search
+ *                                        prune for low temporal variance block
+ * \param[in]    skip_pred_mv             Flag indicating to skip av1_mv_pred
+ * \param[out]   use_scaled_ref_frame     Flag to indicate if scaled reference
+ *                                        frame is used.
+ *
+ * \remark Nothing is returned. Instead, predicted MVs are placed into
+ * \c frame_mv array, and use_scaled_ref_frame is set.
+ */
+static INLINE void find_predictors(
+    AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+    struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
+    int force_skip_low_temp_var, int skip_pred_mv, bool *use_scaled_ref_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, ref_frame);
+  const bool ref_is_scaled =
+      ref->y_crop_height != cm->height || ref->y_crop_width != cm->width;
+  const YV12_BUFFER_CONFIG *scaled_ref =
+      av1_get_scaled_ref_frame(cpi, ref_frame);
+  const YV12_BUFFER_CONFIG *yv12 =
+      ref_is_scaled && scaled_ref ? scaled_ref : ref;
+  const int num_planes = av1_num_planes(cm);
+  x->pred_mv_sad[ref_frame] = INT_MAX;
+  x->pred_mv0_sad[ref_frame] = INT_MAX;
+  x->pred_mv1_sad[ref_frame] = INT_MAX;
+  frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+  // TODO(kyslov) this needs various further optimizations. to be continued..
+  assert(yv12 != NULL);
+  if (yv12 != NULL) {
+    struct scale_factors *const sf =
+        scaled_ref ? NULL : get_ref_scale_factors(cm, ref_frame);
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                     xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                     mbmi_ext->mode_context);
+    // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+    // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+    av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+    av1_find_best_ref_mvs_from_stack(
+        cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
+        &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
+    frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame];
+    // Early exit for non-LAST frame if force_skip_low_temp_var is set.
+    if (!ref_is_scaled && bsize >= BLOCK_8X8 && !skip_pred_mv &&
+        !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
+      av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+                  bsize);
+    }
+  }
+  if (cm->features.switchable_motion_mode) {
+    av1_count_overlappable_neighbors(cm, xd);
+  }
+  mbmi->num_proj_ref = 1;
+  *use_scaled_ref_frame = ref_is_scaled && scaled_ref;
+}
+
+static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi,
+                                   PREDICTION_MODE pred_mode,
+                                   MV_REFERENCE_FRAME ref_frame0,
+                                   MV_REFERENCE_FRAME ref_frame1,
+                                   const AV1_COMMON *cm) {
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  mbmi->ref_mv_idx = 0;
+  mbmi->mode = pred_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frame0;
+  mbmi->ref_frame[1] = ref_frame1;
+  pmi->palette_size[PLANE_TYPE_Y] = 0;
+  pmi->palette_size[PLANE_TYPE_UV] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->num_proj_ref = 1;
+  mbmi->interintra_mode = 0;
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+static INLINE void init_estimate_block_intra_args(
+    struct estimate_block_intra_args *args, AV1_COMP *cpi, MACROBLOCK *x) {
+  args->cpi = cpi;
+  args->x = x;
+  args->mode = DC_PRED;
+  args->skippable = 1;
+  args->rdc = 0;
+  args->best_sad = UINT_MAX;
+  args->prune_mode_based_on_sad = false;
+}
+
+static INLINE int get_pred_buffer(PRED_BUFFER *p, int len) {
+  for (int buf_idx = 0; buf_idx < len; buf_idx++) {
+    if (!p[buf_idx].in_use) {
+      p[buf_idx].in_use = 1;
+      return buf_idx;
+    }
+  }
+  return -1;
+}
+
+static INLINE void free_pred_buffer(PRED_BUFFER *p) {
+  if (p != NULL) p->in_use = 0;
+}
+
+#if CONFIG_INTERNAL_STATS
+static INLINE void store_coding_context_nonrd(MACROBLOCK *x,
+                                              PICK_MODE_CONTEXT *ctx,
+                                              int mode_index) {
+#else
+static INLINE void store_coding_context_nonrd(MACROBLOCK *x,
+                                              PICK_MODE_CONTEXT *ctx) {
+#endif  // CONFIG_INTERNAL_STATS
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->rd_stats.skip_txfm = txfm_info->skip_txfm;
+
+  ctx->skippable = txfm_info->skip_txfm;
+#if CONFIG_INTERNAL_STATS
+  ctx->best_mode_index = mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+  ctx->mic = *xd->mi[0];
+  ctx->skippable = txfm_info->skip_txfm;
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+}
+
+void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
+                   BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf,
+                        int pred_stride, RD_STATS *this_rdc, int *skippable,
+                        BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               RD_STATS *this_rdc, int start_plane,
+                               int stop_plane);
+
+void av1_estimate_block_intra(int plane, int block, int row, int col,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              void *arg);
+
+void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                             int best_early_term, unsigned int ref_cost_intra,
+                             int reuse_prediction, struct buf_2d *orig_dst,
+                             PRED_BUFFER *tmp_buffers,
+                             PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+                             BEST_PICKMODE *best_pickmode,
+                             PICK_MODE_CONTEXT *ctx);
+
+#endif  // AOM_AV1_ENCODER_NONRD_OPT_H_
diff --git a/third_party/aom/av1/encoder/nonrd_pickmode.c b/third_party/aom/av1/encoder/nonrd_pickmode.c
new file mode 100644
index 0000000000..f939b6d1fa
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_pickmode.c
@@ -0,0 +1,3537 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
+
+static INLINE int early_term_inter_search_with_sse(int early_term_idx,
+                                                   BLOCK_SIZE bsize,
+                                                   int64_t this_sse,
+                                                   int64_t best_sse,
+                                                   PREDICTION_MODE this_mode) {
+  // Aggressiveness to terminate inter mode search early is adjusted based on
+  // speed and block size.
+  static const double early_term_thresh[4][4] = { { 0.65, 0.65, 0.65, 0.7 },
+                                                  { 0.6, 0.65, 0.85, 0.9 },
+                                                  { 0.5, 0.5, 0.55, 0.6 },
+                                                  { 0.6, 0.75, 0.85, 0.85 } };
+  static const double early_term_thresh_newmv_nearestmv[4] = { 0.3, 0.3, 0.3,
+                                                               0.3 };
+
+  const int size_group = size_group_lookup[bsize];
+  assert(size_group < 4);
+  assert((early_term_idx > 0) && (early_term_idx < EARLY_TERM_INDICES));
+  const double threshold =
+      ((early_term_idx == EARLY_TERM_IDX_4) &&
+       (this_mode == NEWMV || this_mode == NEARESTMV))
+          ? early_term_thresh_newmv_nearestmv[size_group]
+          : early_term_thresh[early_term_idx - 1][size_group];
+
+  // Terminate inter mode search early based on best sse so far.
+  if ((early_term_idx > 0) && (threshold * this_sse > best_sse)) {
+    return 1;
+  }
+  return 0;
+}
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+  bp->best_sse = INT64_MAX;
+  bp->best_mode = NEARESTMV;
+  bp->best_ref_frame = LAST_FRAME;
+  bp->best_second_ref_frame = NONE_FRAME;
+  bp->best_tx_size = TX_8X8;
+  bp->tx_type = DCT_DCT;
+  bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+  bp->best_mode_skip_txfm = 0;
+  bp->best_mode_initial_skip_flag = 0;
+  bp->best_pred = NULL;
+  bp->best_motion_mode = SIMPLE_TRANSLATION;
+  bp->num_proj_ref = 0;
+  av1_zero(bp->wm_params);
+  av1_zero(bp->pmi);
+}
+
+// Copy best inter mode parameters to best_pickmode
+static INLINE void update_search_state_nonrd(
+    InterModeSearchStateNonrd *search_state, MB_MODE_INFO *const mi,
+    TxfmSearchInfo *txfm_info, RD_STATS *nonskip_rdc, PICK_MODE_CONTEXT *ctx,
+    PREDICTION_MODE this_best_mode, const int64_t sse_y) {
+  BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+  best_pickmode->best_sse = sse_y;
+  best_pickmode->best_mode = this_best_mode;
+  best_pickmode->best_motion_mode = mi->motion_mode;
+  best_pickmode->wm_params = mi->wm_params;
+  best_pickmode->num_proj_ref = mi->num_proj_ref;
+  best_pickmode->best_pred_filter = mi->interp_filters;
+  best_pickmode->best_tx_size = mi->tx_size;
+  best_pickmode->best_ref_frame = mi->ref_frame[0];
+  best_pickmode->best_second_ref_frame = mi->ref_frame[1];
+  best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm;
+  best_pickmode->best_mode_initial_skip_flag =
+      (nonskip_rdc->rate == INT_MAX && search_state->this_rdc.skip_txfm);
+  if (!best_pickmode->best_mode_skip_txfm) {
+    memcpy(ctx->blk_skip, txfm_info->blk_skip,
+           sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+  }
+}
+
+static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                int_mv *mv, MV ref_mv, FULLPEL_MV start_mv,
+                                bool fullpel_performed_well) {
+  const int frame_lowmotion = cpi->rc.avg_frame_low_motion;
+  const int reduce_mv_pel_precision_highmotion =
+      cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion;
+
+  // Reduce MV precision for higher int MV value & frame-level motion
+  if (reduce_mv_pel_precision_highmotion >= 3) {
+    int mv_thresh = 4;
+    const int is_low_resoln =
+        (cpi->common.width * cpi->common.height <= 320 * 240);
+    mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
+    if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12;
+    mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
+    if (abs(mv->as_fullmv.row) >= mv_thresh ||
+        abs(mv->as_fullmv.col) >= mv_thresh)
+      return HALF_PEL;
+  } else if (reduce_mv_pel_precision_highmotion >= 1) {
+    int mv_thresh;
+    const int th_vals[2][3] = { { 4, 8, 10 }, { 4, 6, 8 } };
+    const int th_idx = reduce_mv_pel_precision_highmotion - 1;
+    assert(th_idx >= 0 && th_idx < 2);
+    if (frame_lowmotion > 0 && frame_lowmotion < 40)
+      mv_thresh = 12;
+    else
+      mv_thresh = (bsize >= BLOCK_32X32)   ? th_vals[th_idx][0]
+                  : (bsize >= BLOCK_16X16) ? th_vals[th_idx][1]
+                                           : th_vals[th_idx][2];
+    if (abs(mv->as_fullmv.row) >= (mv_thresh << 1) ||
+        abs(mv->as_fullmv.col) >= (mv_thresh << 1))
+      return FULL_PEL;
+    else if (abs(mv->as_fullmv.row) >= mv_thresh ||
+             abs(mv->as_fullmv.col) >= mv_thresh)
+      return HALF_PEL;
+  }
+  // Reduce MV precision for relatively static (e.g. background), low-complex
+  // large areas
+  if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 2) {
+    const int qband = x->qindex >> (QINDEX_BITS - 2);
+    assert(qband < 4);
+    if (x->content_state_sb.source_sad_nonrd <= kVeryLowSad &&
+        bsize > BLOCK_16X16 && qband != 0) {
+      if (x->source_variance < 500)
+        return FULL_PEL;
+      else if (x->source_variance < 5000)
+        return HALF_PEL;
+    }
+  } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 1) {
+    if (fullpel_performed_well && ref_mv.row == 0 && ref_mv.col == 0 &&
+        start_mv.row == 0 && start_mv.col == 0)
+      return HALF_PEL;
+  }
+  return cpi->sf.mv_sf.subpel_force_stop;
+}
+
+static bool use_aggressive_subpel_search_method(MACROBLOCK *x,
+                                                bool use_adaptive_subpel_search,
+                                                bool fullpel_performed_well) {
+  if (!use_adaptive_subpel_search) return false;
+  const int qband = x->qindex >> (QINDEX_BITS - 2);
+  assert(qband < 4);
+  if ((qband > 0) && (fullpel_performed_well ||
+                      (x->content_state_sb.source_sad_nonrd <= kLowSad) ||
+                      (x->source_variance < 100)))
+    return true;
+  return false;
+}
+
+/*!\brief Runs Motion Estimation for a specific block and specific ref frame.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by running Motion Estimation for a specific
+ * block and a specific reference frame. Exits early if RDCost of Full Pel part
+ * exceeds best RD Cost fund so far
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    bsize                    Current block size
+ * \param[in]    tmp_mv                   Pointer to best found New MV
+ * \param[in]    rate_mv                  Pointer to Rate of the best new MV
+ * \param[in]    best_rd_sofar            RD Cost of the best mode found so far
+ * \param[in]    use_base_mv              Flag, indicating that tmp_mv holds
+ *                                        specific MV to start the search with
+ *
+ * \return Returns 0 if ME was terminated after Full Pel Search because too
+ * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv.
+ * Rate estimation for this vector is placed to \c rate_mv
+ */
+static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int_mv *tmp_mv,
+                                  int *rate_mv, int64_t best_rd_sofar,
+                                  int use_base_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *sf = &cpi->sf;
+  MB_MODE_INFO *mi = xd->mi[0];
+  int step_param = (sf->rt_sf.fullpel_search_step_param)
+                       ? sf->rt_sf.fullpel_search_step_param
+                       : cpi->mv_search_params.mv_step_param;
+  FULLPEL_MV start_mv;
+  const int ref = mi->ref_frame[0];
+  const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv;
+  MV center_mv;
+  int dis;
+  int rv = 0;
+  int cost_list[5];
+  int search_subpel = 1;
+
+  start_mv = get_fullmv_from_mv(&ref_mv);
+
+  if (!use_base_mv)
+    center_mv = ref_mv;
+  else
+    center_mv = tmp_mv->as_mv;
+
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+  const search_site_config *src_search_sites =
+      av1_get_search_site_config(cpi, x, search_method);
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  FULLPEL_MV_STATS best_mv_stats;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+                                     start_mv, src_search_sites, search_method,
+                                     /*fine_search_interval=*/0);
+
+  const unsigned int full_var_rd = av1_full_pixel_search(
+      start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
+      &tmp_mv->as_fullmv, &best_mv_stats, NULL);
+
+  // calculate the bit cost on motion vector
+  MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+
+  *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs->nmv_joint_cost,
+                             x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+
+  // TODO(kyslov) Account for Rate Mode!
+  rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar);
+
+  if (rv && search_subpel) {
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    const bool fullpel_performed_well =
+        (bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) ||
+        (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) ||
+        (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127);
+    if (sf->rt_sf.reduce_mv_pel_precision_highmotion ||
+        sf->rt_sf.reduce_mv_pel_precision_lowcomplex)
+      ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv, ref_mv,
+                                            start_mv, fullpel_performed_well);
+
+    MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+    assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+    // adaptively downgrade subpel search method based on block properties
+    if (use_aggressive_subpel_search_method(
+            x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well))
+      av1_find_best_sub_pixel_tree_pruned_more(
+          xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+          &dis, &x->pred_sse[ref], NULL);
+    else
+      cpi->mv_search_params.find_fractional_mv_step(
+          xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+          &dis, &x->pred_sse[ref], NULL);
+    *rate_mv =
+        av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost,
+                        x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  }
+  // The final MV can not be equal to the reference MV as this will trigger an
+  // assert later. This can happen if both NEAREST and NEAR modes were skipped.
+  rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
+  return rv;
+}
+
+/*!\brief Searches for the best New Motion Vector.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by doing Motion Estimation. Uses reduced
+ * complexity ME for non-LAST frames or calls \c combined_motion_search
+ * for LAST reference frame
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    frame_mv                 Array that holds MVs for all modes
+ *                                        and ref frames
+ * \param[in]    ref_frame                Reference frame for which to find
+ *                                        the best New MVs
+ * \param[in]    gf_temporal_ref          Flag, indicating temporal reference
+ *                                        for GOLDEN frame
+ * \param[in]    bsize                    Current block size
+ * \param[in]    mi_row                   Row index in 4x4 units
+ * \param[in]    mi_col                   Column index in 4x4 units
+ * \param[in]    rate_mv                  Pointer to Rate of the best new MV
+ * \param[in]    best_rdc                 Pointer to the RD Cost for the best
+ *                                        mode found so far
+ *
+ * \return Returns -1 if the search was not done, otherwise returns 0.
+ * Best New MV is placed into \c frame_mv array, Rate estimation for this
+ * vector is placed to \c rate_mv
+ */
+static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
+                         int_mv frame_mv[][REF_FRAMES],
+                         MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv,
+                         RD_STATS *best_rdc) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  AV1_COMMON *cm = &cpi->common;
+  int_mv *this_ref_frm_newmv = &frame_mv[NEWMV][ref_frame];
+  unsigned int y_sad_zero;
+  if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      gf_temporal_ref) {
+    int tmp_sad;
+    int dis;
+
+    if (bsize < BLOCK_16X16) return -1;
+
+    int me_search_size_col = block_size_wide[bsize] >> 1;
+    int me_search_size_row = block_size_high[bsize] >> 1;
+    tmp_sad = av1_int_pro_motion_estimation(
+        cpi, x, bsize, mi_row, mi_col,
+        &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv, &y_sad_zero,
+        me_search_size_col, me_search_size_row);
+
+    if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+
+    this_ref_frm_newmv->as_int = mi->mv[0].as_int;
+    int_mv best_mv = mi->mv[0];
+    best_mv.as_mv.row >>= 3;
+    best_mv.as_mv.col >>= 3;
+    MV ref_mv = av1_get_ref_mv(x, 0).as_mv;
+    this_ref_frm_newmv->as_mv.row >>= 3;
+    this_ref_frm_newmv->as_mv.col >>= 3;
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL);
+    if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion ||
+        cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex) {
+      FULLPEL_MV start_mv = { .row = 0, .col = 0 };
+      ms_params.forced_stop =
+          subpel_select(cpi, x, bsize, &best_mv, ref_mv, start_mv, false);
+    }
+    MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis,
+        &x->pred_sse[ref_frame], NULL);
+    this_ref_frm_newmv->as_int = best_mv.as_int;
+
+    // When NEWMV is same as ref_mv from the drl, it is preferred to code the
+    // MV as NEARESTMV or NEARMV. In this case, NEWMV needs to be skipped to
+    // avoid an assert failure at a later stage. The scenario can occur if
+    // NEARESTMV was not evaluated for ALTREF.
+    if (this_ref_frm_newmv->as_mv.col == ref_mv.col &&
+        this_ref_frm_newmv->as_mv.row == ref_mv.row)
+      return -1;
+
+    *rate_mv = av1_mv_bit_cost(&this_ref_frm_newmv->as_mv, &ref_mv,
+                               x->mv_costs->nmv_joint_cost,
+                               x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  } else if (!combined_motion_search(cpi, x, bsize, &frame_mv[NEWMV][ref_frame],
+                                     rate_mv, best_rdc->rdcost, 0)) {
+    return -1;
+  }
+
+  return 0;
+}
+
+static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
+                                            const MACROBLOCKD *xd,
+                                            const ModeCosts *mode_costs,
+                                            int segment_id, BLOCK_SIZE bsize,
+                                            unsigned int *ref_costs_single) {
+  int seg_ref_active =
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+  } else {
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    ref_costs_single[INTRA_FRAME] =
+        mode_costs->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
+    if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+        is_comp_ref_allowed(bsize)) {
+      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+      base_cost += mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
+    }
+    ref_costs_single[LAST_FRAME] = base_cost;
+    ref_costs_single[GOLDEN_FRAME] = base_cost;
+    ref_costs_single[ALTREF_FRAME] = base_cost;
+    // add cost for last, golden, altref
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0];
+  }
+}
+
+static INLINE void set_force_skip_flag(const AV1_COMP *const cpi,
+                                       MACROBLOCK *const x, unsigned int sse,
+                                       int *force_skip) {
+  if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT &&
+      cpi->sf.rt_sf.tx_size_level_based_on_qstep &&
+      cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
+    const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (x->e_mbd.bd - 5);
+    const unsigned int qstep_sq = qstep * qstep;
+    // If the sse is low for low source variance blocks, mark those as
+    // transform skip.
+    // Note: Though qstep_sq is based on ac qstep, the threshold is kept
+    // low so that reliable early estimate of tx skip can be obtained
+    // through its comparison with sse.
+    if (sse < qstep_sq && x->source_variance < qstep_sq &&
+        x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+        x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+      *force_skip = 1;
+  }
+}
+
+#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \
+  (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false)
+#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16)
+
+static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 MACROBLOCK *const x, unsigned int var,
+                                 unsigned int sse, int *force_skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TX_SIZE tx_size;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) {
+    int multiplier = 8;
+    unsigned int var_thresh = 0;
+    unsigned int is_high_var = 1;
+    // Use quantizer based thresholds to determine transform size.
+    if (cpi->sf.rt_sf.tx_size_level_based_on_qstep) {
+      const int qband = x->qindex >> (QINDEX_BITS - 2);
+      const int mult[4] = { 8, 7, 6, 5 };
+      assert(qband < 4);
+      multiplier = mult[qband];
+      const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (xd->bd - 5);
+      const unsigned int qstep_sq = qstep * qstep;
+      var_thresh = qstep_sq * 2;
+      if (cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
+        // If the sse is low for low source variance blocks, mark those as
+        // transform skip.
+        // Note: Though qstep_sq is based on ac qstep, the threshold is kept
+        // low so that reliable early estimate of tx skip can be obtained
+        // through its comparison with sse.
+        if (sse < qstep_sq && x->source_variance < qstep_sq &&
+            x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+            x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+          *force_skip = 1;
+        // Further lower transform size based on aq mode only if residual
+        // variance is high.
+        is_high_var = (var >= var_thresh);
+      }
+    }
+    // Choose larger transform size for blocks where dc component is dominant or
+    // the ac component is low.
+    if (sse > ((var * multiplier) >> 2) || (var < var_thresh))
+      tx_size =
+          AOMMIN(max_txsize_lookup[bsize],
+                 tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+    else
+      tx_size = TX_8X8;
+
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && is_high_var)
+      tx_size = TX_8X8;
+    else if (tx_size > TX_16X16)
+      tx_size = TX_16X16;
+  } else {
+    tx_size =
+        AOMMIN(max_txsize_lookup[bsize],
+               tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+  }
+
+  if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize))
+    tx_size = TX_SIZE_FOR_BSIZE_GT32;
+
+  return AOMMIN(tx_size, TX_16X16);
+}
+
+static void block_variance(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride, int w, int h,
+                           unsigned int *sse, int *sum, int block_size,
+                           uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
+  int k = 0;
+  *sse = 0;
+  *sum = 0;
+
+  // This function is called for block sizes >= BLOCK_32x32. As per the design
+  // the aom_get_var_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32)
+  // per call. Hence the width and height of the block need to be at least 8 and
+  // 32 samples respectively.
+  assert(w >= 32);
+  assert(h >= 8);
+  for (int row = 0; row < h; row += block_size) {
+    for (int col = 0; col < w; col += 32) {
+      aom_get_var_sse_sum_8x8_quad(src + src_stride * row + col, src_stride,
+                                   ref + ref_stride * row + col, ref_stride,
+                                   &sse8x8[k], &sum8x8[k], sse, sum,
+                                   &var8x8[k]);
+      k += 4;
+    }
+  }
+}
+
+static void block_variance_16x16_dual(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int w,
+                                      int h, unsigned int *sse, int *sum,
+                                      int block_size, uint32_t *sse16x16,
+                                      uint32_t *var16x16) {
+  int k = 0;
+  *sse = 0;
+  *sum = 0;
+  // This function is called for block sizes >= BLOCK_32x32. As per the design
+  // the aom_get_var_sse_sum_16x16_dual() processes four 16x16 blocks (in a
+  // 16x32) per call. Hence the width and height of the block need to be at
+  // least 16 and 32 samples respectively.
+  assert(w >= 32);
+  assert(h >= 16);
+  for (int row = 0; row < h; row += block_size) {
+    for (int col = 0; col < w; col += 32) {
+      aom_get_var_sse_sum_16x16_dual(src + src_stride * row + col, src_stride,
+                                     ref + ref_stride * row + col, ref_stride,
+                                     &sse16x16[k], sse, sum, &var16x16[k]);
+      k += 2;
+    }
+  }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+                               unsigned int *sse_i, int *sum_i,
+                               unsigned int *var_o, unsigned int *sse_o,
+                               int *sum_o) {
+  const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+  const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+  const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+  int row, col, k = 0;
+
+  for (row = 0; row < nh; row += 2) {
+    for (col = 0; col < nw; col += 2) {
+      sse_o[k] = sse_i[row * nw + col] + sse_i[row * nw + col + 1] +
+                 sse_i[(row + 1) * nw + col] + sse_i[(row + 1) * nw + col + 1];
+      sum_o[k] = sum_i[row * nw + col] + sum_i[row * nw + col + 1] +
+                 sum_i[(row + 1) * nw + col] + sum_i[(row + 1) * nw + col + 1];
+      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+                                       (b_width_log2_lookup[unit_size] +
+                                        b_height_log2_lookup[unit_size] + 6));
+      k++;
+    }
+  }
+}
+
+// Adjust the ac_thr according to speed, width, height and normalized sum
+static int ac_thr_factor(int speed, int width, int height, int norm_sum) {
+  if (speed >= 8 && norm_sum < 5) {
+    if (width <= 640 && height <= 480)
+      return 4;
+    else
+      return 2;
+  }
+  return 1;
+}
+
+// Sets early_term flag based on chroma planes prediction
+static INLINE void set_early_term_based_on_uv_plane(
+    AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row,
+    int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx,
+    const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+  const uint32_t dc_quant = p->dequant_QTX[0];
+  const uint32_t ac_quant = p->dequant_QTX[1];
+  int64_t dc_thr = dc_quant * dc_quant >> 6;
+  int64_t ac_thr = ac_quant * ac_quant >> 6;
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  int ac_test = 1;
+  int dc_test = 1;
+  const int norm_sum = abs(sum) >> (bw + bh);
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->oxcf.speed > 5)
+    ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
+                                     norm_sum, cpi->svc.temporal_layer_id);
+  else
+    ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
+#else
+  ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
+
+#endif
+
+  if (cpi->sf.rt_sf.increase_source_sad_thresh) {
+    dc_thr = dc_thr << 1;
+    ac_thr = ac_thr << 2;
+  }
+
+  for (int k = 0; k < num_blk; k++) {
+    // Check if all ac coefficients can be quantized to zero.
+    if (!(var_tx[k] < ac_thr || var == 0)) {
+      ac_test = 0;
+      break;
+    }
+    // Check if dc coefficient can be quantized to zero.
+    if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+      dc_test = 0;
+      break;
+    }
+  }
+
+  // Check if chroma can be skipped based on ac and dc test flags.
+  if (ac_test && dc_test) {
+    int skip_uv[2] = { 0 };
+    unsigned int var_uv[2];
+    unsigned int sse_uv[2];
+    // Transform skipping test in UV planes.
+    for (int plane = AOM_PLANE_U; plane <= AOM_PLANE_V; plane++) {
+      int j = plane - 1;
+      skip_uv[j] = 1;
+      if (x->color_sensitivity[COLOR_SENS_IDX(plane)]) {
+        skip_uv[j] = 0;
+        struct macroblock_plane *const puv = &x->plane[plane];
+        struct macroblockd_plane *const puvd = &xd->plane[plane];
+        const BLOCK_SIZE uv_bsize = get_plane_block_size(
+            bsize, puvd->subsampling_x, puvd->subsampling_y);
+        // Adjust these thresholds for UV.
+        const int shift_ac = cpi->sf.rt_sf.increase_source_sad_thresh ? 5 : 3;
+        const int shift_dc = cpi->sf.rt_sf.increase_source_sad_thresh ? 4 : 3;
+        const int64_t uv_dc_thr =
+            (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> shift_dc;
+        const int64_t uv_ac_thr =
+            (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> shift_ac;
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      plane, plane);
+        var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
+                                                  puvd->dst.buf,
+                                                  puvd->dst.stride, &sse_uv[j]);
+        if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+            (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+          skip_uv[j] = 1;
+        else
+          break;
+      }
+    }
+    if (skip_uv[0] & skip_uv[1]) {
+      *early_term = 1;
+    }
+  }
+}
+
+static INLINE void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x,
+                                              RD_STATS *rd_stats,
+                                              int calculate_rd, int *early_term,
+                                              BLOCK_SIZE bsize,
+                                              unsigned int sse) {
+  if (calculate_rd) {
+    if (!*early_term) {
+      const int bw = block_size_wide[bsize];
+      const int bh = block_size_high[bsize];
+
+      model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, rd_stats->sse, bw * bh,
+                            &rd_stats->rate, &rd_stats->dist);
+    }
+
+    if (*early_term) {
+      rd_stats->rate = 0;
+      rd_stats->dist = sse << 4;
+    }
+  }
+}
+
+static void model_skip_for_sb_y_large_64(AV1_COMP *cpi, BLOCK_SIZE bsize,
+                                         int mi_row, int mi_col, MACROBLOCK *x,
+                                         MACROBLOCKD *xd, RD_STATS *rd_stats,
+                                         int *early_term, int calculate_rd,
+                                         int64_t best_sse,
+                                         unsigned int *var_output,
+                                         unsigned int var_prune_threshold) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  int test_skip = 1;
+  unsigned int var;
+  int sum;
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  unsigned int sse16x16[64] = { 0 };
+  unsigned int var16x16[64] = { 0 };
+  assert(xd->mi[0]->tx_size == TX_16X16);
+  assert(bsize > BLOCK_32X32);
+
+  // Calculate variance for whole partition, and also save 16x16 blocks'
+  // variance to be used in following transform skipping test.
+  block_variance_16x16_dual(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 16,
+                            sse16x16, var16x16);
+
+  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+  if (var_output) {
+    *var_output = var;
+    if (*var_output > var_prune_threshold) {
+      return;
+    }
+  }
+
+  rd_stats->sse = sse;
+  // Skipping test
+  *early_term = 0;
+  set_force_skip_flag(cpi, x, sse, early_term);
+  // The code below for setting skip flag assumes transform size of at least
+  // 8x8, so force this lower limit on transform.
+  MB_MODE_INFO *const mi = xd->mi[0];
+  if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
+      early_term_inter_search_with_sse(
+          cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
+          mi->mode))
+    test_skip = 0;
+
+  if (*early_term) test_skip = 0;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  if (test_skip) {
+    const unsigned int *sse_tx = sse16x16;
+    const unsigned int *var_tx = var16x16;
+    const unsigned int num_block = (1 << (bw + bh - 2)) >> 2;
+    set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
+                                     early_term, num_block, sse_tx, var_tx, sum,
+                                     var, sse);
+  }
+  calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
+                             sse);
+}
+
+static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
+                                      int mi_row, int mi_col, MACROBLOCK *x,
+                                      MACROBLOCKD *xd, RD_STATS *rd_stats,
+                                      int *early_term, int calculate_rd,
+                                      int64_t best_sse,
+                                      unsigned int *var_output,
+                                      unsigned int var_prune_threshold) {
+  if (x->force_zeromv_skip_for_blk) {
+    *early_term = 1;
+    rd_stats->rate = 0;
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    return;
+  }
+
+  // For block sizes greater than 32x32, the transform size is always 16x16.
+  // This function avoids calling calculate_variance() for tx_size 16x16 cases
+  // by directly populating variance at tx_size level from
+  // block_variance_16x16_dual() function.
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) {
+    xd->mi[0]->tx_size = TX_SIZE_FOR_BSIZE_GT32;
+    model_skip_for_sb_y_large_64(cpi, bsize, mi_row, mi_col, x, xd, rd_stats,
+                                 early_term, calculate_rd, best_sse, var_output,
+                                 var_prune_threshold);
+    return;
+  }
+
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  int test_skip = 1;
+  unsigned int var;
+  int sum;
+
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  unsigned int sse8x8[256] = { 0 };
+  int sum8x8[256] = { 0 };
+  unsigned int var8x8[256] = { 0 };
+  TX_SIZE tx_size;
+
+  // Calculate variance for whole partition, and also save 8x8 blocks' variance
+  // to be used in following transform skipping test.
+  block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+  if (var_output) {
+    *var_output = var;
+    if (*var_output > var_prune_threshold) {
+      return;
+    }
+  }
+
+  rd_stats->sse = sse;
+  // Skipping test
+  *early_term = 0;
+  tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term);
+  assert(tx_size <= TX_16X16);
+  // The code below for setting skip flag assumes transform size of at least
+  // 8x8, so force this lower limit on transform.
+  if (tx_size < TX_8X8) tx_size = TX_8X8;
+  xd->mi[0]->tx_size = tx_size;
+
+  MB_MODE_INFO *const mi = xd->mi[0];
+  if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
+      early_term_inter_search_with_sse(
+          cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
+          mi->mode))
+    test_skip = 0;
+
+  if (*early_term) test_skip = 0;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  if (test_skip) {
+    unsigned int sse16x16[64] = { 0 };
+    int sum16x16[64] = { 0 };
+    unsigned int var16x16[64] = { 0 };
+    const unsigned int *sse_tx = sse8x8;
+    const unsigned int *var_tx = var8x8;
+    unsigned int num_blks = 1 << (bw + bh - 2);
+
+    if (tx_size >= TX_16X16) {
+      calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+                         sum16x16);
+      sse_tx = sse16x16;
+      var_tx = var16x16;
+      num_blks = num_blks >> 2;
+    }
+    set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
+                                     early_term, num_blks, sse_tx, var_tx, sum,
+                                     var, sse);
+  }
+  calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
+                             sse);
+}
+
+static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd,
+                              RD_STATS *rd_stats, unsigned int *var_out,
+                              int calculate_rd, int *early_term) {
+  if (x->force_zeromv_skip_for_blk && early_term != NULL) {
+    *early_term = 1;
+    rd_stats->rate = 0;
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+  }
+
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+
+  unsigned int var = cpi->ppi->fn_ptr[bsize].vf(
+      p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
+  int force_skip = 0;
+  xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip);
+  if (var_out) {
+    *var_out = var;
+  }
+
+  if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) {
+    const int bwide = block_size_wide[bsize];
+    const int bhigh = block_size_high[bsize];
+    model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate,
+                          &dist);
+  } else {
+    rate = INT_MAX;  // this will be overwritten later with av1_block_yrd
+    dist = INT_MAX;
+  }
+  rd_stats->sse = sse;
+  x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+  if (force_skip && ref > INTRA_FRAME) {
+    rate = 0;
+    dist = (int64_t)sse << 4;
+  }
+
+  assert(rate >= 0);
+
+  rd_stats->skip_txfm = (rate == 0);
+  rate = AOMMIN(rate, INT_MAX);
+  rd_stats->rate = rate;
+  rd_stats->dist = dist;
+}
+
+static INLINE int get_drl_cost(PREDICTION_MODE this_mode, int ref_mv_idx,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               const int (*const drl_mode_cost0)[2],
+                               int8_t ref_frame_type) {
+  int cost = 0;
+  if (this_mode == NEWMV || this_mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][ref_mv_idx != idx];
+        if (ref_mv_idx == idx) return cost;
+      }
+    }
+    return cost;
+  }
+
+  if (have_nearmv_in_inter_mode(this_mode)) {
+    for (int idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][ref_mv_idx != (idx - 1)];
+        if (ref_mv_idx == (idx - 1)) return cost;
+      }
+    }
+    return cost;
+  }
+  return cost;
+}
+
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
+                       int16_t mode_context) {
+  if (is_inter_compound_mode(mode)) {
+    return mode_costs
+        ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+  }
+
+  int mode_cost = 0;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+  assert(is_inter_mode(mode));
+
+  if (mode == NEWMV) {
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
+    return mode_cost;
+  } else {
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+    if (mode == GLOBALMV) {
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+}
+
+static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
+                            RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row,
+                            int mv_col, int speed, uint32_t spatial_variance,
+                            CONTENT_STATE_SB content_state_sb) {
+  // Bias against MVs associated with NEWMV mode that are very different from
+  // top/left neighbors.
+  if (this_mode == NEWMV) {
+    int al_mv_average_row;
+    int al_mv_average_col;
+    int row_diff, col_diff;
+    int above_mv_valid = 0;
+    int left_mv_valid = 0;
+    int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL;
+    int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL;
+    if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad &&
+        spatial_variance < 300 &&
+        (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) {
+      this_rdc->rdcost = this_rdc->rdcost << 2;
+      return;
+    }
+    if (xd->above_mbmi) {
+      above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV;
+      above_row = xd->above_mbmi->mv[0].as_mv.row;
+      above_col = xd->above_mbmi->mv[0].as_mv.col;
+    }
+    if (xd->left_mbmi) {
+      left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV;
+      left_row = xd->left_mbmi->mv[0].as_mv.row;
+      left_col = xd->left_mbmi->mv[0].as_mv.col;
+    }
+    if (above_mv_valid && left_mv_valid) {
+      al_mv_average_row = (above_row + left_row + 1) >> 1;
+      al_mv_average_col = (above_col + left_col + 1) >> 1;
+    } else if (above_mv_valid) {
+      al_mv_average_row = above_row;
+      al_mv_average_col = above_col;
+    } else if (left_mv_valid) {
+      al_mv_average_row = left_row;
+      al_mv_average_col = left_col;
+    } else {
+      al_mv_average_row = al_mv_average_col = 0;
+    }
+    row_diff = al_mv_average_row - mv_row;
+    col_diff = al_mv_average_col - mv_col;
+    if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) {
+      if (bsize >= BLOCK_32X32)
+        this_rdc->rdcost = this_rdc->rdcost << 1;
+      else
+        this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+    }
+  } else {
+    // Bias for speed >= 8 for low spatial variance.
+    if (speed >= 8 && spatial_variance < 150 &&
+        (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64))
+      this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+  }
+}
+
+static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           MV_REFERENCE_FRAME ref_frame,
+                                           THR_MODES best_mode_idx,
+                                           PREDICTION_MODE mode) {
+  const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4);
+  const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128);
+  for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) {
+    int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx];
+    if (thr_mode_idx == best_mode_idx) {
+      *freq_fact -= (*freq_fact >> 4);
+    } else {
+      *freq_fact =
+          AOMMIN(*freq_fact + RD_THRESH_INC,
+                 cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+    }
+  }
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void av1_pickmode_ctx_den_update(
+    AV1_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
+    unsigned int ref_frame_cost[REF_FRAMES],
+    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int reuse_inter_pred,
+    BEST_PICKMODE *bp) {
+  ctx_den->zero_last_cost_orig = zero_last_cost_orig;
+  ctx_den->ref_frame_cost = ref_frame_cost;
+  ctx_den->frame_mv = frame_mv;
+  ctx_den->reuse_inter_pred = reuse_inter_pred;
+  ctx_den->best_tx_size = bp->best_tx_size;
+  ctx_den->best_mode = bp->best_mode;
+  ctx_den->best_ref_frame = bp->best_ref_frame;
+  ctx_den->best_pred_filter = bp->best_pred_filter;
+  ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
+}
+
+static void recheck_zeromv_after_denoising(
+    AV1_COMP *cpi, MB_MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd,
+    AV1_DENOISER_DECISION decision, AV1_PICKMODE_CTX_DEN *ctx_den,
+    struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_STATS *best_rdc,
+    BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
+  // denoised result. Only do this under noise conditions, and if rdcost of
+  // ZEROMV on original source is not significantly higher than rdcost of best
+  // mode.
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow &&
+      ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
+      ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+       (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+        cpi->svc.number_spatial_layers == 1 &&
+        decision == FILTER_ZEROMV_BLOCK))) {
+    // Check if we should pick ZEROMV on denoised signal.
+    AV1_COMMON *const cm = &cpi->common;
+    RD_STATS this_rdc;
+    const ModeCosts *mode_costs = &x->mode_costs;
+    TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+    MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+
+    mi->mode = GLOBALMV;
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+    xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[LAST_FRAME][AOM_PLANE_Y];
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+    unsigned int var;
+    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL);
+
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+    this_rdc.rate += cost_mv_ref(mode_costs, GLOBALMV, mode_ctx);
+
+    this_rdc.rate += ctx_den->ref_frame_cost[LAST_FRAME];
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+    txfm_info->skip_txfm = this_rdc.skip_txfm;
+    // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
+    // is higher than best_ref mode (on original source).
+    if (this_rdc.rdcost > best_rdc->rdcost) {
+      this_rdc = *best_rdc;
+      mi->mode = best_pickmode->best_mode;
+      mi->ref_frame[0] = best_pickmode->best_ref_frame;
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
+      mi->interp_filters = best_pickmode->best_pred_filter;
+      if (best_pickmode->best_ref_frame == INTRA_FRAME) {
+        mi->mv[0].as_int = INVALID_MV;
+      } else {
+        mi->mv[0].as_int = ctx_den
+                               ->frame_mv[best_pickmode->best_mode]
+                                         [best_pickmode->best_ref_frame]
+                               .as_int;
+        if (ctx_den->reuse_inter_pred) {
+          xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[GOLDEN_FRAME][AOM_PLANE_Y];
+          av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+        }
+      }
+      mi->tx_size = best_pickmode->best_tx_size;
+      txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm;
+    } else {
+      ctx_den->best_ref_frame = LAST_FRAME;
+      *best_rdc = this_rdc;
+    }
+  }
+}
+#endif  // CONFIG_AV1_TEMPORAL_DENOISING
+
+/*!\brief Searches for the best interpolation filter
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR,
+ * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects
+ * the one that gives lowest RD cost. RD cost is calculated using curvfit model.
+ * Support for dual filters (different filters in the x & y directions) is
+ * allowed if sf.interp_sf.disable_dual_filter = 0.
+ *
+ * \param[in]    cpi                  Top-level encoder structure
+ * \param[in]    x                    Pointer to structure holding all the
+ *                                    data for the current macroblock
+ * \param[in]    this_rdc             Pointer to calculated RD Cost
+ * \param[in]    inter_pred_params_sr Pointer to structure holding parameters of
+                                      inter prediction for single reference
+ * \param[in]    mi_row               Row index in 4x4 units
+ * \param[in]    mi_col               Column index in 4x4 units
+ * \param[in]    tmp_buffer           Pointer to a temporary buffer for
+ *                                    prediction re-use
+ * \param[in]    bsize                Current block size
+ * \param[in]    reuse_inter_pred     Flag, indicating prediction re-use
+ * \param[out]   this_mode_pred       Pointer to store prediction buffer
+ *                                    for prediction re-use
+ * \param[out]   this_early_term      Flag, indicating that transform can be
+ *                                    skipped
+ * \param[out]   var                  The residue variance of the current
+ *                                    predictor.
+ * \param[in]    use_model_yrd_large  Flag, indicating special logic to handle
+ *                                    large blocks
+ * \param[in]    best_sse             Best sse so far.
+ * \param[in]    is_single_pred       Flag, indicating single mode.
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc and best filter is placed to \c mi->interp_filters. In case
+ * \c reuse_inter_pred flag is set, this function also outputs
+ * \c this_mode_pred. Also \c this_early_temp is set if transform can be
+ * skipped
+ */
+static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+                              InterPredParams *inter_pred_params_sr, int mi_row,
+                              int mi_col, PRED_BUFFER *tmp_buffer,
+                              BLOCK_SIZE bsize, int reuse_inter_pred,
+                              PRED_BUFFER **this_mode_pred,
+                              int *this_early_term, unsigned int *var,
+                              int use_model_yrd_large, int64_t best_sse,
+                              int is_single_pred) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+  int dim_factor =
+      (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1;
+  RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
+  TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
+  PRED_BUFFER *current_pred = *this_mode_pred;
+  int best_skip = 0;
+  int best_early_term = 0;
+  int64_t best_cost = INT64_MAX;
+  int best_filter_index = -1;
+
+  SubpelParams subpel_params;
+  // Initialize inter prediction params at mode level for single reference
+  // mode.
+  if (is_single_pred)
+    init_inter_mode_params(&mi->mv[0].as_mv, inter_pred_params_sr,
+                           &subpel_params, xd->block_ref_scale_factors[0],
+                           pd->pre->width, pd->pre->height);
+  for (int filter_idx = 0; filter_idx < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE;
+       ++filter_idx) {
+    int64_t cost;
+    if (cpi->sf.interp_sf.disable_dual_filter &&
+        filters_ref_set[filter_idx].as_filters.x_filter !=
+            filters_ref_set[filter_idx].as_filters.y_filter)
+      continue;
+
+    mi->interp_filters.as_int = filters_ref_set[filter_idx].as_int;
+    if (is_single_pred)
+      av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
+                                            &subpel_params);
+    else
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    unsigned int curr_var = UINT_MAX;
+    if (use_model_yrd_large)
+      model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                &pf_rd_stats[filter_idx], this_early_term, 1,
+                                best_sse, &curr_var, UINT_MAX);
+    else
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[filter_idx], &curr_var,
+                        1, NULL);
+    pf_rd_stats[filter_idx].rate += av1_get_switchable_rate(
+        x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
+    cost = RDCOST(x->rdmult, pf_rd_stats[filter_idx].rate,
+                  pf_rd_stats[filter_idx].dist);
+    pf_tx_size[filter_idx] = mi->tx_size;
+    if (cost < best_cost) {
+      *var = curr_var;
+      best_filter_index = filter_idx;
+      best_cost = cost;
+      best_skip = pf_rd_stats[filter_idx].skip_txfm;
+      best_early_term = *this_early_term;
+      if (reuse_inter_pred) {
+        if (*this_mode_pred != current_pred) {
+          free_pred_buffer(*this_mode_pred);
+          *this_mode_pred = current_pred;
+        }
+        current_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+        pd->dst.buf = current_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+  }
+  assert(best_filter_index >= 0 &&
+         best_filter_index < dim_factor * FILTER_SEARCH_SIZE);
+  if (reuse_inter_pred && *this_mode_pred != current_pred)
+    free_pred_buffer(current_pred);
+
+  mi->interp_filters.as_int = filters_ref_set[best_filter_index].as_int;
+  mi->tx_size = pf_tx_size[best_filter_index];
+  this_rdc->rate = pf_rd_stats[best_filter_index].rate;
+  this_rdc->dist = pf_rd_stats[best_filter_index].dist;
+  this_rdc->sse = pf_rd_stats[best_filter_index].sse;
+  this_rdc->skip_txfm = (best_skip || best_early_term);
+  *this_early_term = best_early_term;
+  if (reuse_inter_pred) {
+    pd->dst.buf = (*this_mode_pred)->data;
+    pd->dst.stride = (*this_mode_pred)->stride;
+  } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) {
+    if (is_single_pred)
+      av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
+                                            &subpel_params);
+    else
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+  }
+}
+#if !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE int is_warped_mode_allowed(const AV1_COMP *cpi,
+                                             MACROBLOCK *const x,
+                                             const MB_MODE_INFO *mbmi) {
+  const FeatureFlags *const features = &cpi->common.features;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (cpi->sf.inter_sf.extra_prune_warped) return 0;
+  if (has_second_ref(mbmi)) return 0;
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+  if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mbmi, features->allow_warped_motion);
+  }
+
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    return 1;
+  }
+
+  return 0;
+}
+
+static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const FeatureFlags *const features = &cm->features;
+
+  mi->num_proj_ref = 1;
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+  if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mi, features->allow_warped_motion);
+  }
+
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    if (warp_sample_info->num < 0) {
+      warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+    }
+    mi->num_proj_ref = warp_sample_info->num;
+  }
+}
+
+static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               int *this_early_term, int use_model_yrd_large,
+                               int *rate_mv, int64_t best_sse) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const FeatureFlags *const features = &cm->features;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 };
+  int best_skip = 0;
+  int best_early_term = 0;
+  int64_t best_cost = INT64_MAX;
+  int best_mode_index = -1;
+  const int interp_filter = features->interp_filter;
+
+  const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = {
+    SIMPLE_TRANSLATION, WARPED_CAUSAL
+  };
+  int mode_search_size = is_warped_mode_allowed(cpi, x, mi) ? 2 : 1;
+
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
+
+  const int total_samples = mi->num_proj_ref;
+  if (total_samples == 0) {
+    // Do not search WARPED_CAUSAL if there are no samples to use to determine
+    // warped parameters.
+    mode_search_size = 1;
+  }
+
+  const MB_MODE_INFO base_mbmi = *mi;
+  MB_MODE_INFO best_mbmi;
+
+  for (int mode_index = 0; mode_index < mode_search_size; ++mode_index) {
+    int64_t cost = INT64_MAX;
+    MOTION_MODE motion_mode = motion_modes[mode_index];
+    *mi = base_mbmi;
+    mi->motion_mode = motion_mode;
+    if (motion_mode == SIMPLE_TRANSLATION) {
+      mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      if (use_model_yrd_large)
+        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                  &pf_rd_stats[mode_index], this_early_term, 1,
+                                  best_sse, NULL, UINT_MAX);
+      else
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, 1,
+                          NULL);
+      pf_rd_stats[mode_index].rate +=
+          av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+                                  cm->seq_params->enable_dual_filter);
+      cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate,
+                    pf_rd_stats[mode_index].dist);
+    } else if (motion_mode == WARPED_CAUSAL) {
+      int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+      const ModeCosts *mode_costs = &x->mode_costs;
+      mi->wm_params.wmtype = DEFAULT_WMTYPE;
+      mi->interp_filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+      memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+      memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+      // Select the samples according to motion vector difference
+      if (mi->num_proj_ref > 1) {
+        mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref,
+                                             mi->num_proj_ref, bsize);
+      }
+
+      // Compute the warped motion parameters with a least squares fit
+      //  using the collected samples
+      if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize,
+                               mi->mv[0].as_mv.row, mi->mv[0].as_mv.col,
+                               &mi->wm_params, mi_row, mi_col)) {
+        if (mi->mode == NEWMV) {
+          const int_mv mv0 = mi->mv[0];
+          const WarpedMotionParams wm_params0 = mi->wm_params;
+          const int num_proj_ref0 = mi->num_proj_ref;
+
+          const int_mv ref_mv = av1_get_ref_mv(x, 0);
+          SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+          av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                            &ref_mv.as_mv, NULL);
+
+          // Refine MV in a small range.
+          av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+                               total_samples, cpi->sf.mv_sf.warp_search_method,
+                               cpi->sf.mv_sf.warp_search_iters);
+          if (mi->mv[0].as_int == ref_mv.as_int) {
+            continue;
+          }
+
+          if (mv0.as_int != mi->mv[0].as_int) {
+            // Keep the refined MV and WM parameters.
+            int tmp_rate_mv = av1_mv_bit_cost(
+                &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+                x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+            *rate_mv = tmp_rate_mv;
+          } else {
+            // Restore the old MV and WM parameters.
+            mi->mv[0] = mv0;
+            mi->wm_params = wm_params0;
+            mi->num_proj_ref = num_proj_ref0;
+          }
+        }
+        // Build the warped predictor
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_Y, av1_num_planes(cm) - 1);
+        if (use_model_yrd_large)
+          model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                    &pf_rd_stats[mode_index], this_early_term,
+                                    1, best_sse, NULL, UINT_MAX);
+        else
+          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL,
+                            1, NULL);
+
+        pf_rd_stats[mode_index].rate +=
+            mode_costs->motion_mode_cost[bsize][mi->motion_mode];
+        cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate,
+                      pf_rd_stats[mode_index].dist);
+      } else {
+        cost = INT64_MAX;
+      }
+    }
+    if (cost < best_cost) {
+      best_mode_index = mode_index;
+      best_cost = cost;
+      best_skip = pf_rd_stats[mode_index].skip_txfm;
+      best_early_term = *this_early_term;
+      best_mbmi = *mi;
+    }
+  }
+  assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE);
+
+  *mi = best_mbmi;
+  this_rdc->rate = pf_rd_stats[best_mode_index].rate;
+  this_rdc->dist = pf_rd_stats[best_mode_index].dist;
+  this_rdc->sse = pf_rd_stats[best_mode_index].sse;
+  this_rdc->skip_txfm = (best_skip || best_early_term);
+  *this_early_term = best_early_term;
+  if (best_mode_index < FILTER_SEARCH_SIZE - 1) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define COLLECT_NON_SQR_STAT 0
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+
+static AOM_INLINE void print_stage_time(const char *stage_name,
+                                        int64_t stage_time,
+                                        int64_t total_time) {
+  printf("    %s: %ld (%f%%)\n", stage_name, stage_time,
+         100 * stage_time / (float)total_time);
+}
+
+static void print_time(const mode_search_stat_nonrd *const ms_stat,
+                       BLOCK_SIZE bsize, int mi_rows, int mi_cols, int mi_row,
+                       int mi_col) {
+  if ((mi_row + mi_size_high[bsize] >= mi_rows) &&
+      (mi_col + mi_size_wide[bsize] >= mi_cols)) {
+    int64_t total_time = 0l;
+    int32_t total_blocks = 0;
+    for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
+      total_time += ms_stat->total_block_times[bs];
+      total_blocks += ms_stat->num_blocks[bs];
+    }
+
+    printf("\n");
+    for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
+      if (ms_stat->num_blocks[bs] == 0) {
+        continue;
+      }
+      if (!COLLECT_NON_SQR_STAT && block_size_wide[bs] != block_size_high[bs]) {
+        continue;
+      }
+
+      printf("BLOCK_%dX%d Num %d, Time: %ld (%f%%), Avg_time %f:\n",
+             block_size_wide[bs], block_size_high[bs], ms_stat->num_blocks[bs],
+             ms_stat->total_block_times[bs],
+             100 * ms_stat->total_block_times[bs] / (float)total_time,
+             (float)ms_stat->total_block_times[bs] / ms_stat->num_blocks[bs]);
+      for (int j = 0; j < MB_MODE_COUNT; j++) {
+        if (ms_stat->nonskipped_search_times[bs][j] == 0) {
+          continue;
+        }
+
+        int64_t total_mode_time = ms_stat->nonskipped_search_times[bs][j];
+        printf("  Mode %d, %d/%d tps %f\n", j,
+               ms_stat->num_nonskipped_searches[bs][j],
+               ms_stat->num_searches[bs][j],
+               ms_stat->num_nonskipped_searches[bs][j] > 0
+                   ? (float)ms_stat->nonskipped_search_times[bs][j] /
+                         ms_stat->num_nonskipped_searches[bs][j]
+                   : 0l);
+        if (j >= INTER_MODE_START) {
+          total_mode_time = ms_stat->ms_time[bs][j] + ms_stat->ifs_time[bs][j] +
+                            ms_stat->model_rd_time[bs][j] +
+                            ms_stat->txfm_time[bs][j];
+          print_stage_time("Motion Search Time", ms_stat->ms_time[bs][j],
+                           total_time);
+          print_stage_time("Filter Search Time", ms_stat->ifs_time[bs][j],
+                           total_time);
+          print_stage_time("Model    RD   Time", ms_stat->model_rd_time[bs][j],
+                           total_time);
+          print_stage_time("Tranfm Search Time", ms_stat->txfm_time[bs][j],
+                           total_time);
+        }
+        print_stage_time("Total  Mode   Time", total_mode_time, total_time);
+      }
+      printf("\n");
+    }
+    printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks);
+  }
+}
+#endif  // COLLECT_NONRD_PICK_MODE_STAT
+
+static bool should_prune_intra_modes_using_neighbors(
+    const MACROBLOCKD *xd, bool enable_intra_mode_pruning_using_neighbors,
+    PREDICTION_MODE this_mode, PREDICTION_MODE above_mode,
+    PREDICTION_MODE left_mode) {
+  if (!enable_intra_mode_pruning_using_neighbors) return false;
+
+  // Avoid pruning of DC_PRED as it is the most probable mode to win as per the
+  // statistics generated for nonrd intra mode evaluations.
+  if (this_mode == DC_PRED) return false;
+
+  // Enable the pruning for current mode only if it is not the winner mode of
+  // both the neighboring blocks (left/top).
+  return xd->up_available && this_mode != above_mode && xd->left_available &&
+         this_mode != left_mode;
+}
+
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  RD_STATS this_rdc, best_rdc;
+  struct estimate_block_intra_args args;
+  init_estimate_block_intra_args(&args, cpi, x);
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  mi->tx_size =
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+  assert(IMPLIES(xd->lossless[mi->segment_id], mi->tx_size == TX_4X4));
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[mi->tx_size];
+
+  // If the current block size is the same as the transform block size, enable
+  // mode pruning based on the best SAD so far.
+  if (cpi->sf.rt_sf.prune_intra_mode_using_best_sad_so_far && bsize == tx_bsize)
+    args.prune_mode_based_on_sad = true;
+
+  int *bmode_costs;
+  PREDICTION_MODE best_mode = DC_PRED;
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  const unsigned int source_variance = x->source_variance;
+  bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
+
+  av1_invalid_rd_stats(&best_rdc);
+  av1_invalid_rd_stats(&this_rdc);
+
+  init_mbmi_nonrd(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm);
+  mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV;
+
+  // Change the limit of this loop to add other intra prediction
+  // mode tests.
+  for (int mode_index = 0; mode_index < RTC_INTRA_MODES; ++mode_index) {
+    PREDICTION_MODE this_mode = intra_mode_list[mode_index];
+
+    // As per the statistics generated for intra mode evaluation in the nonrd
+    // path, it is found that the probability of H_PRED mode being the winner is
+    // very low when the best mode so far is V_PRED (out of DC_PRED and V_PRED).
+    // If V_PRED is the winner mode out of DC_PRED and V_PRED, it could imply
+    // the presence of a vertically dominant pattern. Hence, H_PRED mode is not
+    // evaluated.
+    if (cpi->sf.rt_sf.prune_h_pred_using_best_mode_so_far &&
+        this_mode == H_PRED && best_mode == V_PRED)
+      continue;
+
+    if (should_prune_intra_modes_using_neighbors(
+            xd, cpi->sf.rt_sf.enable_intra_mode_pruning_using_neighbors,
+            this_mode, A, L)) {
+      // Prune V_PRED and H_PRED if source variance of the block is less than
+      // or equal to 50. The source variance threshold is obtained empirically.
+      if ((this_mode == V_PRED || this_mode == H_PRED) && source_variance <= 50)
+        continue;
+
+      // As per the statistics, probability of SMOOTH_PRED being the winner is
+      // low when best mode so far is DC_PRED (out of DC_PRED, V_PRED and
+      // H_PRED). Hence, SMOOTH_PRED mode is not evaluated.
+      if (best_mode == DC_PRED && this_mode == SMOOTH_PRED) continue;
+    }
+
+    this_rdc.dist = this_rdc.rate = 0;
+    args.mode = this_mode;
+    args.skippable = 1;
+    args.rdc = &this_rdc;
+    mi->mode = this_mode;
+    av1_foreach_transformed_block_in_plane(xd, bsize, AOM_PLANE_Y,
+                                           av1_estimate_block_intra, &args);
+
+    if (this_rdc.rate == INT_MAX) continue;
+
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+    if (args.skippable) {
+      this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+    } else {
+      this_rdc.rate += x->mode_costs.skip_txfm_cost[skip_ctx][0];
+    }
+    this_rdc.rate += bmode_costs[this_mode];
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      best_mode = this_mode;
+      if (!this_rdc.skip_txfm) {
+        memset(ctx->blk_skip, 0,
+               sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+      }
+    }
+  }
+
+  mi->mode = best_mode;
+  // Keep DC for UV since mode test is based on Y channel only.
+  mi->uv_mode = UV_DC_PRED;
+  *rd_cost = best_rdc;
+
+  // For lossless: always force the skip flags off.
+  // Even though the blk_skip is set to 0 above in the rdcost comparison,
+  // do it here again in case the above logic changes.
+  if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+    x->txfm_search_info.skip_txfm = 0;
+    memset(ctx->blk_skip, 0,
+           sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+  }
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context_nonrd(x, ctx, mi->mode);
+#else
+  store_coding_context_nonrd(x, ctx);
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
+  struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
+  struct scale_factors *const sf_golden =
+      get_ref_scale_factors(cm, GOLDEN_FRAME);
+  return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) &&
+          (sf_last->y_scale_fp == sf_golden->y_scale_fp));
+}
+
+static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
+                                              MB_MODE_INFO *mi, int mi_row,
+                                              int mi_col, BLOCK_SIZE bsize,
+                                              int gf_temporal_ref,
+                                              int use_ref_frame[],
+                                              int *force_skip_low_temp_var) {
+  AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+
+  // When the ref_frame_config is used to set the reference frame structure
+  // then the usage of alt_ref is determined by the ref_frame_flags
+  // (and not the speed feature use_nonrd_altref_frame).
+  int use_alt_ref_frame = cpi->ppi->rtc_ref.set_ref_frame_config ||
+                          cpi->sf.rt_sf.use_nonrd_altref_frame;
+
+  int use_golden_ref_frame = 1;
+  int use_last_ref_frame = 1;
+
+  // When the ref_frame_config is used to set the reference frame structure:
+  // check if LAST is used as a reference. And only remove golden and altref
+  // references below if last is used as a reference.
+  if (cpi->ppi->rtc_ref.set_ref_frame_config)
+    use_last_ref_frame =
+        cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0;
+
+  // frame_since_golden is not used when user sets the referene structure.
+  if (!cpi->ppi->rtc_ref.set_ref_frame_config && use_last_ref_frame &&
+      cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
+    use_golden_ref_frame = 0;
+  }
+
+  if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var &&
+      x->nonrd_prune_ref_frame_search) {
+    if (is_small_sb)
+      *force_skip_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
+          &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+    else
+      *force_skip_low_temp_var = av1_get_force_skip_low_temp_var(
+          &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, skip golden reference.
+    if (*force_skip_low_temp_var) {
+      use_golden_ref_frame = 0;
+      use_alt_ref_frame = 0;
+    }
+  }
+
+  if (use_last_ref_frame &&
+      (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk ||
+       (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) {
+    use_golden_ref_frame = 0;
+    use_alt_ref_frame = 0;
+  }
+
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    use_golden_ref_frame = 1;
+    use_alt_ref_frame = 0;
+  }
+
+  // Skip golden/altref reference if color is set, on flat blocks with motion.
+  // For screen: always skip golden/alt (if color_sensitivity_sb_g/alt is set)
+  // except when x->nonrd_prune_ref_frame_search = 0. This latter flag
+  // may be set in the variance partition when golden is a much better
+  // reference than last, in which case it may not be worth skipping
+  // golden/altref completely.
+  // Condition on use_last_ref to make sure there remains at least one
+  // reference.
+  if (use_last_ref_frame &&
+      ((cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+        x->nonrd_prune_ref_frame_search != 0) ||
+       (x->source_variance < 200 &&
+        x->content_state_sb.source_sad_nonrd >= kLowSad))) {
+    if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+        x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+      use_golden_ref_frame = 0;
+    if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+        x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+      use_alt_ref_frame = 0;
+  }
+
+  // For non-screen: if golden and altref are not being selected as references
+  // (use_golden_ref_frame/use_alt_ref_frame = 0) check to allow golden back
+  // based on the sad of nearest/nearmv of LAST ref. If this block sad is large,
+  // keep golden as reference. Only do this for the agrressive pruning mode and
+  // avoid it when color is set for golden reference.
+  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+      (cpi->ref_frame_flags & AOM_LAST_FLAG) && !use_golden_ref_frame &&
+      !use_alt_ref_frame && x->pred_mv_sad[LAST_FRAME] != INT_MAX &&
+      x->nonrd_prune_ref_frame_search > 2 &&
+      x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+      x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) {
+    int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150;
+    int pred = x->pred_mv_sad[LAST_FRAME] >>
+               (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    if (pred > thr) use_golden_ref_frame = 1;
+  }
+
+  use_alt_ref_frame =
+      cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0;
+  use_golden_ref_frame =
+      cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0;
+
+  // For spatial layers: enable golden ref if it is set by user and
+  // corresponds to the lower spatial layer.
+  if (cpi->svc.spatial_layer_id > 0 && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
+      x->content_state_sb.source_sad_nonrd < kHighSad) {
+    const int buffslot_golden =
+        cpi->ppi->rtc_ref.ref_idx[GOLDEN_FRAME - LAST_FRAME];
+    if (cpi->ppi->rtc_ref.buffer_time_index[buffslot_golden] ==
+        cpi->svc.current_superframe)
+      use_golden_ref_frame = 1;
+  }
+
+  use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
+  use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
+  use_ref_frame[LAST_FRAME] = use_last_ref_frame;
+  // Keep this assert on, as only 3 references are used in nonrd_pickmode
+  // (LAST, GOLDEN, ALTREF), and if all 3 are not set by user then this
+  // frame must be an intra-only frame and hence should never enter the
+  // pickmode here for inter frames.
+  assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
+}
+
+static AOM_INLINE int is_filter_search_enabled_blk(
+    AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize,
+    int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) {
+  const AV1_COMMON *const cm = &cpi->common;
+  // filt search disabled
+  if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0;
+  // filt search purely based on mode properties
+  if (!cb_pred_filter_search) return 1;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int enable_interp_search = 0;
+  if (!(xd->left_mbmi && xd->above_mbmi)) {
+    // neighbors info unavailable
+    enable_interp_search = 2;
+  } else if (!(is_inter_block(xd->left_mbmi) &&
+               is_inter_block(xd->above_mbmi))) {
+    // neighbor is INTRA
+    enable_interp_search = 2;
+  } else if (xd->left_mbmi->interp_filters.as_int !=
+             xd->above_mbmi->interp_filters.as_int) {
+    // filters are different
+    enable_interp_search = 2;
+  } else if ((cb_pred_filter_search == 1) &&
+             (xd->left_mbmi->interp_filters.as_filters.x_filter !=
+              EIGHTTAP_REGULAR)) {
+    // not regular
+    enable_interp_search = 2;
+  } else {
+    // enable prediction based on chessboard pattern
+    if (xd->left_mbmi->interp_filters.as_filters.x_filter == EIGHTTAP_SMOOTH)
+      *filt_select = EIGHTTAP_SMOOTH;
+    const int bsl = mi_size_wide_log2[bsize];
+    enable_interp_search =
+        (bool)((((mi_row + mi_col) >> bsl) +
+                get_chessboard_index(cm->current_frame.frame_number)) &
+               0x1);
+    if (cyclic_refresh_segment_id_boosted(segment_id)) enable_interp_search = 1;
+  }
+  return enable_interp_search;
+}
+
+static AOM_INLINE int skip_mode_by_threshold(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv,
+    int frames_since_golden, const int *const rd_threshes,
+    const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip,
+    int extra_shift) {
+  int skip_this_mode = 0;
+  const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)];
+  int64_t mode_rd_thresh =
+      best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1)
+                : ((int64_t)rd_threshes[mode_index]) << extra_shift;
+
+  // Increase mode_rd_thresh value for non-LAST for improved encoding
+  // speed
+  if (ref_frame != LAST_FRAME) {
+    mode_rd_thresh = mode_rd_thresh << 1;
+    if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4)
+      mode_rd_thresh = mode_rd_thresh << (extra_shift + 1);
+  }
+
+  if (rd_less_than_thresh(best_cost, mode_rd_thresh,
+                          rd_thresh_freq_fact[mode_index]))
+    if (mv.as_int != 0) skip_this_mode = 1;
+
+  return skip_this_mode;
+}
+
+static AOM_INLINE int skip_mode_by_low_temp(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+    CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) {
+  // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
+  // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+  // later.
+  if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) {
+    return 1;
+  }
+
+  if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 &&
+      force_skip_low_temp_var && mode == NEWMV) {
+    return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+    int extra_prune, unsigned int sse_zeromv_norm, int more_prune) {
+  const unsigned int thresh_skip_golden = 500;
+
+  if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden &&
+      mode == NEWMV)
+    return 1;
+
+  if (bsize == BLOCK_128X128 && mode == NEWMV) return 1;
+
+  // Skip testing non-LAST if this flag is set.
+  if (extra_prune) {
+    if (extra_prune > 1 && ref_frame != LAST_FRAME &&
+        (bsize > BLOCK_16X16 && mode == NEWMV))
+      return 1;
+
+    if (ref_frame != LAST_FRAME && mode == NEARMV) return 1;
+
+    if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1;
+  }
+  return 0;
+}
+
+static void set_block_source_sad(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                 struct buf_2d *yv12_mb) {
+  struct macroblock_plane *const p = &x->plane[0];
+  const int y_sad = cpi->ppi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride,
+                                                yv12_mb->buf, yv12_mb->stride);
+  if (y_sad == 0) x->block_is_zero_sad = 1;
+}
+
+static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int y_sad,
+                                  unsigned int source_variance,
+                                  struct buf_2d yv12_mb[MAX_MB_PLANE]) {
+  const int subsampling_x = cpi->common.seq_params->subsampling_x;
+  const int subsampling_y = cpi->common.seq_params->subsampling_y;
+  const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+  const int high_res = cpi->common.width * cpi->common.height >= 640 * 360;
+  if (bsize == cpi->common.seq_params->sb_size) {
+    // At superblock level color_sensitivity is already set to 0, 1, or 2.
+    // 2 is middle/uncertain level. To avoid additional sad
+    // computations when bsize = sb_size force level 2 to 1 (certain color)
+    // for motion areas.
+    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 2) {
+      x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] =
+          source_sad_nonrd >= kMedSad ? 1 : 0;
+    }
+    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 2) {
+      x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] =
+          source_sad_nonrd >= kMedSad ? 1 : 0;
+    }
+    return;
+  }
+  int shift = 3;
+  unsigned int source_var_thr = 50;
+  int uv_sad_thr = 100;
+  if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res)
+    shift = 4;
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    if (cpi->rc.high_source_sad) shift = 6;
+    if (source_sad_nonrd > kMedSad) {
+      source_var_thr = 1200;
+      uv_sad_thr = 10;
+    }
+  }
+  NOISE_LEVEL noise_level = kLow;
+  int norm_sad =
+      y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+  unsigned int thresh_spatial = (cpi->common.width > 1920) ? 5000 : 1000;
+  // If the spatial source variance is high and the normalized y_sad
+  // is low, then y-channel is likely good for mode estimation, so keep
+  // color_sensitivity off. For low noise content for now, since there is
+  // some bdrate regression for noisy color clip.
+  if (cpi->noise_estimate.enabled)
+    noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+  if (noise_level == kLow && source_variance > thresh_spatial &&
+      cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && norm_sad < 50) {
+    x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 0;
+    x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 0;
+    return;
+  }
+  const int num_planes = av1_num_planes(&cpi->common);
+
+  for (int plane = AOM_PLANE_U; plane < num_planes; ++plane) {
+    // Always check if level = 2. If level = 0 check again for
+    // motion areas for higher resolns, where color artifacts
+    // are more noticeable.
+    if (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 2 ||
+        (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 0 &&
+         source_sad_nonrd >= kMedSad && high_res)) {
+      struct macroblock_plane *const p = &x->plane[plane];
+      const BLOCK_SIZE bs =
+          get_plane_block_size(bsize, subsampling_x, subsampling_y);
+
+      const int uv_sad = cpi->ppi->fn_ptr[bs].sdf(
+          p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride);
+
+      const int norm_uv_sad =
+          uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]);
+      x->color_sensitivity[COLOR_SENS_IDX(plane)] =
+          uv_sad > (y_sad >> shift) && norm_uv_sad > 40;
+      if (source_variance < source_var_thr && norm_uv_sad > uv_sad_thr)
+        x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1;
+    }
+  }
+}
+
+static void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x,
+                                      struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+                                      const int *use_ref_frame_mask,
+                                      const MV_REFERENCE_FRAME *rf,
+                                      int *ref_mv_idx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  MV_REFERENCE_FRAME ref_frame_comp;
+  if (!use_ref_frame_mask[rf[1]]) {
+    // Need to setup pred_block, if it hasn't been done in find_predictors.
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]);
+    const int num_planes = av1_num_planes(cm);
+    if (yv12 != NULL) {
+      const struct scale_factors *const sf =
+          get_ref_scale_factors_const(cm, rf[1]);
+      av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes);
+    }
+  }
+  ref_frame_comp = av1_ref_frame_type(rf);
+  mbmi_ext->mode_context[ref_frame_comp] = 0;
+  mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX;
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp);
+  *ref_mv_idx = mbmi->ref_mv_idx + 1;
+}
+
+static void set_compound_mode(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+                              MV_REFERENCE_FRAME ref_frame2, int ref_mv_idx,
+                              int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+                              PREDICTION_MODE this_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  mi->ref_frame[0] = ref_frame;
+  mi->ref_frame[1] = ref_frame2;
+  mi->compound_idx = 1;
+  mi->comp_group_idx = 0;
+  mi->interinter_comp.type = COMPOUND_AVERAGE;
+  MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame);
+  if (this_mode == GLOBAL_GLOBALMV) {
+    frame_mv[this_mode][ref_frame].as_int = 0;
+    frame_mv[this_mode][ref_frame2].as_int = 0;
+  } else if (this_mode == NEAREST_NEARESTMV) {
+    frame_mv[this_mode][ref_frame].as_int =
+        xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int;
+    frame_mv[this_mode][ref_frame2].as_int =
+        xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int;
+  } else if (this_mode == NEAR_NEARMV) {
+    frame_mv[this_mode][ref_frame].as_int =
+        xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int;
+    frame_mv[this_mode][ref_frame2].as_int =
+        xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int;
+  }
+}
+
+// Prune compound mode if the single mode variance is lower than a fixed
+// percentage of the median value.
+static bool skip_comp_based_on_var(
+    const unsigned int (*single_vars)[REF_FRAMES], BLOCK_SIZE bsize) {
+  unsigned int best_var = UINT_MAX;
+  for (int cur_mode_idx = 0; cur_mode_idx < RTC_INTER_MODES; cur_mode_idx++) {
+    for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+      best_var = AOMMIN(best_var, single_vars[cur_mode_idx][ref_idx]);
+    }
+  }
+  const unsigned int thresh_64 = (unsigned int)(0.57356805f * 8659);
+  const unsigned int thresh_32 = (unsigned int)(0.23964763f * 4281);
+
+  // Currently, the thresh for 128 and 16 are not well-tuned. We are using the
+  // results from 64 and 32 as an heuristic.
+  switch (bsize) {
+    case BLOCK_128X128: return best_var < 4 * thresh_64;
+    case BLOCK_64X64: return best_var < thresh_64;
+    case BLOCK_32X32: return best_var < thresh_32;
+    case BLOCK_16X16: return best_var < thresh_32 / 4;
+    default: return false;
+  }
+}
+
+static AOM_FORCE_INLINE void fill_single_inter_mode_costs(
+    int (*single_inter_mode_costs)[REF_FRAMES], int num_inter_modes,
+    const REF_MODE *reference_mode_set, const ModeCosts *mode_costs,
+    const int16_t *mode_context) {
+  bool ref_frame_used[REF_FRAMES] = { false };
+  for (int idx = 0; idx < num_inter_modes; idx++) {
+    ref_frame_used[reference_mode_set[idx].ref_frame] = true;
+  }
+
+  for (int this_ref_frame = LAST_FRAME; this_ref_frame < REF_FRAMES;
+       this_ref_frame++) {
+    if (!ref_frame_used[this_ref_frame]) {
+      continue;
+    }
+
+    const MV_REFERENCE_FRAME rf[2] = { this_ref_frame, NONE_FRAME };
+    const int16_t mode_ctx = av1_mode_context_analyzer(mode_context, rf);
+    for (PREDICTION_MODE this_mode = NEARESTMV; this_mode <= NEWMV;
+         this_mode++) {
+      single_inter_mode_costs[INTER_OFFSET(this_mode)][this_ref_frame] =
+          cost_mv_ref(mode_costs, this_mode, mode_ctx);
+    }
+  }
+}
+
+static AOM_INLINE bool is_globalmv_better(
+    PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, int rate_mv,
+    const ModeCosts *mode_costs,
+    const int (*single_inter_mode_costs)[REF_FRAMES],
+    const MB_MODE_INFO_EXT *mbmi_ext) {
+  const int globalmv_mode_cost =
+      single_inter_mode_costs[INTER_OFFSET(GLOBALMV)][ref_frame];
+  int this_mode_cost =
+      rate_mv + single_inter_mode_costs[INTER_OFFSET(this_mode)][ref_frame];
+  if (this_mode == NEWMV || this_mode == NEARMV) {
+    const MV_REFERENCE_FRAME rf[2] = { ref_frame, NONE_FRAME };
+    this_mode_cost += get_drl_cost(
+        NEWMV, 0, mbmi_ext, mode_costs->drl_mode_cost0, av1_ref_frame_type(rf));
+  }
+  return this_mode_cost > globalmv_mode_cost;
+}
+
+// Set up the mv/ref_frames etc based on the comp_index. Returns 1 if it
+// succeeds, 0 if it fails.
+static AOM_INLINE int setup_compound_params_from_comp_idx(
+    const AV1_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+    PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *ref_frame,
+    MV_REFERENCE_FRAME *ref_frame2, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+    const int *use_ref_frame_mask, int comp_index,
+    bool comp_use_zero_zeromv_only, MV_REFERENCE_FRAME *last_comp_ref_frame,
+    BLOCK_SIZE bsize) {
+  const MV_REFERENCE_FRAME *rf = comp_ref_mode_set[comp_index].ref_frame;
+  int skip_gf = 0;
+  int skip_alt = 0;
+  *this_mode = comp_ref_mode_set[comp_index].pred_mode;
+  *ref_frame = rf[0];
+  *ref_frame2 = rf[1];
+  assert(*ref_frame == LAST_FRAME);
+  assert(*this_mode == GLOBAL_GLOBALMV || *this_mode == NEAREST_NEARESTMV);
+  if (x->source_variance < 50 && bsize > BLOCK_16X16) {
+    if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+        x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+      skip_gf = 1;
+    if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+        x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+      skip_alt = 1;
+  }
+  if (comp_use_zero_zeromv_only && *this_mode != GLOBAL_GLOBALMV) {
+    return 0;
+  }
+  if (*ref_frame2 == GOLDEN_FRAME &&
+      (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 || skip_gf ||
+       !(cpi->ref_frame_flags & AOM_GOLD_FLAG))) {
+    return 0;
+  } else if (*ref_frame2 == LAST2_FRAME &&
+             (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 ||
+              !(cpi->ref_frame_flags & AOM_LAST2_FLAG))) {
+    return 0;
+  } else if (*ref_frame2 == ALTREF_FRAME &&
+             (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 || skip_alt ||
+              !(cpi->ref_frame_flags & AOM_ALT_FLAG))) {
+    return 0;
+  }
+  int ref_mv_idx = 0;
+  if (*last_comp_ref_frame != rf[1]) {
+    // Only needs to be done once per reference pair.
+    setup_compound_prediction(&cpi->common, x, yv12_mb, use_ref_frame_mask, rf,
+                              &ref_mv_idx);
+    *last_comp_ref_frame = rf[1];
+  }
+  set_compound_mode(x, *ref_frame, *ref_frame2, ref_mv_idx, frame_mv,
+                    *this_mode);
+  if (*this_mode != GLOBAL_GLOBALMV &&
+      frame_mv[*this_mode][*ref_frame].as_int == 0 &&
+      frame_mv[*this_mode][*ref_frame2].as_int == 0) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static AOM_INLINE bool previous_mode_performed_poorly(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
+    const unsigned int (*vars)[REF_FRAMES],
+    const int64_t (*uv_dist)[REF_FRAMES]) {
+  unsigned int best_var = UINT_MAX;
+  int64_t best_uv_dist = INT64_MAX;
+  for (int midx = 0; midx < RTC_INTER_MODES; midx++) {
+    best_var = AOMMIN(best_var, vars[midx][ref_frame]);
+    best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]);
+  }
+  assert(best_var != UINT_MAX && "Invalid variance data.");
+  const float mult = 1.125f;
+  bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame];
+  if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX &&
+      best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) {
+    // If we have chroma info, then take it into account
+    var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame];
+  }
+  return var_bad;
+}
+
+static AOM_INLINE bool prune_compoundmode_with_singlemode_var(
+    PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame,
+    MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES],
+    const uint8_t (*mode_checked)[REF_FRAMES],
+    const unsigned int (*vars)[REF_FRAMES],
+    const int64_t (*uv_dist)[REF_FRAMES]) {
+  const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode);
+  const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode);
+
+  bool first_ref_valid = false, second_ref_valid = false;
+  bool first_ref_bad = false, second_ref_bad = false;
+  if (mode_checked[single_mode0][ref_frame] &&
+      frame_mv[single_mode0][ref_frame].as_int ==
+          frame_mv[compound_mode][ref_frame].as_int &&
+      vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) {
+    first_ref_valid = true;
+    first_ref_bad =
+        previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist);
+  }
+  if (mode_checked[single_mode1][ref_frame2] &&
+      frame_mv[single_mode1][ref_frame2].as_int ==
+          frame_mv[compound_mode][ref_frame2].as_int &&
+      vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) {
+    second_ref_valid = true;
+    second_ref_bad =
+        previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist);
+  }
+  if (first_ref_valid && second_ref_valid) {
+    return first_ref_bad && second_ref_bad;
+  } else if (first_ref_valid || second_ref_valid) {
+    return first_ref_bad || second_ref_bad;
+  }
+  return false;
+}
+
+// Function to setup parameters used for inter mode evaluation in non-rd.
+static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
+    AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+    RD_STATS *rd_cost, int *force_skip_low_temp_var, int mi_row, int mi_col,
+    int gf_temporal_ref, unsigned char segment_id, BLOCK_SIZE bsize
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    ,
+    PICK_MODE_CONTEXT *ctx, int denoise_svc_pickmode
+#endif
+) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  int skip_pred_mv = 0;
+
+  // Initialize variance and distortion (chroma) for all modes and reference
+  // frames
+  for (int idx = 0; idx < RTC_INTER_MODES; idx++) {
+    for (int ref = 0; ref < REF_FRAMES; ref++) {
+      search_state->vars[idx][ref] = UINT_MAX;
+      search_state->uv_dist[idx][ref] = INT64_MAX;
+    }
+  }
+
+  // Initialize values of color sensitivity with sb level color sensitivity
+  av1_copy(x->color_sensitivity, x->color_sensitivity_sb);
+
+  init_best_pickmode(&search_state->best_pickmode);
+
+  // Estimate cost for single reference frames
+  estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize,
+                                  search_state->ref_costs_single);
+
+  // Reset flag to indicate modes evaluated
+  av1_zero(search_state->mode_checked);
+
+  txfm_info->skip_txfm = 0;
+
+  // Initialize mode decisions
+  av1_invalid_rd_stats(&search_state->best_rdc);
+  av1_invalid_rd_stats(&search_state->this_rdc);
+  av1_invalid_rd_stats(rd_cost);
+  for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
+    x->warp_sample_info[ref_idx].num = -1;
+  }
+
+  mi->bsize = bsize;
+  mi->ref_frame[0] = NONE_FRAME;
+  mi->ref_frame[1] = NONE_FRAME;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    // if (cpi->ppi->use_svc) denoise_svc_pickmode =
+    // av1_denoise_svc_non_key(cpi);
+    if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
+      av1_denoiser_reset_frame_stats(ctx);
+  }
+#endif
+
+  // Populate predicated motion vectors for LAST_FRAME
+  if (cpi->ref_frame_flags & AOM_LAST_FLAG) {
+    find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv,
+                    search_state->yv12_mb, bsize, *force_skip_low_temp_var,
+                    x->force_zeromv_skip_for_blk,
+                    &search_state->use_scaled_ref_frame[LAST_FRAME]);
+  }
+  // Update mask to use all reference frame
+  get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
+                         search_state->use_ref_frame_mask,
+                         force_skip_low_temp_var);
+
+  skip_pred_mv = x->force_zeromv_skip_for_blk ||
+                 (x->nonrd_prune_ref_frame_search > 2 &&
+                  x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
+                  x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
+
+  // Populate predicated motion vectors for other single reference frame
+  // Start at LAST_FRAME + 1.
+  for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME + 1;
+       ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
+    if (search_state->use_ref_frame_mask[ref_frame_iter]) {
+      find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv,
+                      search_state->yv12_mb, bsize, *force_skip_low_temp_var,
+                      skip_pred_mv,
+                      &search_state->use_scaled_ref_frame[ref_frame_iter]);
+    }
+  }
+}
+
+// Function to check the inter mode can be skipped based on mode statistics and
+// speed features settings.
+static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
+    AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+    int64_t *thresh_sad_pred, int *force_mv_inter_layer, int *is_single_pred,
+    PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *last_comp_ref_frame,
+    MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int idx,
+    int_mv svc_mv, int force_skip_low_temp_var, unsigned int sse_zeromv_norm,
+    int num_inter_modes, unsigned char segment_id, BLOCK_SIZE bsize,
+    bool comp_use_zero_zeromv_only, bool check_globalmv) {
+  AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const SVC *const svc = &cpi->svc;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+
+  // Skip compound mode based on reference frame mask and type of the mode and
+  // for allowed compound modes, setup ref mv stack and reference frame.
+  if (idx >= num_inter_modes) {
+    const int comp_index = idx - num_inter_modes;
+    if (!setup_compound_params_from_comp_idx(
+            cpi, x, search_state->yv12_mb, this_mode, ref_frame, ref_frame2,
+            search_state->frame_mv, search_state->use_ref_frame_mask,
+            comp_index, comp_use_zero_zeromv_only, last_comp_ref_frame,
+            bsize)) {
+      return true;
+    }
+    *is_single_pred = 0;
+  } else {
+    *this_mode = ref_mode_set[idx].pred_mode;
+    *ref_frame = ref_mode_set[idx].ref_frame;
+    *ref_frame2 = NONE_FRAME;
+  }
+
+  if (x->sb_me_block && *ref_frame == LAST_FRAME) {
+    // We want to make sure to test the superblock MV:
+    // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
+    // have this sb MV. And don't skip NEWMV_LAST: this will be set to
+    // sb MV in handle_inter_mode_nonrd(), in case NEAREST or NEAR don't
+    // have it.
+    if (*this_mode == NEARESTMV &&
+        search_state->frame_mv[NEARESTMV][LAST_FRAME].as_int ==
+            x->sb_me_mv.as_int) {
+      return false;
+    }
+    if (*this_mode == NEARMV &&
+        search_state->frame_mv[NEARMV][LAST_FRAME].as_int ==
+            x->sb_me_mv.as_int) {
+      return false;
+    }
+    if (*this_mode == NEWMV) {
+      return false;
+    }
+  }
+
+  // Skip the single reference mode for which mode check flag is set.
+  if (*is_single_pred && search_state->mode_checked[*this_mode][*ref_frame]) {
+    return true;
+  }
+
+  // Skip GLOBALMV mode if check_globalmv flag is not enabled.
+  if (!check_globalmv && *this_mode == GLOBALMV) {
+    return true;
+  }
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+  aom_usec_timer_start(&x->ms_stat_nonrd.timer1);
+  x->ms_stat_nonrd.num_searches[bsize][*this_mode]++;
+#endif
+  mi->mode = *this_mode;
+  mi->ref_frame[0] = *ref_frame;
+  mi->ref_frame[1] = *ref_frame2;
+
+  // Skip the mode if use reference frame mask flag is not set.
+  if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
+
+  // Skip mode for some modes and reference frames when
+  // force_zeromv_skip_for_blk flag is true.
+  if (x->force_zeromv_skip_for_blk &&
+      ((!(*this_mode == NEARESTMV &&
+          search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
+        *this_mode != GLOBALMV) ||
+       *ref_frame != LAST_FRAME))
+    return true;
+
+  // Skip compound mode based on variance of previously evaluated single
+  // reference modes.
+  if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred &&
+      prune_compoundmode_with_singlemode_var(
+          *this_mode, *ref_frame, *ref_frame2, search_state->frame_mv,
+          search_state->mode_checked, search_state->vars,
+          search_state->uv_dist)) {
+    return true;
+  }
+
+  *force_mv_inter_layer = 0;
+  if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+      ((*ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
+       (*ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) ||
+       (*ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) {
+    // Only test mode if NEARESTMV/NEARMV is (svc_mv.mv.col, svc_mv.mv.row),
+    // otherwise set NEWMV to (svc_mv.mv.col, svc_mv.mv.row).
+    // Skip newmv and filter search.
+    *force_mv_inter_layer = 1;
+    if (*this_mode == NEWMV) {
+      search_state->frame_mv[*this_mode][*ref_frame] = svc_mv;
+    } else if (search_state->frame_mv[*this_mode][*ref_frame].as_int !=
+               svc_mv.as_int) {
+      return true;
+    }
+  }
+
+  // If the segment reference frame feature is enabled then do nothing if the
+  // current ref frame is not allowed.
+  if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame))
+    return true;
+
+  // For screen content: skip mode testing based on source_sad.
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    // If source_sad is computed: skip non-zero motion
+    // check for stationary (super)blocks. Otherwise if superblock
+    // has motion skip the modes with zero motion on last reference
+    // for flat blocks, and color is not set.
+    // For the latter condition: the same condition should apply
+    // to newmv if (0, 0), so this latter condition is repeated
+    // below after search_new_mv.
+    if (rt_sf->source_metrics_sb_nonrd) {
+      if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 &&
+           x->content_state_sb.source_sad_nonrd == kZeroSad) ||
+          (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 &&
+           x->block_is_zero_sad == 0 && *ref_frame == LAST_FRAME &&
+           ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+             x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+            cpi->rc.high_source_sad) &&
+           x->source_variance == 0))
+        return true;
+    }
+    // Skip NEWMV search for flat blocks.
+    if (*this_mode == NEWMV && x->source_variance < 100) return true;
+    // Skip non-LAST for color on flat blocks.
+    if (*ref_frame > LAST_FRAME && x->source_variance == 0 &&
+        (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+         x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1))
+      return true;
+  }
+
+  // Skip mode based on block size, reference frame mode and other block
+  // properties.
+  if (skip_mode_by_bsize_and_ref_frame(
+          *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search,
+          sse_zeromv_norm, rt_sf->nonrd_aggressive_skip))
+    return true;
+
+  // Skip mode based on low temporal variance and souce sad.
+  if (skip_mode_by_low_temp(*this_mode, *ref_frame, bsize, x->content_state_sb,
+                            search_state->frame_mv[*this_mode][*ref_frame],
+                            force_skip_low_temp_var))
+    return true;
+
+  // Disable this drop out case if the ref frame segment level feature is
+  // enabled for this segment. This is to prevent the possibility that we
+  // end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
+    if (rt_sf->nonrd_prune_ref_frame_search > 0 &&
+        x->pred_mv_sad[*ref_frame] != INT_MAX && *ref_frame != LAST_FRAME) {
+      if ((int64_t)(x->pred_mv_sad[*ref_frame]) > *thresh_sad_pred) return true;
+    }
+  }
+
+  // Check for skipping NEARMV based on pred_mv_sad.
+  if (*this_mode == NEARMV && x->pred_mv1_sad[*ref_frame] != INT_MAX &&
+      x->pred_mv1_sad[*ref_frame] > (x->pred_mv0_sad[*ref_frame] << 1))
+    return true;
+
+  // Skip single reference mode based on rd threshold.
+  if (*is_single_pred) {
+    if (skip_mode_by_threshold(
+            *this_mode, *ref_frame,
+            search_state->frame_mv[*this_mode][*ref_frame],
+            cpi->rc.frames_since_golden, cpi->rd.threshes[segment_id][bsize],
+            x->thresh_freq_fact[bsize], search_state->best_rdc.rdcost,
+            search_state->best_pickmode.best_mode_skip_txfm,
+            (rt_sf->nonrd_aggressive_skip ? 1 : 0)))
+      return true;
+  }
+  return false;
+}
+
+// Function to perform inter mode evaluation for non-rd
+static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
+    AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+    PICK_MODE_CONTEXT *ctx, PRED_BUFFER **this_mode_pred,
+    PRED_BUFFER *tmp_buffer, InterPredParams inter_pred_params_sr,
+    int *best_early_term, unsigned int *sse_zeromv_norm, bool *check_globalmv,
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    int64_t *zero_last_cost_orig, int denoise_svc_pickmode,
+#endif
+    int idx, int force_mv_inter_layer, int is_single_pred, int gf_temporal_ref,
+    int use_model_yrd_large, int filter_search_enabled_blk, BLOCK_SIZE bsize,
+    PREDICTION_MODE this_mode, InterpFilter filt_select,
+    int cb_pred_filter_search, int reuse_inter_pred,
+    int *sb_me_has_been_tested) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  const int bw = block_size_wide[bsize];
+  const InterpFilter filter_ref = cm->features.interp_filter;
+  const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+  BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+  MV_REFERENCE_FRAME ref_frame = mi->ref_frame[0];
+  MV_REFERENCE_FRAME ref_frame2 = mi->ref_frame[1];
+  int_mv *const this_mv = &search_state->frame_mv[this_mode][ref_frame];
+  unsigned int var = UINT_MAX;
+  int this_early_term = 0;
+  int rate_mv = 0;
+  int is_skippable;
+  int skip_this_mv = 0;
+  unsigned int var_threshold = UINT_MAX;
+  PREDICTION_MODE this_best_mode;
+  RD_STATS nonskip_rdc;
+  av1_invalid_rd_stats(&nonskip_rdc);
+
+  if (x->sb_me_block && this_mode == NEWMV && ref_frame == LAST_FRAME) {
+    // Set the NEWMV_LAST to the sb MV.
+    search_state->frame_mv[NEWMV][LAST_FRAME].as_int = x->sb_me_mv.as_int;
+  } else if (this_mode == NEWMV && !force_mv_inter_layer) {
+#if COLLECT_NONRD_PICK_MODE_STAT
+    aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+    // Find the best motion vector for single/compound mode.
+    const bool skip_newmv = search_new_mv(
+        cpi, x, search_state->frame_mv, ref_frame, gf_temporal_ref, bsize,
+        mi_row, mi_col, &rate_mv, &search_state->best_rdc);
+#if COLLECT_NONRD_PICK_MODE_STAT
+    aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+    x->ms_stat_nonrd.ms_time[bsize][this_mode] +=
+        aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+    // Skip NEWMV mode,
+    //   (i). For bsize smaller than 16X16
+    //  (ii). Based on sad of the predicted mv w.r.t LAST_FRAME
+    // (iii). When motion vector is same as that of reference mv
+    if (skip_newmv) {
+      return true;
+    }
+  }
+
+  // Check the current motion vector is same as that of previously evaluated
+  // motion vectors.
+  for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
+       inter_mv_mode++) {
+    if (inter_mv_mode == this_mode) continue;
+    if (is_single_pred &&
+        search_state->mode_checked[inter_mv_mode][ref_frame] &&
+        this_mv->as_int ==
+            search_state->frame_mv[inter_mv_mode][ref_frame].as_int) {
+      skip_this_mv = 1;
+      break;
+    }
+  }
+
+  // Skip single mode if current motion vector is same that of previously
+  // evaluated motion vectors.
+  if (skip_this_mv && is_single_pred) return true;
+
+  // For screen: for spatially flat blocks with non-zero motion,
+  // skip newmv if the motion vector is (0, 0)-LAST, and color is not set.
+  if (this_mode == NEWMV && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      cpi->svc.spatial_layer_id == 0 && rt_sf->source_metrics_sb_nonrd) {
+    if (this_mv->as_int == 0 && ref_frame == LAST_FRAME &&
+        x->block_is_zero_sad == 0 &&
+        ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+          x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+         cpi->rc.high_source_sad) &&
+        x->source_variance == 0)
+      return true;
+  }
+
+  mi->mode = this_mode;
+  mi->mv[0].as_int = this_mv->as_int;
+  mi->mv[1].as_int = 0;
+  if (!is_single_pred)
+    mi->mv[1].as_int = search_state->frame_mv[this_mode][ref_frame2].as_int;
+
+  // Set buffers to store predicted samples for reuse
+  if (reuse_inter_pred) {
+    if (!*this_mode_pred) {
+      *this_mode_pred = &tmp_buffer[3];
+    } else {
+      *this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+      pd->dst.buf = (*this_mode_pred)->data;
+      pd->dst.stride = bw;
+    }
+  }
+
+  mi->motion_mode = SIMPLE_TRANSLATION;
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) {
+    calc_num_proj_ref(cpi, x, mi);
+  }
+#endif
+  // set variance threshold for compound mode pruning
+  if (rt_sf->prune_compoundmode_with_singlecompound_var && !is_single_pred &&
+      use_model_yrd_large) {
+    const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode);
+    const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode);
+    var_threshold =
+        AOMMIN(var_threshold,
+               search_state->vars[INTER_OFFSET(single_mode0)][ref_frame]);
+    var_threshold =
+        AOMMIN(var_threshold,
+               search_state->vars[INTER_OFFSET(single_mode1)][ref_frame2]);
+  }
+
+  // decide interpolation filter, build prediction signal, get sse
+  const bool is_mv_subpel =
+      (mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07);
+  const bool enable_filt_search_this_mode =
+      (filter_search_enabled_blk == 2)
+          ? true
+          : (filter_search_enabled_blk && !force_mv_inter_layer &&
+             is_single_pred &&
+             (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search));
+  if (is_mv_subpel && enable_filt_search_this_mode) {
+#if COLLECT_NONRD_PICK_MODE_STAT
+    aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+    search_filter_ref(
+        cpi, x, &search_state->this_rdc, &inter_pred_params_sr, mi_row, mi_col,
+        tmp_buffer, bsize, reuse_inter_pred, this_mode_pred, &this_early_term,
+        &var, use_model_yrd_large, best_pickmode->best_sse, is_single_pred);
+#if COLLECT_NONRD_PICK_MODE_STAT
+    aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+    x->ms_stat_nonrd.ifs_time[bsize][this_mode] +=
+        aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+#if !CONFIG_REALTIME_ONLY
+  } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion &&
+             this_mode == NEWMV) {
+    // Find the best motion mode when current mode is NEWMV
+    search_motion_mode(cpi, x, &search_state->this_rdc, mi_row, mi_col, bsize,
+                       &this_early_term, use_model_yrd_large, &rate_mv,
+                       best_pickmode->best_sse);
+    if (this_mode == NEWMV) {
+      this_mv[0] = mi->mv[0];
+    }
+#endif
+  } else {
+    mi->interp_filters =
+        (filter_ref == SWITCHABLE)
+            ? av1_broadcast_interp_filter(default_interp_filter)
+            : av1_broadcast_interp_filter(filter_ref);
+    if (force_mv_inter_layer)
+      mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+    // If it is sub-pel motion and cb_pred_filter_search is enabled, select
+    // the pre-decided filter
+    if (is_mv_subpel && cb_pred_filter_search)
+      mi->interp_filters = av1_broadcast_interp_filter(filt_select);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+    aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+    if (is_single_pred) {
+      SubpelParams subpel_params;
+      // Initialize inter mode level params for single reference mode.
+      init_inter_mode_params(&mi->mv[0].as_mv, &inter_pred_params_sr,
+                             &subpel_params, xd->block_ref_scale_factors[0],
+                             pd->pre->width, pd->pre->height);
+      av1_enc_build_inter_predictor_y_nonrd(xd, &inter_pred_params_sr,
+                                            &subpel_params);
+    } else {
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    }
+
+    if (use_model_yrd_large) {
+      model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                &search_state->this_rdc, &this_early_term, 0,
+                                best_pickmode->best_sse, &var, var_threshold);
+    } else {
+      model_rd_for_sb_y(cpi, bsize, x, xd, &search_state->this_rdc, &var, 0,
+                        &this_early_term);
+    }
+#if COLLECT_NONRD_PICK_MODE_STAT
+    aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+    x->ms_stat_nonrd.model_rd_time[bsize][this_mode] +=
+        aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+  }
+
+  // update variance for single mode
+  if (is_single_pred) {
+    search_state->vars[INTER_OFFSET(this_mode)][ref_frame] = var;
+    if (this_mv->as_int == 0) {
+      search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
+    }
+  }
+  // prune compound mode based on single mode var threshold
+  if (!is_single_pred && var > var_threshold) {
+    if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+    return true;
+  }
+
+  if (ref_frame == LAST_FRAME && this_mv->as_int == 0) {
+    *sse_zeromv_norm = (unsigned int)(search_state->this_rdc.sse >>
+                                      (b_width_log2_lookup[bsize] +
+                                       b_height_log2_lookup[bsize]));
+  }
+
+  // Perform early termination based on sse.
+  if (rt_sf->sse_early_term_inter_search &&
+      early_term_inter_search_with_sse(rt_sf->sse_early_term_inter_search,
+                                       bsize, search_state->this_rdc.sse,
+                                       best_pickmode->best_sse, this_mode)) {
+    if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+    return true;
+  }
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+  x->ms_stat_nonrd.num_nonskipped_searches[bsize][this_mode]++;
+#endif
+
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1];
+  const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0];
+  const int64_t sse_y = search_state->this_rdc.sse;
+
+  if (this_early_term) {
+    search_state->this_rdc.skip_txfm = 1;
+    search_state->this_rdc.rate = skip_txfm_cost;
+    search_state->this_rdc.dist = search_state->this_rdc.sse << 4;
+  } else {
+#if COLLECT_NONRD_PICK_MODE_STAT
+    aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+    // Calculates RD Cost using Hadamard transform.
+    av1_block_yrd(x, &search_state->this_rdc, &is_skippable, bsize,
+                  mi->tx_size);
+    if (search_state->this_rdc.skip_txfm ||
+        RDCOST(x->rdmult, search_state->this_rdc.rate,
+               search_state->this_rdc.dist) >=
+            RDCOST(x->rdmult, 0, search_state->this_rdc.sse)) {
+      if (!search_state->this_rdc.skip_txfm) {
+        // Need to store "real" rdc for possible future use if UV rdc
+        // disallows tx skip
+        nonskip_rdc = search_state->this_rdc;
+        nonskip_rdc.rate += no_skip_txfm_cost;
+      }
+      search_state->this_rdc.rate = skip_txfm_cost;
+      search_state->this_rdc.skip_txfm = 1;
+      search_state->this_rdc.dist = search_state->this_rdc.sse;
+    } else {
+      search_state->this_rdc.rate += no_skip_txfm_cost;
+    }
+
+    // Populate predicted sample for chroma planes based on color sensitivity.
+    if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+         x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) {
+      RD_STATS rdc_uv;
+      const BLOCK_SIZE uv_bsize =
+          get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                               xd->plane[AOM_PLANE_U].subsampling_y);
+      if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_U, AOM_PLANE_U);
+      }
+      if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_V, AOM_PLANE_V);
+      }
+      // Compute sse for chroma planes.
+      const int64_t sse_uv = av1_model_rd_for_sb_uv(
+          cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V);
+      if (rdc_uv.dist < x->min_dist_inter_uv)
+        x->min_dist_inter_uv = rdc_uv.dist;
+      search_state->this_rdc.sse += sse_uv;
+      // Restore Y rdc if UV rdc disallows txfm skip
+      if (search_state->this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
+          nonskip_rdc.rate != INT_MAX)
+        search_state->this_rdc = nonskip_rdc;
+      if (is_single_pred) {
+        search_state->uv_dist[INTER_OFFSET(this_mode)][ref_frame] = rdc_uv.dist;
+      }
+      search_state->this_rdc.rate += rdc_uv.rate;
+      search_state->this_rdc.dist += rdc_uv.dist;
+      search_state->this_rdc.skip_txfm =
+          search_state->this_rdc.skip_txfm && rdc_uv.skip_txfm;
+    }
+#if COLLECT_NONRD_PICK_MODE_STAT
+    aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+    x->ms_stat_nonrd.txfm_time[bsize][this_mode] +=
+        aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+  }
+
+  this_best_mode = this_mode;
+  // TODO(kyslov) account for UV prediction cost
+  search_state->this_rdc.rate += rate_mv;
+  if (!is_single_pred) {
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+    search_state->this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx);
+  } else {
+    // If the current mode has zeromv but is not GLOBALMV, compare the rate
+    // cost. If GLOBALMV is cheaper, use GLOBALMV instead.
+    if (this_mode != GLOBALMV &&
+        this_mv->as_int == search_state->frame_mv[GLOBALMV][ref_frame].as_int) {
+      if (is_globalmv_better(this_mode, ref_frame, rate_mv, mode_costs,
+                             search_state->single_inter_mode_costs, mbmi_ext)) {
+        this_best_mode = GLOBALMV;
+      }
+    }
+
+    search_state->this_rdc.rate +=
+        search_state
+            ->single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame];
+  }
+
+  if (is_single_pred && this_mv->as_int == 0 && var < UINT_MAX) {
+    search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
+  }
+
+  search_state->this_rdc.rate += search_state->ref_costs_single[ref_frame];
+
+  search_state->this_rdc.rdcost = RDCOST(x->rdmult, search_state->this_rdc.rate,
+                                         search_state->this_rdc.dist);
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR && is_single_pred) {
+    newmv_diff_bias(xd, this_best_mode, &search_state->this_rdc, bsize,
+                    search_state->frame_mv[this_best_mode][ref_frame].as_mv.row,
+                    search_state->frame_mv[this_best_mode][ref_frame].as_mv.col,
+                    cpi->speed, x->source_variance, x->content_state_sb);
+  }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
+      cpi->denoiser.denoising_level > kDenLowLow) {
+    av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
+    // Keep track of zero_last cost.
+    if (ref_frame == LAST_FRAME && this_mv->as_int == 0)
+      *zero_last_cost_orig = search_state->this_rdc.rdcost;
+  }
+#else
+  (void)(sse_y);
+#endif
+
+  search_state->mode_checked[this_mode][ref_frame] = 1;
+  search_state->mode_checked[this_best_mode][ref_frame] = 1;
+
+  if (*check_globalmv) {
+    int32_t abs_mv =
+        abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.row) +
+        abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.col);
+    // Early exit check: if the magnitude of this_best_mode's mv is small
+    // enough, we skip GLOBALMV check in the next loop iteration.
+    if (abs_mv < 2) {
+      *check_globalmv = false;
+    }
+  }
+#if COLLECT_NONRD_PICK_MODE_STAT
+  aom_usec_timer_mark(&x->ms_stat_nonrd.timer1);
+  x->ms_stat_nonrd.nonskipped_search_times[bsize][this_mode] +=
+      aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1);
+#endif
+
+  if (x->sb_me_block && ref_frame == LAST_FRAME &&
+      search_state->frame_mv[this_best_mode][ref_frame].as_int ==
+          x->sb_me_mv.as_int)
+    *sb_me_has_been_tested = 1;
+
+  // Copy best mode params to search state
+  if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) {
+    search_state->best_rdc = search_state->this_rdc;
+    *best_early_term = this_early_term;
+    update_search_state_nonrd(search_state, mi, txfm_info, &nonskip_rdc, ctx,
+                              this_best_mode, sse_y);
+
+    // This is needed for the compound modes.
+    search_state->frame_mv_best[this_best_mode][ref_frame].as_int =
+        search_state->frame_mv[this_best_mode][ref_frame].as_int;
+    if (ref_frame2 > NONE_FRAME) {
+      search_state->frame_mv_best[this_best_mode][ref_frame2].as_int =
+          search_state->frame_mv[this_best_mode][ref_frame2].as_int;
+    }
+
+    if (reuse_inter_pred) {
+      free_pred_buffer(best_pickmode->best_pred);
+      best_pickmode->best_pred = *this_mode_pred;
+    }
+  } else {
+    if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+  }
+
+  if (*best_early_term && (idx > 0 || rt_sf->nonrd_aggressive_skip)) {
+    txfm_info->skip_txfm = 1;
+    if (!x->sb_me_block || *sb_me_has_been_tested) return false;
+  }
+  return true;
+}
+
+// Function to perform screen content mode evaluation for non-rd
+static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd(
+    AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+    PRED_BUFFER *this_mode_pred, PICK_MODE_CONTEXT *ctx,
+    PRED_BUFFER *tmp_buffer, struct buf_2d *orig_dst, int skip_idtx_palette,
+    int try_palette, BLOCK_SIZE bsize, int reuse_inter_pred, int mi_col,
+    int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+  const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+  // TODO(marpan): Only allow for 8 bit-depth for now, re-enable for 10/12 bit
+  // when issue 3359 is fixed.
+  if (cm->seq_params->bit_depth == 8 &&
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette &&
+      !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk &&
+      is_inter_mode(best_pickmode->best_mode) &&
+      best_pickmode->best_pred != NULL &&
+      (!rt_sf->prune_idtx_nonrd ||
+       (rt_sf->prune_idtx_nonrd && bsize <= BLOCK_32X32 &&
+        best_pickmode->best_mode_skip_txfm != 1 && x->source_variance > 200))) {
+    RD_STATS idtx_rdc;
+    av1_init_rd_stats(&idtx_rdc);
+    int is_skippable;
+    this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+    pd->dst.buf = this_mode_pred->data;
+    pd->dst.stride = bw;
+    const PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+    av1_block_yrd_idtx(x, best_pred->data, best_pred->stride, &idtx_rdc,
+                       &is_skippable, bsize, mi->tx_size);
+    int64_t idx_rdcost_y = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
+    int allow_idtx = 1;
+    // Incorporate color into rd cost.
+    if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+         x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) {
+      RD_STATS rdc_uv;
+      const BLOCK_SIZE uv_bsize =
+          get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                               xd->plane[AOM_PLANE_U].subsampling_y);
+      if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_U, AOM_PLANE_U);
+      }
+      if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_V, AOM_PLANE_V);
+      }
+      av1_model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U,
+                             AOM_PLANE_V);
+      if (rdc_uv.dist < x->min_dist_inter_uv)
+        x->min_dist_inter_uv = rdc_uv.dist;
+      idtx_rdc.rate += rdc_uv.rate;
+      idtx_rdc.dist += rdc_uv.dist;
+      idtx_rdc.skip_txfm = idtx_rdc.skip_txfm && rdc_uv.skip_txfm;
+      if (idx_rdcost_y == 0 && rdc_uv.dist > 0 && x->source_variance < 3000 &&
+          x->content_state_sb.source_sad_nonrd > kMedSad)
+        allow_idtx = 0;
+    }
+    int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
+    if (allow_idtx && idx_rdcost < search_state->best_rdc.rdcost) {
+      best_pickmode->tx_type = IDTX;
+      search_state->best_rdc.rdcost = idx_rdcost;
+      best_pickmode->best_mode_skip_txfm = idtx_rdc.skip_txfm;
+      if (!idtx_rdc.skip_txfm) {
+        memcpy(ctx->blk_skip, txfm_info->blk_skip,
+               sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+      }
+      xd->tx_type_map[0] = best_pickmode->tx_type;
+      memset(ctx->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk);
+      memset(xd->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk);
+    }
+    pd->dst = *orig_dst;
+  }
+
+  if (!try_palette) return;
+  const unsigned int intra_ref_frame_cost =
+      search_state->ref_costs_single[INTRA_FRAME];
+
+  if (!is_mode_intra(best_pickmode->best_mode)) {
+    PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+    if (reuse_inter_pred && best_pred != NULL) {
+      if (best_pred->data == orig_dst->buf) {
+        this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+        aom_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride, bw, bh);
+        best_pickmode->best_pred = this_mode_pred;
+      }
+    }
+    pd->dst = *orig_dst;
+  }
+  // Search palette mode for Luma plane in inter frame.
+  av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
+                               &search_state->this_rdc,
+                               search_state->best_rdc.rdcost);
+  // Update best mode data in search_state
+  if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) {
+    best_pickmode->pmi = mi->palette_mode_info;
+    best_pickmode->best_mode = DC_PRED;
+    mi->mv[0].as_int = INVALID_MV;
+    mi->mv[1].as_int = INVALID_MV;
+    best_pickmode->best_ref_frame = INTRA_FRAME;
+    best_pickmode->best_second_ref_frame = NONE;
+    search_state->best_rdc.rate = search_state->this_rdc.rate;
+    search_state->best_rdc.dist = search_state->this_rdc.dist;
+    search_state->best_rdc.rdcost = search_state->this_rdc.rdcost;
+    best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm;
+    // Keep the skip_txfm off if the color_sensitivity is set.
+    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+        x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])
+      search_state->this_rdc.skip_txfm = 0;
+    if (!search_state->this_rdc.skip_txfm) {
+      memcpy(ctx->blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+    }
+    if (xd->tx_type_map[0] != DCT_DCT)
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+  }
+}
+
+/*!\brief AV1 inter mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * Top level function for Non-RD optimized inter mode selection.
+ * This finction will loop over subset of inter modes and select the best one
+ * based on calculated modelled RD cost. While making decisions which modes to
+ * check, this function applies heuristics based on previously checked modes,
+ * block residual variance, block size, and other factors to prune certain
+ * modes and reference frames. Currently only single reference frame modes
+ * are checked. Additional heuristics are applied to decide if intra modes
+ *  need to be checked.
+ *  *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+                                data/contexts/models for the tile during
+                                encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+                                  MACROBLOCK *x, RD_STATS *rd_cost,
+                                  BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  AV1_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  MV_REFERENCE_FRAME ref_frame, ref_frame2;
+  const unsigned char segment_id = mi->segment_id;
+  int best_early_term = 0;
+  int force_skip_low_temp_var = 0;
+  unsigned int sse_zeromv_norm = UINT_MAX;
+  const int num_inter_modes = NUM_INTER_MODES;
+  const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+  bool check_globalmv = rt_sf->check_globalmv_on_single_ref;
+  PRED_BUFFER tmp_buffer[4];
+  DECLARE_ALIGNED(16, uint8_t, pred_buf[MAX_MB_PLANE * MAX_SB_SQUARE]);
+  PRED_BUFFER *this_mode_pred = NULL;
+  const int reuse_inter_pred =
+      rt_sf->reuse_inter_pred_nonrd && cm->seq_params->bit_depth == AOM_BITS_8;
+  InterModeSearchStateNonrd search_state;
+  av1_zero(search_state.use_ref_frame_mask);
+  av1_zero(search_state.use_scaled_ref_frame);
+  BEST_PICKMODE *const best_pickmode = &search_state.best_pickmode;
+  (void)tile_data;
+
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const int pixels_in_block = bh * bw;
+  struct buf_2d orig_dst = pd->dst;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+#if COLLECT_NONRD_PICK_MODE_STAT
+  // Mode statistics can be collected only when num_workers is 1
+  assert(cpi->mt_info.num_workers <= 1);
+  aom_usec_timer_start(&x->ms_stat_nonrd.bsize_timer);
+#endif
+  int64_t thresh_sad_pred = INT64_MAX;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int_mv svc_mv = { .as_int = 0 };
+  int force_mv_inter_layer = 0;
+  bool comp_use_zero_zeromv_only = 0;
+  int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  const int denoise_recheck_zeromv = 1;
+  AV1_PICKMODE_CTX_DEN ctx_den;
+  int64_t zero_last_cost_orig = INT64_MAX;
+  int denoise_svc_pickmode = 1;
+  const int resize_pending = is_frame_resize_pending(cpi);
+#endif
+  const ModeCosts *mode_costs = &x->mode_costs;
+  struct scale_factors sf_no_scale;
+  av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+                                    cm->width, cm->height);
+  if (reuse_inter_pred) {
+    for (int buf_idx = 0; buf_idx < 3; buf_idx++) {
+      tmp_buffer[buf_idx].data = &pred_buf[pixels_in_block * buf_idx];
+      tmp_buffer[buf_idx].stride = bw;
+      tmp_buffer[buf_idx].in_use = 0;
+    }
+    tmp_buffer[3].data = pd->dst.buf;
+    tmp_buffer[3].stride = pd->dst.stride;
+    tmp_buffer[3].in_use = 0;
+  }
+
+  const int gf_temporal_ref = is_same_gf_and_last_scale(cm);
+
+  // If the lower spatial layer uses an averaging filter for downsampling
+  // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative
+  // to source, so use subpel motion vector to compensate. The nonzero motion
+  // is half pixel shifted to left and top, so (-4, -4). This has more effect
+  // on higher resolutions, so condition it on that for now.
+  // Exclude quality layers, which have the same resolution and hence no shift.
+  if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+      !svc->has_lower_quality_layer &&
+      svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
+      cm->width * cm->height > 640 * 480) {
+    svc_mv.as_mv.row = -4;
+    svc_mv.as_mv.col = -4;
+  }
+
+  // Setup parameters used for inter mode evaluation.
+  set_params_nonrd_pick_inter_mode(cpi, x, &search_state, rd_cost,
+                                   &force_skip_low_temp_var, mi_row, mi_col,
+                                   gf_temporal_ref, segment_id, bsize
+#if CONFIG_AV1_TEMPORAL_DENOISING
+                                   ,
+                                   ctx, denoise_svc_pickmode
+#endif
+  );
+
+  if (rt_sf->use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) {
+    // Only search compound if bsize \gt BLOCK_16X16.
+    if (bsize > BLOCK_16X16) {
+      comp_use_zero_zeromv_only = rt_sf->check_only_zero_zeromv_on_large_blocks;
+    } else {
+      tot_num_comp_modes = 0;
+    }
+  } else {
+    tot_num_comp_modes = 0;
+  }
+
+  if (x->pred_mv_sad[LAST_FRAME] != INT_MAX) {
+    thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1;
+    // Increase threshold for less aggressive pruning.
+    if (rt_sf->nonrd_prune_ref_frame_search == 1)
+      thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2);
+  }
+
+  const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
+
+  // decide block-level interp filter search flags:
+  // filter_search_enabled_blk:
+  // 0: disabled
+  // 1: filter search depends on mode properties
+  // 2: filter search forced since prediction is unreliable
+  // cb_pred_filter_search 0: disabled cb prediction
+  InterpFilter filt_select = EIGHTTAP_REGULAR;
+  const int cb_pred_filter_search =
+      x->content_state_sb.source_sad_nonrd > kVeryLowSad
+          ? cpi->sf.interp_sf.cb_pred_filter_search
+          : 0;
+  const int filter_search_enabled_blk =
+      is_filter_search_enabled_blk(cpi, x, mi_row, mi_col, bsize, segment_id,
+                                   cb_pred_filter_search, &filt_select);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+  x->ms_stat_nonrd.num_blocks[bsize]++;
+#endif
+  init_mbmi_nonrd(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm);
+  mi->tx_size = AOMMIN(
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+      TX_16X16);
+
+  fill_single_inter_mode_costs(search_state.single_inter_mode_costs,
+                               num_inter_modes, ref_mode_set, mode_costs,
+                               mbmi_ext->mode_context);
+
+  MV_REFERENCE_FRAME last_comp_ref_frame = NONE_FRAME;
+
+  // Initialize inter prediction params at block level for single reference
+  // mode.
+  InterPredParams inter_pred_params_sr;
+  init_inter_block_params(&inter_pred_params_sr, pd->width, pd->height,
+                          mi_row * MI_SIZE, mi_col * MI_SIZE, pd->subsampling_x,
+                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd),
+                          /*is_intrabc=*/0);
+  inter_pred_params_sr.conv_params =
+      get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd);
+
+  x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad;
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      !x->force_zeromv_skip_for_blk &&
+      x->content_state_sb.source_sad_nonrd != kZeroSad &&
+      x->source_variance == 0 && bsize < cm->seq_params->sb_size &&
+      search_state.yv12_mb[LAST_FRAME][0].width == cm->width &&
+      search_state.yv12_mb[LAST_FRAME][0].height == cm->height) {
+    set_block_source_sad(cpi, x, bsize, &search_state.yv12_mb[LAST_FRAME][0]);
+  }
+
+  int sb_me_has_been_tested = 0;
+  x->sb_me_block = x->sb_me_partition;
+  // Only use this feature (force testing of superblock motion) if coding
+  // block size is large.
+  if (x->sb_me_block) {
+    if (cm->seq_params->sb_size == BLOCK_128X128 && bsize < BLOCK_64X64)
+      x->sb_me_block = 0;
+    else if (cm->seq_params->sb_size == BLOCK_64X64 && bsize < BLOCK_32X32)
+      x->sb_me_block = 0;
+  }
+
+  x->min_dist_inter_uv = INT64_MAX;
+  for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
+    // If we are at the first compound mode, and the single modes already
+    // perform well, then end the search.
+    if (rt_sf->skip_compound_based_on_var && idx == num_inter_modes &&
+        skip_comp_based_on_var(search_state.vars, bsize)) {
+      break;
+    }
+
+    int is_single_pred = 1;
+    PREDICTION_MODE this_mode;
+
+    if (idx == 0 && !x->force_zeromv_skip_for_blk) {
+      // Set color sensitivity on first tested mode only.
+      // Use y-sad already computed in find_predictors: take the sad with motion
+      // vector closest to 0; the uv-sad computed below in set_color_sensitivity
+      // is for zeromv.
+      // For screen: first check if golden reference is being used, if so,
+      // force color_sensitivity on (=1) if the color sensitivity for sb_g is 1.
+      // The check in set_color_sensitivity() will then follow and check for
+      // setting the flag if the level is still 2 or 0.
+      if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+          search_state.use_ref_frame_mask[GOLDEN_FRAME]) {
+        if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1)
+          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1;
+        if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1;
+      }
+      if (search_state.use_ref_frame_mask[LAST_FRAME] &&
+          x->pred_mv0_sad[LAST_FRAME] != INT_MAX) {
+        int y_sad = x->pred_mv0_sad[LAST_FRAME];
+        if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+            (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+             abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+                (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+                 abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+          y_sad = x->pred_mv1_sad[LAST_FRAME];
+        set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
+                              search_state.yv12_mb[LAST_FRAME]);
+      }
+    }
+
+    // Check the inter mode can be skipped based on mode statistics and speed
+    // features settings.
+    if (skip_inter_mode_nonrd(cpi, x, &search_state, &thresh_sad_pred,
+                              &force_mv_inter_layer, &is_single_pred,
+                              &this_mode, &last_comp_ref_frame, &ref_frame,
+                              &ref_frame2, idx, svc_mv, force_skip_low_temp_var,
+                              sse_zeromv_norm, num_inter_modes, segment_id,
+                              bsize, comp_use_zero_zeromv_only, check_globalmv))
+      continue;
+
+    // Select prediction reference frames.
+    for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+      xd->plane[plane].pre[0] = search_state.yv12_mb[ref_frame][plane];
+      if (!is_single_pred)
+        xd->plane[plane].pre[1] = search_state.yv12_mb[ref_frame2][plane];
+    }
+
+    mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = ref_frame2;
+    set_ref_ptrs(cm, xd, ref_frame, ref_frame2);
+
+    // Check if the scaled reference frame should be used. This is set in the
+    // find_predictors() for each usable reference. If so, set the
+    // block_ref_scale_factors[] to no reference scaling.
+    if (search_state.use_scaled_ref_frame[ref_frame]) {
+      xd->block_ref_scale_factors[0] = &sf_no_scale;
+    }
+    if (!is_single_pred && search_state.use_scaled_ref_frame[ref_frame2]) {
+      xd->block_ref_scale_factors[1] = &sf_no_scale;
+    }
+
+    // Perform inter mode evaluation for non-rd
+    if (!handle_inter_mode_nonrd(
+            cpi, x, &search_state, ctx, &this_mode_pred, tmp_buffer,
+            inter_pred_params_sr, &best_early_term, &sse_zeromv_norm,
+            &check_globalmv,
+#if CONFIG_AV1_TEMPORAL_DENOISING
+            &zero_last_cost_orig, denoise_svc_pickmode,
+#endif
+            idx, force_mv_inter_layer, is_single_pred, gf_temporal_ref,
+            use_model_yrd_large, filter_search_enabled_blk, bsize, this_mode,
+            filt_select, cb_pred_filter_search, reuse_inter_pred,
+            &sb_me_has_been_tested)) {
+      break;
+    }
+  }
+
+  // Restore mode data of best inter mode
+  mi->mode = best_pickmode->best_mode;
+  mi->motion_mode = best_pickmode->best_motion_mode;
+  mi->wm_params = best_pickmode->wm_params;
+  mi->num_proj_ref = best_pickmode->num_proj_ref;
+  mi->interp_filters = best_pickmode->best_pred_filter;
+  mi->tx_size = best_pickmode->best_tx_size;
+  memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
+  mi->ref_frame[0] = best_pickmode->best_ref_frame;
+  mi->mv[0].as_int = search_state
+                         .frame_mv_best[best_pickmode->best_mode]
+                                       [best_pickmode->best_ref_frame]
+                         .as_int;
+  mi->mv[1].as_int = 0;
+  if (best_pickmode->best_second_ref_frame > INTRA_FRAME) {
+    mi->ref_frame[1] = best_pickmode->best_second_ref_frame;
+    mi->mv[1].as_int = search_state
+                           .frame_mv_best[best_pickmode->best_mode]
+                                         [best_pickmode->best_second_ref_frame]
+                           .as_int;
+  }
+  // Perform intra prediction search, if the best SAD is above a certain
+  // threshold.
+  mi->angle_delta[PLANE_TYPE_Y] = 0;
+  mi->angle_delta[PLANE_TYPE_UV] = 0;
+  mi->filter_intra_mode_info.use_filter_intra = 0;
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+  aom_usec_timer_start(&x->ms_stat_nonrd.timer1);
+  x->ms_stat_nonrd.num_searches[bsize][DC_PRED]++;
+  x->ms_stat_nonrd.num_nonskipped_searches[bsize][DC_PRED]++;
+#endif
+
+  int force_palette_test = 0;
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      x->content_state_sb.source_sad_nonrd != kZeroSad &&
+      bsize <= BLOCK_16X16) {
+    unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 200000;
+    unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 200;
+    unsigned int best_sse_inter_motion =
+        (unsigned int)(search_state.best_rdc.sse >>
+                       (b_width_log2_lookup[bsize] +
+                        b_height_log2_lookup[bsize]));
+    if (best_sse_inter_motion > thresh_sse &&
+        x->source_variance > thresh_source_var)
+      force_palette_test = 1;
+  }
+
+  // Evaluate Intra modes in inter frame
+  if (!x->force_zeromv_skip_for_blk)
+    av1_estimate_intra_mode(cpi, x, bsize, best_early_term,
+                            search_state.ref_costs_single[INTRA_FRAME],
+                            reuse_inter_pred, &orig_dst, tmp_buffer,
+                            &this_mode_pred, &search_state.best_rdc,
+                            best_pickmode, ctx);
+
+  int skip_idtx_palette = (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+                           x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) &&
+                          x->content_state_sb.source_sad_nonrd != kZeroSad &&
+                          !cpi->rc.high_source_sad;
+
+  int try_palette =
+      !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mi->bsize);
+  try_palette =
+      try_palette &&
+      (is_mode_intra(best_pickmode->best_mode) || force_palette_test) &&
+      x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
+      (cpi->rc.high_source_sad || x->source_variance > 300);
+
+  if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0;
+
+  // Perform screen content mode evaluation for non-rd
+  handle_screen_content_mode_nonrd(
+      cpi, x, &search_state, this_mode_pred, ctx, tmp_buffer, &orig_dst,
+      skip_idtx_palette, try_palette, bsize, reuse_inter_pred, mi_col, mi_row);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+  aom_usec_timer_mark(&x->ms_stat_nonrd.timer1);
+  x->ms_stat_nonrd.nonskipped_search_times[bsize][DC_PRED] +=
+      aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1);
+#endif
+
+  pd->dst = orig_dst;
+  // Best mode is finalized. Restore the mode data to mbmi
+  if (try_palette) mi->palette_mode_info = best_pickmode->pmi;
+  mi->mode = best_pickmode->best_mode;
+  mi->ref_frame[0] = best_pickmode->best_ref_frame;
+  mi->ref_frame[1] = best_pickmode->best_second_ref_frame;
+  // For lossless: always force the skip flags off.
+  if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+    txfm_info->skip_txfm = 0;
+    memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk);
+  } else {
+    txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm;
+  }
+  if (has_second_ref(mi)) {
+    mi->comp_group_idx = 0;
+    mi->compound_idx = 1;
+    mi->interinter_comp.type = COMPOUND_AVERAGE;
+  }
+
+  if (!is_inter_block(mi)) {
+    mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
+  } else {
+    // If inter mode is selected and ref_frame was one that uses the
+    // scaled reference frame, then we can't use reuse_inter_pred.
+    if (search_state.use_scaled_ref_frame[best_pickmode->best_ref_frame] ||
+        (has_second_ref(mi) &&
+         search_state
+             .use_scaled_ref_frame[best_pickmode->best_second_ref_frame]))
+      x->reuse_inter_pred = 0;
+  }
+
+  // Restore the predicted samples of best mode to final buffer
+  if (reuse_inter_pred && best_pickmode->best_pred != NULL) {
+    PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+    if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
+      aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
+                        pd->dst.stride, bw, bh);
+    }
+  }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && resize_pending == 0 &&
+      denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
+      cpi->denoiser.reset == 0) {
+    AV1_DENOISER_DECISION decision = COPY_BLOCK;
+    ctx->sb_skip_denoising = 0;
+    av1_pickmode_ctx_den_update(
+        &ctx_den, zero_last_cost_orig, search_state.ref_costs_single,
+        search_state.frame_mv, reuse_inter_pred, best_pickmode);
+    av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
+                         gf_temporal_ref);
+    if (denoise_recheck_zeromv)
+      recheck_zeromv_after_denoising(
+          cpi, mi, x, xd, decision, &ctx_den, search_state.yv12_mb,
+          &search_state.best_rdc, best_pickmode, bsize, mi_row, mi_col);
+    best_pickmode->best_ref_frame = ctx_den.best_ref_frame;
+  }
+#endif
+
+  // Update the factors used for RD thresholding for all modes.
+  if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) {
+    THR_MODES best_mode_idx =
+        mode_idx[best_pickmode->best_ref_frame][mode_offset(mi->mode)];
+    if (best_pickmode->best_ref_frame == INTRA_FRAME) {
+      // Only consider the modes that are included in the intra_mode_list.
+      int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
+      for (int mode_index = 0; mode_index < intra_modes; mode_index++) {
+        update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx,
+                                intra_mode_list[mode_index]);
+      }
+    } else {
+      PREDICTION_MODE this_mode;
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+        update_thresh_freq_fact(cpi, x, bsize, best_pickmode->best_ref_frame,
+                                best_mode_idx, this_mode);
+      }
+    }
+  }
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context_nonrd(x, ctx, mi->mode);
+#else
+  store_coding_context_nonrd(x, ctx);
+#endif  // CONFIG_INTERNAL_STATS
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+  aom_usec_timer_mark(&x->ms_stat_nonrd.bsize_timer);
+  x->ms_stat_nonrd.total_block_times[bsize] +=
+      aom_usec_timer_elapsed(&x->ms_stat_nonrd.bsize_timer);
+  print_time(&x->ms_stat_nonrd, bsize, cm->mi_params.mi_rows,
+             cm->mi_params.mi_cols, mi_row, mi_col);
+#endif  // COLLECT_NONRD_PICK_MODE_STAT
+
+  *rd_cost = search_state.best_rdc;
+
+  // Reset the xd->block_ref_scale_factors[i], as they may have
+  // been set to pointer &sf_no_scale, which becomes invalid afer
+  // this function.
+  set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+}
diff --git a/third_party/aom/av1/encoder/optical_flow.c b/third_party/aom/av1/encoder/optical_flow.c
new file mode 100644
index 0000000000..dc168e7aee
--- /dev/null
+++ b/third_party/aom/av1/encoder/optical_flow.c
@@ -0,0 +1,1113 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/mathutils.h"
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/optical_flow.h"
+#include "av1/encoder/sparse_linear_solver.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+void av1_init_opfl_params(OPFL_PARAMS *opfl_params) {
+  opfl_params->pyramid_levels = OPFL_PYRAMID_LEVELS;
+  opfl_params->warping_steps = OPFL_WARPING_STEPS;
+  opfl_params->lk_params = NULL;
+}
+
+void av1_init_lk_params(LK_PARAMS *lk_params) {
+  lk_params->window_size = OPFL_WINDOW_SIZE;
+}
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+  return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+// Helper function to determine whether optical flow method is sparse.
+static INLINE int is_sparse(const OPFL_PARAMS *opfl_params) {
+  return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0;
+}
+
+static void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+                                  const YV12_BUFFER_CONFIG *ref_frame,
+                                  const double x_coord, const double y_coord,
+                                  const int window_size, const int bit_depth,
+                                  double *ix, double *iy, double *it,
+                                  LOCALMV *mv);
+
+// coefficients for bilinear interpolation on unit square
+static int pixel_interp(const double x, const double y, const double b00,
+                        const double b01, const double b10, const double b11) {
+  const int xint = (int)x;
+  const int yint = (int)y;
+  const double xdec = x - xint;
+  const double ydec = y - yint;
+  const double a = (1 - xdec) * (1 - ydec);
+  const double b = xdec * (1 - ydec);
+  const double c = (1 - xdec) * ydec;
+  const double d = xdec * ydec;
+  // if x, y are already integers, this results to b00
+  int interp = (int)round(a * b00 + b * b01 + c * b10 + d * b11);
+  return interp;
+}
+
+// Scharr filter to compute spatial gradient
+static void spatial_gradient(const YV12_BUFFER_CONFIG *frame, const int x_coord,
+                             const int y_coord, const int direction,
+                             double *derivative) {
+  double *filter;
+  // Scharr filters
+  double gx[9] = { -3, 0, 3, -10, 0, 10, -3, 0, 3 };
+  double gy[9] = { -3, -10, -3, 0, 0, 0, 3, 10, 3 };
+  if (direction == 0) {  // x direction
+    filter = gx;
+  } else {  // y direction
+    filter = gy;
+  }
+  int idx = 0;
+  double d = 0;
+  for (int yy = -1; yy <= 1; yy++) {
+    for (int xx = -1; xx <= 1; xx++) {
+      d += filter[idx] *
+           frame->y_buffer[(y_coord + yy) * frame->y_stride + (x_coord + xx)];
+      idx++;
+    }
+  }
+  // normalization scaling factor for scharr
+  *derivative = d / 32.0;
+}
+
+// Determine the spatial gradient at subpixel locations
+// For example, when reducing images for pyramidal LK,
+// corners found in original image may be at subpixel locations.
+static void gradient_interp(double *fullpel_deriv, const double x_coord,
+                            const double y_coord, const int w, const int h,
+                            double *derivative) {
+  const int xint = (int)x_coord;
+  const int yint = (int)y_coord;
+  double interp;
+  if (xint + 1 > w - 1 || yint + 1 > h - 1) {
+    interp = fullpel_deriv[yint * w + xint];
+  } else {
+    interp = pixel_interp(x_coord, y_coord, fullpel_deriv[yint * w + xint],
+                          fullpel_deriv[yint * w + (xint + 1)],
+                          fullpel_deriv[(yint + 1) * w + xint],
+                          fullpel_deriv[(yint + 1) * w + (xint + 1)]);
+  }
+
+  *derivative = interp;
+}
+
+static void temporal_gradient(const YV12_BUFFER_CONFIG *frame,
+                              const YV12_BUFFER_CONFIG *frame2,
+                              const double x_coord, const double y_coord,
+                              const int bit_depth, double *derivative,
+                              LOCALMV *mv) {
+  const int w = 2;
+  const int h = 2;
+  uint8_t pred1[4];
+  uint8_t pred2[4];
+
+  const int y = (int)y_coord;
+  const int x = (int)x_coord;
+  const double ydec = y_coord - y;
+  const double xdec = x_coord - x;
+  const int is_intrabc = 0;  // Is intra-copied?
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame2);
+  const int subsampling_x = 0, subsampling_y = 0;  // for y-buffer
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(MULTITAP_SHARP);
+  const int plane = 0;  // y-plane
+  const struct buf_2d ref_buf2 = { NULL, frame2->y_buffer, frame2->y_crop_width,
+                                   frame2->y_crop_height, frame2->y_stride };
+  struct scale_factors scale;
+  av1_setup_scale_factors_for_frame(&scale, frame->y_crop_width,
+                                    frame->y_crop_height, frame->y_crop_width,
+                                    frame->y_crop_height);
+  InterPredParams inter_pred_params;
+  av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                        subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+                        &scale, &ref_buf2, interp_filters);
+  inter_pred_params.interp_filter_params[0] =
+      &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+  inter_pred_params.interp_filter_params[1] =
+      &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+  inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+  MV newmv = { .row = (int16_t)round((mv->row + xdec) * 8),
+               .col = (int16_t)round((mv->col + ydec) * 8) };
+  av1_enc_build_one_inter_predictor(pred2, w, &newmv, &inter_pred_params);
+  const struct buf_2d ref_buf1 = { NULL, frame->y_buffer, frame->y_crop_width,
+                                   frame->y_crop_height, frame->y_stride };
+  av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                        subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+                        &scale, &ref_buf1, interp_filters);
+  inter_pred_params.interp_filter_params[0] =
+      &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+  inter_pred_params.interp_filter_params[1] =
+      &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+  inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+  MV zeroMV = { .row = (int16_t)round(xdec * 8),
+                .col = (int16_t)round(ydec * 8) };
+  av1_enc_build_one_inter_predictor(pred1, w, &zeroMV, &inter_pred_params);
+
+  *derivative = pred2[0] - pred1[0];
+}
+
+// Numerical differentiate over window_size x window_size surrounding (x,y)
+// location. Alters ix, iy, it to contain numerical partial derivatives
+static void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+                                  const YV12_BUFFER_CONFIG *ref_frame,
+                                  const double x_coord, const double y_coord,
+                                  const int window_size, const int bit_depth,
+                                  double *ix, double *iy, double *it,
+                                  LOCALMV *mv) {
+  const double left = x_coord - window_size / 2.0;
+  const double top = y_coord - window_size / 2.0;
+  // gradient operators need pixel before and after (start at 1)
+  const double x_start = AOMMAX(1, left);
+  const double y_start = AOMMAX(1, top);
+  const int frame_height = frame->y_crop_height;
+  const int frame_width = frame->y_crop_width;
+  double deriv_x;
+  double deriv_y;
+  double deriv_t;
+
+  const double x_end = AOMMIN(x_coord + window_size / 2.0, frame_width - 2);
+  const double y_end = AOMMIN(y_coord + window_size / 2.0, frame_height - 2);
+  const int xs = (int)AOMMAX(1, x_start - 1);
+  const int ys = (int)AOMMAX(1, y_start - 1);
+  const int xe = (int)AOMMIN(x_end + 2, frame_width - 2);
+  const int ye = (int)AOMMIN(y_end + 2, frame_height - 2);
+  // with normalization, gradients may be double values
+  double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x));
+  double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y));
+  if (!fullpel_dx || !fullpel_dy) {
+    aom_free(fullpel_dx);
+    aom_free(fullpel_dy);
+    return;
+  }
+
+  // TODO(any): This could be more efficient in the case that x_coord
+  // and y_coord are integers.. but it may look more messy.
+
+  // calculate spatial gradients at full pixel locations
+  for (int j = ys; j < ye; j++) {
+    for (int i = xs; i < xe; i++) {
+      spatial_gradient(frame, i, j, 0, &deriv_x);
+      spatial_gradient(frame, i, j, 1, &deriv_y);
+      int idx = (j - ys) * (xe - xs) + (i - xs);
+      fullpel_dx[idx] = deriv_x;
+      fullpel_dy[idx] = deriv_y;
+    }
+  }
+  // compute numerical differentiation for every pixel in window
+  // (this potentially includes subpixels)
+  for (double j = y_start; j < y_end; j++) {
+    for (double i = x_start; i < x_end; i++) {
+      temporal_gradient(frame, ref_frame, i, j, bit_depth, &deriv_t, mv);
+      gradient_interp(fullpel_dx, i - xs, j - ys, xe - xs, ye - ys, &deriv_x);
+      gradient_interp(fullpel_dy, i - xs, j - ys, xe - xs, ye - ys, &deriv_y);
+      int idx = (int)(j - top) * window_size + (int)(i - left);
+      ix[idx] = deriv_x;
+      iy[idx] = deriv_y;
+      it[idx] = deriv_t;
+    }
+  }
+  // TODO(any): to avoid setting deriv arrays to zero for every iteration,
+  // could instead pass these two values back through function call
+  // int first_idx = (int)(y_start - top) * window_size + (int)(x_start - left);
+  // int width = window_size - ((int)(x_start - left) + (int)(left + window_size
+  // - x_end));
+
+  aom_free(fullpel_dx);
+  aom_free(fullpel_dy);
+}
+
+// To compute eigenvalues of 2x2 matrix: Solve for lambda where
+// Determinant(matrix - lambda*identity) == 0
+static void eigenvalues_2x2(const double *matrix, double *eig) {
+  const double a = 1;
+  const double b = -1 * matrix[0] - matrix[3];
+  const double c = -1 * matrix[1] * matrix[2] + matrix[0] * matrix[3];
+  // quadratic formula
+  const double discriminant = b * b - 4 * a * c;
+  eig[0] = (-b - sqrt(discriminant)) / (2.0 * a);
+  eig[1] = (-b + sqrt(discriminant)) / (2.0 * a);
+  // double check that eigenvalues are ordered by magnitude
+  if (fabs(eig[0]) > fabs(eig[1])) {
+    double tmp = eig[0];
+    eig[0] = eig[1];
+    eig[1] = tmp;
+  }
+}
+
+// Shi-Tomasi corner detection criteria
+static double corner_score(const YV12_BUFFER_CONFIG *frame_to_filter,
+                           const YV12_BUFFER_CONFIG *ref_frame, const int x,
+                           const int y, double *i_x, double *i_y, double *i_t,
+                           const int n, const int bit_depth) {
+  double eig[2];
+  LOCALMV mv = { .row = 0, .col = 0 };
+  // TODO(any): technically, ref_frame and i_t are not used by corner score
+  // so these could be replaced by dummy variables,
+  // or change this to spatial gradient function over window only
+  gradients_over_window(frame_to_filter, ref_frame, x, y, n, bit_depth, i_x,
+                        i_y, i_t, &mv);
+  double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+  multiply_mat(i_x, i_x, Mres1, 1, n * n, 1);
+  multiply_mat(i_x, i_y, Mres2, 1, n * n, 1);
+  multiply_mat(i_y, i_y, Mres3, 1, n * n, 1);
+  double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+  eigenvalues_2x2(M, eig);
+  return fabs(eig[0]);
+}
+
+// Finds corners in frame_to_filter
+// For less strict requirements (i.e. more corners), decrease threshold
+static int detect_corners(const YV12_BUFFER_CONFIG *frame_to_filter,
+                          const YV12_BUFFER_CONFIG *ref_frame,
+                          const int maxcorners, int *ref_corners,
+                          const int bit_depth) {
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  // TODO(any): currently if maxcorners is decreased, then it only means
+  // corners will be omited from bottom-right of image. if maxcorners
+  // is actually used, then this algorithm would need to re-iterate
+  // and choose threshold based on that
+  assert(maxcorners == frame_height * frame_width);
+  int countcorners = 0;
+  const double threshold = 0.1;
+  double score;
+  const int n = 3;
+  double i_x[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  double i_y[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  double i_t[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  const int fromedge = n;
+  double max_score = corner_score(frame_to_filter, ref_frame, fromedge,
+                                  fromedge, i_x, i_y, i_t, n, bit_depth);
+  // rough estimate of max corner score in image
+  for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+    for (int y = fromedge; y < frame_height - fromedge; y += frame_height / 5) {
+      for (int i = 0; i < n * n; i++) {
+        i_x[i] = 0;
+        i_y[i] = 0;
+        i_t[i] = 0;
+      }
+      score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+                           bit_depth);
+      if (score > max_score) {
+        max_score = score;
+      }
+    }
+  }
+  // score all the points and choose corners over threshold
+  for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+    for (int y = fromedge;
+         (y < frame_height - fromedge) && countcorners < maxcorners; y += 1) {
+      for (int i = 0; i < n * n; i++) {
+        i_x[i] = 0;
+        i_y[i] = 0;
+        i_t[i] = 0;
+      }
+      score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+                           bit_depth);
+      if (score > threshold * max_score) {
+        ref_corners[countcorners * 2] = x;
+        ref_corners[countcorners * 2 + 1] = y;
+        countcorners++;
+      }
+    }
+  }
+  return countcorners;
+}
+
+// weights is an nxn matrix. weights is filled with a gaussian function,
+// with independent variable: distance from the center point.
+static void gaussian(const double sigma, const int n, const int normalize,
+                     double *weights) {
+  double total_weight = 0;
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < n; i++) {
+      double distance = sqrt(pow(n / 2 - i, 2) + pow(n / 2 - j, 2));
+      double weight = exp(-0.5 * pow(distance / sigma, 2));
+      weights[j * n + i] = weight;
+      total_weight += weight;
+    }
+  }
+  if (normalize == 1) {
+    for (int j = 0; j < n; j++) {
+      weights[j] = weights[j] / total_weight;
+    }
+  }
+}
+
+static double convolve(const double *filter, const int *img, const int size) {
+  double result = 0;
+  for (int i = 0; i < size; i++) {
+    result += filter[i] * img[i];
+  }
+  return result;
+}
+
+// Applies a Gaussian low-pass smoothing filter to produce
+// a corresponding lower resolution image with halved dimensions
+static void reduce(uint8_t *img, int height, int width, int stride,
+                   uint8_t *reduced_img) {
+  const int new_width = width / 2;
+  const int window_size = 5;
+  const double gaussian_filter[25] = {
+    1. / 256, 1.0 / 64, 3. / 128, 1. / 64,  1. / 256, 1. / 64, 1. / 16,
+    3. / 32,  1. / 16,  1. / 64,  3. / 128, 3. / 32,  9. / 64, 3. / 32,
+    3. / 128, 1. / 64,  1. / 16,  3. / 32,  1. / 16,  1. / 64, 1. / 256,
+    1. / 64,  3. / 128, 1. / 64,  1. / 256
+  };
+  // filter is 5x5 so need prev and forward 2 pixels
+  int img_section[25];
+  for (int y = 0; y < height - 1; y += 2) {
+    for (int x = 0; x < width - 1; x += 2) {
+      int i = 0;
+      for (int yy = y - window_size / 2; yy <= y + window_size / 2; yy++) {
+        for (int xx = x - window_size / 2; xx <= x + window_size / 2; xx++) {
+          int yvalue = yy;
+          int xvalue = xx;
+          // copied pixels outside the boundary
+          if (yvalue < 0) yvalue = 0;
+          if (xvalue < 0) xvalue = 0;
+          if (yvalue >= height) yvalue = height - 1;
+          if (xvalue >= width) xvalue = width - 1;
+          img_section[i++] = img[yvalue * stride + xvalue];
+        }
+      }
+      reduced_img[(y / 2) * new_width + (x / 2)] = (uint8_t)convolve(
+          gaussian_filter, img_section, window_size * window_size);
+    }
+  }
+}
+
+static int cmpfunc(const void *a, const void *b) {
+  return (*(int *)a - *(int *)b);
+}
+static void filter_mvs(const MV_FILTER_TYPE mv_filter, const int frame_height,
+                       const int frame_width, LOCALMV *localmvs, MV *mvs) {
+  const int n = 5;  // window size
+  // for smoothing filter
+  const double gaussian_filter[25] = {
+    1. / 256, 1. / 64,  3. / 128, 1. / 64,  1. / 256, 1. / 64, 1. / 16,
+    3. / 32,  1. / 16,  1. / 64,  3. / 128, 3. / 32,  9. / 64, 3. / 32,
+    3. / 128, 1. / 64,  1. / 16,  3. / 32,  1. / 16,  1. / 64, 1. / 256,
+    1. / 64,  3. / 128, 1. / 64,  1. / 256
+  };
+  // for median filter
+  int mvrows[25];
+  int mvcols[25];
+  if (mv_filter != MV_FILTER_NONE) {
+    for (int y = 0; y < frame_height; y++) {
+      for (int x = 0; x < frame_width; x++) {
+        int center_idx = y * frame_width + x;
+        int i = 0;
+        double filtered_row = 0;
+        double filtered_col = 0;
+        for (int yy = y - n / 2; yy <= y + n / 2; yy++) {
+          for (int xx = x - n / 2; xx <= x + n / 2; xx++) {
+            int yvalue = yy;
+            int xvalue = xx;
+            // copied pixels outside the boundary
+            if (yvalue < 0) yvalue = 0;
+            if (xvalue < 0) xvalue = 0;
+            if (yvalue >= frame_height) yvalue = frame_height - 1;
+            if (xvalue >= frame_width) xvalue = frame_width - 1;
+            int index = yvalue * frame_width + xvalue;
+            if (mv_filter == MV_FILTER_SMOOTH) {
+              filtered_row += mvs[index].row * gaussian_filter[i];
+              filtered_col += mvs[index].col * gaussian_filter[i];
+            } else if (mv_filter == MV_FILTER_MEDIAN) {
+              mvrows[i] = mvs[index].row;
+              mvcols[i] = mvs[index].col;
+            }
+            i++;
+          }
+        }
+
+        MV mv = mvs[center_idx];
+        if (mv_filter == MV_FILTER_SMOOTH) {
+          mv.row = (int16_t)filtered_row;
+          mv.col = (int16_t)filtered_col;
+        } else if (mv_filter == MV_FILTER_MEDIAN) {
+          qsort(mvrows, 25, sizeof(mv.row), cmpfunc);
+          qsort(mvcols, 25, sizeof(mv.col), cmpfunc);
+          mv.row = mvrows[25 / 2];
+          mv.col = mvcols[25 / 2];
+        }
+        LOCALMV localmv = { .row = ((double)mv.row) / 8,
+                            .col = ((double)mv.row) / 8 };
+        localmvs[y * frame_width + x] = localmv;
+        // if mvs array is immediately updated here, then the result may
+        // propagate to other pixels.
+      }
+    }
+    for (int i = 0; i < frame_height * frame_width; i++) {
+      MV mv = { .row = (int16_t)round(8 * localmvs[i].row),
+                .col = (int16_t)round(8 * localmvs[i].col) };
+      mvs[i] = mv;
+    }
+  }
+}
+
+// Computes optical flow at a single pyramid level,
+// using Lucas-Kanade algorithm.
+// Modifies mvs array.
+static void lucas_kanade(const YV12_BUFFER_CONFIG *from_frame,
+                         const YV12_BUFFER_CONFIG *to_frame, const int level,
+                         const LK_PARAMS *lk_params, const int num_ref_corners,
+                         int *ref_corners, const int mv_stride,
+                         const int bit_depth, LOCALMV *mvs) {
+  assert(lk_params->window_size > 0 && lk_params->window_size % 2 == 0);
+  const int n = lk_params->window_size;
+  // algorithm is sensitive to window size
+  double *i_x = (double *)aom_malloc(n * n * sizeof(*i_x));
+  double *i_y = (double *)aom_malloc(n * n * sizeof(*i_y));
+  double *i_t = (double *)aom_malloc(n * n * sizeof(*i_t));
+  double *weights = (double *)aom_malloc(n * n * sizeof(*weights));
+  if (!i_x || !i_y || !i_t || !weights) goto free_lk_buf;
+
+  const int expand_multiplier = (int)pow(2, level);
+  double sigma = 0.2 * n;
+  // normalizing doesn't really affect anything since it's applied
+  // to every component of M and b
+  gaussian(sigma, n, 0, weights);
+  for (int i = 0; i < num_ref_corners; i++) {
+    const double x_coord = 1.0 * ref_corners[i * 2] / expand_multiplier;
+    const double y_coord = 1.0 * ref_corners[i * 2 + 1] / expand_multiplier;
+    int highres_x = ref_corners[i * 2];
+    int highres_y = ref_corners[i * 2 + 1];
+    int mv_idx = highres_y * (mv_stride) + highres_x;
+    LOCALMV mv_old = mvs[mv_idx];
+    mv_old.row = mv_old.row / expand_multiplier;
+    mv_old.col = mv_old.col / expand_multiplier;
+    // using this instead of memset, since it's not completely
+    // clear if zero memset works on double arrays
+    for (int j = 0; j < n * n; j++) {
+      i_x[j] = 0;
+      i_y[j] = 0;
+      i_t[j] = 0;
+    }
+    gradients_over_window(from_frame, to_frame, x_coord, y_coord, n, bit_depth,
+                          i_x, i_y, i_t, &mv_old);
+    double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+    double bres1[1] = { 0 }, bres2[1] = { 0 };
+    for (int j = 0; j < n * n; j++) {
+      Mres1[0] += weights[j] * i_x[j] * i_x[j];
+      Mres2[0] += weights[j] * i_x[j] * i_y[j];
+      Mres3[0] += weights[j] * i_y[j] * i_y[j];
+      bres1[0] += weights[j] * i_x[j] * i_t[j];
+      bres2[0] += weights[j] * i_y[j] * i_t[j];
+    }
+    double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+    double b[2] = { -1 * bres1[0], -1 * bres2[0] };
+    double eig[2] = { 1, 1 };
+    eigenvalues_2x2(M, eig);
+    double threshold = 0.1;
+    if (fabs(eig[0]) > threshold) {
+      // if M is not invertible, then displacement
+      // will default to zeros
+      double u[2] = { 0, 0 };
+      linsolve(2, M, 2, b, u);
+      int mult = 1;
+      if (level != 0)
+        mult = expand_multiplier;  // mv doubles when resolution doubles
+      LOCALMV mv = { .row = (mult * (u[0] + mv_old.row)),
+                     .col = (mult * (u[1] + mv_old.col)) };
+      mvs[mv_idx] = mv;
+      mvs[mv_idx] = mv;
+    }
+  }
+free_lk_buf:
+  aom_free(weights);
+  aom_free(i_t);
+  aom_free(i_x);
+  aom_free(i_y);
+}
+
+// Warp the src_frame to warper_frame according to mvs.
+// mvs point to src_frame
+static void warp_back_frame(YV12_BUFFER_CONFIG *warped_frame,
+                            const YV12_BUFFER_CONFIG *src_frame,
+                            const LOCALMV *mvs, int mv_stride) {
+  int w, h;
+  const int fw = src_frame->y_crop_width;
+  const int fh = src_frame->y_crop_height;
+  const int src_fs = src_frame->y_stride, warped_fs = warped_frame->y_stride;
+  const uint8_t *src_buf = src_frame->y_buffer;
+  uint8_t *warped_buf = warped_frame->y_buffer;
+  double temp;
+  for (h = 0; h < fh; h++) {
+    for (w = 0; w < fw; w++) {
+      double cord_x = (double)w + mvs[h * mv_stride + w].col;
+      double cord_y = (double)h + mvs[h * mv_stride + w].row;
+      cord_x = fclamp(cord_x, 0, (double)(fw - 1));
+      cord_y = fclamp(cord_y, 0, (double)(fh - 1));
+      const int floorx = (int)floor(cord_x);
+      const int floory = (int)floor(cord_y);
+      const double fracx = cord_x - (double)floorx;
+      const double fracy = cord_y - (double)floory;
+
+      temp = 0;
+      for (int hh = 0; hh < 2; hh++) {
+        const double weighth = hh ? (fracy) : (1 - fracy);
+        for (int ww = 0; ww < 2; ww++) {
+          const double weightw = ww ? (fracx) : (1 - fracx);
+          int y = floory + hh;
+          int x = floorx + ww;
+          y = clamp(y, 0, fh - 1);
+          x = clamp(x, 0, fw - 1);
+          temp += (double)src_buf[y * src_fs + x] * weightw * weighth;
+        }
+      }
+      warped_buf[h * warped_fs + w] = (uint8_t)round(temp);
+    }
+  }
+}
+
+// Same as warp_back_frame, but using a better interpolation filter.
+static void warp_back_frame_intp(YV12_BUFFER_CONFIG *warped_frame,
+                                 const YV12_BUFFER_CONFIG *src_frame,
+                                 const LOCALMV *mvs, int mv_stride) {
+  int w, h;
+  const int fw = src_frame->y_crop_width;
+  const int fh = src_frame->y_crop_height;
+  const int warped_fs = warped_frame->y_stride;
+  uint8_t *warped_buf = warped_frame->y_buffer;
+  const int blk = 2;
+  uint8_t temp_blk[4];
+
+  const int is_intrabc = 0;  // Is intra-copied?
+  const int is_high_bitdepth = is_frame_high_bitdepth(src_frame);
+  const int subsampling_x = 0, subsampling_y = 0;  // for y-buffer
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(MULTITAP_SHARP2);
+  const int plane = 0;  // y-plane
+  const struct buf_2d ref_buf2 = { NULL, src_frame->y_buffer,
+                                   src_frame->y_crop_width,
+                                   src_frame->y_crop_height,
+                                   src_frame->y_stride };
+  const int bit_depth = src_frame->bit_depth;
+  struct scale_factors scale;
+  av1_setup_scale_factors_for_frame(
+      &scale, src_frame->y_crop_width, src_frame->y_crop_height,
+      src_frame->y_crop_width, src_frame->y_crop_height);
+
+  for (h = 0; h < fh; h++) {
+    for (w = 0; w < fw; w++) {
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, blk, blk, h, w, subsampling_x,
+                            subsampling_y, bit_depth, is_high_bitdepth,
+                            is_intrabc, &scale, &ref_buf2, interp_filters);
+      inter_pred_params.interp_filter_params[0] =
+          &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+      inter_pred_params.interp_filter_params[1] =
+          &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+      inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+      MV newmv = { .row = (int16_t)round((mvs[h * mv_stride + w].row) * 8),
+                   .col = (int16_t)round((mvs[h * mv_stride + w].col) * 8) };
+      av1_enc_build_one_inter_predictor(temp_blk, blk, &newmv,
+                                        &inter_pred_params);
+      warped_buf[h * warped_fs + w] = temp_blk[0];
+    }
+  }
+}
+
+#define DERIVATIVE_FILTER_LENGTH 7
+double filter[DERIVATIVE_FILTER_LENGTH] = { -1.0 / 60, 9.0 / 60,  -45.0 / 60, 0,
+                                            45.0 / 60, -9.0 / 60, 1.0 / 60 };
+
+// Get gradient of the whole frame
+static void get_frame_gradients(const YV12_BUFFER_CONFIG *from_frame,
+                                const YV12_BUFFER_CONFIG *to_frame, double *ix,
+                                double *iy, double *it, int grad_stride) {
+  int w, h, k, idx;
+  const int fw = from_frame->y_crop_width;
+  const int fh = from_frame->y_crop_height;
+  const int from_fs = from_frame->y_stride, to_fs = to_frame->y_stride;
+  const uint8_t *from_buf = from_frame->y_buffer;
+  const uint8_t *to_buf = to_frame->y_buffer;
+
+  const int lh = DERIVATIVE_FILTER_LENGTH;
+  const int hleft = (lh - 1) / 2;
+
+  for (h = 0; h < fh; h++) {
+    for (w = 0; w < fw; w++) {
+      // x
+      ix[h * grad_stride + w] = 0;
+      for (k = 0; k < lh; k++) {
+        // if we want to make this block dependent, need to extend the
+        // boundaries using other initializations.
+        idx = w + k - hleft;
+        idx = clamp(idx, 0, fw - 1);
+        ix[h * grad_stride + w] += filter[k] * 0.5 *
+                                   ((double)from_buf[h * from_fs + idx] +
+                                    (double)to_buf[h * to_fs + idx]);
+      }
+      // y
+      iy[h * grad_stride + w] = 0;
+      for (k = 0; k < lh; k++) {
+        // if we want to make this block dependent, need to extend the
+        // boundaries using other initializations.
+        idx = h + k - hleft;
+        idx = clamp(idx, 0, fh - 1);
+        iy[h * grad_stride + w] += filter[k] * 0.5 *
+                                   ((double)from_buf[idx * from_fs + w] +
+                                    (double)to_buf[idx * to_fs + w]);
+      }
+      // t
+      it[h * grad_stride + w] =
+          (double)to_buf[h * to_fs + w] - (double)from_buf[h * from_fs + w];
+    }
+  }
+}
+
+// Solve for linear equations given by the H-S method
+static void solve_horn_schunck(const double *ix, const double *iy,
+                               const double *it, int grad_stride, int width,
+                               int height, const LOCALMV *init_mvs,
+                               int init_mv_stride, LOCALMV *mvs,
+                               int mv_stride) {
+  // TODO(bohanli): May just need to allocate the buffers once per optical flow
+  // calculation
+  int *row_pos = aom_calloc(width * height * 28, sizeof(*row_pos));
+  int *col_pos = aom_calloc(width * height * 28, sizeof(*col_pos));
+  double *values = aom_calloc(width * height * 28, sizeof(*values));
+  double *mv_vec = aom_calloc(width * height * 2, sizeof(*mv_vec));
+  double *mv_init_vec = aom_calloc(width * height * 2, sizeof(*mv_init_vec));
+  double *temp_b = aom_calloc(width * height * 2, sizeof(*temp_b));
+  double *b = aom_calloc(width * height * 2, sizeof(*b));
+  if (!row_pos || !col_pos || !values || !mv_vec || !mv_init_vec || !temp_b ||
+      !b) {
+    goto free_hs_solver_buf;
+  }
+
+  // the location idx for neighboring pixels, k < 4 are the 4 direct neighbors
+  const int check_locs_y[12] = { 0, 0, -1, 1, -1, -1, 1, 1, 0, 0, -2, 2 };
+  const int check_locs_x[12] = { -1, 1, 0, 0, -1, 1, -1, 1, -2, 2, 0, 0 };
+
+  int h, w, checkh, checkw, k, ret;
+  const int offset = height * width;
+  SPARSE_MTX A;
+  int c = 0;
+  const double lambda = 100;
+
+  for (w = 0; w < width; w++) {
+    for (h = 0; h < height; h++) {
+      mv_init_vec[w * height + h] = init_mvs[h * init_mv_stride + w].col;
+      mv_init_vec[w * height + h + offset] =
+          init_mvs[h * init_mv_stride + w].row;
+    }
+  }
+
+  // get matrix A
+  for (w = 0; w < width; w++) {
+    for (h = 0; h < height; h++) {
+      int center_num_direct = 4;
+      const int center_idx = w * height + h;
+      if (w == 0 || w == width - 1) center_num_direct--;
+      if (h == 0 || h == height - 1) center_num_direct--;
+      // diagonal entry for this row from the center pixel
+      double cor_w = center_num_direct * center_num_direct + center_num_direct;
+      row_pos[c] = center_idx;
+      col_pos[c] = center_idx;
+      values[c] = lambda * cor_w;
+      c++;
+      row_pos[c] = center_idx + offset;
+      col_pos[c] = center_idx + offset;
+      values[c] = lambda * cor_w;
+      c++;
+      // other entries from direct neighbors
+      for (k = 0; k < 4; k++) {
+        checkh = h + check_locs_y[k];
+        checkw = w + check_locs_x[k];
+        if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+          continue;
+        }
+        int this_idx = checkw * height + checkh;
+        int this_num_direct = 4;
+        if (checkw == 0 || checkw == width - 1) this_num_direct--;
+        if (checkh == 0 || checkh == height - 1) this_num_direct--;
+        cor_w = -center_num_direct - this_num_direct;
+        row_pos[c] = center_idx;
+        col_pos[c] = this_idx;
+        values[c] = lambda * cor_w;
+        c++;
+        row_pos[c] = center_idx + offset;
+        col_pos[c] = this_idx + offset;
+        values[c] = lambda * cor_w;
+        c++;
+      }
+      // entries from neighbors on the diagonal corners
+      for (k = 4; k < 8; k++) {
+        checkh = h + check_locs_y[k];
+        checkw = w + check_locs_x[k];
+        if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+          continue;
+        }
+        int this_idx = checkw * height + checkh;
+        cor_w = 2;
+        row_pos[c] = center_idx;
+        col_pos[c] = this_idx;
+        values[c] = lambda * cor_w;
+        c++;
+        row_pos[c] = center_idx + offset;
+        col_pos[c] = this_idx + offset;
+        values[c] = lambda * cor_w;
+        c++;
+      }
+      // entries from neighbors with dist of 2
+      for (k = 8; k < 12; k++) {
+        checkh = h + check_locs_y[k];
+        checkw = w + check_locs_x[k];
+        if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+          continue;
+        }
+        int this_idx = checkw * height + checkh;
+        cor_w = 1;
+        row_pos[c] = center_idx;
+        col_pos[c] = this_idx;
+        values[c] = lambda * cor_w;
+        c++;
+        row_pos[c] = center_idx + offset;
+        col_pos[c] = this_idx + offset;
+        values[c] = lambda * cor_w;
+        c++;
+      }
+    }
+  }
+  ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+                            2 * width * height, &A);
+  if (ret < 0) goto free_hs_solver_buf;
+  // subtract init mv part from b
+  av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height);
+  for (int i = 0; i < 2 * width * height; i++) {
+    b[i] = -temp_b[i];
+  }
+  av1_free_sparse_mtx_elems(&A);
+
+  // add cross terms to A and modify b with ExEt / EyEt
+  for (w = 0; w < width; w++) {
+    for (h = 0; h < height; h++) {
+      int curidx = w * height + h;
+      // modify b
+      b[curidx] += -ix[h * grad_stride + w] * it[h * grad_stride + w];
+      b[curidx + offset] += -iy[h * grad_stride + w] * it[h * grad_stride + w];
+      // add cross terms to A
+      row_pos[c] = curidx;
+      col_pos[c] = curidx + offset;
+      values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w];
+      c++;
+      row_pos[c] = curidx + offset;
+      col_pos[c] = curidx;
+      values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w];
+      c++;
+    }
+  }
+  // Add diagonal terms to A
+  for (int i = 0; i < c; i++) {
+    if (row_pos[i] == col_pos[i]) {
+      if (row_pos[i] < offset) {
+        w = row_pos[i] / height;
+        h = row_pos[i] % height;
+        values[i] += pow(ix[h * grad_stride + w], 2);
+      } else {
+        w = (row_pos[i] - offset) / height;
+        h = (row_pos[i] - offset) % height;
+        values[i] += pow(iy[h * grad_stride + w], 2);
+      }
+    }
+  }
+
+  ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+                            2 * width * height, &A);
+  if (ret < 0) goto free_hs_solver_buf;
+
+  // solve for the mvs
+  ret = av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec);
+  if (ret < 0) goto free_hs_solver_buf;
+
+  // copy mvs
+  for (w = 0; w < width; w++) {
+    for (h = 0; h < height; h++) {
+      mvs[h * mv_stride + w].col = mv_vec[w * height + h];
+      mvs[h * mv_stride + w].row = mv_vec[w * height + h + offset];
+    }
+  }
+free_hs_solver_buf:
+  aom_free(row_pos);
+  aom_free(col_pos);
+  aom_free(values);
+  aom_free(mv_vec);
+  aom_free(mv_init_vec);
+  aom_free(b);
+  aom_free(temp_b);
+  av1_free_sparse_mtx_elems(&A);
+}
+
+// Calculate optical flow from from_frame to to_frame using the H-S method.
+static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame,
+                         const YV12_BUFFER_CONFIG *to_frame, const int level,
+                         const int mv_stride, const int mv_height,
+                         const int mv_width, const OPFL_PARAMS *opfl_params,
+                         LOCALMV *mvs) {
+  // mvs are always on level 0, here we define two new mv arrays that is of size
+  // of this level.
+  const int fw = from_frame->y_crop_width;
+  const int fh = from_frame->y_crop_height;
+  const int factor = (int)pow(2, level);
+  int w, h, k, init_mv_stride;
+  LOCALMV *init_mvs = NULL, *refine_mvs = NULL;
+  double *ix = NULL, *iy = NULL, *it = NULL;
+  YV12_BUFFER_CONFIG temp_frame;
+  temp_frame.y_buffer = NULL;
+  if (level == 0) {
+    init_mvs = mvs;
+    init_mv_stride = mv_stride;
+  } else {
+    init_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+    if (!init_mvs) goto free_hs_buf;
+    init_mv_stride = fw;
+    for (h = 0; h < fh; h++) {
+      for (w = 0; w < fw; w++) {
+        init_mvs[h * init_mv_stride + w].row =
+            mvs[h * factor * mv_stride + w * factor].row / (double)factor;
+        init_mvs[h * init_mv_stride + w].col =
+            mvs[h * factor * mv_stride + w * factor].col / (double)factor;
+      }
+    }
+  }
+  refine_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+  if (!refine_mvs) goto free_hs_buf;
+  // temp frame for warping
+  temp_frame.y_buffer =
+      (uint8_t *)aom_calloc(fh * fw, sizeof(*temp_frame.y_buffer));
+  if (!temp_frame.y_buffer) goto free_hs_buf;
+  temp_frame.y_crop_height = fh;
+  temp_frame.y_crop_width = fw;
+  temp_frame.y_stride = fw;
+  // gradient buffers
+  ix = aom_calloc(fw * fh, sizeof(*ix));
+  iy = aom_calloc(fw * fh, sizeof(*iy));
+  it = aom_calloc(fw * fh, sizeof(*it));
+  if (!ix || !iy || !it) goto free_hs_buf;
+  // For each warping step
+  for (k = 0; k < opfl_params->warping_steps; k++) {
+    // warp from_frame with init_mv
+    if (level == 0) {
+      warp_back_frame_intp(&temp_frame, to_frame, init_mvs, init_mv_stride);
+    } else {
+      warp_back_frame(&temp_frame, to_frame, init_mvs, init_mv_stride);
+    }
+    // calculate frame gradients
+    get_frame_gradients(from_frame, &temp_frame, ix, iy, it, fw);
+    // form linear equations and solve mvs
+    solve_horn_schunck(ix, iy, it, fw, fw, fh, init_mvs, init_mv_stride,
+                       refine_mvs, fw);
+    // update init_mvs
+    for (h = 0; h < fh; h++) {
+      for (w = 0; w < fw; w++) {
+        init_mvs[h * init_mv_stride + w].col += refine_mvs[h * fw + w].col;
+        init_mvs[h * init_mv_stride + w].row += refine_mvs[h * fw + w].row;
+      }
+    }
+  }
+  // copy back the mvs if needed
+  if (level != 0) {
+    for (h = 0; h < mv_height; h++) {
+      for (w = 0; w < mv_width; w++) {
+        mvs[h * mv_stride + w].row =
+            init_mvs[h / factor * init_mv_stride + w / factor].row *
+            (double)factor;
+        mvs[h * mv_stride + w].col =
+            init_mvs[h / factor * init_mv_stride + w / factor].col *
+            (double)factor;
+      }
+    }
+  }
+free_hs_buf:
+  if (level != 0) aom_free(init_mvs);
+  aom_free(refine_mvs);
+  aom_free(temp_frame.y_buffer);
+  aom_free(ix);
+  aom_free(iy);
+  aom_free(it);
+}
+
+// Apply optical flow iteratively at each pyramid level
+static void pyramid_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                                 const YV12_BUFFER_CONFIG *to_frame,
+                                 const int bit_depth,
+                                 const OPFL_PARAMS *opfl_params,
+                                 const OPTFLOW_METHOD method, LOCALMV *mvs) {
+  assert(opfl_params->pyramid_levels > 0 &&
+         opfl_params->pyramid_levels <= MAX_PYRAMID_LEVELS);
+  int levels = opfl_params->pyramid_levels;
+  const int frame_height = from_frame->y_crop_height;
+  const int frame_width = from_frame->y_crop_width;
+  if ((frame_height / pow(2.0, levels - 1) < 50 ||
+       frame_height / pow(2.0, levels - 1) < 50) &&
+      levels > 1)
+    levels = levels - 1;
+  uint8_t *images1[MAX_PYRAMID_LEVELS] = { NULL };
+  uint8_t *images2[MAX_PYRAMID_LEVELS] = { NULL };
+  int *ref_corners = NULL;
+
+  images1[0] = from_frame->y_buffer;
+  images2[0] = to_frame->y_buffer;
+  YV12_BUFFER_CONFIG *buffers1 = aom_malloc(levels * sizeof(*buffers1));
+  YV12_BUFFER_CONFIG *buffers2 = aom_malloc(levels * sizeof(*buffers2));
+  if (!buffers1 || !buffers2) goto free_pyramid_buf;
+  buffers1[0] = *from_frame;
+  buffers2[0] = *to_frame;
+  int fw = frame_width;
+  int fh = frame_height;
+  for (int i = 1; i < levels; i++) {
+    // TODO(bohanli): may need to extend buffers for better interpolation SIMD
+    images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images1[i]));
+    images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images2[i]));
+    if (!images1[i] || !images2[i]) goto free_pyramid_buf;
+    int stride;
+    if (i == 1)
+      stride = from_frame->y_stride;
+    else
+      stride = fw;
+    reduce(images1[i - 1], fh, fw, stride, images1[i]);
+    reduce(images2[i - 1], fh, fw, stride, images2[i]);
+    fh /= 2;
+    fw /= 2;
+    YV12_BUFFER_CONFIG a = { .y_buffer = images1[i],
+                             .y_crop_width = fw,
+                             .y_crop_height = fh,
+                             .y_stride = fw };
+    YV12_BUFFER_CONFIG b = { .y_buffer = images2[i],
+                             .y_crop_width = fw,
+                             .y_crop_height = fh,
+                             .y_stride = fw };
+    buffers1[i] = a;
+    buffers2[i] = b;
+  }
+  // Compute corners for specific frame
+  int num_ref_corners = 0;
+  if (is_sparse(opfl_params)) {
+    int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height;
+    ref_corners = aom_malloc(maxcorners * 2 * sizeof(*ref_corners));
+    if (!ref_corners) goto free_pyramid_buf;
+    num_ref_corners = detect_corners(from_frame, to_frame, maxcorners,
+                                     ref_corners, bit_depth);
+  }
+  const int stop_level = 0;
+  for (int i = levels - 1; i >= stop_level; i--) {
+    if (method == LUCAS_KANADE) {
+      assert(is_sparse(opfl_params));
+      lucas_kanade(&buffers1[i], &buffers2[i], i, opfl_params->lk_params,
+                   num_ref_corners, ref_corners, buffers1[0].y_crop_width,
+                   bit_depth, mvs);
+    } else if (method == HORN_SCHUNCK) {
+      assert(!is_sparse(opfl_params));
+      horn_schunck(&buffers1[i], &buffers2[i], i, buffers1[0].y_crop_width,
+                   buffers1[0].y_crop_height, buffers1[0].y_crop_width,
+                   opfl_params, mvs);
+    }
+  }
+free_pyramid_buf:
+  for (int i = 1; i < levels; i++) {
+    aom_free(images1[i]);
+    aom_free(images2[i]);
+  }
+  aom_free(ref_corners);
+  aom_free(buffers1);
+  aom_free(buffers2);
+}
+// Computes optical flow by applying algorithm at
+// multiple pyramid levels of images (lower-resolution, smoothed images)
+// This accounts for larger motions.
+// Inputs:
+//   from_frame Frame buffer.
+//   to_frame: Frame buffer. MVs point from_frame -> to_frame.
+//   from_frame_idx: Index of from_frame.
+//   to_frame_idx: Index of to_frame. Return all zero MVs when idx are equal.
+//   bit_depth:
+//   opfl_params: contains algorithm-specific parameters.
+//   mv_filter: MV_FILTER_NONE, MV_FILTER_SMOOTH, or MV_FILTER_MEDIAN.
+//   method: LUCAS_KANADE, HORN_SCHUNCK
+//   mvs: pointer to MVs. Contains initialization, and modified
+//   based on optical flow. Must have
+//   dimensions = from_frame->y_crop_width * from_frame->y_crop_height
+void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                      const YV12_BUFFER_CONFIG *to_frame,
+                      const int from_frame_idx, const int to_frame_idx,
+                      const int bit_depth, const OPFL_PARAMS *opfl_params,
+                      const MV_FILTER_TYPE mv_filter,
+                      const OPTFLOW_METHOD method, MV *mvs) {
+  const int frame_height = from_frame->y_crop_height;
+  const int frame_width = from_frame->y_crop_width;
+  // TODO(any): deal with the case where frames are not of the same dimensions
+  assert(frame_height == to_frame->y_crop_height &&
+         frame_width == to_frame->y_crop_width);
+  if (from_frame_idx == to_frame_idx) {
+    // immediately return all zero mvs when frame indices are equal
+    for (int yy = 0; yy < frame_height; yy++) {
+      for (int xx = 0; xx < frame_width; xx++) {
+        MV mv = { .row = 0, .col = 0 };
+        mvs[yy * frame_width + xx] = mv;
+      }
+    }
+    return;
+  }
+
+  // Initialize double mvs based on input parameter mvs array
+  LOCALMV *localmvs =
+      aom_malloc(frame_height * frame_width * sizeof(*localmvs));
+  if (!localmvs) return;
+
+  filter_mvs(MV_FILTER_SMOOTH, frame_height, frame_width, localmvs, mvs);
+
+  for (int i = 0; i < frame_width * frame_height; i++) {
+    MV mv = mvs[i];
+    LOCALMV localmv = { .row = ((double)mv.row) / 8,
+                        .col = ((double)mv.col) / 8 };
+    localmvs[i] = localmv;
+  }
+  // Apply optical flow algorithm
+  pyramid_optical_flow(from_frame, to_frame, bit_depth, opfl_params, method,
+                       localmvs);
+
+  // Update original mvs array
+  for (int j = 0; j < frame_height; j++) {
+    for (int i = 0; i < frame_width; i++) {
+      int idx = j * frame_width + i;
+      if (j + localmvs[idx].row < 0 || j + localmvs[idx].row >= frame_height ||
+          i + localmvs[idx].col < 0 || i + localmvs[idx].col >= frame_width) {
+        continue;
+      }
+      MV mv = { .row = (int16_t)round(8 * localmvs[idx].row),
+                .col = (int16_t)round(8 * localmvs[idx].col) };
+      mvs[idx] = mv;
+    }
+  }
+
+  filter_mvs(mv_filter, frame_height, frame_width, localmvs, mvs);
+
+  aom_free(localmvs);
+}
+#endif
diff --git a/third_party/aom/av1/encoder/optical_flow.h b/third_party/aom/av1/encoder/optical_flow.h
new file mode 100644
index 0000000000..2fbe474d77
--- /dev/null
+++ b/third_party/aom/av1/encoder/optical_flow.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+#define AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_OPTICAL_FLOW_API
+
+typedef enum { LUCAS_KANADE, HORN_SCHUNCK } OPTFLOW_METHOD;
+
+typedef enum {
+  MV_FILTER_NONE,
+  MV_FILTER_SMOOTH,
+  MV_FILTER_MEDIAN
+} MV_FILTER_TYPE;
+
+typedef struct LOCALMV {
+  double row;
+  double col;
+} LOCALMV;
+
+#define MAX_PYRAMID_LEVELS 5
+// default options for optical flow
+#define OPFL_WINDOW_SIZE 15
+#define OPFL_PYRAMID_LEVELS 3  // total levels
+#define OPFL_WARPING_STEPS 3
+
+// parameters specific to Lucas-Kanade
+typedef struct lk_params {
+  int window_size;
+} LK_PARAMS;
+
+// generic structure to contain parameters for all
+// optical flow algorithms
+typedef struct opfl_params {
+  int pyramid_levels;
+  int warping_steps;
+  LK_PARAMS *lk_params;
+  int flags;
+} OPFL_PARAMS;
+
+#define OPFL_FLAG_SPARSE 1
+
+void av1_init_opfl_params(OPFL_PARAMS *opfl_params);
+
+void av1_init_lk_params(LK_PARAMS *lk_params);
+
+void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                      const YV12_BUFFER_CONFIG *to_frame,
+                      const int from_frame_idx, const int to_frame_idx,
+                      const int bit_depth, const OPFL_PARAMS *opfl_params,
+                      const MV_FILTER_TYPE mv_filter,
+                      const OPTFLOW_METHOD method, MV *mvs);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_OPTICAL_FLOW_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
new file mode 100644
index 0000000000..7f79e9596e
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.c
@@ -0,0 +1,975 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "av1/common/pred_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/tx_search.h"
+
+#define AV1_K_MEANS_DIM 1
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+#define AV1_K_MEANS_DIM 2
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+
+static int int16_comparer(const void *a, const void *b) {
+  return (*(int16_t *)a - *(int16_t *)b);
+}
+
+int av1_remove_duplicates(int16_t *centroids, int num_centroids) {
+  int num_unique;  // number of unique centroids
+  int i;
+  qsort(centroids, num_centroids, sizeof(*centroids), int16_comparer);
+  // Remove duplicates.
+  num_unique = 1;
+  for (i = 1; i < num_centroids; ++i) {
+    if (centroids[i] != centroids[i - 1]) {  // found a new unique centroid
+      centroids[num_unique++] = centroids[i];
+    }
+  }
+  return num_unique;
+}
+
+static int delta_encode_cost(const int *colors, int num, int bit_depth,
+                             int min_val) {
+  if (num <= 0) return 0;
+  int bits_cost = bit_depth;
+  if (num == 1) return bits_cost;
+  bits_cost += 2;
+  int max_delta = 0;
+  int deltas[PALETTE_MAX_SIZE];
+  const int min_bits = bit_depth - 3;
+  for (int i = 1; i < num; ++i) {
+    const int delta = colors[i] - colors[i - 1];
+    deltas[i - 1] = delta;
+    assert(delta >= min_val);
+    if (delta > max_delta) max_delta = delta;
+  }
+  int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+  assert(bits_per_delta <= bit_depth);
+  int range = (1 << bit_depth) - colors[0] - min_val;
+  for (int i = 0; i < num - 1; ++i) {
+    bits_cost += bits_per_delta;
+    range -= deltas[i];
+    bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range));
+  }
+  return bits_cost;
+}
+
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+                          const uint16_t *colors, int n_colors,
+                          uint8_t *cache_color_found, int *out_cache_colors) {
+  if (n_cache <= 0) {
+    for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i];
+    return n_colors;
+  }
+  memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found));
+  int n_in_cache = 0;
+  int in_cache_flags[PALETTE_MAX_SIZE];
+  memset(in_cache_flags, 0, sizeof(in_cache_flags));
+  for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) {
+    for (int j = 0; j < n_colors; ++j) {
+      if (colors[j] == color_cache[i]) {
+        in_cache_flags[j] = 1;
+        cache_color_found[i] = 1;
+        ++n_in_cache;
+        break;
+      }
+    }
+  }
+  int j = 0;
+  for (int i = 0; i < n_colors; ++i)
+    if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i];
+  assert(j == n_colors - n_in_cache);
+  return j;
+}
+
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *zero_count,
+                                 int *min_bits) {
+  const int n = pmi->palette_size[1];
+  const int max_val = 1 << bit_depth;
+  int max_d = 0;
+  *min_bits = bit_depth - 4;
+  *zero_count = 0;
+  for (int i = 1; i < n; ++i) {
+    const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
+                      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
+    const int v = abs(delta);
+    const int d = AOMMIN(v, max_val - v);
+    if (d > max_d) max_d = d;
+    if (d == 0) ++(*zero_count);
+  }
+  return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+                             const uint16_t *color_cache, int n_cache,
+                             int bit_depth) {
+  const int n = pmi->palette_size[0];
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache =
+      av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+                            cache_color_found, out_cache_colors);
+  const int total_bits =
+      n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
+  return av1_cost_literal(total_bits);
+}
+
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+                              const uint16_t *color_cache, int n_cache,
+                              int bit_depth) {
+  const int n = pmi->palette_size[1];
+  int total_bits = 0;
+  // U channel palette color cost.
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache = av1_index_color_cache(
+      color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n,
+      cache_color_found, out_cache_colors);
+  total_bits +=
+      n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0);
+
+  // V channel palette color cost.
+  int zero_count = 0, min_bits_v = 0;
+  const int bits_v =
+      av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+  const int bits_using_delta =
+      2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+  const int bits_using_raw = bit_depth * n;
+  total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
+  return av1_cost_literal(total_bits);
+}
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
+                                                int orig_width, int orig_height,
+                                                int new_width, int new_height) {
+  int j;
+  assert(new_width >= orig_width);
+  assert(new_height >= orig_height);
+  if (new_width == orig_width && new_height == orig_height) return;
+
+  for (j = orig_height - 1; j >= 0; --j) {
+    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+    // Copy last column to extra columns.
+    memset(color_map + j * new_width + orig_width,
+           color_map[j * new_width + orig_width - 1], new_width - orig_width);
+  }
+  // Copy last row to extra rows.
+  for (j = orig_height; j < new_height; ++j) {
+    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+           new_width);
+  }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
+                                               int n_cache, int n_colors,
+                                               int stride, int16_t *centroids,
+                                               int bit_depth) {
+  if (n_cache <= 0) return;
+  for (int i = 0; i < n_colors * stride; i += stride) {
+    int min_diff = abs((int)centroids[i] - (int)color_cache[0]);
+    int idx = 0;
+    for (int j = 1; j < n_cache; ++j) {
+      const int this_diff = abs((int)centroids[i] - (int)color_cache[j]);
+      if (this_diff < min_diff) {
+        min_diff = this_diff;
+        idx = j;
+      }
+    }
+    const int min_threshold = 4 << (bit_depth - 8);
+    if (min_diff <= min_threshold) centroids[i] = color_cache[idx];
+  }
+}
+
+/*!\brief Calculate the luma palette cost from a given color palette
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * Given the base colors as specified in centroids[], calculate the RD cost
+ * of palette mode.
+ */
+static AOM_INLINE void palette_rd_y(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *centroids,
+    int n, uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
+    uint8_t *tx_type_map, int *beat_best_palette_rd,
+    bool *do_header_rd_based_breakout, int discount_color_cost) {
+  if (do_header_rd_based_breakout != NULL) *do_header_rd_based_breakout = false;
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids,
+                          cpi->common.seq_params->bit_depth);
+  const int num_unique_colors = av1_remove_duplicates(centroids, n);
+  if (num_unique_colors < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.seq_params->use_highbitdepth) {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel_highbd(
+          (int)centroids[i], cpi->common.seq_params->bit_depth);
+    }
+  } else {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+    }
+  }
+  pmi->palette_size[0] = num_unique_colors;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
+                   1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+
+  RD_STATS tokenonly_rd_stats;
+  int this_rate;
+
+  if (do_header_rd_based_gating) {
+    assert(do_header_rd_based_breakout != NULL);
+    const int palette_mode_rate = intra_mode_info_cost_y(
+        cpi, x, mbmi, bsize, dc_mode_cost, discount_color_cost);
+    const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+    // Less aggressive pruning when prune_luma_palette_size_search_level == 1.
+    const int header_rd_shift =
+        (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0;
+    // Terminate further palette_size search, if the header cost corresponding
+    // to lower palette_size is more than *best_rd << header_rd_shift. This
+    // logic is implemented with a right shift in the LHS to prevent a possible
+    // overflow with the left shift in RHS.
+    if ((header_rd >> header_rd_shift) > *best_rd) {
+      *do_header_rd_based_breakout = true;
+      return;
+    }
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) return;
+    this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+  } else {
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) return;
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost,
+                                       discount_color_cost);
+  }
+
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
+      this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    // Setting beat_best_rd flag because current mode rd is better than best_rd.
+    // This flag need to be updated only for palette evaluation in key frames
+    if (beat_best_rd) *beat_best_rd = 1;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->txfm_search_info.blk_skip,
+           sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip_txfm;
+    if (beat_best_palette_rd) *beat_best_palette_rd = 1;
+  }
+}
+
+static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) {
+  assert(step_size != 0);
+  return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx;
+}
+
+// Performs count-based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_top_color_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data,
+    int16_t *top_colors, int start_n, int end_n, int step_size,
+    bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+    int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, int discount_color_cost) {
+  int16_t centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  int top_color_winner = end_n;
+  /* clang-format off */
+  assert(IMPLIES(step_size < 0, start_n > end_n));
+  /* clang-format on */
+  assert(IMPLIES(step_size > 0, start_n < end_n));
+  while (!is_iter_over(n, end_n, step_size)) {
+    int beat_best_palette_rd = 0;
+    bool do_header_rd_based_breakout = false;
+    memcpy(centroids, top_colors, n * sizeof(top_colors[0]));
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+                 best_palette_color_map, best_rd, rate, rate_tokenonly,
+                 distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+                 tx_type_map, &beat_best_palette_rd,
+                 &do_header_rd_based_breakout, discount_color_cost);
+    *last_n_searched = n;
+    if (do_header_rd_based_breakout) {
+      // Terminate palette_size search by setting last_n_searched to end_n.
+      *last_n_searched = end_n;
+      break;
+    }
+    if (beat_best_palette_rd) {
+      top_color_winner = n;
+    } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+      // At search level 2, we return immediately if we don't see an improvement
+      return top_color_winner;
+    }
+    n += step_size;
+  }
+  return top_color_winner;
+}
+
+// Performs k-means based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_k_means_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int lower_bound,
+    int upper_bound, int start_n, int end_n, int step_size,
+    bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+    int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points, int discount_color_cost) {
+  int16_t centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  int top_color_winner = end_n;
+  /* clang-format off */
+  assert(IMPLIES(step_size < 0, start_n > end_n));
+  /* clang-format on */
+  assert(IMPLIES(step_size > 0, start_n < end_n));
+  while (!is_iter_over(n, end_n, step_size)) {
+    int beat_best_palette_rd = 0;
+    bool do_header_rd_based_breakout = false;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] =
+          lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+                 best_palette_color_map, best_rd, rate, rate_tokenonly,
+                 distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+                 tx_type_map, &beat_best_palette_rd,
+                 &do_header_rd_based_breakout, discount_color_cost);
+    *last_n_searched = n;
+    if (do_header_rd_based_breakout) {
+      // Terminate palette_size search by setting last_n_searched to end_n.
+      *last_n_searched = end_n;
+      break;
+    }
+    if (beat_best_palette_rd) {
+      top_color_winner = n;
+    } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+      // At search level 2, we return immediately if we don't see an improvement
+      return top_color_winner;
+    }
+    n += step_size;
+  }
+  return top_color_winner;
+}
+
+// Sets the parameters to search the current number of colors +- 1
+static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size,
+                                         int winner, int end_n) {
+  // Set min to winner - 1 unless we are already at the border, then we set it
+  // to winner + 1
+  *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1)
+                                        : AOMMAX(winner - 1, PALETTE_MIN_SIZE);
+  // Set max to winner + 1 unless we are already at the border, then we set it
+  // to winner - 1
+  *max_n =
+      (winner == end_n) ? (winner - 1) : AOMMIN(winner + 1, PALETTE_MAX_SIZE);
+
+  // Set the step size to max_n - min_n so we only search those two values.
+  // If max_n == min_n, then set step_size to 1 to avoid infinite loop later.
+  *step_size = AOMMAX(1, *max_n - *min_n);
+}
+
+static AOM_INLINE void fill_data_and_get_bounds(const uint8_t *src,
+                                                const int src_stride,
+                                                const int rows, const int cols,
+                                                const int is_high_bitdepth,
+                                                int16_t *data, int *lower_bound,
+                                                int *upper_bound) {
+  if (is_high_bitdepth) {
+    const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+    *lower_bound = *upper_bound = src_ptr[0];
+    for (int r = 0; r < rows; ++r) {
+      for (int c = 0; c < cols; ++c) {
+        const int val = src_ptr[c];
+        data[c] = (int16_t)val;
+        *lower_bound = AOMMIN(*lower_bound, val);
+        *upper_bound = AOMMAX(*upper_bound, val);
+      }
+      src_ptr += src_stride;
+      data += cols;
+    }
+    return;
+  }
+
+  // low bit depth
+  *lower_bound = *upper_bound = src[0];
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int val = src[c];
+      data[c] = (int16_t)val;
+      *lower_bound = AOMMIN(*lower_bound, val);
+      *upper_bound = AOMMAX(*upper_bound, val);
+    }
+    src += src_stride;
+    data += cols;
+  }
+}
+
+/*! \brief Colors are sorted by their count: the higher the better.
+ */
+struct ColorCount {
+  //! Color index in the histogram.
+  int index;
+  //! Histogram count.
+  int count;
+};
+
+int color_count_comp(const void *c1, const void *c2) {
+  const struct ColorCount *color_count1 = (const struct ColorCount *)c1;
+  const struct ColorCount *color_count2 = (const struct ColorCount *)c2;
+  if (color_count1->count > color_count2->count) return -1;
+  if (color_count1->count < color_count2->count) return 1;
+  if (color_count1->index < color_count2->index) return -1;
+  return 1;
+}
+
+static void find_top_colors(const int *const count_buf, int bit_depth,
+                            int n_colors, int16_t *top_colors) {
+  // Top color array, serving as a priority queue if more than n_colors are
+  // found.
+  struct ColorCount top_color_counts[PALETTE_MAX_SIZE] = { { 0 } };
+  int n_color_count = 0;
+  for (int i = 0; i < (1 << bit_depth); ++i) {
+    if (count_buf[i] > 0) {
+      if (n_color_count < n_colors) {
+        // Keep adding to the top colors.
+        top_color_counts[n_color_count].index = i;
+        top_color_counts[n_color_count].count = count_buf[i];
+        ++n_color_count;
+        if (n_color_count == n_colors) {
+          qsort(top_color_counts, n_colors, sizeof(top_color_counts[0]),
+                color_count_comp);
+        }
+      } else {
+        // Check the worst in the sorted top.
+        if (count_buf[i] > top_color_counts[n_colors - 1].count) {
+          int j = n_colors - 1;
+          // Move up to the best one.
+          while (j >= 1 && count_buf[i] > top_color_counts[j - 1].count) --j;
+          memmove(top_color_counts + j + 1, top_color_counts + j,
+                  (n_colors - j - 1) * sizeof(top_color_counts[0]));
+          top_color_counts[j].index = i;
+          top_color_counts[j].count = count_buf[i];
+        }
+      }
+    }
+  }
+  assert(n_color_count == n_colors);
+
+  for (int i = 0; i < n_colors; ++i) {
+    top_colors[i] = top_color_counts[i].index;
+  }
+}
+
+void av1_rd_pick_palette_intra_sby(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           bsize));
+  assert(PALETTE_MAX_SIZE == 8);
+  assert(PALETTE_MIN_SIZE == 2);
+
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
+  const int is_hbd = seq_params->use_highbitdepth;
+  const int bit_depth = seq_params->bit_depth;
+  const int discount_color_cost = cpi->sf.rt_sf.use_nonrd_pick_mode;
+  int unused;
+
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  int colors, colors_threshold = 0;
+  if (is_hbd) {
+    int count_buf_8bit[1 << 8];  // Maximum (1 << 8) bins for hbd path.
+    av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf,
+                            count_buf_8bit, &colors_threshold, &colors);
+  } else {
+    av1_count_colors(src, src_stride, rows, cols, count_buf, &colors);
+    colors_threshold = colors;
+  }
+
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int color_thresh_palette = 64;
+  // Allow for larger color_threshold for palette search, based on color,
+  // scene_change, and block source variance.
+  // Since palette is Y based, only allow larger threshold if block
+  // color_dist is below threshold.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+      cpi->sf.rt_sf.increase_color_thresh_palette && cpi->rc.high_source_sad &&
+      x->source_variance > 50) {
+    int64_t norm_color_dist = 0;
+    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+      norm_color_dist = x->min_dist_inter_uv >>
+                        (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+      if (x->color_sensitivity[0] && x->color_sensitivity[1])
+        norm_color_dist = norm_color_dist >> 1;
+    }
+    if (norm_color_dist < 8000) color_thresh_palette += 20;
+  }
+  if (colors_threshold > 1 && colors_threshold <= color_thresh_palette) {
+    int16_t *const data = x->palette_buffer->kmeans_data_buf;
+    int16_t centroids[PALETTE_MAX_SIZE];
+    int lower_bound, upper_bound;
+    fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data,
+                             &lower_bound, &upper_bound);
+
+    mbmi->mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+    // Find the dominant colors, stored in top_colors[].
+    int16_t top_colors[PALETTE_MAX_SIZE] = { 0 };
+    find_top_colors(count_buf, bit_depth, AOMMIN(colors, PALETTE_MAX_SIZE),
+                    top_colors);
+
+    // The following are the approaches used for header rdcost based gating
+    // for early termination for different values of prune_palette_search_level.
+    // 0: Pruning based on header rdcost for ascending order palette_size
+    // search.
+    // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size
+    // search and for finer search do_header_rd_based_gating parameter is
+    // explicitly passed as 'false'.
+    // 2: Enabled only for ascending order palette_size search and for
+    // descending order search do_header_rd_based_gating parameter is explicitly
+    // passed as 'false'.
+    const bool do_header_rd_based_gating =
+        cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
+
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
+        (colors > PALETTE_MIN_SIZE)) {
+      // Start index and step size below are chosen to evaluate unique
+      // candidates in neighbor search, in case a winner candidate is found in
+      // coarse search. Example,
+      // 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
+      // size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
+      // If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
+      // (3) and 8 (7).
+      // 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
+      // as for 8 colors) then step size should also be 2, to cover all
+      // candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
+      // 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
+      // coarse search will evaluate 3 and 6. For the winner, unique neighbors
+      // (3: 2,4 or 6: 5,7) would be evaluated.
+
+      // Start index for coarse palette search for dominant colors and k-means
+      const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                   3, 3, 2,
+                                                                   3, 3, 2 };
+      // Step size for coarse palette search for dominant colors and k-means
+      const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                     3, 3, 3,
+                                                                     3, 3, 3 };
+
+      // Choose the start index and step size for coarse search based on number
+      // of colors
+      const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE);
+      const int min_n = start_n_lookup_table[max_n];
+      const int step_size = step_size_lookup_table[max_n];
+      assert(min_n >= PALETTE_MIN_SIZE);
+      // Perform top color coarse palette search to find the winner candidate
+      const int top_color_winner = perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+          step_size, do_header_rd_based_gating, &unused, color_cache, n_cache,
+          best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+          discount_color_cost);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for dominant colors
+      if (top_color_winner <= max_n) {
+        int stage2_min_n, stage2_max_n, stage2_step_size;
+        set_stage2_params(&stage2_min_n, &stage2_max_n, &stage2_step_size,
+                          top_color_winner, max_n);
+        // perform finer search for the winner candidate
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n,
+            stage2_max_n + 1, stage2_step_size,
+            /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+            best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+            distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+            tx_type_map, discount_color_cost);
+      }
+      // K-means clustering.
+      // Perform k-means coarse palette search to find the winner candidate
+      const int k_means_winner = perform_k_means_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+          min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused,
+          color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+          rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+          best_blk_skip, tx_type_map, color_map, rows * cols,
+          discount_color_cost);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for k-means
+      if (k_means_winner <= max_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        set_stage2_params(&start_n_stage2, &end_n_stage2, &step_size_stage2,
+                          k_means_winner, max_n);
+        // perform finer search for the winner candidate
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+            start_n_stage2, end_n_stage2 + 1, step_size_stage2,
+            /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+            best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+            distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+            tx_type_map, color_map, rows * cols, discount_color_cost);
+      }
+    } else {
+      const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
+                min_n = PALETTE_MIN_SIZE;
+      // Perform top color palette search in ascending order
+      int last_n_searched = min_n;
+      perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+          1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache,
+          best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+          discount_color_cost);
+      if (last_n_searched < max_n) {
+        // Search in descending order until we get to the previous best
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n,
+            last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused,
+            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, discount_color_cost);
+      }
+      // K-means clustering.
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == 2);
+        centroids[0] = lower_bound;
+        centroids[1] = upper_bound;
+        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
+                     color_cache, n_cache, /*do_header_rd_based_gating=*/false,
+                     best_mbmi, best_palette_color_map, best_rd, rate,
+                     rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+                     best_blk_skip, tx_type_map, NULL, NULL,
+                     discount_color_cost);
+      } else {
+        // Perform k-means palette search in ascending order
+        last_n_searched = min_n;
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+            min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched,
+            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, color_map, rows * cols,
+            discount_color_cost);
+        if (last_n_searched < max_n) {
+          // Search in descending order until we get to the previous best
+          perform_k_means_palette_search(
+              cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+              max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false,
+              &unused, color_cache, n_cache, best_mbmi, best_palette_color_map,
+              best_rd, rate, rate_tokenonly, distortion, skippable,
+              beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+              rows * cols, discount_color_cost);
+        }
+      }
+    }
+  }
+
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           block_width * block_height * sizeof(best_palette_color_map[0]));
+    // Gather the stats to determine whether to use screen content tools in
+    // function av1_determine_sc_tools_with_encoding().
+    x->palette_pixels += (block_width * block_height);
+  }
+  *mbmi = *best_mbmi;
+}
+
+void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
+                                    int dc_mode_cost,
+                                    uint8_t *best_palette_color_map,
+                                    MB_MODE_INFO *const best_mbmi,
+                                    int64_t *best_rd, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    uint8_t *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           mbmi->bsize));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
+  int this_rate;
+  int64_t this_rd;
+  int colors_u, colors_v;
+  int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  RD_STATS tokenonly_rd_stats;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  mbmi->uv_mode = UV_DC_PRED;
+  if (seq_params->use_highbitdepth) {
+    int count_buf[1 << 12];      // Maximum (1 << 12) color levels.
+    int count_buf_8bit[1 << 8];  // Maximum (1 << 8) bins for hbd path.
+    av1_count_colors_highbd(src_u, src_stride, rows, cols,
+                            seq_params->bit_depth, count_buf, count_buf_8bit,
+                            &colors_threshold_u, &colors_u);
+    av1_count_colors_highbd(src_v, src_stride, rows, cols,
+                            seq_params->bit_depth, count_buf, count_buf_8bit,
+                            &colors_threshold_v, &colors_v);
+  } else {
+    int count_buf[1 << 8];
+    av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u);
+    av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v);
+    colors_threshold_u = colors_u;
+    colors_threshold_v = colors_v;
+  }
+
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+  colors_threshold = colors_threshold_u > colors_threshold_v
+                         ? colors_threshold_u
+                         : colors_threshold_v;
+  if (colors_threshold > 1 && colors_threshold <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int16_t *const data = x->palette_buffer->kmeans_data_buf;
+    int16_t centroids[2 * PALETTE_MAX_SIZE];
+
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (seq_params->use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+    }
+
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+        if (seq_params->use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        }
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    const int colors = colors_u > colors_v ? colors_u : colors_v;
+    const int max_colors =
+        colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+    for (n = PALETTE_MIN_SIZE; n <= max_colors; ++n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+      }
+      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+      optimize_palette_colors(color_cache, n_cache, n, 2, centroids,
+                              cpi->common.seq_params->bit_depth);
+      // Sort the U channel colors in ascending order.
+      for (i = 0; i < 2 * (n - 1); i += 2) {
+        int min_idx = i;
+        int min_val = centroids[i];
+        for (j = i + 2; j < 2 * n; j += 2)
+          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+        if (min_idx != i) {
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
+          centroids[i] = centroids[min_idx];
+          centroids[i + 1] = centroids[min_idx + 1];
+          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+        }
+      }
+      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+      extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                               plane_block_height);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+          if (seq_params->use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+          else
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel((int)centroids[j * 2 + i - 1]);
+        }
+      }
+
+      if (cpi->sf.intra_sf.early_term_chroma_palette_size_search) {
+        const int palette_mode_rate =
+            intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+        const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+        // Terminate further palette_size search, if header cost corresponding
+        // to lower palette_size is more than the best_rd.
+        if (header_rd >= *best_rd) break;
+        av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+        if (tokenonly_rd_stats.rate == INT_MAX) continue;
+        this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+      } else {
+        av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+        if (tokenonly_rd_stats.rate == INT_MAX) continue;
+        this_rate = tokenonly_rd_stats.rate +
+                    intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      }
+
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *best_mbmi = *mbmi;
+        memcpy(best_palette_color_map, color_map,
+               plane_block_width * plane_block_height *
+                   sizeof(best_palette_color_map[0]));
+        *rate = this_rate;
+        *distortion = tokenonly_rd_stats.dist;
+        *rate_tokenonly = tokenonly_rd_stats.rate;
+        *skippable = tokenonly_rd_stats.skip_txfm;
+      }
+    }
+  }
+  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
+  }
+}
+
+void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  int16_t *const data = x->palette_buffer->kmeans_data_buf;
+  int16_t centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      if (cpi->common.seq_params->use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+      }
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
new file mode 100644
index 0000000000..7da863a0cc
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares functions used in palette search.
+ */
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct PICK_MODE_CONTEXT;
+struct macroblock;
+
+/*!\cond */
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c
+
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids,
+                                        uint8_t *indices, int n, int k,
+                                        int max_itr);
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int16_t *data, int16_t *centroids,
+                                        uint8_t *indices, int n, int k,
+                                        int max_itr);
+/*!\endcond */
+
+/*!\brief Calculates the cluster to which each data point belong.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    data               The data points whose cluster indices are
+ *                                  to be computed. The data layout is
+ *                                  NUM_DATA_POINTS X DATA_DIM.
+ * \param[in]    centroids          Pointer to the centroids. The data layout
+ *                                  is NUM_CENTROIDS X DATA_DIM.
+ * \param[in]    indices            Pointer to store the computed indices.
+ * \param[in]    n                  Number of data points.
+ * \param[in]    k                  Number of clusters.
+ * \param[in]    dim                Data dimension.
+ *
+ * \remark Returns nothing, but saves each data's cluster index in \a indices.
+ */
+static INLINE void av1_calc_indices(const int16_t *data,
+                                    const int16_t *centroids, uint8_t *indices,
+                                    int n, int k, int dim) {
+  assert(n > 0);
+  assert(k > 0);
+  if (dim == 1) {
+    av1_calc_indices_dim1(data, centroids, indices, /*total_dist=*/NULL, n, k);
+  } else if (dim == 2) {
+    av1_calc_indices_dim2(data, centroids, indices, /*total_dist=*/NULL, n, k);
+  } else {
+    assert(0 && "Untemplated k means dimension");
+  }
+}
+
+/*!\brief Performs k-means cluster on the data.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    data               The data points to be clustered. The data
+ *                                  layout is NUM_DATA_POINTS X DATA_DIM.
+ * \param[in]    centroids          Pointer to store the computed centroids.
+ *                                  The data layout is
+ *                                  NUM_CENTROIDS X DATA_DIM.
+ * \param[in]    indices            Pointer to store the computed indices. For
+ *                                  each training data.
+ * \param[in]    n                  Number of data points.
+ * \param[in]    k                  Number of clusters.
+ * \param[in]    dim                Data dimension.
+ * \param[in]    max_itr            Maximum number of iterations to run.
+ *
+ * \remark Returns nothing, but saves each cluster's centroid in centroids and
+ * each data's cluster index in \a indices.
+ *
+ * \attention The output centroids are rounded off to nearest integers.
+ */
+static INLINE void av1_k_means(const int16_t *data, int16_t *centroids,
+                               uint8_t *indices, int n, int k, int dim,
+                               int max_itr) {
+  assert(n > 0);
+  assert(k > 0);
+  if (dim == 1) {
+    AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
+  } else if (dim == 2) {
+    AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr);
+  } else {
+    assert(0 && "Untemplated k means dimension");
+  }
+}
+
+/*!\brief Removes duplicated centroid indices.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    centroids          A list of centroids index.
+ * \param[in]    num_centroids      Number of centroids.
+ *
+ * \return Returns the number of unique centroids and saves the unique centroids
+ * in beginning of the centroids array.
+ *
+ * \attention The centroids should be rounded to integers before calling this
+ * method.
+ */
+int av1_remove_duplicates(int16_t *centroids, int num_centroids);
+
+/*!\brief Checks what colors are in the color cache.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    color_cache          A cache of colors.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    colors               New base colors.
+ * \param[in]    n_colors             Number of new colors.
+ * \param[in]    cache_color_found    Stores what cached colors are presented in
+ *                                    colors.
+ * \param[in]    out_cache_colors     Stores what colors are not in the cache.
+ *
+ * \return Returns the number of colors that are not in cache. In addition,
+ * records whether each cache color is presented in colors in cache_color_found,
+ * and stores and stores the out of cache colors in out_cache_colors.
+ */
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+                          const uint16_t *colors, int n_colors,
+                          uint8_t *cache_color_found, int *out_cache_colors);
+
+/*!\brief Gets the rate cost for each delta-encoding v palette.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ * \param[in]    zero_count           Stores the number of zero deltas.
+ * \param[in]    min_bits             Minimum bits for the deltas. Sets to
+ *                                    bit_depth - 4.
+ *
+ * \return Returns the number of bits used to transmit each v palette color
+ * delta and assigns zero_count with the number of deltas being 0.
+ */
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *zero_count, int *min_bits);
+
+/*!\brief Gets the rate cost for transmitting luma palette color values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    color_cache          Color cache presented at the decoder.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+                             const uint16_t *color_cache, int n_cache,
+                             int bit_depth);
+
+/*!\brief Gets the rate cost for transmitting luma palette chroma values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    color_cache          Color cache presented at the decoder.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+                              const uint16_t *color_cache, int n_cache,
+                              int bit_depth);
+
+/*!\brief Search for the best palette in the luma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sby(
+    const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
+    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    uint8_t *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map);
+
+/*!\brief Search for the best palette in the chroma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi,
+                                    struct macroblock *x, int dc_mode_cost,
+                                    uint8_t *best_palette_color_map,
+                                    MB_MODE_INFO *const best_mbmi,
+                                    int64_t *best_rd, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    uint8_t *skippable);
+
+/*!\brief Resets palette color map for chroma channels.
+ */
+void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/third_party/aom/av1/encoder/partition_cnn_weights.h b/third_party/aom/av1/encoder/partition_cnn_weights.h
new file mode 100644
index 0000000000..504038c63a
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_cnn_weights.h
@@ -0,0 +1,2139 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/ml.h"
+
+#define CNN_BRANCH_0_OUT_CH 20
+#define CNN_BRANCH_1_OUT_CH 4
+#define CNN_BRANCH_2_OUT_CH 20
+#define CNN_BRANCH_3_OUT_CH 20
+#define CNN_TOT_OUT_CH                                                      \
+  (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \
+    (CNN_BRANCH_3_OUT_CH)))
+#define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH)
+#define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2)
+#define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4)
+#define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8)
+#define CNN_OUT_BUF_SIZE                                \
+  (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \
+    (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE)))
+
+#define NUM_DNN_BRANCHES 4
+#define NUM_CNN_LAYERS 5
+#define BRANCH_0_NUM_DNN_LAYERS 2
+#define BRANCH_1_NUM_DNN_LAYERS 2
+#define BRANCH_2_NUM_DNN_LAYERS 2
+#define BRANCH_3_NUM_DNN_LAYERS 2
+#define CNN_LAYER_0_HEIGHT 5
+#define CNN_LAYER_0_WIDTH 5
+#define CNN_LAYER_0_IN_CH 1
+#define CNN_LAYER_0_OUT_CH 20
+#define CNN_LAYER_0_HORZ_STRIDE 4
+#define CNN_LAYER_0_VERT_STRIDE 4
+#define CNN_LAYER_1_HEIGHT 2
+#define CNN_LAYER_1_WIDTH 2
+#define CNN_LAYER_1_IN_CH 20
+#define CNN_LAYER_1_OUT_CH 20
+#define CNN_LAYER_1_HORZ_STRIDE 2
+#define CNN_LAYER_1_VERT_STRIDE 2
+#define CNN_LAYER_2_HEIGHT 2
+#define CNN_LAYER_2_WIDTH 2
+#define CNN_LAYER_2_IN_CH 20
+#define CNN_LAYER_2_OUT_CH 20
+#define CNN_LAYER_2_HORZ_STRIDE 2
+#define CNN_LAYER_2_VERT_STRIDE 2
+#define CNN_LAYER_3_HEIGHT 2
+#define CNN_LAYER_3_WIDTH 2
+#define CNN_LAYER_3_IN_CH 20
+#define CNN_LAYER_3_OUT_CH 4
+#define CNN_LAYER_3_HORZ_STRIDE 2
+#define CNN_LAYER_3_VERT_STRIDE 2
+#define CNN_LAYER_4_HEIGHT 2
+#define CNN_LAYER_4_WIDTH 2
+#define CNN_LAYER_4_IN_CH 4
+#define CNN_LAYER_4_OUT_CH 20
+#define CNN_LAYER_4_HORZ_STRIDE 2
+#define CNN_LAYER_4_VERT_STRIDE 2
+#define BRANCH_0_NUM_DNN_FEATURES 37
+#define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_0_NUM_LOGITS 1
+#define BRANCH_1_NUM_DNN_FEATURES 25
+#define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_1_NUM_LOGITS 1
+#define BRANCH_2_NUM_DNN_FEATURES 25
+#define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_2_NUM_LOGITS 1
+#define BRANCH_3_NUM_DNN_FEATURES 41
+#define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_3_NUM_LOGITS 1
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = {
+  0.131894f,    -0.593536f,  -0.212935f,  -0.00220011f, -0.396949f,
+  0.287753f,    -0.91875f,   -0.0095057f, 0.804197f,    -0.395239f,
+  0.516604f,    1.16439f,    0.445784f,   -0.163349f,   0.746488f,
+  -0.33891f,    -0.562652f,  0.481403f,   0.755378f,    -0.200753f,
+  0.0784307f,   0.105657f,   0.0205673f,  -0.524089f,   -0.476146f,
+  -0.161206f,   -0.65079f,   0.137474f,   0.28584f,     0.508768f,
+  -0.643386f,   0.227068f,   -0.899507f,  -0.413382f,   0.631466f,
+  0.398203f,    -0.544392f,  0.825155f,   0.671847f,    -0.249779f,
+  0.323121f,    0.125357f,   -0.719564f,  -0.0714854f,  -0.168472f,
+  -0.213246f,   -0.674525f,  0.330148f,   -0.138414f,   0.20462f,
+  -0.518571f,   -0.15091f,   -0.605116f,  -0.448732f,   -0.475599f,
+  0.738f,       -0.328526f,  0.755035f,   0.969414f,    -0.321039f,
+  -0.23068f,    0.408567f,   -0.377813f,  -0.273974f,   1.0684f,
+  0.373968f,    -0.450305f,  0.439258f,   -0.381846f,   -0.267331f,
+  0.30613f,     -0.39369f,   0.622438f,   -0.52877f,    -0.334991f,
+  0.263193f,    -0.402121f,  0.64142f,    0.793048f,    -0.0231174f,
+  -0.68474f,    -0.293338f,  -0.737511f,  -0.462654f,   0.474629f,
+  0.141397f,    -0.152529f,  0.345879f,   -0.499991f,   0.00174024f,
+  0.337387f,    -0.131151f,  0.427385f,   -0.457449f,   -0.879614f,
+  -0.425908f,   -0.263172f,  0.0344974f,  1.07861f,     -0.00416662f,
+  0.0208952f,   0.233905f,   0.765965f,   0.0423685f,   -0.117554f,
+  -0.248237f,   0.49848f,    -0.845131f,  0.223648f,    -0.838709f,
+  0.5834f,      0.309956f,   -0.0625093f, -0.619619f,   0.918957f,
+  0.358271f,    -0.668459f,  0.518783f,   -0.418963f,   -0.206788f,
+  0.364983f,    -0.0396087f, 0.624309f,   -0.138679f,   -0.142453f,
+  0.28309f,     0.895092f,   -0.215713f,  0.439025f,    0.659333f,
+  -0.366025f,   -0.413518f,  0.66657f,    -0.265919f,   0.473471f,
+  -1.0729f,     -0.526702f,  0.2838f,     0.367648f,    -0.61242f,
+  0.121656f,    0.547727f,   -0.0636793f, -0.33006f,    -0.306604f,
+  -0.00897731f, 0.688242f,   0.0944626f,  0.321508f,    0.0437392f,
+  -0.560035f,   -0.768334f,  0.0571051f,  -0.0427601f,  -0.0437806f,
+  -0.816209f,   -0.395829f,  0.293733f,   0.217645f,    -0.646428f,
+  0.132448f,    -0.435806f,  -0.0556814f, 0.0218857f,   0.348525f,
+  -0.17296f,    0.669057f,   0.638604f,   -0.0995596f,  -0.024099f,
+  -0.262332f,   -0.548975f,  0.357894f,   0.43873f,     -0.688234f,
+  -0.425519f,   0.190986f,   -0.074778f,  0.294232f,    -0.548969f,
+  -0.731198f,   0.03616f,    -0.475969f,  -0.306075f,   -0.111929f,
+  -0.234146f,   0.612669f,   0.882254f,   -0.622893f,   0.262431f,
+  0.465242f,    0.245384f,   -0.811016f,  0.501798f,    -0.925875f,
+  0.264373f,    0.307766f,   -0.26872f,   0.113027f,    -0.158875f,
+  0.0711483f,   0.220275f,   -0.0699022f, -0.0111303f,  -0.435384f,
+  -0.720014f,   0.593484f,   -0.964082f,  0.750925f,    0.252433f,
+  0.964332f,    -0.256904f,  -0.421715f,  -0.403851f,   -0.188081f,
+  0.694014f,    -1.00183f,   0.798921f,   0.0603123f,   0.213814f,
+  0.739642f,    -0.0203375f, 0.72569f,    -0.260224f,   0.0199516f,
+  -0.322451f,   0.318204f,   -0.38392f,   0.740994f,    -0.265215f,
+  -0.54541f,    -0.51479f,   -0.458397f,  0.519564f,    0.0509182f,
+  0.0363331f,   -0.293051f,  0.317714f,   -0.327488f,   -0.0840401f,
+  0.318437f,    -0.619403f,  0.641094f,   -0.288435f,   -0.260185f,
+  0.181083f,    -0.169294f,  0.292645f,   0.140405f,    0.0572885f,
+  -0.637428f,   -0.102616f,  0.288955f,   0.817314f,    0.116855f,
+  0.635532f,    0.283334f,   -0.236391f,  -0.305035f,   -0.217365f,
+  -0.033021f,   -0.455858f,  0.439922f,   -0.104039f,   0.373376f,
+  0.310659f,    0.388789f,   0.266341f,   0.0746306f,   -0.428192f,
+  -0.202695f,   -0.347625f,  0.00585741f, 0.366203f,    0.221413f,
+  0.518856f,    0.57245f,    -0.375071f,  -0.2436f,     -0.511895f,
+  -1.03708f,    0.681455f,   -0.111544f,  -0.183563f,   0.109729f,
+  -0.422646f,   -0.529777f,  0.747473f,   -0.270223f,   -0.11435f,
+  0.378931f,    0.420456f,   0.236331f,   0.49261f,     -0.0666801f,
+  0.0475846f,   0.906095f,   -0.4146f,    -0.020588f,   -0.653285f,
+  0.135335f,    0.543846f,   -0.309061f,  0.11899f,     -0.639168f,
+  -0.719994f,   -0.219706f,  -0.645631f,  -0.829049f,   -0.0114746f,
+  0.834604f,    0.0378035f,  0.107957f,   0.546929f,    -0.674395f,
+  -0.854817f,   -1.1443f,    0.223413f,   -0.326324f,   0.440971f,
+  0.383582f,    -0.495084f,  0.280091f,   -0.53116f,    0.0333923f,
+  -0.354339f,   -0.0449156f, -0.538896f,  -0.753355f,   0.463995f,
+  0.000969967f, -0.2832f,    0.587276f,   0.853094f,    -0.481985f,
+  -0.138202f,   0.180989f,   -0.349044f,  -0.417534f,   0.455591f,
+  0.287332f,    0.251496f,   0.381416f,   0.339632f,    -0.0825727f,
+  0.352739f,    0.161697f,   -0.319764f,  -0.258015f,   0.668833f,
+  -0.553303f,   -0.578815f,  -0.3758f,    0.289f,       0.247368f,
+  0.00681103f,  0.421092f,   -0.191033f,  -0.425868f,   -0.1239f,
+  0.0540422f,   -0.0856856f, 0.481168f,   -0.0283741f,  -0.196018f,
+  0.230923f,    -0.145288f,  0.52188f,    0.00628462f,  -0.604556f,
+  -0.562879f,   0.319282f,   0.323799f,   0.453941f,    0.271129f,
+  -0.0520196f,  0.684571f,   -0.391779f,  -0.404614f,   0.134097f,
+  -0.825482f,   0.0913949f,  0.483543f,   0.159084f,    0.301637f,
+  0.427013f,    0.196153f,   0.460091f,   -0.730573f,   -0.12278f,
+  0.221665f,    0.674622f,   -0.623363f,  -0.0761517f,  0.637979f,
+  -0.468498f,   0.527276f,   -0.596894f,  -0.34675f,    -0.251241f,
+  0.418533f,    -0.476696f,  -0.901267f,  -0.0088241f,  -0.12421f,
+  -0.660316f,   -0.0222117f, -0.470898f,  -1.10739f,    -0.441645f,
+  0.39516f,     -0.0117906f, 0.254122f,   0.00722599f,  -1.00697f,
+  0.48908f,     -0.122287f,  -0.378608f,  -0.339145f,   0.682463f,
+  0.305606f,    0.453628f,   -0.49923f,   -0.791388f,   -0.202515f,
+  0.23214f,     -0.434209f,  -0.778283f,  -0.538015f,   0.145769f,
+  0.446281f,    -0.339329f,  -0.198478f,  -0.183717f,   -0.855441f,
+  -0.105778f,   0.575067f,   -0.18592f,   -0.348094f,   0.740614f,
+  0.041549f,    -0.109663f,  0.0434492f,  0.245242f,    -1.22192f,
+  0.685896f,    -0.208115f,  -0.0616216f, -1.00552f,    0.31045f,
+  -0.184394f,   0.466705f,   -0.0984364f, -0.506252f,   0.144874f,
+  0.357038f,    0.675221f,   -0.822171f,  -0.52729f,    0.991212f,
+  0.432422f,    0.383493f,   -0.372395f,  0.35651f,     -0.25369f,
+  0.660208f,    -0.117745f,  -0.142433f,  -0.724115f,   -1.0035f,
+  -0.59178f,    0.563444f,   -0.282531f,  -0.599989f,   0.507424f,
+  -0.782875f,   0.755029f,   -0.754962f,  -0.617825f,   0.565984f,
+  -0.826878f,   -0.456563f,  0.0212161f,  0.469867f,    -0.144864f,
+  0.225748f,    -0.279029f,  0.21052f,    -0.440183f,   0.936069f,
+  0.170595f,    0.40966f,    0.452453f,   -0.576006f,   1.50696f,
+  0.649049f,    0.094957f,   -0.167706f,  -0.258342f,   0.59269f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = {
+  0.00475215f,  -0.00362332f, -0.00317542f, 0.190083f,    0.0488147f,
+  -0.0268093f,  -0.00432231f, 0.0112229f,   0.0626653f,   -0.0025698f,
+  0.0018675f,   -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f,
+  0.000136436f, 0.0667295f,   0.0251274f,   0.00226553f,  -0.000638344f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = {
+  0.228403f,    0.241933f,     0.181079f,    0.101728f,    0.278455f,
+  -0.222078f,   0.387578f,     0.0847356f,   -0.0737012f,  0.26518f,
+  -1.0817f,     0.0404161f,    -0.805199f,   0.336576f,    -0.541494f,
+  0.246264f,    0.116597f,     -0.756804f,   -0.914136f,   0.410265f,
+  0.413294f,    0.07873f,      0.450017f,    -0.264346f,   0.549095f,
+  1.03755f,     -0.203542f,    1.61018f,     0.374131f,    0.402515f,
+  -2.36115f,    0.116427f,     -0.172157f,   -0.231482f,   -0.905736f,
+  -0.0183059f,  -0.575746f,    0.110348f,    -0.268018f,   0.140399f,
+  0.427196f,    0.0718528f,    0.247936f,    -0.326661f,   0.150404f,
+  -0.659979f,   -0.157148f,    0.00826241f,  -0.679275f,   -0.131564f,
+  -1.04822f,    1.06039f,      -0.207898f,   0.510167f,    0.484233f,
+  0.138972f,    -0.0801639f,   -0.184416f,   0.0741107f,   -0.0299281f,
+  0.112263f,    0.380071f,     -0.0185269f,  -0.0821188f,  0.918796f,
+  -0.576106f,   0.593007f,     0.479446f,    0.0440703f,   0.322379f,
+  0.176783f,    -0.147111f,    0.0953247f,   -0.636377f,   0.0702104f,
+  0.130979f,    0.293892f,     -0.0112124f,  -0.040347f,   -0.16034f,
+  0.3252f,      -0.586802f,    0.601786f,    -0.487148f,   -0.458777f,
+  0.463835f,    0.144942f,     0.00339965f,  -0.779966f,   0.0585298f,
+  -1.20758f,    -0.275614f,    0.292346f,    -0.132781f,   0.337892f,
+  -0.357677f,   1.48511f,      0.172907f,    -0.148668f,   0.243184f,
+  -0.503392f,   -0.0791543f,   0.0265389f,   -0.102267f,   0.213294f,
+  0.0657801f,   0.156996f,     0.0891168f,   0.120805f,    0.261285f,
+  -0.343025f,   -0.0792235f,   -0.106415f,   0.133878f,    -0.112981f,
+  -0.00151126f, -0.0643829f,   0.0458938f,   -0.0452731f,  -0.00147422f,
+  0.1871f,      -0.0208793f,   0.0752037f,   0.0794674f,   0.167666f,
+  0.198028f,    -0.361015f,    -0.0661721f,  -0.10672f,    -0.0773641f,
+  -1.15856f,    -0.516443f,    -0.322702f,   0.15668f,     0.0075841f,
+  -0.157731f,   0.270926f,     -0.241551f,   0.0169097f,   -0.0263953f,
+  -0.303556f,   -0.239237f,    0.117792f,    -0.137871f,   0.122054f,
+  -0.587381f,   0.112938f,     0.0867262f,   -0.27909f,    -0.203622f,
+  -0.622195f,   0.42623f,      0.670704f,    0.190826f,    -0.304979f,
+  -0.570075f,   -0.240699f,    0.43744f,     0.632896f,    -0.563846f,
+  -0.0160434f,  -0.0709745f,   0.816662f,    0.269999f,    -0.358734f,
+  0.193644f,    1.19339f,      -0.118223f,   -0.363291f,   -0.723616f,
+  -1.58825f,    0.0222856f,    0.769852f,    0.322713f,    0.0857619f,
+  -0.669756f,   -1.08414f,     1.18593f,     0.486166f,    -0.520646f,
+  0.0861854f,   -0.134197f,    0.258337f,    0.223345f,    0.697639f,
+  -0.57261f,    0.54031f,      0.892644f,    0.497572f,    -0.287076f,
+  -1.95928f,    -0.0568128f,   -0.253335f,   0.00233392f,  -0.192787f,
+  -0.115203f,   -0.0975649f,   0.277954f,    0.000704534f, -0.315884f,
+  0.309583f,    0.357458f,     0.0939298f,   -0.072701f,   0.433045f,
+  -0.536938f,   0.534523f,     0.184585f,    -0.0415175f,  -0.120909f,
+  -1.2622f,     0.412449f,     -0.114741f,   0.290453f,    -0.441671f,
+  -0.0242497f,  -0.20746f,     0.139019f,    -0.422668f,   -0.146732f,
+  -0.688828f,   -0.00339426f,  0.04166f,     0.41755f,     0.405675f,
+  0.562564f,    0.0216812f,    0.0271391f,   0.215227f,    0.328183f,
+  -1.6442f,     -0.827838f,    0.115491f,    0.0951442f,   -0.133779f,
+  -0.0482928f,  0.203177f,     0.322953f,    -0.513259f,   0.0676788f,
+  -0.0877928f,  0.224448f,     0.451957f,    0.314243f,    0.307403f,
+  0.35653f,     0.0286278f,    2.27554f,     0.569313f,    -0.0488753f,
+  -2.48809f,    0.274555f,     -0.248375f,   -0.635634f,   -0.187663f,
+  0.1827f,      -0.409634f,    -0.0280568f,  -0.207119f,   -0.208192f,
+  -0.410268f,   -0.017669f,    0.134856f,    0.434551f,    0.165201f,
+  0.584608f,    -0.389997f,    -0.088713f,   0.118087f,    0.00210905f,
+  -1.07698f,    -0.520967f,    -0.198742f,   0.190255f,    -0.162639f,
+  0.0122759f,   0.460774f,     -0.684633f,   -0.149512f,   0.167556f,
+  -0.295034f,   -0.0650964f,   0.0868653f,   -0.691352f,   0.089795f,
+  0.0620608f,   0.0531289f,    0.0124286f,   0.151921f,    1.51067f,
+  -0.10586f,    -0.0311871f,   0.114706f,    0.0565205f,   -0.159634f,
+  -0.423987f,   -0.226896f,    0.0605352f,   -0.36324f,    -0.142205f,
+  -0.252249f,   0.0666312f,    0.316655f,    0.00687196f,  0.131079f,
+  -0.128281f,   -0.293468f,    1.3327f,      0.542277f,    -0.060088f,
+  -1.73475f,    0.0542297f,    -0.227522f,   -0.376004f,   -0.147028f,
+  0.0228252f,   0.0569538f,    -0.0796497f,  0.0937596f,   -0.0660153f,
+  -0.979219f,   -0.377322f,    0.0523787f,   0.467299f,    0.0824278f,
+  0.437147f,    0.263637f,     0.0325681f,   0.303581f,    0.353479f,
+  -0.142369f,   -0.394797f,    0.597185f,    0.116482f,    -0.0782593f,
+  0.364539f,    -0.30396f,     0.119016f,    -0.0022429f,  -0.044292f,
+  -0.0110531f,  0.233571f,     0.000975879f, 0.447332f,    -0.0320396f,
+  0.541609f,    0.14232f,      0.163905f,    0.848609f,    0.19954f,
+  -0.186591f,   -0.44465f,     -0.431672f,   0.159037f,    -0.129977f,
+  -0.141778f,   0.246818f,     -0.197539f,   -0.70115f,    0.185449f,
+  0.400274f,    -0.0350744f,   0.239727f,    -0.290504f,   0.0698443f,
+  -0.180374f,   -0.759591f,    -0.0569088f,  -0.50246f,    -0.0986616f,
+  -0.892114f,   0.306737f,     -0.133937f,   0.285625f,    0.495471f,
+  -0.686222f,   -0.168647f,    -0.0926158f,  0.351772f,    -0.0215394f,
+  0.361223f,    0.0657142f,    0.268229f,    -0.616299f,   0.0564718f,
+  -0.294013f,   -0.588019f,    0.0234195f,   -0.426863f,   -0.511253f,
+  -0.72177f,    0.420903f,     0.0987506f,   0.309368f,    0.523532f,
+  1.06073f,     -0.33028f,     0.0818142f,   0.0130354f,   0.0180882f,
+  0.0316898f,   -0.416614f,    -0.566344f,   -0.163083f,   0.285085f,
+  -0.0534352f,  0.385496f,     0.151068f,    -0.208295f,   -0.175648f,
+  0.0476705f,   0.190428f,     -0.643391f,   0.484004f,    -0.421836f,
+  -0.19829f,    -0.227574f,    -0.0869152f,  1.09881f,     0.345129f,
+  -0.236732f,   -0.381935f,    -1.46271f,    0.465914f,    0.610375f,
+  0.689968f,    -0.688546f,    1.95033f,     0.420946f,    0.0282428f,
+  0.147823f,    0.669393f,     0.429085f,    -0.328385f,   -0.150439f,
+  -0.419097f,   -0.828102f,    0.248743f,    0.24644f,     0.0186131f,
+  -0.384319f,   -0.126294f,    -0.417067f,   0.271483f,    -0.0128456f,
+  -0.881351f,   0.152581f,     0.185584f,    -0.745827f,   0.0551359f,
+  0.127083f,    0.936983f,     -0.0225341f,  0.575861f,    0.767417f,
+  -0.140867f,   -0.762518f,    0.422446f,    -0.0611973f,  0.0515641f,
+  -0.144168f,   -0.298882f,    0.308461f,    0.0208704f,   0.213872f,
+  -0.258708f,   1.13186f,      0.314083f,    -0.347536f,   -0.137768f,
+  0.653953f,    -0.217883f,    -0.56112f,    -0.864661f,   0.488836f,
+  0.268133f,    -0.548664f,    -0.765226f,   0.117082f,    0.326798f,
+  -0.678246f,   0.477785f,     -1.27584f,    0.198912f,    -0.710395f,
+  1.39096f,     -0.411577f,    -0.55119f,    0.51092f,     -0.295023f,
+  0.245983f,    -0.0957192f,   -0.312001f,   0.0175991f,   0.524423f,
+  -0.126379f,   0.124687f,     -1.53945f,    -0.342856f,   0.514072f,
+  0.400884f,    -0.00581101f,  -0.219327f,   0.0977873f,   0.337551f,
+  -0.058603f,   0.20034f,      0.0429945f,   0.676803f,    -0.273585f,
+  -0.173435f,   -0.581596f,    0.226263f,    -0.0946223f,  -0.060088f,
+  -0.0100809f,  -0.022242f,    -0.22218f,    -0.030463f,   -0.141389f,
+  -0.190757f,   -0.00526518f,  -0.77519f,    -0.0825695f,  0.308403f,
+  0.262792f,    -0.601842f,    0.0783697f,   0.197527f,    0.0714048f,
+  0.0392629f,   -0.388628f,    0.172541f,    -0.0222009f,  0.252096f,
+  0.0728652f,   0.173632f,     0.192914f,    -0.00969965f, 0.0530136f,
+  -0.00765759f, 0.440234f,     -0.0943323f,  0.112319f,    0.0878737f,
+  -0.739021f,   0.385305f,     0.133334f,    -0.396697f,   0.177818f,
+  -0.0712558f,  0.516923f,     0.102174f,    0.17158f,     -0.211068f,
+  0.295795f,    -0.36198f,     0.179087f,    -0.845744f,   -0.242514f,
+  -1.49073f,    0.272702f,     0.59011f,     -0.408184f,   -0.0731313f,
+  0.234643f,    0.589642f,     -0.100778f,   0.516921f,    -0.700154f,
+  0.316432f,    0.36117f,      0.0380282f,   0.480101f,    -0.0975487f,
+  0.941452f,    0.231705f,     -0.151182f,   -1.20305f,    0.28255f,
+  -0.0427662f,  -0.00717175f,  -0.842085f,   -0.357376f,   0.545581f,
+  -0.290714f,   0.741498f,     1.00377f,     0.483864f,    0.150405f,
+  0.0834512f,   -0.10031f,     0.424054f,    -0.0223491f,  -0.0696701f,
+  -0.134479f,   -0.747227f,    0.422208f,    0.123858f,    -0.392624f,
+  -0.0299847f,  -0.0376142f,   -0.392536f,   -0.0343114f,  0.298224f,
+  -0.375899f,   0.693119f,     0.27909f,     -0.53463f,    0.105459f,
+  -0.0267383f,  0.5094f,       -0.411557f,   0.451749f,    -0.348479f,
+  -0.0497316f,  -0.353913f,    -0.14858f,    0.241838f,    0.331039f,
+  0.756607f,    -0.0701661f,   -0.827264f,   -0.367772f,   0.447201f,
+  0.834616f,    -0.00497265f,  -0.0557285f,  0.055088f,    -0.300115f,
+  -0.143833f,   -1.07838f,     -0.106896f,   0.16945f,     0.0170324f,
+  0.108754f,    0.335893f,     -0.0923708f,  0.450209f,    -0.0713308f,
+  -0.0233037f,  -0.0129902f,   -1.40664f,    -0.0996218f,  0.711236f,
+  0.400716f,    0.227871f,     2.01499f,     0.572926f,    0.135673f,
+  -0.0340458f,  -0.316736f,    0.24257f,     -0.700768f,   -0.194985f,
+  0.312011f,    -0.179599f,    0.128114f,    0.0725977f,   -0.193816f,
+  0.352143f,    0.070641f,     -0.467808f,   -0.399047f,   0.10136f,
+  0.671574f,    -0.553965f,    0.105729f,    0.210383f,    0.065048f,
+  0.248198f,    -0.731674f,    0.588725f,    -0.308237f,   0.24511f,
+  0.00608906f,  0.170906f,     0.246175f,    0.149521f,    0.106071f,
+  0.160246f,    0.118487f,     -0.104102f,   0.872823f,    0.227478f,
+  0.0182631f,   -0.115083f,    0.0142445f,   0.307947f,    -0.884925f,
+  0.0767105f,   0.0414042f,    -0.448021f,   -0.0400193f,  -0.0765448f,
+  -0.411931f,   -0.199624f,    0.333371f,    0.17267f,     -0.0431816f,
+  0.190826f,    -0.0758961f,   -1.02831f,    -0.0414525f,  0.605374f,
+  -0.0188181f,  -0.2207f,      1.30004f,     -0.207005f,   -0.0333617f,
+  0.227145f,    0.105059f,     -0.0473393f,  -0.448752f,   -0.0342152f,
+  -0.0244812f,  0.220329f,     0.0313591f,   -0.0902074f,  -0.0731945f,
+  0.88488f,     0.306306f,     -0.275613f,   -0.476372f,   0.00678104f,
+  0.442029f,    0.122049f,     0.118042f,    0.270527f,    -0.462538f,
+  0.0665021f,   -0.260255f,    0.209182f,    0.162321f,    0.0629934f,
+  -0.244896f,   -0.078863f,    0.655585f,    -0.0506617f,  -0.487128f,
+  0.118765f,    -0.34408f,     0.0930615f,   -0.365632f,   -0.0670776f,
+  0.44428f,     0.286734f,     0.146608f,    0.686757f,    -0.0738428f,
+  -0.10034f,    -0.928438f,    -0.172601f,   -0.0959575f,  -0.010532f,
+  0.277549f,    0.28773f,      -0.318883f,   0.71254f,     0.273593f,
+  -0.382845f,   -0.0104587f,   -0.647769f,   0.25541f,     0.194625f,
+  0.265197f,    -0.750938f,    -0.0650515f,  -0.567092f,   0.070613f,
+  0.209531f,    0.429699f,     0.130676f,    0.514914f,    0.615778f,
+  0.594535f,    -0.0878778f,   0.40593f,     -0.303383f,   0.0907863f,
+  -0.320068f,   0.0137162f,    -0.303424f,   0.594207f,    -0.236524f,
+  -0.692627f,   -0.990063f,    -0.0262934f,  0.222375f,    0.503412f,
+  0.220224f,    0.676871f,     -0.150996f,   0.379777f,    0.841339f,
+  -1.05981f,    0.259943f,     -0.781745f,   0.0346478f,   0.115791f,
+  -0.25171f,    -0.00872158f,  0.395561f,    -0.0849893f,  -1.20134f,
+  -0.313938f,   0.789542f,     0.159606f,    -0.782095f,   -0.229754f,
+  0.266687f,    -0.0354282f,   -0.3041f,     0.0338618f,   -0.390001f,
+  -0.28362f,    -0.436144f,    0.777351f,    0.855321f,    0.653338f,
+  -0.0382912f,  -0.204577f,    1.13828f,     0.220395f,    -4.60853f,
+  0.575694f,    0.0453189f,    1.76567f,     0.466151f,    -0.366109f,
+  0.594717f,    0.278891f,     -0.750676f,   -0.332739f,   -0.942304f,
+  0.280363f,    0.284561f,     0.209326f,    0.238347f,    -0.0124311f,
+  -0.439463f,   -0.036186f,    0.165997f,    0.374717f,    -0.481148f,
+  -0.626417f,   0.0223598f,    0.039337f,    -0.379918f,   0.211046f,
+  0.0795812f,   0.863355f,     -0.341448f,   0.421494f,    0.410477f,
+  -0.117025f,   -0.511108f,    0.565193f,    -0.063582f,   -0.031349f,
+  -0.0750174f,  0.387941f,     0.541266f,    0.0919753f,   1.05041f,
+  0.263004f,    0.289006f,     0.0439694f,   -1.22439f,    -0.247832f,
+  0.260967f,    0.355794f,     0.599694f,    -0.69418f,    0.372805f,
+  -0.161731f,   0.0720574f,    0.0394657f,   0.122772f,    -0.458067f,
+  -0.370826f,   -1.34495e-05f, -0.373404f,   0.0245539f,   -2.3472f,
+  -2.61448f,    0.264794f,     0.0601582f,   -0.968597f,   -0.196022f,
+  -0.727067f,   0.167346f,     0.517478f,    0.0035377f,   0.777219f,
+  0.553128f,    0.727211f,     0.606202f,    -0.495604f,   2.41445f,
+  0.465214f,    -0.0443004f,   0.142972f,    0.141459f,    -0.17771f,
+  0.0156117f,   0.169264f,     0.0428022f,   -0.164827f,   -0.240632f,
+  0.215289f,    -0.213134f,    -0.184163f,   0.0161321f,   -0.20025f,
+  -0.0311616f,  0.00292108f,   -0.0131921f,  0.0437664f,   -0.104817f,
+  -0.131906f,   0.0822771f,    0.237307f,    -0.347567f,   -1.2485f,
+  0.253616f,    -0.442217f,    0.0514077f,   0.337561f,    -0.0147658f,
+  -0.132888f,   -0.643821f,    0.445573f,    -0.0146213f,  0.235511f,
+  0.53583f,     -0.640644f,    0.0280044f,   0.00628834f,  0.143885f,
+  0.380077f,    -0.542342f,    0.363101f,    0.0647334f,   -0.476556f,
+  -0.822676f,   0.482454f,     -0.0467326f,  -0.253083f,   0.116726f,
+  0.317333f,    0.548131f,     -0.234667f,   0.579923f,    -0.420683f,
+  0.595613f,    -0.279864f,    -0.753204f,   -0.516844f,   -0.436574f,
+  -0.120682f,   -0.278939f,    0.752202f,    -0.183443f,   -0.14632f,
+  -0.0344068f,  0.127638f,     -0.225245f,   0.489391f,    0.145082f,
+  -0.73672f,    0.980065f,     -0.0367412f,  0.40632f,     -0.802509f,
+  0.356897f,    0.366172f,     1.23858f,     -0.978381f,   -0.684924f,
+  -0.0870693f,  -0.353628f,    0.695788f,    -0.244593f,   -1.8897f,
+  -0.257803f,   0.686937f,     0.405155f,    -0.125696f,   0.258075f,
+  0.570584f,    -0.439481f,    -0.59798f,    0.0745711f,   -0.235162f,
+  0.133048f,    -0.243033f,    0.0415527f,   -0.00118735f, 0.00980514f,
+  -0.297429f,   -0.144983f,    0.463093f,    0.0965441f,   -0.338508f,
+  -0.651077f,   0.817577f,     -0.0364773f,  -0.388465f,   0.113288f,
+  0.231198f,    0.316208f,     -0.592201f,   0.530376f,    -0.431434f,
+  0.0200985f,   0.104303f,     -0.130705f,   0.4374f,      0.362342f,
+  0.70641f,     0.20037f,      0.309128f,    -0.484535f,   -1.18469f,
+  0.513893f,    0.201236f,     -0.022396f,   0.179638f,    -0.361289f,
+  -0.0794946f,  -1.04704f,     -0.0281103f,  0.0494822f,   0.00196415f,
+  0.0625478f,   -0.229033f,    0.12018f,     0.542629f,    -0.222423f,
+  -0.0123321f,  -0.0988525f,   0.773192f,    -0.192218f,   -3.19156f,
+  0.300606f,    0.462751f,     2.2968f,      0.137182f,    0.132539f,
+  0.165884f,    0.128818f,     -0.155856f,   -0.558538f,   -0.231742f,
+  -0.244377f,   -0.442397f,    0.250947f,    0.0850658f,   -0.00820139f,
+  0.391284f,    0.17453f,      0.306003f,    -0.531499f,   -0.624451f,
+  0.564584f,    -0.343953f,    -0.0278713f,  0.212664f,    -0.135969f,
+  -0.0179867f,  -0.687887f,    0.371065f,    -0.0537029f,  0.0499509f,
+  0.0980684f,   -0.0438569f,   0.186731f,    0.182105f,    0.172254f,
+  -0.149446f,   -0.0247637f,   0.148098f,    1.20772f,     -0.136664f,
+  0.00983112f,  0.0181381f,    -0.0147549f,  -0.0846561f,  -0.827022f,
+  0.00207177f,  0.0478215f,    0.0652549f,   0.0898219f,   -0.0224959f,
+  -0.0274246f,  0.0166498f,    -0.0211715f,  -0.502932f,   0.0961452f,
+  0.251206f,    -0.0623632f,   0.741566f,    0.0078449f,   -2.99162f,
+  -0.187244f,   0.0743479f,    1.46425f,     0.0737923f,   0.0133544f,
+  0.20922f,     -0.178671f,    -0.0528492f,  -0.526717f,   0.0282125f,
+  -0.0363201f,  0.37406f,      -0.303658f,   -0.066803f,   0.132237f,
+  0.962057f,    -0.399733f,    0.191765f,    -0.452606f,   -0.348732f,
+  0.444939f,    0.153025f,     0.0796317f,   0.265985f,    -0.319638f,
+  0.0278161f,   -0.333734f,    0.226108f,    0.147895f,    -0.124066f,
+  -0.37306f,    0.19541f,      0.200175f,    -0.0593244f,  0.0333887f,
+  -0.0284278f,  0.462491f,     0.0686487f,   -0.332435f,   -0.437166f,
+  0.302795f,    0.100542f,     0.0265019f,   0.767212f,    -0.140621f,
+  0.11558f,     -0.70584f,     -0.00017415f, 0.00793092f,  -0.0490901f,
+  0.0598338f,   0.484876f,     -0.13025f,    0.660349f,    0.147503f,
+  -0.462766f,   0.0843824f,    0.218493f,    0.310921f,    -0.162284f,
+  0.210404f,    -0.788799f,    0.0698512f,   -0.484799f,   0.0311505f,
+  -0.308243f,   0.417298f,     0.0593723f,   0.208908f,    0.451437f,
+  0.354546f,    -0.0700888f,   -0.281678f,   -0.311177f,   0.00914652f,
+  -0.372084f,   0.135036f,     0.185393f,    0.461347f,    -0.114241f,
+  -0.402347f,   -0.692327f,    0.0376155f,   -0.200267f,   0.565963f,
+  -0.0627442f,  0.429677f,     0.170514f,    0.350565f,    0.699528f,
+  -0.948126f,   -0.364205f,    0.348878f,    -0.137832f,   -0.0791649f,
+  -0.0462295f,  -0.255078f,    -0.398509f,   0.136783f,    -0.0164628f,
+  -0.555472f,   0.690396f,     0.147715f,    0.000523095f, 0.14874f,
+  0.524804f,    0.162974f,     0.797599f,    0.277473f,    -0.500696f,
+  0.189917f,    -0.333309f,    0.00613646f,  -1.07817f,    0.0470502f,
+  0.210766f,    0.159768f,     -0.447774f,   -0.252968f,   -1.72739f,
+  0.0658259f,   -0.448747f,    2.26511f,     0.349651f,    0.157232f,
+  0.956842f,    0.856676f,     0.149227f,    -0.626957f,   -0.566771f,
+  -0.0980846f,  0.351668f,     -0.362741f,   -0.0272282f,  -0.113632f,
+  0.366015f,    -0.00790003f,  -0.458632f,   -0.31157f,    -0.182257f,
+  -0.953975f,   0.0583582f,    0.164721f,    -0.900107f,   -0.115542f,
+  0.0654192f,   0.99056f,      -0.247976f,   0.48254f,     0.670196f,
+  0.098585f,    -0.212855f,    0.310072f,    0.0894616f,   0.151944f,
+  0.119629f,    -0.26735f,     0.162257f,    -0.0305818f,  0.681526f,
+  -0.229847f,   1.01556f,      0.29132f,     0.740113f,    0.0703937f,
+  0.537892f,    -0.18653f,     -0.0252359f,  -0.420014f,   0.197631f,
+  -0.176629f,   0.00674754f,   0.301288f,    -0.162816f,   0.636235f,
+  -0.341362f,   0.197296f,     -0.589747f,   -0.749363f,   -0.277197f,
+  -1.27291f,    -0.0857908f,   -0.147591f,   -0.0956297f,  -0.109097f,
+  0.0717554f,   0.359078f,     0.301457f,    0.486934f,    -0.260955f,
+  -0.126821f,   1.55756f,      0.477469f,    -1.45363f,    1.42198f,
+  -0.360847f,   -0.0211924f,   -0.0184957f,  -0.110706f,   -0.152136f,
+  0.104703f,    0.267615f,     0.127392f,    0.172996f,    0.258326f,
+  0.268578f,    -0.431123f,    -0.114419f,   0.0101172f,   -0.195671f,
+  0.0792025f,   -0.151505f,    -0.064077f,   0.0479777f,   -0.141882f,
+  0.121492f,    -0.139132f,    -0.348252f,   0.341043f,    -0.565367f,
+  -0.0791259f,  -0.781086f,    0.0140045f,   0.571094f,    -0.00875077f,
+  0.217132f,    -0.202345f,    0.157213f,    0.228445f,    0.366612f,
+  -0.529989f,   0.42241f,      -0.540538f,   -0.0425556f,  -0.207774f,
+  -0.0663941f,  0.37836f,      -0.0650245f,  -0.0828694f,  -0.0835478f,
+  -0.795512f,   0.470268f,     0.1551f,      -0.69017f,    -0.116735f,
+  0.157614f,    0.555973f,     -0.293311f,   0.245428f,    -0.0853701f,
+  -0.449278f,   -0.0551647f,   -0.00137429f, 0.709439f,    -0.456796f,
+  0.132062f,    -0.0449484f,   -0.308599f,   0.180608f,    -2.24196f,
+  0.421478f,    -0.640946f,    -0.460397f,   -0.920628f,   -0.184949f,
+  -0.0416982f,  0.6484f,       -0.22806f,    0.412229f,    -0.468079f,
+  -0.72372f,    -0.347698f,    -1.3899f,     0.631876f,    0.0611046f,
+  0.0294258f,   -0.128091f,    -0.205615f,   0.355348f,    -0.267725f,
+  -0.644835f,   0.435879f,     0.517477f,    -0.338123f,   -0.157764f,
+  0.32762f,     -0.166454f,    0.221007f,    -0.0438278f,  -0.0777725f,
+  0.10986f,     0.941545f,     -0.542284f,   -0.172312f,   -0.256597f,
+  -0.0181391f,  0.220623f,     -0.432456f,   0.0164074f,   0.250226f,
+  -0.522576f,   0.783109f,     0.198703f,    -0.784554f,   -0.0929628f,
+  0.326861f,    0.470293f,     0.442684f,    0.271879f,    -0.108256f,
+  0.0483558f,   -0.403151f,    0.36183f,     -0.268186f,   0.270851f,
+  -0.696826f,   -0.166037f,    -0.354658f,   0.405977f,    -0.473447f,
+  0.649689f,    -0.0863114f,   -0.147319f,   0.0869966f,   0.319792f,
+  0.493026f,    -1.07456f,     0.354751f,    0.114605f,    -0.120647f,
+  -0.238315f,   0.0290955f,    -0.355299f,   -0.45381f,    0.0812865f,
+  -0.0180434f,  0.00861318f,   -0.892943f,   -0.0127801f,  -1.66398f,
+  0.290505f,    0.126832f,     2.08173f,     -0.0454847f,  -0.162481f,
+  1.07426f,     0.228566f,     0.280528f,    -0.537625f,   -0.175288f,
+  -0.118012f,   0.649114f,     -0.349926f,   -0.0189864f,  -0.30934f,
+  -0.363178f,   -0.119822f,    -0.22656f,    0.484513f,    -0.173269f,
+  0.41987f,     -0.448517f,    -0.0950466f,  0.482443f,    0.061558f,
+  0.4219f,      -0.536388f,    0.0781972f,   0.212489f,    0.104229f,
+  -0.0792804f,  0.402066f,     -0.676313f,   -0.2272f,     -0.16379f,
+  0.260145f,    -0.0504658f,   -0.0826579f,  -1.37749f,    0.00790747f,
+  0.0841031f,   -0.0671308f,   -0.00301736f, -0.386206f,   0.190311f,
+  0.0702639f,   0.0643968f,    0.133741f,    -0.0141555f,  -0.0365324f,
+  0.87028f,     0.207894f,     -0.421266f,   0.689256f,    0.145037f,
+  -0.270796f,   0.212604f,     -0.345326f,   0.0074631f,   -1.72379f,
+  0.0672097f,   -0.273153f,    1.30503f,     -1.01324f,    0.00284696f,
+  0.851459f,    0.176847f,     0.30948f,     -0.57144f,    -0.0596695f,
+  -0.111189f,   0.130361f,     -0.298286f,   0.0567591f,   -0.0885215f,
+  -0.847601f,   0.238624f,     -0.162391f,   0.452357f,    -0.0192713f,
+  0.226661f,    0.0762922f,    -0.0894055f,  0.332702f,    0.424484f,
+  0.0443207f,   -0.162345f,    -0.601036f,   0.280527f,    -0.137362f,
+  0.266345f,    0.729438f,     -0.887182f,   0.152943f,    -0.573548f,
+  -0.0201383f,  -0.56521f,     0.033582f,    0.300284f,    -0.144472f,
+  0.633026f,    0.30866f,      0.0653073f,   0.316901f,    0.0721326f,
+  0.192252f,    -0.833162f,    0.194292f,    -0.08663f,    -0.189401f,
+  -0.178242f,   0.111488f,     0.522487f,    -0.65497f,    0.457049f,
+  0.390654f,    0.0522936f,    -0.39712f,    -0.293717f,   -0.374656f,
+  -0.118916f,   -0.853076f,    -0.0829578f,  -0.17335f,    -0.0218694f,
+  0.367968f,    0.478469f,     0.0913813f,   0.519251f,    0.803526f,
+  -0.272516f,   -0.341329f,    0.0897285f,   0.247653f,    0.000898686f,
+  0.313196f,    0.000587979f,  -0.314189f,   -0.449439f,   -0.0291611f,
+  -0.356287f,   -0.722904f,    -0.0480958f,  -0.523758f,   -0.576146f,
+  0.133754f,    0.616921f,     -0.085494f,   0.487487f,    0.745129f,
+  0.993267f,    0.256555f,     0.0822743f,   0.0411971f,   0.139388f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = {
+  0.00447951f,  0.0202534f,  0.00970833f, -0.00460874f,  0.0942288f,
+  -0.0534704f,  0.00829869f, -0.0255174f, -0.0809143f,   0.00169117f,
+  0.0177427f,   0.0259387f,  0.0291077f,  -0.0267599f,   0.100275f,
+  -0.00389366f, 0.0315499f,  0.0265846f,  -0.000206604f, 0.0302221f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = {
+  0.153048f,    0.0725422f,   0.068901f,     -0.475608f,   0.0736706f,
+  -0.134076f,   0.229289f,    0.0217921f,    0.0449205f,   -1.00002f,
+  0.149133f,    0.0497258f,   0.118988f,     0.0741764f,   0.0385486f,
+  0.225181f,    0.012966f,    0.155593f,     -3.07175f,    -0.0641051f,
+  0.09161f,     0.0259005f,   -0.209998f,    -0.420298f,   0.0587126f,
+  0.00352744f,  0.0451313f,   -0.049384f,    0.11516f,     0.083135f,
+  0.103675f,    -0.0185604f,  0.0623248f,    -0.0993726f,  0.0448522f,
+  0.0134017f,   -0.294776f,   -0.251924f,    0.0712635f,   -0.0764298f,
+  -0.463766f,   -0.0295011f,  -0.579168f,    0.573853f,    -0.00596607f,
+  0.0237762f,   -0.0500104f,  -0.0969275f,   0.155573f,    0.0515382f,
+  -0.178454f,   -0.154008f,   -0.278299f,    -0.166421f,   0.0149533f,
+  -0.0700236f,  0.239287f,    -1.19545f,     -0.0744625f,  0.143037f,
+  0.141874f,    0.086302f,    0.0838633f,    -0.454179f,   0.120308f,
+  -0.0896718f,  0.254909f,    0.0714462f,    0.00471098f,  -0.869494f,
+  0.209407f,    0.138285f,    0.0816641f,    0.0666266f,   0.0848555f,
+  0.173313f,    0.0695633f,   0.285667f,     -3.15384f,    0.00140275f,
+  -0.969824f,   -0.0318689f,  -0.00487396f,  0.412541f,    0.0263593f,
+  -0.249824f,   0.0897776f,   0.0208836f,    -0.0982745f,  -0.16049f,
+  -0.12719f,    -0.186166f,   0.102338f,     0.273931f,    -0.0886306f,
+  -0.19513f,    -0.0135712f,  -0.194127f,    -0.0834291f,  0.426623f,
+  -0.0705446f,  0.0327476f,   0.0800862f,    0.478757f,    -0.00849111f,
+  -0.554911f,   -0.0489312f,  -0.184029f,    -0.227428f,   0.159989f,
+  -0.0677731f,  -0.0901436f,  0.00308696f,   -0.352243f,   0.278715f,
+  0.306374f,    -0.0772054f,  -0.0122733f,   -0.0693457f,  0.074365f,
+  -0.267458f,   -0.123612f,   -0.495954f,    0.552604f,    -0.103951f,
+  -0.121771f,   0.179966f,    -0.377947f,    -1.35472f,    0.153294f,
+  -0.445284f,   -0.089813f,   -0.00529807f,  0.254047f,    -0.0378426f,
+  0.114597f,    -0.143052f,   0.0815258f,    -0.10528f,    0.00833533f,
+  -0.117508f,   0.129052f,    0.0706719f,    -1.39506f,    0.0124731f,
+  0.109831f,    -0.0744156f,  0.181612f,     0.0787894f,   0.0293352f,
+  0.494929f,    0.00997207f,  -0.585882f,    -0.0844138f,  -0.00864134f,
+  -0.109943f,   0.0713114f,   0.14883f,      0.0610554f,   0.204145f,
+  -0.00390313f, 0.0184763f,   -0.111387f,    0.175442f,    -0.0840215f,
+  -0.178785f,   -0.0693612f,  -0.254507f,    -0.191549f,   0.501561f,
+  -0.0858995f,  -0.164921f,   0.0250706f,    -0.0916282f,  0.247085f,
+  0.13877f,     -0.419487f,   -0.295065f,    -0.213812f,   -0.10362f,
+  0.138243f,    0.086985f,    0.113633f,     -0.459273f,   0.12388f,
+  -0.139296f,   0.253792f,    0.0421624f,    0.0665065f,   -0.977282f,
+  0.199927f,    0.115194f,    0.099045f,     0.0534806f,   0.089283f,
+  0.0815367f,   0.150901f,    0.253458f,     -3.24825f,    -0.0118163f,
+  -0.544565f,   0.0201825f,   -0.0682201f,   0.759028f,    0.00479696f,
+  -0.00625607f, 0.058007f,    -0.0811189f,   -0.114617f,   -0.0998578f,
+  0.133312f,    0.0246256f,   -0.0167416f,   0.196118f,    0.109823f,
+  0.109489f,    0.474682f,    -0.763475f,    0.0818745f,   0.0798777f,
+  -0.0994905f,  -0.00138143f, -0.108563f,    0.697289f,    -0.103702f,
+  -0.306085f,   -0.0996705f,  -0.142618f,    -0.130989f,   0.0813303f,
+  -0.0909275f,  -0.10786f,    -0.0280431f,   0.206877f,    -1.70798f,
+  0.525568f,    0.559891f,    -0.166132f,    -0.227574f,   -0.150955f,
+  0.0849226f,   0.00497342f,  -0.168667f,    -0.282575f,   0.00537805f,
+  -0.0185572f,  0.0607167f,   -0.0534948f,   -0.0215776f,  -0.14825f,
+  -0.0164577f,  -0.0611978f,  0.0347562f,    0.286917f,    0.226598f,
+  0.149497f,    -0.478101f,   -0.246006f,    0.0663239f,   -0.121728f,
+  0.267087f,    0.0802681f,   -0.184741f,    -0.558267f,   0.0437066f,
+  0.13816f,     -0.0710939f,  0.0725697f,    0.339857f,    0.161069f,
+  0.304871f,    0.108138f,    0.193396f,     0.0891607f,   -0.0701939f,
+  -0.182038f,   -0.451873f,   -0.233883f,    0.0444747f,   0.0436545f,
+  -0.245894f,   -0.0721136f,  0.309013f,     0.278996f,    0.0259377f,
+  0.0278116f,   0.0686773f,   -0.271237f,    0.235082f,    -0.0778285f,
+  -0.456541f,   -0.109303f,   -0.074565f,    -0.407301f,   -0.162191f,
+  -0.801819f,   0.372435f,    -0.559083f,    -0.039189f,   0.0477762f,
+  0.0875363f,   0.0699926f,   0.116552f,     -0.308217f,   0.0341607f,
+  -0.14202f,    0.135517f,    0.0316971f,    0.153297f,    -0.759722f,
+  0.12849f,     0.114229f,    0.0814893f,    0.275402f,    0.0403976f,
+  0.0357503f,   0.212295f,    0.0673998f,    -2.59822f,    -0.0475021f,
+  -0.0594725f,  0.0659163f,   0.0469717f,    -0.0370461f,  -0.12863f,
+  -0.381743f,   -0.0445055f,  -0.106843f,    -0.0880648f,  0.00591106f,
+  0.235514f,    -0.165162f,   -0.0696645f,   0.115374f,    0.245558f,
+  0.192049f,    -0.388628f,   -0.48291f,     0.154313f,    -0.160207f,
+  0.125928f,    0.122039f,    0.0713794f,    -0.161244f,   0.128082f,
+  -0.234659f,   0.0680219f,   0.0597933f,    0.208421f,    -0.163623f,
+  0.196873f,    0.156603f,    0.184179f,     -0.278331f,   -0.0481286f,
+  0.0828152f,   0.247004f,    0.0915582f,    -0.0906229f,  -0.20376f,
+  0.136593f,    0.0740336f,   -0.0134935f,   -0.355048f,   0.0898485f,
+  -0.0962068f,  0.185804f,    -0.0145596f,   0.0966589f,   -0.515784f,
+  0.121602f,    0.0320428f,   0.11093f,      -0.0559421f,  0.0355484f,
+  0.192128f,    0.0500888f,   0.133641f,     -1.73282f,    -0.0624599f,
+  0.122524f,    0.0757292f,   -0.0974648f,   -0.193649f,   0.0561096f,
+  0.0159959f,   0.0334472f,   -0.0168832f,   -0.12386f,    -0.112419f,
+  0.19552f,     0.0308502f,   0.0537643f,    -0.0181012f,  0.0392183f,
+  0.0461833f,   -0.52623f,    -0.238252f,    0.0821762f,   -0.212384f,
+  0.112901f,    0.096063f,    0.0540225f,    0.0773583f,   0.143045f,
+  -0.101551f,   0.282418f,    0.0176749f,    -0.00244542f, -0.780154f,
+  -0.254428f,   -5.82215f,    0.106638f,     0.11746f,     0.0486823f,
+  0.164562f,    0.0303006f,   0.229614f,     -2.41845f,    -0.117122f,
+  0.0451654f,   0.0237383f,   -0.208731f,    0.0721137f,   0.0761163f,
+  -0.0569416f,  -0.00830511f, -0.045256f,    0.14535f,     -0.0189222f,
+  -0.283363f,   -3.15502f,    0.0971161f,    -0.035913f,   0.00813281f,
+  0.0187974f,   -0.361573f,   -0.302067f,    0.118014f,    -0.0956148f,
+  -0.596567f,   0.0105443f,   -0.49019f,     -0.0801959f,  0.0322344f,
+  -0.0280032f,  0.0555038f,   -0.111495f,    -0.0994456f,  0.0178021f,
+  0.0358362f,   1.07063f,     -0.0833138f,   0.0621246f,   0.0637157f,
+  0.0999207f,   0.191975f,    -1.2811f,      0.0341681f,   0.14818f,
+  0.0957259f,   0.109909f,    0.0566115f,    0.0585633f,   0.179939f,
+  -0.104372f,   0.309091f,    0.0172941f,    0.0243182f,   -0.935252f,
+  -0.296257f,   -5.83634f,    0.0899249f,    0.455347f,    0.129505f,
+  0.220212f,    0.0214801f,   0.284802f,     -2.94585f,    -0.0805413f,
+  -1.01819f,    0.00534034f,  -0.057203f,    0.0869331f,   0.0207575f,
+  -0.124479f,   -0.0465806f,  0.0894252f,    0.32203f,     0.0858497f,
+  0.25178f,     0.0932205f,   0.0888455f,    0.233153f,    -0.446398f,
+  -0.00791233f, 0.0909603f,   -0.0904397f,   0.131835f,    0.475597f,
+  -0.1236f,     0.0231622f,   0.138602f,     -0.097731f,   -0.0282484f,
+  -0.549095f,   -0.0457428f,  -0.0895407f,   -0.293965f,   0.166872f,
+  0.46719f,     0.236254f,    0.0615991f,    0.499236f,    0.540366f,
+  0.402035f,    0.0606324f,   -0.0499928f,   -0.0155198f,  0.0994403f,
+  -0.14773f,    -0.183433f,   -0.612093f,    -0.334201f,   -0.110877f,
+  -0.143441f,   0.05815f,     -0.318586f,    -0.344235f,   0.199593f,
+  0.51109f,     -0.252281f,   -0.028834f,    0.0615421f,   0.0623699f,
+  0.210745f,    -0.236448f,   0.166279f,     0.127516f,    -0.0971157f,
+  -0.204389f,   0.208112f,    0.0377023f,    0.271837f,    -0.00859528f,
+  0.0797081f,   -0.00582115f, 0.140018f,     -0.384865f,   -0.0853243f,
+  -0.586727f,   -0.0664489f,  -0.631436f,    -0.245828f,   -0.0647894f,
+  -0.171912f,   -0.0801706f,  0.0731614f,    -0.11725f,    0.281478f,
+  -0.03047f,    0.0363488f,   -0.0481651f,   -0.326329f,   -0.0155898f,
+  -0.428316f,   -0.0989367f,  -0.271902f,    -0.00263837f, 0.366168f,
+  0.325989f,    0.165463f,    0.0668512f,    -0.142202f,   0.419992f,
+  0.164971f,    -0.515479f,   -0.187585f,    -0.151783f,   -0.0682468f,
+  0.0910191f,   0.117086f,    0.106579f,     0.0961825f,   0.162148f,
+  -0.129645f,   0.301039f,    0.000320343f,  -0.0558097f,  -0.844295f,
+  -0.218919f,   -5.7571f,     0.0982612f,    0.238955f,    0.0703565f,
+  0.0969388f,   0.107202f,    0.321585f,     -3.00594f,    -0.058755f,
+  -0.620004f,   0.052114f,    0.128423f,     -0.177673f,   -0.00341509f,
+  -0.146756f,   -0.0414309f,  -0.0893262f,   -0.0584779f,  -0.129552f,
+  0.127629f,    0.13275f,     -0.0973342f,   -0.215617f,   0.0724309f,
+  0.0102229f,   0.178137f,    -0.943374f,    -0.171465f,   0.304949f,
+  -0.0963836f,  -0.0346437f,  -0.138667f,    -0.234184f,   0.0344159f,
+  -0.319592f,   -0.0990766f,  -0.16065f,     0.369432f,    0.194911f,
+  0.363348f,    -0.356009f,   -0.00736217f,  0.241788f,    -2.21311f,
+  0.704816f,    0.697019f,    0.129186f,     -0.132799f,   -0.11861f,
+  0.0383451f,   0.0247782f,   -0.12687f,     0.0256552f,   0.048413f,
+  0.00660549f,  0.0457962f,   -0.012819f,    0.115991f,    -0.1117f,
+  -0.291045f,   -0.646138f,   0.0813613f,    0.112063f,    0.191675f,
+  0.120835f,    -0.444267f,   -0.340385f,    0.0391936f,   -0.151132f,
+  0.184419f,    0.124998f,    -0.14089f,     0.214087f,    0.00108535f,
+  0.119611f,    0.0236965f,   0.0715074f,    -0.225997f,   -0.0126552f,
+  -0.459214f,   -0.490444f,   0.173716f,     0.355811f,    -0.13607f,
+  -0.191091f,   -0.530085f,   -0.400666f,    0.011221f,    0.10527f,
+  -0.11498f,    -0.011864f,   0.364376f,     0.0319587f,   -0.0528563f,
+  0.0353899f,   0.0393453f,   -0.289211f,    -0.347785f,   -0.0417157f,
+  0.545848f,    0.741785f,    -0.0732565f,   -1.29687f,    -0.0433128f,
+  -1.44162f,    0.318894f,    -0.377784f,    0.123751f,    -0.00444347f,
+  0.0957118f,   0.0893616f,   0.0911595f,    0.092917f,    0.127681f,
+  -0.159929f,   0.190417f,    -0.0297948f,   -0.00132599f, -0.742756f,
+  -0.0364169f,  -4.00108f,    0.0784767f,    0.223048f,    0.0430138f,
+  0.0180493f,   0.212842f,    0.122987f,     -2.83267f,    -0.0641464f,
+  -0.173247f,   0.100946f,    0.0804885f,    0.0172631f,   0.0877408f,
+  -0.353222f,   0.0108262f,   -0.0452121f,   -0.116127f,   0.268154f,
+  -0.132587f,   -0.27481f,    -0.0316914f,   0.0610525f,   0.439691f,
+  0.00966415f,  -0.78962f,    -0.424823f,    -0.0214365f,  -0.113846f,
+  0.100793f,    0.126482f,    0.0415354f,    0.0427995f,   0.14273f,
+  -0.315674f,   0.110095f,    0.0061568f,    0.0320474f,   -0.3596f,
+  -0.12533f,    -1.28837f,    0.174673f,     -0.235912f,   0.00495439f,
+  0.0695473f,   0.266489f,    0.049248f,     0.0868526f,   -0.0685969f,
+  0.102984f,    0.0924639f,   -0.027535f,    0.0709277f,   0.155776f,
+  -0.190944f,   0.188273f,    -0.00897471f,  0.0964232f,   -0.475822f,
+  -0.209374f,   -5.00252f,    0.103495f,     0.110698f,    0.00682092f,
+  0.208586f,    0.0489575f,   0.0966254f,    -1.42973f,    -0.0645128f,
+  0.0515961f,   0.0571281f,   -0.0992321f,   0.00791648f,  0.0087609f,
+  0.0607367f,   0.0315705f,   0.0183317f,    0.0756087f,   -0.0292847f,
+  -0.212932f,   -0.782259f,   0.0899944f,    0.102677f,    0.0681135f,
+  0.0447764f,   -0.481969f,   -0.221459f,    0.0794475f,   -0.229157f,
+  0.136781f,    0.0832359f,   0.0297807f,    -0.00287225f, -5.97897f,
+  -0.0960581f,  0.250945f,    -0.00133314f,  -0.112396f,   -0.856922f,
+  0.115776f,    0.124536f,    0.0914194f,    -0.160775f,   0.128684f,
+  0.106718f,    0.100665f,    0.139579f,     -0.86141f,    -0.190323f,
+  0.0884896f,   0.0363845f,   -0.19831f,     0.121601f,    0.0264453f,
+  -0.00557822f, 0.0720238f,   -0.0140132f,   -0.166814f,   -0.266214f,
+  0.00500545f,  0.0146905f,   0.126035f,     0.0812372f,   0.0615973f,
+  0.0766063f,   -0.420156f,   -0.126157f,    -0.0284299f,  -0.112513f,
+  -0.567008f,   -0.0100263f,  -0.607567f,    0.193053f,    0.0067527f,
+  -0.0753897f,  0.00134269f,  -0.0512249f,   -0.161661f,   0.0667741f,
+  -0.113702f,   -0.071606f,   -0.300563f,    0.276479f,    -0.155318f,
+  -0.0512306f,  0.0896443f,   -0.987911f,    0.0440889f,   0.430958f,
+  0.175427f,    0.101385f,    0.0303662f,    0.0672653f,   -6.62463f,
+  -0.10475f,    0.228249f,    -0.00482173f,  -0.0608713f,  -0.895836f,
+  0.187976f,    0.162173f,    0.0747544f,    0.219953f,    0.0682489f,
+  0.142665f,    0.100287f,    0.301887f,     -1.97736f,    -0.295001f,
+  -1.0733f,     -0.0562668f,  -0.0604295f,   0.0304073f,   0.194274f,
+  -0.243593f,   0.0727137f,   0.0610967f,    -0.0692415f,  -0.02967f,
+  0.055633f,    0.0192402f,   0.105841f,     0.102236f,    -0.0757102f,
+  -0.0067639f,  0.0102317f,   -0.257959f,    -0.0638652f,  0.45521f,
+  -0.114967f,   0.0921177f,   0.223796f,     0.277072f,    -0.0613282f,
+  -0.564693f,   -0.151333f,   -0.158035f,    0.228491f,    0.12997f,
+  -0.192625f,   -0.125344f,   0.0983258f,    -0.931206f,   0.618715f,
+  0.273759f,    -0.145527f,   -0.099431f,    -0.119551f,   0.0663484f,
+  -0.161419f,   -0.202377f,   -0.545393f,    0.0917645f,   0.042263f,
+  -0.17117f,    -0.178622f,   -0.336977f,    0.866715f,    0.0376922f,
+  -0.319728f,   -0.127406f,   0.0599384f,    0.268804f,    -0.0331844f,
+  0.355326f,    -0.103902f,   0.0425935f,    0.00525512f,  -0.133687f,
+  -0.122695f,   0.145582f,    0.139013f,     -0.0053352f,  0.0313566f,
+  0.327295f,    -0.0117993f,  0.233524f,     0.162388f,    -0.0793262f,
+  0.454543f,    0.0442224f,   -0.742673f,    -0.144882f,   0.0874983f,
+  -0.0707259f,  0.0219869f,   0.201728f,     0.0204537f,   0.0788857f,
+  -0.0374329f,  0.0724169f,   0.0743593f,    -0.0193526f,  -0.313546f,
+  -0.418882f,   -0.0815754f,  -0.197144f,    0.305053f,    0.330196f,
+  -0.131006f,   -0.00113249f, 0.0750458f,    -0.541764f,   0.299935f,
+  0.308516f,    -0.20547f,    -0.333066f,    0.0285833f,   0.191147f,
+  0.160372f,    0.0724649f,   0.0426326f,    0.153046f,    -6.59656f,
+  -0.081237f,   0.219163f,    0.0147081f,    -0.0109837f,  -1.01487f,
+  0.170055f,    0.163386f,    0.106413f,     0.150188f,    0.0688875f,
+  0.0541359f,   0.156307f,    0.178844f,     -1.51054f,    -0.149477f,
+  -0.504503f,   0.017878f,    -0.181821f,    -0.0999659f,  0.0484548f,
+  -0.32211f,    0.0406744f,   0.0017627f,    0.0220593f,   0.0900512f,
+  -0.561625f,   0.107279f,    -0.0861521f,   -0.0862376f,  0.0816765f,
+  0.168072f,    0.150063f,    -0.816825f,    -0.13569f,    0.557555f,
+  -0.155265f,   0.025135f,    -0.109304f,    -0.0487062f,  -0.00347487f,
+  -0.454803f,   -0.0394371f,  -0.214597f,    -0.248898f,   0.286501f,
+  -0.249246f,   -0.138935f,   0.00391409f,   -0.122544f,   -2.14993f,
+  0.588942f,    0.541231f,    0.0154047f,    -0.359742f,   0.0520729f,
+  0.0667058f,   0.0418163f,   -0.132533f,    -0.184759f,   0.0546118f,
+  -0.131198f,   0.109664f,    -0.0714679f,   -0.114163f,   -0.243081f,
+  -0.0405089f,  0.0342795f,   0.0801825f,    -0.268408f,   0.192207f,
+  0.0800494f,   -0.586539f,   -0.118155f,    -0.0508569f,  -0.193987f,
+  0.261478f,    0.105719f,    -0.125361f,    -0.0956201f,  0.0233802f,
+  0.271098f,    0.0113352f,   0.0910447f,    0.00628244f,  -0.071722f,
+  0.21439f,     0.0747191f,   0.207765f,     -0.0782454f,  -0.0151716f,
+  -0.196505f,   -0.44798f,    -0.228597f,    0.0549039f,   -0.120715f,
+  -0.19388f,    -0.0768461f,  0.361102f,     0.122936f,    -0.0334211f,
+  -0.202503f,   -0.0450776f,  -0.272345f,    0.662321f,    0.109247f,
+  -0.218026f,   -0.0669386f,  -0.0864701f,   -0.633421f,   -0.158007f,
+  -1.10778f,    0.351211f,    -0.541458f,    -0.0171707f,  0.149606f,
+  0.106105f,    0.0880349f,   0.0968455f,    0.113269f,    -5.01949f,
+  -0.106404f,   0.175578f,    -0.030045f,    -0.0267249f,  -0.563713f,
+  0.173885f,    0.130772f,    0.0334519f,    0.0770157f,   0.0394389f,
+  -0.0290326f,  0.220003f,    0.180901f,     -1.62203f,    -0.151858f,
+  -0.202386f,   -0.0067836f,  0.0287665f,    -0.194183f,   -0.239834f,
+  -0.484159f,   0.00671722f,  -0.122459f,    0.0808959f,   -0.263769f,
+  -0.015066f,   -0.0429868f,  -0.111255f,    -0.231872f,   0.219659f,
+  -0.0437412f,  -0.536618f,   -0.477831f,    0.0421895f,   -0.0815851f,
+  0.119638f,    0.0786293f,   -0.000668378f, 0.0305567f,   -0.0868189f,
+  -0.178327f,   0.0799657f,   0.0280923f,    -0.211395f,   -0.464577f,
+  0.216912f,    0.0761976f,   0.160288f,     -0.416372f,   -0.10286f,
+  -0.0733786f,  0.261033f,    0.0493698f,    0.143137f,    -0.179979f,
+  0.15655f,     0.0897976f,   -0.0258041f,   -0.152852f,   -6.15512f,
+  -0.118917f,   0.227283f,    -0.0514043f,   -0.0786432f,  -0.523485f,
+  0.1644f,      0.0869001f,   0.0984082f,    -0.428288f,   0.0791992f,
+  0.141904f,    0.0652073f,   0.104429f,     -0.775125f,   -0.121479f,
+  0.0841637f,   0.0135705f,   -0.208863f,    -0.0629523f,  0.0455794f,
+  0.0513898f,   -0.0147657f,  0.0401145f,    0.0660079f,   0.0210609f,
+  -0.0151801f,  0.0562111f,   0.140308f,     -0.0196394f,  0.0230753f,
+  -0.0336115f,  -0.422411f,   -0.196974f,    -0.0405748f,  -0.283428f,
+  0.15458f,     0.0876296f,   0.0314038f,    0.16389f,     -7.01385f,
+  -0.117146f,   0.197273f,    -0.0400688f,   0.0143951f,   -0.964007f,
+  -0.0618919f,  0.0406891f,   0.07992f,      -0.144132f,   0.116416f,
+  0.0326838f,   0.103641f,    0.171805f,     -1.05158f,    -0.182589f,
+  0.116991f,    0.0530774f,   -0.212454f,    -0.016727f,   -0.0565992f,
+  0.0712873f,   0.0445466f,   -0.000107032f, -0.121449f,   -0.15148f,
+  0.0220338f,   0.0762024f,   0.12253f,      0.0622466f,   0.0835822f,
+  0.0465119f,   -0.388743f,   -0.34665f,     -0.0720734f,  -0.101581f,
+  -0.630565f,   -0.0512685f,  -0.520541f,    0.0530119f,   -0.0245276f,
+  -0.19116f,    -0.0144446f,  -0.0604486f,   0.187251f,    -0.021341f,
+  -0.217823f,   0.0510256f,   -0.197946f,    0.060955f,    -0.0617316f,
+  0.0741673f,   0.117591f,    -1.47844f,     -0.0911093f,  0.359225f,
+  0.145027f,    0.127513f,    0.0617905f,    0.141154f,    -7.63868f,
+  -0.0808127f,  0.274843f,    0.00693195f,   -0.0283113f,  -0.853871f,
+  -0.15737f,    0.0858904f,   0.0746279f,    0.109912f,    0.193775f,
+  0.0698094f,   0.174159f,    0.259556f,     -1.49885f,    -0.156706f,
+  -1.04113f,    -0.0329546f,  -0.0491449f,   -0.0304125f,  0.0514892f,
+  -0.244284f,   0.126814f,    -0.0387081f,   -0.153173f,   -0.0566748f,
+  0.294111f,    -0.0170534f,  0.102381f,     0.447606f,    -0.0613267f,
+  -0.0636869f,  -0.0347599f,  -0.259572f,    -0.0657846f,  0.454352f,
+  -0.169453f,   -0.00177987f, 0.133279f,     -0.0863932f,  -0.134423f,
+  -0.475107f,   -0.00448962f, -0.214607f,    0.111413f,    0.194377f,
+  -0.0710837f,  0.0562353f,   0.0401193f,    0.248595f,    0.538374f,
+  0.449469f,    -0.39111f,    0.0125057f,    0.0448811f,   -0.00707751f,
+  -0.164894f,   -0.317516f,   -0.56231f,     -0.270262f,   0.127016f,
+  -0.12092f,    -0.0881587f,  -0.323908f,    0.872344f,    0.103391f,
+  0.267971f,    -0.155088f,   -0.0136683f,   0.309517f,    0.119901f,
+  0.271307f,    -0.188463f,   0.185121f,     -0.142777f,   -0.110535f,
+  -0.163107f,   0.175502f,    0.0801924f,    0.240499f,    0.0874759f,
+  0.308907f,    -0.00222504f, 0.193366f,     0.109018f,    -0.0772158f,
+  -0.520675f,   0.0259432f,   -0.736666f,    -0.296579f,   0.043486f,
+  -0.128932f,   0.0417669f,   0.125747f,     0.157879f,    0.112857f,
+  -0.0595681f,  0.0611936f,   -0.042125f,    -0.270338f,   0.120072f,
+  -0.36675f,    -0.0347962f,  -0.119539f,    0.0873369f,   0.296432f,
+  -0.069501f,   -0.0383859f,  0.0913597f,    -0.40747f,    0.234276f,
+  0.332536f,    -0.732132f,   -0.312291f,    0.137759f,    0.227593f,
+  0.14165f,     0.129068f,    0.102734f,     0.135818f,    -7.35883f,
+  -0.101533f,   0.256027f,    -0.0142278f,   -0.0561601f,  -1.09899f,
+  -0.106538f,   0.0612256f,   0.099487f,     -0.0605983f,  0.134311f,
+  0.052226f,    0.143672f,    0.219944f,     -1.47539f,    -0.101828f,
+  -0.429979f,   0.010478f,    -0.0132605f,   0.103363f,    0.0267373f,
+  -0.338865f,   0.0090188f,   0.0810085f,    -0.124368f,   -0.0133776f,
+  0.595666f,    -0.00162201f, -0.212444f,    -0.26342f,    0.0913656f,
+  -0.106279f,   0.414515f,    -0.709901f,    -0.00198859f, 0.305288f,
+  -0.188536f,   -0.0377482f,  -0.131909f,    -0.116099f,   -0.236827f,
+  -0.36356f,    0.0179455f,   -0.202143f,    -0.00395508f, 0.177363f,
+  0.0630679f,   -0.145173f,   -0.0558639f,   -0.44879f,    -1.55687f,
+  0.473398f,    0.50531f,     -0.0656231f,   -0.137197f,   0.064707f,
+  0.122083f,    0.0321111f,   -0.167096f,    0.0406581f,   -0.0793592f,
+  -0.0777081f,  0.0321379f,   -0.0108834f,   -0.0652323f,  -0.102918f,
+  0.0178664f,   0.0781873f,   0.0613189f,    -0.04177f,    0.159566f,
+  0.15134f,     -0.445996f,   -0.384905f,    0.0951659f,   -0.175046f,
+  0.255746f,    0.177047f,    -0.150632f,    0.200522f,    0.00778549f,
+  0.232168f,    -0.0304652f,  0.083155f,     -0.125395f,   -0.0203289f,
+  -0.23874f,    0.0349836f,   0.231701f,     -0.14849f,    -0.204272f,
+  -0.198309f,   -0.364955f,   -0.228428f,    0.0614142f,   -0.040976f,
+  -0.227785f,   -0.0898404f,  0.271566f,     -0.209196f,   0.0226431f,
+  -0.0911715f,  0.0840369f,   -0.299411f,    -0.529182f,   0.0622292f,
+  0.202475f,    0.0155583f,   -0.083114f,    0.124253f,    -0.22721f,
+  -1.02565f,    0.193961f,    -0.54287f,     -0.00849364f, 0.11124f,
+  0.0993531f,   0.120621f,    0.0959537f,    0.136274f,    -5.23358f,
+  -0.107433f,   0.155286f,    -0.0136043f,   -0.0246768f,  -0.631187f,
+  -0.0493852f,  0.0446751f,   0.0588353f,    0.160766f,    -0.0354385f,
+  -0.0672548f,  0.243743f,    0.186004f,     -1.20199f,    -0.151872f,
+  -0.0760096f,  -0.00775123f, -0.0122227f,   0.0891327f,   -0.377876f,
+  -0.469926f,   -0.134715f,   -0.0969362f,   0.212542f,    0.0871489f,
+  0.164638f,    -0.0485785f,  -0.167754f,    -0.515052f,   0.13821f,
+  0.0515572f,   -0.430691f,   -0.394719f,    0.143947f,    -0.00670816f,
+  0.129623f,    0.140299f,    0.0336978f,    0.153545f,    -0.350927f,
+  -0.213485f,   0.0344809f,   0.0405889f,    0.0749967f,   -0.369352f,
+  -0.109398f,   0.0350649f,   0.190893f,     -0.284106f,   -0.185376f,
+  0.0105842f,   0.263692f,    0.160429f,     0.0998209f,   -0.127779f,
+  0.140558f,    0.108968f,    -0.0122672f,   0.102875f,    -5.72172f,
+  -0.161288f,   0.135935f,    -0.0143087f,   0.106556f,    -0.649813f,
+  -0.123049f,   -0.0108861f,  0.102918f,     -0.298137f,   0.0329013f,
+  0.100763f,    0.12018f,     0.100782f,     -0.648036f,   -0.111122f,
+  0.12363f,     0.0211952f,   -0.225201f,    0.0506021f,   0.0167621f,
+  0.0608759f,   -0.0245646f,  0.0503477f,    -0.0972749f,  -0.0415155f,
+  -0.00578366f, -0.0977591f,  0.124867f,     0.0134788f,   -0.0375816f,
+  -0.00581233f, -0.272292f,   -0.250393f,    0.024511f,    -0.184891f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = {
+  0.182474f,  0.0223202f,  0.204111f, 0.0573683f,  0.111143f,
+  0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f,
+  0.0133081f, 0.119719f,   0.237522f, -0.266705f,  0.129427f,
+  0.0695857f, 0.22068f,    0.231667f, 0.405829f,   -0.0972567f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = {
+  -0.0393876f,  -0.269924f,   -0.0703231f,   -0.0236484f,  0.170478f,
+  0.245566f,    0.175963f,    0.104194f,     -0.0490501f,  -0.157605f,
+  -0.0275165f,  -0.0169499f,  -0.250725f,    0.215203f,    -0.00733655f,
+  0.0111298f,   0.205606f,    0.928046f,     0.15139f,     0.0955483f,
+  -0.015115f,   -0.126643f,   0.0957605f,    -0.140178f,   -0.0246866f,
+  0.097097f,    0.116287f,    0.177746f,     0.0570021f,   -0.0518686f,
+  -0.0446482f,  -0.0125318f,  0.0116092f,    0.102431f,    0.0898519f,
+  0.0870372f,   -0.843274f,   0.383311f,     -0.102761f,   -0.0246494f,
+  0.0312555f,   0.19472f,     0.111573f,     0.0920392f,   -0.0555618f,
+  0.326461f,    0.219357f,    -0.133727f,    -0.118399f,   -0.0611432f,
+  -0.169931f,   0.123733f,    -0.204607f,    0.082592f,    0.0323181f,
+  0.201618f,    -0.00388867f, -0.053583f,    0.0266333f,   -0.0951787f,
+  -0.0358283f,  -0.0649549f,  0.0119263f,    -0.11812f,    0.209851f,
+  -0.036616f,   -0.014911f,   -0.138096f,    -0.139664f,   -0.207395f,
+  0.0128848f,   -0.201816f,   0.0899419f,    0.343308f,    -0.0096243f,
+  -0.212605f,   -0.0905284f,  -0.0597114f,   -0.055261f,   -0.0653405f,
+  0.0330484f,   -0.27681f,    -0.0994095f,   -0.0468272f,  0.145713f,
+  0.267216f,    0.185335f,    0.1798f,       -0.0437882f,  -0.200401f,
+  -0.0398117f,  -0.0736501f,  -0.166349f,    0.203316f,    0.0710647f,
+  0.061825f,    0.281131f,    0.733323f,     0.215488f,    0.00145659f,
+  -0.138995f,   -0.0833713f,  0.107809f,     -0.105343f,   -0.0672139f,
+  0.101852f,    0.135455f,    0.132903f,     0.0312017f,   -0.0643586f,
+  -0.0274546f,  -0.0687466f,  -0.020233f,    0.109444f,    0.0774587f,
+  0.139497f,    -0.800587f,   0.325783f,     -0.0546695f,  -0.092003f,
+  -0.0773301f,  0.189672f,    0.0604666f,    0.0939425f,   0.679495f,
+  0.114789f,    -0.161153f,   0.12843f,      -0.0345385f,  -0.134641f,
+  -0.153995f,   0.0823055f,   -0.0349296f,   0.0299183f,   -0.0606872f,
+  0.137588f,    0.0449805f,   -0.0555399f,   -0.00553351f, -0.120719f,
+  -0.204701f,   -0.0739813f,  0.0584115f,    -0.104833f,   -0.110989f,
+  0.00845446f,  0.0630702f,   -0.147861f,    0.0268545f,   -0.216419f,
+  0.00531986f,  -0.206641f,   0.253082f,     0.413215f,    -0.05909f,
+  -0.0939983f,  -0.116818f,   -0.0450892f,   -0.0551134f,  -0.00696931f,
+  -0.113003f,   -0.289192f,   -0.00884866f,  -0.0365724f,  0.0401887f,
+  0.238622f,    0.149151f,    0.175751f,     -0.157425f,   -0.138924f,
+  -0.0277598f,  -0.0285915f,  0.10165f,      0.209532f,    0.0862249f,
+  0.0256428f,   0.623204f,    -0.0941196f,   0.20345f,     -0.132869f,
+  0.00947298f,  -0.14753f,    0.103918f,     -0.161799f,   0.125566f,
+  0.10916f,     0.115446f,    0.135627f,     -0.0181667f,  -0.0734694f,
+  -0.0154729f,  -0.085849f,   -0.000427605f, 0.113614f,    0.0776308f,
+  0.111899f,    -0.214917f,   0.393234f,     -0.132223f,   0.020783f,
+  -0.074902f,   0.217477f,    0.107883f,     0.109466f,    0.146609f,
+  0.317061f,    0.074379f,    -0.0505457f,   -0.0503772f,  -0.0678954f,
+  -0.220003f,   0.114878f,    0.176014f,     -0.00657996f, -0.0875497f,
+  0.065582f,    0.00238612f,  -0.063395f,    0.0295323f,   -0.127126f,
+  0.099813f,    -0.115452f,   0.0106309f,    -0.179632f,   -0.0436553f,
+  0.0120295f,   0.0652713f,   -0.131512f,    -0.081714f,   -0.205363f,
+  -0.0374944f,  -0.196707f,   0.680568f,     -0.00991824f, -0.0212223f,
+  -0.186258f,   -0.432361f,   -0.0291303f,   -0.0475983f,  -0.071383f,
+  -0.0116416f,  -0.28257f,    -0.0635272f,   -0.0576546f,  -0.280129f,
+  0.286528f,    0.199997f,    0.192851f,     0.323829f,    -0.185006f,
+  -0.04791f,    -0.0882187f,  -0.0496895f,   0.293135f,    0.125539f,
+  0.0341828f,   0.993452f,    0.0369177f,    0.0453796f,   0.0329807f,
+  0.157673f,    -0.153195f,   0.122383f,     -0.161983f,   -0.317619f,
+  0.105129f,    0.155673f,    0.152489f,     0.0685417f,   -0.0595907f,
+  -0.026657f,   -0.0954336f,  -0.0359557f,   0.105617f,    0.0825066f,
+  0.100189f,    -0.22125f,    0.382508f,     -0.0247677f,  -0.115807f,
+  -0.0639787f,  0.177786f,    0.0566206f,    0.0496389f,   1.31533f,
+  0.0482907f,   -0.118743f,   0.190632f,     0.172867f,    -0.108446f,
+  -0.200186f,   0.122572f,    0.0897468f,    0.0155328f,   -0.0380217f,
+  0.125161f,    -0.141723f,   -0.023157f,    0.0270805f,   -0.101961f,
+  0.12358f,     -0.0866255f,  0.00306761f,   -0.131764f,   -0.461118f,
+  -0.00803936f, 0.0895496f,   -0.153905f,    0.207623f,    -0.249099f,
+  -0.0198487f,  -0.160013f,   0.81136f,      -0.109978f,   -0.0880332f,
+  -0.0761368f,  -0.0755881f,  -0.0384827f,   -0.0554777f,  -0.0750048f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = {
+  0.0106809f, 0.136699f, 0.285316f, 0.395746f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = {
+  -0.0161019f,  -0.088871f,  0.0463358f,   -0.198037f,   0.038122f,
+  0.0135483f,   -0.196641f,  -0.433531f,   0.527972f,    -0.143716f,
+  0.558627f,    0.459889f,   0.322864f,    -0.491514f,   -0.190915f,
+  -0.0765601f,  0.210329f,   0.689389f,    -0.100415f,   -1.8788f,
+  0.2228f,      0.292781f,   -0.954838f,   -0.0788763f,  -0.131402f,
+  -0.17154f,    0.049934f,   -0.0541183f,  -0.530529f,   -0.666165f,
+  0.195492f,    0.218548f,   -0.314895f,   0.0749444f,   -0.191344f,
+  0.349469f,    0.00811248f, -0.760157f,   0.0707434f,   -0.0719285f,
+  -0.264495f,   -0.432009f,  -0.432686f,   0.155738f,    -0.020197f,
+  0.19278f,     -0.658335f,  -0.273143f,   -0.286079f,   0.243402f,
+  0.497701f,    0.0121003f,  -0.666308f,   0.028172f,    -0.547901f,
+  -0.11755f,    0.322028f,   0.0878274f,   -0.0328334f,  0.311816f,
+  0.0951026f,   -1.11429f,   -0.0417486f,  0.123467f,    -0.0910681f,
+  -0.0154255f,  0.311201f,   -0.0156158f,  -0.600437f,   0.0274156f,
+  -0.174907f,   -1.29313f,   -0.178656f,   0.596556f,    -0.421725f,
+  -0.289137f,   0.529297f,   0.114833f,    -0.0155887f,  -0.308232f,
+  -0.0228361f,  0.184017f,   0.138232f,    0.146347f,    -0.117867f,
+  0.248351f,    -0.282846f,  -0.18058f,    0.348355f,    -0.415754f,
+  0.0657168f,   0.431728f,   -0.231043f,   -0.186745f,   0.137401f,
+  -0.282329f,   -0.159678f,  0.754262f,    0.037824f,    -1.68521f,
+  -0.290175f,   0.289588f,   -0.18683f,    -0.300385f,   0.285449f,
+  -0.00386456f, 0.0563485f,  -0.376541f,   0.159899f,    -0.697312f,
+  0.0284389f,   0.437307f,   0.3968f,      -0.372082f,   -0.232535f,
+  0.394629f,    0.00315248f, -0.38374f,    0.0311291f,   -0.624353f,
+  0.498083f,    -0.342663f,  -0.125978f,   0.186797f,    0.187723f,
+  0.149335f,    -0.82727f,   -0.0740974f,  -0.659039f,   0.42671f,
+  -0.448835f,   0.150677f,   0.830742f,    -0.233148f,   -0.65308f,
+  -0.0878935f,  -0.407797f,  -0.511826f,   -0.0739023f,  0.506305f,
+  -0.187451f,   0.0284968f,  -0.822238f,   0.362523f,    -0.270865f,
+  0.032335f,    0.560413f,   -0.00388247f, -0.446333f,   0.163147f,
+  -0.409633f,   -0.372575f,  0.306993f,    0.55953f,     -0.24362f,
+  -0.0929369f,  -0.520298f,  -0.444022f,   0.186077f,    -0.0942208f,
+  0.624049f,    -0.429625f,  -0.869528f,   0.405257f,    -0.120445f,
+  0.537685f,    -0.3911f,    0.142142f,    0.0913808f,   -0.00375967f,
+  0.382781f,    0.60505f,    -0.271608f,   -0.0630436f,  -0.150625f,
+  -0.0124598f,  0.0132878f,  0.138475f,    -0.106264f,   -0.416581f,
+  -0.518415f,   0.185127f,   -0.464622f,   -0.0102925f,  0.0389567f,
+  0.406439f,    -0.0414264f, -0.366185f,   -0.511867f,   -0.650255f,
+  0.278252f,    0.0270234f,  0.262788f,    -0.0294793f,  0.12651f,
+  0.421537f,    0.0300837f,  0.0742187f,   0.281954f,    -0.122069f,
+  -0.450145f,   -0.312206f,  -0.402633f,   -0.0868137f,  0.190433f,
+  -0.149602f,   -0.175029f,  0.00900023f,  -0.266596f,   0.21721f,
+  -0.245079f,   -1.09798f,   0.319409f,    -0.337938f,   0.358514f,
+  0.0771549f,   0.447087f,   -0.305507f,   -0.285492f,   0.383896f,
+  0.145933f,    -0.264944f,  -0.118486f,   0.068805f,    -0.194231f,
+  -1.79133f,    0.363408f,   -0.17434f,    -0.229629f,   0.132188f,
+  0.207548f,    -0.876264f,  0.265634f,    0.139332f,    0.236206f,
+  -0.0145184f,  0.562865f,   0.526612f,    -0.0333508f,  -0.421885f,
+  0.273485f,    -0.110882f,  0.425557f,    0.513303f,    -0.422322f,
+  0.0563155f,   -0.0409693f, 0.194768f,    -0.419828f,   -0.107195f,
+  -1.19224f,    0.48552f,    0.132782f,    -0.00932096f, -0.225484f,
+  -0.428484f,   -0.0392684f, 0.750697f,    0.337615f,    0.158476f,
+  0.413484f,    0.326017f,   -0.757107f,   -0.183962f,   0.00884361f,
+  0.126507f,    -0.0751588f, -0.308782f,   -0.104237f,   -0.703877f,
+  -0.491806f,   -0.204251f,  -0.317212f,   0.0815479f,   0.296323f,
+  0.219632f,    -0.039859f,  0.556257f,    0.176144f,    -0.0750654f,
+  -0.106419f,   0.00400385f, -0.172266f,   0.000178763f, 0.146532f,
+  0.255202f,    -0.427235f,  -0.182198f,   -0.256557f,   0.260255f,
+  -0.0143364f,  0.0868664f,  -0.564373f,   -0.0876947f,  0.726289f,
+  0.0160001f,   -0.381562f,  -0.638214f,   -0.803803f,   0.25945f,
+  -0.371542f,   -0.419611f,  0.238617f,    0.371834f,    -0.226777f,
+  -0.894602f,   0.37458f,    -0.354866f,   0.0249312f,   0.142374f,
+  0.433813f,    -0.0218183f, -0.33248f,    0.107223f,    0.390823f,
+  -0.0271108f,  -0.616878f,  -0.604984f,   0.517269f,    -0.293573f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = {
+  -0.290371f, -0.0560272f,  -0.118144f,  -0.270583f,  0.401388f,
+  -0.308677f, 0.150729f,    -0.0324442f, -0.135937f,  0.0875581f,
+  0.0206493f, -0.212682f,   -0.0266535f, -0.326656f,  0.0185105f,
+  -1.01429f,  -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f
+};
+
+static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = {
+  NUM_CNN_LAYERS,  // num_layers
+  0,               // is_residue
+  0,               // ext_width
+  0,               // ext_height
+  0,               // strict_bounds
+  {
+      {
+          CNN_LAYER_0_IN_CH,                                // in_channels
+          CNN_LAYER_0_WIDTH,                                // filter_width
+          CNN_LAYER_0_WIDTH,                                // filter_height
+          CNN_LAYER_0_OUT_CH,                               // out_channels
+          CNN_LAYER_0_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_0_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_0_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_0_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          -1,                // output_num
+      },
+      {
+          CNN_LAYER_1_IN_CH,                                // in_channels
+          CNN_LAYER_1_WIDTH,                                // filter_width
+          CNN_LAYER_1_WIDTH,                                // filter_height
+          CNN_LAYER_1_OUT_CH,                               // out_channels
+          CNN_LAYER_1_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_1_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_1_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_1_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          3,                 // output_num
+      },
+      {
+          CNN_LAYER_2_IN_CH,                                // in_channels
+          CNN_LAYER_2_WIDTH,                                // filter_width
+          CNN_LAYER_2_WIDTH,                                // filter_height
+          CNN_LAYER_2_OUT_CH,                               // out_channels
+          CNN_LAYER_2_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_2_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_2_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_2_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          2,                 // output_num
+      },
+      {
+          CNN_LAYER_3_IN_CH,                                // in_channels
+          CNN_LAYER_3_WIDTH,                                // filter_width
+          CNN_LAYER_3_WIDTH,                                // filter_height
+          CNN_LAYER_3_OUT_CH,                               // out_channels
+          CNN_LAYER_3_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_3_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_3_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_3_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          1,                 // output_num
+      },
+      {
+          CNN_LAYER_4_IN_CH,                                // in_channels
+          CNN_LAYER_4_WIDTH,                                // filter_width
+          CNN_LAYER_4_WIDTH,                                // filter_height
+          CNN_LAYER_4_OUT_CH,                               // out_channels
+          CNN_LAYER_4_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_4_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_4_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_4_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          0,                 // output_num
+      },
+  },
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = {
+      0.604356f,    -0.236007f,   0.342172f,   0.531397f,     -0.635698f,
+      -0.591573f,   0.833872f,    0.492814f,   -0.100308f,    0.186385f,
+      0.202779f,    0.263578f,    0.330001f,   -0.15531f,     0.879584f,
+      -0.0048796f,  0.490796f,    0.242254f,   -0.292211f,    -0.696912f,
+      0.746664f,    0.129371f,    -0.0122443f, 0.196234f,     -0.251605f,
+      -0.385617f,   0.157707f,    0.699963f,   0.0432536f,    -0.11141f,
+      -0.0353473f,  -0.0364045f,  -0.113556f,  -0.520842f,    0.231248f,
+      0.230638f,    -0.323852f,   -1.08633f,   -0.0469168f,   -0.481821f,
+      0.366838f,    0.189627f,    -0.0637262f, -0.484917f,    -0.109874f,
+      0.292237f,    0.368702f,    -0.183896f,  -0.109038f,    -1.22613f,
+      -0.880355f,   -1.63768f,    0.337426f,   -0.940994f,    0.413097f,
+      -0.37879f,    -0.480525f,   -0.594819f,  -0.0172653f,   -0.499436f,
+      -0.298395f,   -0.840181f,   -0.0758645f, -0.772089f,    -0.232727f,
+      -0.815968f,   0.160785f,    -0.0767165f, 0.0064244f,    -0.540491f,
+      0.417776f,    -0.384337f,   -0.497377f,  0.68414f,      0.00797514f,
+      0.262626f,    0.203732f,    0.702047f,   0.0617544f,    0.0878249f,
+      -0.315032f,   -0.0169776f,  0.403986f,   0.815872f,     0.135388f,
+      0.0858594f,   0.169172f,    -0.638227f,  -1.65268f,     -0.0476042f,
+      -0.982685f,   0.45707f,     -0.0577537f, 0.367329f,     0.176513f,
+      -0.356454f,   0.0979095f,   -0.277476f,  0.257271f,     -0.333451f,
+      0.0241497f,   0.0671127f,   0.221216f,   0.106065f,     0.537151f,
+      0.0257329f,   0.265559f,    -0.348353f,  0.285569f,     -0.0610511f,
+      -1.59334f,    -1.63826f,    -0.164898f,  -0.36605f,     -0.489304f,
+      0.729241f,    0.0197627f,   0.200291f,   -0.231506f,    -0.255715f,
+      -0.0932264f,  -0.728793f,   0.468297f,   -1.09592f,     -0.079791f,
+      -1.76531f,    -0.182904f,   -2.05897f,   -0.371894f,    0.207124f,
+      0.255029f,    0.186501f,    -0.005805f,  0.00160733f,   -0.178206f,
+      -0.352757f,   -0.164741f,   -0.557583f,  -0.559692f,    -0.00731467f,
+      0.149326f,    0.409735f,    0.22083f,    -0.332572f,    -0.1741f,
+      -0.0519008f,  -0.266402f,   0.294031f,   -2.4453f,      0.339851f,
+      -0.573747f,   -5.97783f,    -0.084142f,  0.20286f,      -0.576038f,
+      -0.111081f,   0.101238f,    -5.83427f,   -1.98537f,     0.322796f,
+      -0.60171f,    0.212412f,    0.247176f,   0.603694f,     -0.54357f,
+      -0.693439f,   0.250725f,    -4.31988f,   0.0935924f,    0.43669f,
+      -0.139706f,   -0.158391f,   0.244309f,   0.619213f,     -0.309154f,
+      -0.135341f,   0.475815f,    -0.290804f,  -0.109038f,    -0.0937104f,
+      0.0385907f,   -0.29105f,    -0.0597651f, -0.451187f,    -1.51821f,
+      0.141772f,    0.822204f,    -0.729661f,  -0.109908f,    0.178217f,
+      -0.750278f,   0.113762f,    -0.0959985f, 0.066579f,     -0.104209f,
+      -0.951378f,   1.4087f,      -1.13175f,   -1.09103f,     -1.50416f,
+      -0.182273f,   -1.80129f,    -0.152135f,  0.356931f,     0.205591f,
+      0.183148f,    -0.498671f,   -0.183034f,  -0.176428f,    0.395706f,
+      -0.589908f,   -0.318276f,   -0.421162f,  0.658766f,     -0.186752f,
+      0.0656253f,   0.248002f,    0.289618f,   -0.458111f,    -0.130789f,
+      -0.542988f,   0.405804f,    -0.35364f,   -0.311927f,    0.218339f,
+      0.309215f,    -0.130347f,   -0.0257543f, 0.0413234f,    -0.190205f,
+      -0.242382f,   0.819886f,    -0.255157f,  -0.181219f,    -0.290903f,
+      -0.301995f,   -0.0469988f,  0.702936f,   0.209122f,     0.0234243f,
+      0.598637f,    0.0305196f,   0.0423457f,  -0.618799f,    0.0190867f,
+      0.420584f,    -0.224752f,   -0.410077f,  0.127854f,     0.395261f,
+      -0.393685f,   -0.282822f,   0.0289504f,  0.0406515f,    -0.511531f,
+      -0.497611f,   0.0252715f,   0.0812549f,  0.80205f,      1.29084f,
+      0.764972f,    0.561258f,    -0.23499f,   0.217594f,     -0.690935f,
+      -0.26607f,    0.357955f,    0.391608f,   0.448352f,     0.458586f,
+      -0.790071f,   0.719959f,    -0.468052f,  1.24579f,      0.220705f,
+      0.284044f,    0.141346f,    0.246687f,   0.147826f,     -0.403557f,
+      -0.00648195f, 0.398034f,    -0.100464f,  -0.77107f,     -0.188274f,
+      -0.219245f,   -0.0330375f,  0.367585f,   -0.220391f,    0.308736f,
+      0.221399f,    0.340292f,    0.037597f,   0.606083f,     0.665634f,
+      -0.755529f,   -0.95989f,    -0.243673f,  0.233709f,     -0.454628f,
+      -0.110952f,   0.776062f,    0.731136f,   -0.140422f,    0.19261f,
+      0.355086f,    0.975026f,    0.190936f,   0.776205f,     0.982781f,
+      0.555569f,    0.42382f,     -0.409721f,  0.25053f,      -0.271328f,
+      0.859941f,    -0.0210901f,  0.0176916f,  -0.562895f,    -0.0787431f,
+      -0.861032f,   -0.34022f,    -0.571995f,  0.205436f,     0.346968f,
+      0.377033f,    -1.08484f,    0.297007f,   -1.01693f,     0.189463f,
+      -0.483242f,   0.147058f,    0.0159503f,  0.0908779f,    -0.46962f,
+      0.174024f,    -0.490704f,   -0.383501f,  -0.0507626f,   0.00902188f,
+      -0.202495f,   0.205047f,    0.0562261f,  -0.143371f,    0.219524f,
+      -0.317294f,   -0.0575756f,  -0.0595825f, -0.000625279f, -0.278864f,
+      -0.0516874f,  -0.225259f,   0.429046f,   -0.0952421f,   0.0799135f,
+      -0.122883f,   -0.262308f,   -0.481006f,  -0.0466122f,   -0.402822f,
+      0.150595f,    -0.0919558f,  -0.356765f,  -0.199222f,    0.219389f,
+      -0.214452f,   -0.196361f,   -0.095758f,  -0.115891f,    -0.143777f,
+      0.549843f,    -0.113036f,   0.764895f,   -0.0114812f,   -0.0684054f,
+      -0.98045f,    -0.0170634f,  0.247719f,   -0.18718f,     -0.381566f,
+      0.150758f,    -0.526257f,   1.00851f,    0.776634f,     1.69728f,
+      -0.303058f,   0.228967f,    -0.414134f,  0.0858226f,    -0.285472f,
+      0.431459f,    0.315318f,    0.587835f,   0.335737f,     -0.0222039f,
+      0.18945f,     0.274008f,    0.609263f,   0.320232f,     -0.214137f,
+      -0.0297668f,  0.0439046f,   -0.52821f,   -0.0127375f,   0.431885f,
+      0.508846f,    -0.329189f,   -0.166778f,  -0.94338f,     -0.358807f,
+      0.208641f,    -0.517986f,   -0.128278f,  0.693464f,     -0.24408f,
+      -0.0669412f,  -0.410287f,   0.0444145f,  -0.264179f,    0.143884f,
+      0.276842f,    0.498934f,    -0.682557f,  -0.217198f,    -0.8249f,
+      -0.40446f,    -0.115376f,   0.417934f,   0.65605f,      -0.00570035f,
+      -0.365742f,   -0.367625f,   0.526824f,   -0.0164913f,   -0.255998f,
+      0.247292f,    0.0846536f,   0.109302f,   -0.302996f,    0.160564f,
+      0.0228132f,   0.035211f,    -0.236951f,  0.493801f,     1.37315f,
+      -0.182348f,   0.234437f,    -0.256906f,  0.12523f,      0.667113f,
+      -0.437981f,   -0.0721831f,  0.303976f,   -0.041336f,    -0.145894f,
+      -0.733741f,   0.436056f,    0.368542f,   -0.149072f,    -0.290281f,
+      0.0946743f,   -0.0579292f,  0.264539f,   0.170048f,     0.262411f,
+      0.049679f,    0.371369f,    0.760675f,   0.482157f,     -0.0196783f,
+      0.260888f,    0.948856f,    0.170228f,   -0.134432f,    -0.942235f,
+      -1.23226f,    -0.373963f,   -0.0381773f, -0.17947f,     0.00947998f,
+      0.01086f,     0.389578f,    -0.380389f,  -0.0865851f,   -0.220328f,
+      -0.171901f,   -0.384325f,   -0.0787615f, 0.392678f,     0.123392f,
+      -0.0895824f,  0.00480886f,  -0.162918f,  0.214336f,     -0.00147339f,
+      0.203899f,    -0.00292344f, -0.148594f,  0.0425697f,    -0.306896f,
+      -0.342225f,   -0.45088f,    -0.184454f,  -0.00923638f,  -0.521993f,
+      -0.334464f,   0.156497f,    -0.0856832f, -0.277661f,    -0.0721105f,
+      -0.488781f,   -0.509543f,   -0.012664f,  0.0940558f,    -0.29869f,
+      0.0434843f,   -0.0178945f,  -0.0525666f, -0.303178f,    0.713507f,
+      -0.137413f,   -0.170289f,   -0.142942f,  -0.316002f,    0.229125f,
+      -0.277585f,   0.0125026f,   0.508316f,   -1.20614f,     -0.915129f,
+      -1.63389f,    -0.454604f,   -0.893951f,  -0.447403f,    -0.751423f,
+      1.3886f,      0.617818f,    0.611458f,   -0.884173f,    -0.7779f,
+      -0.608639f,   -0.164759f,   -0.631846f,  -0.176894f,    -0.459361f,
+      -0.187119f,   0.173283f,    -0.477191f,  -0.156736f,    0.182675f,
+      0.598854f,    -0.489941f,   -0.420493f,  -0.162002f,    0.344418f,
+      0.33832f,     -0.187463f,   -0.388721f,  -0.0733151f,   -0.138835f,
+      0.313699f,    0.0625967f,   -0.291488f,  0.114088f,     -0.356843f,
+      0.197506f,    0.0320749f,   1.16745f,    -0.36081f,     1.63416f,
+      0.198392f,    1.13928f,     -0.317971f,  0.531019f,     0.526518f,
+      0.185814f,    0.0923607f,   0.192858f,   -0.234378f,    0.18091f,
+      -0.228837f,   0.397216f,    0.581501f,   0.284376f,     -0.130434f,
+      0.20076f,     0.242662f,    -0.0480872f, 0.131746f,     0.362712f,
+      0.0146821f,   0.475679f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = {
+  0.477356f,   0.385222f,  0.389122f, 0.539506f,   -0.0272558f, 0.581605f,
+  -0.800961f,  0.142229f,  0.117549f, -0.0724944f, 0.102095f,   -0.71319f,
+  -0.0162434f, -0.132858f, 0.543411f, -0.626599f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = {
+      0.195436f,   -0.623354f,  1.27907f,    0.270071f,   -0.677612f,
+      0.0266141f,  0.272991f,   -0.425446f,  0.891889f,   -0.299836f,
+      -0.611825f,  -0.0322273f, 0.185276f,   0.238639f,   -0.150954f,
+      0.083495f,   -0.472106f,  0.573506f,   1.16465f,    -0.154947f,
+      0.640631f,   -1.59467f,   -9.8166f,    -0.22889f,   -0.189912f,
+      0.227052f,   -0.540787f,  0.0840873f,  -3.04293f,   -0.0209975f,
+      -6.10979f,   -5.92801f,   0.288467f,   -0.169476f,  0.0527948f,
+      -1.21202f,   -0.280915f,  0.290863f,   -0.601877f,  0.0598784f,
+      -0.592136f,  -0.535588f,  -0.0434018f, -0.653223f,  0.00339129f,
+      -0.133273f,  0.279463f,   0.483879f,   0.463664f,   -0.14174f,
+      -1.56354f,   0.560043f,   -1.44639f,   0.673528f,   -0.108418f,
+      -0.707313f,  0.49633f,    -0.0321971f, 0.411475f,   -0.382184f,
+      -0.965501f,  -0.0507655f, 0.540415f,   -0.977297f,  0.370382f,
+      -0.375683f,  0.0844529f,  -2.0002f,    -0.346289f,  0.621251f,
+      -0.489855f,  0.191252f,   -0.576629f,  -0.35773f,   0.023167f,
+      0.180793f,   -0.417864f,  0.0587254f,  0.167824f,   0.0612058f,
+      -0.712108f,  0.155614f,   0.900036f,   -0.480124f,  0.146117f,
+      0.467011f,   0.412525f,   0.312724f,   0.551826f,   -0.179601f,
+      0.706261f,   0.00674965f, -0.495221f,  0.140829f,   -0.0619195f,
+      -0.0697912f, 0.511967f,   -0.0318237f, -0.285946f,  -0.28608f,
+      0.0894142f,  0.234351f,   -0.272328f,  -0.350369f,  -0.392605f,
+      0.287318f,   0.310426f,   0.293524f,   0.357681f,   -0.157868f,
+      0.149652f,   -0.259363f,  0.192941f,   -0.850096f,  0.456507f,
+      0.387857f,   -0.491187f,  -0.0541993f, -0.28118f,   0.193991f,
+      -0.0956664f, 0.0679829f,  0.0341118f,  0.141826f,   0.271538f,
+      -0.285295f,  -0.68666f,   0.306414f,   0.600678f,   0.494801f,
+      -1.11907f,   0.524849f,   0.151169f,   0.474068f,   -0.43441f,
+      -0.229138f,  0.0345483f,  0.682888f,   -0.471534f,  -0.0457066f,
+      -2.36721f,   0.446407f,   0.20396f,    -1.17868f,   0.815363f,
+      -1.13897f,   0.397217f,   -0.593796f,  -6.95512f,   0.650695f,
+      0.771657f,   0.15227f,    -0.824519f,  0.617854f,   -0.295353f,
+      -0.101207f,  0.600989f,   -0.550653f,  -0.722371f,  0.292006f,
+      -0.451891f,  0.54544f,    0.354278f,   0.0136258f,  0.192003f,
+      0.258275f,   -0.0443647f, 0.0928186f,  0.667775f,   0.239558f,
+      0.0523887f,  0.71586f,    0.292563f,   0.362479f,   0.373453f,
+      0.250638f,   -0.423037f,  -0.486574f,  -0.619397f,  0.343888f,
+      0.974971f,   0.574218f,   0.273989f,   -0.209956f,  -0.274333f,
+      0.0553766f,  0.263918f,   0.733824f,   0.038713f,   -0.0788992f,
+      0.292014f,   0.111808f,   -0.197507f,  0.593668f,   -0.0245337f,
+      0.0873662f,  0.530997f,   0.620717f,   0.310697f,   -1.54861f,
+      1.12915f,    0.0991346f,  -0.59214f,   0.422325f,   -0.0157936f,
+      0.380975f,   0.626403f,   0.268064f,   -0.615231f,  -1.43172f,
+      0.0928048f,  0.0949026f,  -0.470912f,  -0.0867527f, -0.0381206f,
+      0.178393f,   -1.13737f,   0.12798f,    0.258214f,   -0.803364f,
+      0.177506f,   0.542718f,   0.660656f,   0.145091f,   0.183056f,
+      -0.47338f,   0.469287f,   0.10832f,    0.0994899f,  -0.402719f,
+      0.157287f,   0.523071f,   -0.324493f,  0.343599f,   0.664839f,
+      -0.0375519f, -0.279238f,  -0.0722333f, 0.395344f,   -0.289316f,
+      0.0259298f,  -0.843245f,  -0.160021f,  0.741429f,   -1.38726f,
+      -0.2969f,    -0.240443f,  0.247731f,   -1.04088f,   -0.280454f,
+      -0.237054f,  -0.759227f,  0.0456369f,  -0.647453f,  -1.02372f,
+      -0.200395f,  -0.546839f,  -0.104226f,  -0.152727f,  -0.56685f,
+      -0.0559663f, -0.425494f,  -0.610679f,  -0.987096f,  -0.575138f,
+      -0.0887979f, 0.463646f,   -1.041f,     -0.49412f,   -0.175298f,
+      -0.463296f,  -0.955177f,  0.17852f,    -1.10694f,   0.181991f,
+      -0.18998f,   0.227818f,   0.688237f,   -1.10444f,   0.549108f,
+      -0.171849f,  -0.245614f,  0.120624f,   1.29571f,    0.607116f,
+      0.00809927f, 0.1041f,     -1.22918f,   -0.212948f,  0.430239f,
+      -1.57341f,   0.482054f,   0.275905f,   0.939785f,   -1.0209f,
+      -0.355534f,  0.397337f,   -0.0593077f, -0.239603f,  0.475483f,
+      -0.999101f,  -0.140578f,  1.04787f,    -0.591981f,  -0.306989f,
+      -0.879012f,  -0.994715f,  0.0343158f,  0.218509f,   0.34704f,
+      0.0672934f,  -0.178941f,  0.20509f,    -0.360031f,  0.161241f,
+      -0.324775f,  -0.359531f,  -0.0657085f, -0.864422f,  -0.444865f,
+      0.597095f,   -0.948691f,  0.240001f,   -0.783159f,  -0.569422f,
+      0.974205f,   -1.04539f,   0.345915f,   -0.681558f,  -0.246047f,
+      0.256174f,   0.493667f,   0.681324f,   0.155613f,   0.773309f,
+      -0.647027f,  -0.214744f,  -0.474202f,  -0.661092f,  -1.02316f,
+      0.0572593f,  -0.437082f,  -0.119874f,  -0.464877f,  -0.58067f,
+      -0.218029f,  0.319516f,   -0.378983f,  -0.0698695f, 0.554693f,
+      -0.537875f,  0.126429f,   -0.145113f,  -0.594312f,  -0.218021f,
+      -0.703569f,  0.0720548f,  0.261054f,   -0.81438f,   0.249921f,
+      0.165296f,   -0.079028f,  -0.322647f,  0.134458f,   0.0975046f,
+      0.538594f,   -0.250126f,  0.142309f,   0.526486f,   0.0532615f,
+      -0.383332f,  -0.38143f,   -0.101611f,  0.519776f,   -0.278364f,
+      -0.23287f,   -0.29139f,   0.22353f,    0.472085f,   0.366264f,
+      0.741187f,   0.42019f,    0.0676459f,  -0.230008f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = {
+  -0.48603f,  -0.578556f,  0.257639f, 0.459915f, 0.178156f,  -1.16663f,
+  0.828891f,  0.620291f,   0.413257f, -1.00508f, -0.574179f, -1.20623f,
+  -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f,
+  0.0233112f, 0.126045f,   0.361304f, 0.655317f, 0.413134f,  0.769947f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = {
+  0.67244f,   -2.59179f, 0.50425f,  -1.86481f,  1.15891f,   -1.26447f,
+  0.761081f,  0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f,
+  -0.560935f, 0.838959f, 0.502264f, -1.28958f,  -0.205551f, 0.635671f,
+  -1.12619f,  -1.68277f, 0.83361f,  1.57235f,   1.15839f,   0.35345f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = {
+  1.14463f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = {
+      0.364612f,    0.237868f,    -0.192821f,   0.12364f,      0.522205f,
+      -0.205785f,   -0.503288f,   -0.426503f,   -0.083073f,    0.0164429f,
+      0.184278f,    -0.426055f,   0.0717997f,   -0.261968f,    0.176412f,
+      -0.101226f,   0.0400285f,   -0.332051f,   0.344385f,     0.189565f,
+      0.441162f,    0.330462f,    -0.719857f,   -1.14209f,     0.557831f,
+      0.104756f,    0.0562001f,   -0.465923f,   -0.344592f,    -0.191554f,
+      -0.0656866f,  -0.640162f,   0.419388f,    0.409308f,     -1.68632f,
+      -1.10829f,    0.105485f,    -0.14561f,    -0.944738f,    0.104629f,
+      -0.146837f,   0.538823f,    -0.153157f,   0.321081f,     -1.77714f,
+      -0.0559296f,  0.324136f,    -0.497023f,   -1.15793f,     -0.740144f,
+      -0.0888472f,  0.010059f,    -0.18394f,    -0.234405f,    -0.10586f,
+      0.130958f,    -0.101944f,   -0.186483f,   -0.447049f,    -0.900026f,
+      0.128444f,    0.401696f,    0.128509f,    0.123778f,     0.062168f,
+      -0.321755f,   -0.0691584f,  0.254468f,    -0.115212f,    -0.848885f,
+      0.817005f,    0.0615853f,   0.153363f,    0.513855f,     0.789225f,
+      0.356168f,    0.371613f,    0.269541f,    0.268173f,     0.220481f,
+      -0.109063f,   -0.00620798f, -0.0334622f,  0.236267f,     -0.0235294f,
+      -0.0800253f,  0.0294184f,   0.047131f,    -0.224047f,    0.0890737f,
+      -0.356293f,   0.0989534f,   0.16799f,     0.498266f,     0.612581f,
+      -0.372897f,   -0.75125f,    0.77698f,     1.1032f,       -0.0764679f,
+      0.0266299f,   0.309532f,    0.461305f,    0.0193521f,    -0.0939161f,
+      -0.276156f,   -0.102714f,   -0.0828328f,  0.40003f,      0.122542f,
+      0.0867203f,   -0.170738f,   0.0850642f,   -0.130762f,    0.082324f,
+      -0.115218f,   -0.0244491f,  0.0434331f,   0.216453f,     0.443733f,
+      -0.173679f,   -0.161617f,   0.316209f,    -0.689656f,    -1.52007f,
+      -0.421018f,   0.430833f,    -0.00734122f, 0.284499f,     -0.0207885f,
+      0.0572024f,   -0.878942f,   0.388264f,    0.0191589f,    -0.123415f,
+      -0.0461196f,  -0.0444461f,  -0.00383171f, 0.0945655f,    -0.0597219f,
+      -0.374918f,   0.0182124f,   0.523083f,    0.00519547f,   0.80513f,
+      -0.221433f,   -1.30591f,    -0.416917f,   -0.718173f,    0.622999f,
+      0.941798f,    0.0477536f,   0.0303772f,   0.268078f,     0.414778f,
+      0.394325f,    0.299733f,    -0.583208f,   0.309379f,     0.416581f,
+      0.0299948f,   -0.409145f,   -0.161557f,   -0.214082f,    -0.0098119f,
+      0.221912f,    0.107135f,    0.0692518f,   0.00490957f,   0.107613f,
+      -0.368404f,   -0.548006f,   0.208274f,    0.550475f,     0.643678f,
+      -1.65859f,    0.095938f,    -0.0434245f,  -0.0792685f,   0.838109f,
+      -0.0138653f,  -0.527573f,   -0.123472f,   -0.235618f,    -0.677401f,
+      -0.125877f,   -0.175604f,   -0.203196f,   0.113478f,     -0.228323f,
+      -0.53539f,    0.134458f,    0.0534899f,   -0.213006f,    -0.138679f,
+      -2.15023f,    0.186303f,    0.48566f,     -1.22301f,     -0.240982f,
+      -0.486836f,   -0.121181f,   -0.131382f,   -0.0320283f,   0.278828f,
+      0.342581f,    -0.182257f,   -0.365193f,   -0.226351f,    0.108928f,
+      -0.100159f,   0.448355f,    -0.0768947f,  0.0633719f,    -0.104786f,
+      0.0456653f,   0.0965752f,   0.156403f,    -0.157337f,    0.212259f,
+      0.317939f,    0.124193f,    -0.329475f,   0.206868f,     -2.15986f,
+      -0.108385f,   -0.396769f,   -0.0317231f,  -0.271524f,    -0.184697f,
+      0.662615f,    0.412926f,    -0.0217462f,  -0.0285475f,   -0.118826f,
+      0.0252706f,   -0.137091f,   0.198973f,    0.329509f,     -0.0831966f,
+      -0.621237f,   0.0896179f,   0.805261f,    -0.019675f,    0.962452f,
+      0.307433f,    0.892168f,    -0.537587f,   -2.46145f,     0.125606f,
+      0.920491f,    0.219462f,    0.292765f,    -0.748238f,    -0.0537239f,
+      -0.224326f,   0.505492f,    0.176426f,    0.0343168f,    0.16708f,
+      -0.581393f,   0.951726f,    -1.1777f,     -0.561914f,    -1.53288f,
+      0.864567f,    -1.19648f,    -1.24141f,    -0.334688f,    -0.622026f,
+      0.666876f,    -0.197005f,   -0.600507f,   -0.851924f,    0.492299f,
+      0.31078f,     -0.0736115f,  0.030999f,    -6.02463e-05f, -0.0604341f,
+      -0.0254238f,  0.139222f,    0.333235f,    0.366534f,     -0.191982f,
+      -0.0156092f,  0.44234f,     -0.0193213f,  0.0938745f,    -0.015709f,
+      -0.12043f,    0.00895591f,  0.0464401f,   0.0530699f,    -0.623018f,
+      -1.23372f,    -0.538647f,   -1.12389f,    0.26742f,      0.548694f,
+      0.00540655f,  -0.219703f,   0.314894f,    -0.573463f,    -0.241555f,
+      0.441851f,    0.422491f,    0.253785f,    -0.384683f,    0.0370165f,
+      0.226669f,    0.245587f,    0.215265f,    -0.122272f,    0.0492235f,
+      0.000658591f, -0.312877f,   0.436487f,    -0.229199f,    -0.174373f,
+      0.904268f,    -0.855845f,   -0.877293f,   -0.65409f,     0.313795f,
+      0.461748f,    -0.737766f,   -0.228523f,   0.182181f,     0.334522f,
+      0.0629676f,   -0.151087f,   0.178798f,    -0.325809f,    -0.331672f,
+      0.0865837f,   -0.0684225f,  0.0252008f,   -0.0820631f,   0.0481863f,
+      0.209473f,    -0.0242151f,  -0.0898919f,  -0.163828f,    -0.164282f,
+      0.581888f,    0.816896f,    0.0607674f,   0.364855f,     -0.346512f,
+      -0.764174f,   0.595561f,    0.302872f,    0.206361f,     0.106917f,
+      -0.972338f,   0.176948f,    0.6415f,      -0.131897f,    -0.155802f,
+      0.216337f,    -0.342511f,   0.123743f,    -0.123014f,    0.0205439f,
+      0.15173f,     -0.23801f,    -1.00387f,    0.651328f,     0.237439f,
+      -0.542952f,   1.066f,       -0.161107f,   -0.593545f,    0.219343f,
+      -0.178094f,   0.0789992f,   0.428332f,    0.23827f,      -0.327421f,
+      0.416144f,    0.00394653f,  0.052046f,    -0.238289f,    0.405942f,
+      0.00141984f,  0.161017f,    0.077111f,    0.0823985f,    0.0981208f,
+      0.109949f,    -0.0428502f,  0.343629f,    -0.722978f,    -0.375269f,
+      -0.111634f,   -0.271523f,   0.712093f,    0.684904f,     -0.572331f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = {
+  0.583367f,  -0.202004f, -0.207626f, 0.412451f,  -0.258311f, 0.0304954f,
+  -0.102458f, 0.450087f,  -0.376851f, -0.338702f, 0.335226f,  0.889072f,
+  0.502411f,  0.649282f,  0.15345f,   -0.0109896f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = {
+      0.0214882f,    -0.934339f,  -0.173335f,  0.8362f,      -0.764234f,
+      0.525163f,     0.409749f,   0.821539f,   -0.784157f,   -0.455593f,
+      0.446099f,     0.406756f,   0.479242f,   -0.814038f,   -0.419332f,
+      0.328869f,     -0.340707f,  0.133219f,   0.0320347f,   0.25089f,
+      -0.324917f,    -0.0684265f, 0.0377777f,  -0.262556f,   0.673458f,
+      -0.0291454f,   -0.417957f,  -1.0075f,    -0.481537f,   0.922105f,
+      -0.000516239f, -0.40034f,   0.242067f,   -0.43178f,    0.32001f,
+      0.143599f,     -0.345172f,  0.126093f,   0.148518f,    -1.12151f,
+      -1.03435f,     0.551691f,   -0.310001f,  -0.323194f,   -0.595128f,
+      -0.395689f,    0.737268f,   -0.729227f,  0.590804f,    -0.590022f,
+      -1.01427f,     -0.521159f,  -0.617579f,  1.07292f,     -0.613047f,
+      -0.619093f,    0.335268f,   0.473753f,   -0.795027f,   1.24635f,
+      -0.556193f,    0.241046f,   -0.0354181f, -0.354215f,   0.716752f,
+      -0.00200745f,  -1.25171f,   -0.440731f,  -0.763918f,   -0.588614f,
+      -0.183901f,    -0.396056f,  0.226903f,   0.921471f,    1.10465f,
+      0.207053f,     0.57681f,    -0.555699f,  0.235469f,    -0.92149f,
+      0.625808f,     0.29653f,    -0.81775f,   -0.307889f,   -1.41384f,
+      -0.136205f,    -0.365314f,  -0.516741f,  0.748052f,    0.617947f,
+      0.0973239f,    0.839607f,   0.530668f,   -0.227032f,   -0.449044f,
+      -1.04725f,     -0.244363f,  -0.396888f,  -0.146161f,   0.359789f,
+      0.0436599f,    1.21645f,    -0.336069f,  0.0534646f,   -0.00200328f,
+      0.658551f,     -0.156142f,  -1.0728f,    0.0951015f,   0.234837f,
+      -0.380525f,    0.041783f,   -0.269273f,  0.0386013f,   -0.455589f,
+      -0.174338f,    0.0345251f,  0.17116f,    -0.507642f,   0.210453f,
+      0.739987f,     -0.0438776f, 0.570145f,   -0.118811f,   0.0548662f,
+      0.153458f,     -0.89887f,   0.493704f,   0.283351f,    0.785441f,
+      -0.586002f,    -0.0616167f, -0.714328f,  -0.145941f,   -0.449656f,
+      0.850117f,     0.279997f,   0.204143f,   -0.31356f,    0.947057f,
+      -0.135787f,    0.747071f,   0.0145968f,  -0.81414f,    0.431009f,
+      -0.275824f,    -0.342928f,  -0.0528272f, -0.592183f,   0.433915f,
+      -0.251752f,    -0.311815f,  -1.47533f,   -1.43677f,    0.0698436f,
+      1.01341f,      0.305063f,   -0.252003f,  -0.428915f,   -0.00104153f,
+      -0.368267f,    -0.354523f,  -0.27956f,   -0.771664f,   0.232092f,
+      -0.428495f,    0.424952f,   -0.343229f,  0.196899f,    -0.761084f,
+      -0.0110293f,   -0.335361f,  0.571637f,   -0.423489f,   -0.52773f,
+      0.0108043f,    -0.504715f,  -1.1419f,    -0.402904f,   -0.160747f,
+      -0.329184f,    0.375374f,   -1.02604f,   -0.601371f,   0.631652f,
+      0.0742486f,    -0.464765f,  0.467445f,   0.240562f,    -0.38211f,
+      -0.459004f,    0.704196f,   0.021357f,   0.860785f,    -1.16731f,
+      -0.479029f,    -0.139644f,  -0.444087f,  0.322326f,    -0.25455f,
+      0.874399f,     0.477696f,   0.0464487f,  1.20658f,     0.0993356f,
+      0.00682712f,   -0.10163f,   -0.371765f,  -0.629513f,   -0.679196f,
+      -0.193935f,    0.47405f,    -0.18238f,   0.254918f,    -0.35306f,
+      -0.375611f,    0.119771f,   -0.257282f,  -0.565124f,   0.162667f,
+      -0.356128f,    0.870351f,   0.241847f,   -0.264712f,   -0.384322f,
+      0.31807f,      0.211621f,   -0.180767f,  0.764944f,    0.368646f,
+      0.186111f,     1.02458f,    -0.494252f,  -0.483375f,   -0.699664f,
+      0.00415657f,   -0.189376f,  -0.677103f,  -0.030319f,   0.667087f,
+      0.810951f,     -0.488237f,  -0.387355f,  -0.726579f,   -0.304763f,
+      1.10392f,      -0.775977f,  -0.247731f,  0.532396f,    1.24089f,
+      0.206621f,     -0.670568f,  -1.08142f,   -0.342503f,   0.189854f,
+      -0.200846f,    0.784204f,   0.641112f,   -0.509346f,   0.0805264f,
+      -1.40006f,     0.322084f,   -0.823739f,  -1.12965f,    -0.215668f,
+      0.099673f,     0.425966f,   0.771697f,   0.338834f,    0.345364f,
+      -0.297826f,    -0.176746f,  -0.297299f,  -1.80029f,    -0.178348f,
+      0.421194f,     -0.19155f,   0.417653f,   0.374441f,    -0.135654f,
+      -0.895843f,    0.220647f,   0.368264f,   0.369233f,    0.382707f,
+      0.0800511f,    0.542053f,   0.318896f,   -0.385539f,   0.313305f,
+      -1.01166f,     -0.222379f,  -1.53708f,   1.32407f,     -0.665444f,
+      -0.102348f,    0.0410504f,  -0.616825f,  1.3108f,      0.405902f,
+      1.27777f,      0.0630558f,  -0.172696f,  0.16224f,     -1.10111f,
+      -3.31326f,     -0.242566f,  0.831422f,   0.917397f,    0.311749f,
+      -0.238613f,    0.438007f,   -0.407089f,  -0.0202555f,  -1.82502f,
+      -0.907965f,    -0.300031f,  -0.616669f,  -0.767921f,   0.285919f,
+      -0.112019f,    0.252677f,   0.350892f,   0.000214244f, 0.315915f,
+      0.260344f,     0.327362f,   -0.0211213f, -0.41241f,    0.0418355f,
+      0.103328f,     -0.0158439f, -0.230505f,  -0.0215114f,  0.266739f,
+      -0.234376f,    -0.352583f,  0.0709437f,  -0.90649f,    -0.535843f,
+      1.21322f,      -1.05144f,   -0.983682f,  -0.189956f,   1.14208f,
+      -0.0188492f,   -0.254821f,  -0.463214f,  -0.708714f,   0.0447348f,
+      -0.220831f,    0.476299f,   0.102544f,   1.1173f,      -0.36981f,
+      -0.814102f,    0.103604f,   -0.247871f,  0.0610701f,   -0.356616f,
+      -0.144093f,    1.66496f,    0.180206f,   -1.04384f,    -0.65883f,
+      0.0290771f,    -0.622728f,  0.761523f,   -0.909091f,   -0.0340348f,
+      0.666895f,     -0.0232575f, 0.962643f,   -2.50103f,    -1.69745f,
+      -0.0482305f,   0.771811f,   -1.32233f,   -0.778722f,   -0.203309f,
+      0.395875f,     -0.171812f,  0.253794f,   0.432799f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = {
+  -0.152159f, 0.552347f,   -0.806068f, 0.227901f,  0.335896f,  0.180785f,
+  0.75277f,   0.982208f,   0.409823f,  -0.17755f,  -0.125365f, 0.738114f,
+  0.202331f,  0.751737f,   -0.360511f, 0.149254f,  0.085073f,  -0.214542f,
+  0.529727f,  -0.0348777f, -2.13162f,  -0.893332f, -0.136952f, -0.71258f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = {
+  -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f,  -2.31153f,
+  0.912733f,  0.879995f, -1.00602f,  -1.02467f,  0.0536835f, 1.76011f,
+  -0.898546f, 1.06959f,  1.60471f,   -1.7312f,   -0.877168f, -0.681185f,
+  -1.57286f,  -1.16038f, -4.11303f,  -3.06351f,  -3.02536f,  -2.92186f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = {
+  1.33207f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = {
+      0.0419551f,  0.0924078f,   -0.153084f,   0.191642f,    0.069586f,
+      -0.530661f,  0.431968f,    0.000453838f, 0.793047f,    0.0161817f,
+      -0.476075f,  -0.156638f,   -0.219066f,   0.372716f,    -0.0642299f,
+      0.156813f,   -0.105819f,   -0.0519422f,  0.149935f,    0.295544f,
+      0.192037f,   -0.0450383f,  0.828794f,    -0.0510661f,  -1.22549f,
+      -0.100293f,  -0.178274f,   0.0304427f,   -0.0664097f,  -0.0438936f,
+      0.948248f,   0.425486f,    -0.238206f,   1.3744f,      0.336897f,
+      0.0760769f,  -0.583508f,   0.0735519f,   -0.117024f,   0.0501598f,
+      0.332212f,   0.199531f,    0.424764f,    0.206712f,    0.342868f,
+      0.592673f,   -0.0961148f,  -0.190113f,   -0.155027f,   0.00789871f,
+      -0.0514839f, -0.416154f,   -0.290309f,   0.407541f,    0.48534f,
+      0.126564f,   0.0709566f,   -0.0469664f,  0.735403f,    -0.365963f,
+      0.150295f,   -0.50147f,    0.021383f,    0.76514f,     0.0085721f,
+      -0.416384f,  1.22268f,     0.0832438f,   0.367813f,    -0.12012f,
+      0.823183f,   -0.0525972f,  -0.325526f,   -0.0983032f,  0.370128f,
+      0.368778f,   0.138971f,    -0.0397997f,  0.411058f,    -0.0400404f,
+      0.588437f,   -0.29963f,    -0.107992f,   -1.75238f,    -0.274387f,
+      0.430418f,   0.495152f,    0.283172f,    -0.441166f,   0.195339f,
+      -0.436182f,  -0.252613f,   0.176204f,    -0.126541f,   -0.474833f,
+      -0.0721603f, -0.496599f,   -0.0608464f,  0.0333451f,   -0.0621485f,
+      0.0843859f,  0.0637854f,   -0.145291f,   0.14876f,     0.181665f,
+      -0.675805f,  0.294903f,    0.301118f,    -0.225957f,   0.0105897f,
+      -0.136427f,  -0.555925f,   -0.158853f,   -0.216779f,   0.0612481f,
+      -0.107158f,  0.352451f,    0.140536f,    -0.0148237f,  0.189371f,
+      -0.091046f,  -0.0476226f,  0.366054f,    -0.0723413f,  0.389883f,
+      -0.0213411f, 0.0279539f,   0.194827f,    -0.271502f,   -0.166474f,
+      0.0690549f,  0.0584665f,   0.0198415f,   -0.442348f,   0.1571f,
+      -0.113463f,  -0.16822f,    -0.0580659f,  -0.13441f,    -0.0022386f,
+      0.251521f,   -0.160494f,   -0.0753547f,  0.0897289f,   0.137917f,
+      0.129836f,   0.0816833f,   -0.626288f,   0.0643293f,   -1.20001f,
+      0.085631f,   -0.195602f,   0.251244f,    0.0321744f,   0.0493178f,
+      -0.220616f,  0.724075f,    -0.00831514f, 2.00319f,     0.407932f,
+      0.0710799f,  -0.166128f,   0.0126611f,   -0.229644f,   -0.0984299f,
+      0.632041f,   -0.0946141f,  0.295315f,    0.100934f,    0.184883f,
+      -0.236173f,  0.158081f,    0.195775f,    0.413542f,    0.789801f,
+      0.767741f,   0.166275f,    -0.348271f,   -0.384074f,   -0.291648f,
+      -0.119899f,  0.0368354f,   0.0751987f,   1.04217f,     -0.159002f,
+      -2.71592f,   -0.788502f,   -1.06268f,    0.536057f,    0.0575876f,
+      1.06811f,    0.12033f,     0.198578f,    -0.0419196f,  0.0631388f,
+      0.623138f,   -0.142226f,   1.33129f,     0.0868059f,   -0.0287825f,
+      0.139378f,   -0.143037f,   0.307452f,    0.0363987f,   -0.0976368f,
+      0.040544f,   0.0269327f,   -0.0845524f,  0.0674699f,   0.104501f,
+      -0.0351155f, 0.167071f,    0.00986971f,  0.10284f,     0.0300016f,
+      0.192601f,   0.0397177f,   0.0251346f,   -0.00912908f, -0.0452825f,
+      0.0164356f,  -0.0275149f,  0.194846f,    0.0943608f,   1.61674f,
+      0.0124345f,  0.523787f,    0.0397258f,   -0.17208f,    -0.147808f,
+      -1.23583f,   0.676385f,    0.551994f,    0.0233041f,   0.0116391f,
+      -0.466706f,  0.154725f,    -0.207371f,   0.606662f,    0.247286f,
+      0.31216f,    0.173765f,    -0.268033f,   0.224422f,    0.314649f,
+      0.481922f,   -0.190604f,   -0.0129162f,  0.270552f,    0.135195f,
+      0.0927735f,  -0.226099f,   0.53897f,     0.103309f,    -0.0257271f,
+      -0.0246776f, 0.442013f,    -0.179246f,   -1.02581f,    0.206176f,
+      -0.326365f,  0.391623f,    -0.103549f,   0.115645f,    0.0269328f,
+      -0.584517f,  -0.237502f,   0.157996f,    0.0447407f,   -0.161f,
+      -0.126072f,  -0.148967f,   -0.416347f,   0.0236496f,   -1.12612f,
+      0.0120709f,  -0.00979376f, 0.0507126f,   -0.172262f,   0.0697059f,
+      -0.212334f,  0.335731f,    -0.0301362f,  -0.839583f,   -0.238539f,
+      0.0636752f,  -0.0467217f,  -0.0372118f,  -0.144615f,   -0.161773f,
+      -0.648242f,  0.158197f,    -0.051471f,   -0.0615805f,  -0.0426936f,
+      -0.0745554f, 0.358975f,    0.358297f,    0.0568553f,   -1.14383f,
+      -0.103955f,  0.728194f,    -0.224945f,   -0.31659f,    -0.204458f,
+      0.171763f,   -0.465666f,   0.899234f,    -0.37042f,    -0.0894774f,
+      0.11478f,    -0.334957f,   0.0896514f,   0.413251f,    0.359471f,
+      1.41597f,    0.558082f,    0.153486f,    0.0270558f,   -0.0178797f,
+      0.124983f,   -0.12273f,    -1.04516f,    -0.125375f,   0.370336f,
+      -0.209423f,  -0.36816f,    -0.66077f,    -0.0180773f,  -0.628921f,
+      -0.178542f,  0.0346841f,   0.0319309f,   -0.470138f,   0.172763f,
+      0.0798846f,  -0.259737f,   -0.652461f,   -0.386283f,   -0.474447f,
+      -0.924054f,  -0.0154613f,  -0.613712f,   -0.138068f,   -0.337842f,
+      0.217921f,   -0.0711405f,  0.000404091f, -0.703766f,   0.0364683f,
+      0.150173f,   0.0126249f,   0.170594f,    0.0371879f,   -0.0862515f,
+      -0.23454f,   -0.0144143f,  0.164947f,    0.45591f,     0.115703f,
+      0.069752f,   -0.011993f,   0.0402097f,   0.00697581f,  0.0811613f,
+      0.384752f,   0.341977f,    0.06087f,     0.0590107f,   0.00812679f,
+      0.121211f,   -0.0612108f,  0.167851f,    0.195781f,    -1.62162f,
+      0.336292f,   -0.0772523f,  -0.310786f,   0.188257f,    -0.0325804f,
+      -0.240098f,  0.158748f,    -0.265264f,   3.19593f,     -0.449251f,
+      -1.33102f,   -0.482856f,   -0.435731f,   0.300808f,    0.346503f,
+      2.67378f,    -0.152379f,   0.219322f,    -0.146119f,   -0.0584806f,
+      -0.0276895f, -0.21955f,    -0.479179f,   -0.689545f,   0.152799f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = {
+  -0.296575f, 0.101072f,  -0.208429f, 0.111585f, 0.699552f,   -0.379484f,
+  0.313244f,  -0.746369f, 0.867757f,  0.457318f, -0.0190943f, -0.290745f,
+  0.45592f,   -0.160465f, -0.634243f, 0.0829737f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = {
+      0.27511f,    -2.14172f,   1.25755f,    -0.554772f,  0.589508f,
+      0.228307f,   0.0754914f,  1.07061f,    0.293323f,   0.65162f,
+      -0.272016f,  -1.33519f,   -0.606759f,  -0.57827f,   0.368807f,
+      -1.48668f,   0.162439f,   0.0821667f,  0.225535f,   -0.795996f,
+      0.0328293f,  0.975476f,   -0.187514f,  2.47069f,    -1.5638f,
+      -0.461524f,  0.00310062f, 1.1556f,     -0.286206f,  0.00426021f,
+      0.585836f,   0.900007f,   0.384055f,   0.189435f,   -0.157291f,
+      -0.0710573f, -0.0663986f, -0.710772f,  -0.669136f,  -0.379493f,
+      -1.2634f,    -0.377524f,  0.824094f,   0.312308f,   0.125368f,
+      -0.382737f,  0.637109f,   0.61907f,    -0.741184f,  0.00257198f,
+      -0.0151343f, -0.669826f,  -0.439855f,  0.564852f,   -0.0588036f,
+      -1.38123f,   -1.1126f,    0.701831f,   0.198686f,   0.266866f,
+      0.270172f,   -0.692401f,  0.272533f,   -1.70914f,   0.66064f,
+      0.0886659f,  -0.132233f,  0.270531f,   -0.479581f,  0.704338f,
+      -0.307039f,  -0.111792f,  -2.05753f,   -0.231749f,  0.300528f,
+      0.383266f,   -0.130857f,  -0.373944f,  1.21025f,    0.704655f,
+      -0.589422f,  0.267185f,   -0.109065f,  -0.195991f,  0.20209f,
+      -0.0676526f, -0.183926f,  0.164894f,   0.0877923f,  0.565943f,
+      -0.0610466f, -0.86354f,   -0.80853f,   -0.176111f,  -1.45016f,
+      -2.29078f,   -0.124524f,  -0.139305f,  -0.187858f,  -0.0250151f,
+      -0.572544f,  0.185336f,   -0.69275f,   -0.430354f,  -0.30861f,
+      -0.754258f,  -0.468221f,  -0.160487f,  -0.766692f,  -0.636418f,
+      -0.71016f,   0.576125f,   -0.240476f,  -0.954556f,  -0.104693f,
+      0.155557f,   -0.840224f,  -0.685457f,  -0.0346927f, -0.644882f,
+      -1.92475f,   -0.314544f,  0.463569f,   0.323569f,   -0.990124f,
+      -0.213658f,  0.407183f,   1.19797f,    -4.77004f,   -0.0613379f,
+      -2.40345f,   -0.0591791f, -0.477622f,  -0.303556f,  0.104077f,
+      -0.974128f,  -0.035172f,  1.47064f,    0.233727f,   -0.0754056f,
+      0.158553f,   0.0614361f,  -1.38865f,   0.690729f,   0.568455f,
+      0.205866f,   -0.0236852f, -0.0921077f, -0.538954f,  0.336613f,
+      -0.427115f,  0.791754f,   -1.819f,     -0.404432f,  0.670242f,
+      -0.0343869f, -0.37191f,   0.0271262f,  0.988161f,   -0.547343f,
+      0.925304f,   0.548079f,   -0.430343f,  -0.214109f,  0.242013f,
+      1.39027f,    0.37648f,    -1.63524f,   -0.158864f,  -0.572779f,
+      -0.766801f,  -2.62032f,   0.47799f,    -1.12025f,   -0.115283f,
+      1.22349f,    -0.262132f,  -0.151274f,  0.390483f,   -0.496482f,
+      1.06166f,    -0.183052f,  0.54647f,    0.847486f,   0.0229506f,
+      0.653309f,   -0.020736f,  -1.27453f,   0.48386f,    -0.366625f,
+      -0.515725f,  -1.31196f,   0.140701f,   -0.183636f,  0.000413912f,
+      0.300993f,   -0.849529f,  -0.59764f,   -0.212992f,  -0.933365f,
+      -1.4054f,    -0.091982f,  0.41695f,    0.264004f,   -0.26379f,
+      -0.0738219f, 0.434052f,   1.16617f,    -0.639624f,  -0.146465f,
+      0.0409936f,  -0.900182f,  0.73517f,    0.805746f,   -0.208088f,
+      1.74459f,    -0.0592751f, 0.624865f,   -0.62325f,   -0.446315f,
+      0.150526f,   0.0526697f,  0.374254f,   -0.658043f,  1.02623f,
+      -0.941758f,  0.381217f,   -0.359448f,  0.160051f,   0.556455f,
+      0.239382f,   0.75851f,    0.437583f,   -0.122221f,  0.746136f,
+      0.218286f,   -0.426729f,  0.0353903f,  -0.830513f,  -0.877586f,
+      0.488077f,   -0.132354f,  -0.180756f,  0.736163f,   -0.202934f,
+      -0.882534f,  0.166305f,   0.183122f,   0.0599858f,  0.442687f,
+      0.0522908f,  -1.17755f,   -1.03733f,   0.392363f,   0.672718f,
+      -1.44704f,   0.360623f,   0.390298f,   -0.213968f,  0.169783f,
+      -0.717536f,  -0.830984f,  -0.445049f,  0.196772f,   -0.730634f,
+      -1.09497f,   0.344012f,   -0.292802f,  -0.67966f,   0.138515f,
+      -0.361803f,  0.936778f,   -0.189802f,  0.197777f,   -0.367507f,
+      -0.293653f,  0.447759f,   -0.409245f,  -0.687568f,  -0.431301f,
+      -0.271234f,  -0.585413f,  -0.936414f,  -0.396049f,  -0.29388f,
+      -0.0930843f, 0.0179339f,  0.262463f,   -0.166598f,  0.0171466f,
+      -0.329641f,  0.39343f,    0.657445f,   -0.579052f,  -0.312444f,
+      -0.0915881f, -0.432622f,  -0.247645f,  0.485749f,   -0.602508f,
+      -0.347936f,  0.287353f,   0.288705f,   0.168397f,   0.568228f,
+      -0.493586f,  1.04155f,    -0.097956f,  0.658928f,   -0.561007f,
+      0.0457783f,  2.12744f,    0.182683f,   -0.690282f,  0.183302f,
+      0.0309499f,  -0.722251f,  0.0660448f,  -0.333277f,  0.198929f,
+      -0.724102f,  -0.405597f,  0.614868f,   -0.292862f,  0.886513f,
+      0.142353f,   -1.48934f,   -0.97273f,   0.199683f,   0.522121f,
+      0.0877478f,  -0.172593f,  -1.58858f,   0.113191f,   -0.436178f,
+      0.640895f,   -0.504676f,  0.0658654f,  -0.361301f,  0.604323f,
+      0.315196f,   -0.423021f,  -0.323484f,  -0.563163f,  0.118989f,
+      -0.404508f,  -0.0550995f, -0.0359236f, -0.126574f,  -0.357288f,
+      -0.0494502f, 1.04959f,    -0.31646f,   -0.0376684f, -0.300744f,
+      -0.135016f,  0.102696f,   -0.392333f,  -1.17502f,   0.505227f,
+      0.337608f,   -0.348831f,  -0.420815f,  0.202791f,   -0.154264f,
+      -0.563686f,  0.0942187f,  0.353862f,   0.0303509f,  -0.132794f,
+      0.420746f,   0.143529f,   0.455822f,   -1.28348f,   -1.35662f,
+      -0.850688f,  -1.76361f,   -0.717546f,  0.443111f,   0.227155f,
+      -0.863307f,  -0.452033f,  -0.278151f,  1.86233f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = {
+  -0.103218f, -0.359587f, 0.619666f,  -0.473497f,  -0.649803f, 0.86992f,
+  -0.115561f, 0.335114f,  -0.285044f, -0.59295f,   0.24497f,   0.611583f,
+  0.38568f,   0.137913f,  -0.281191f, -0.0107777f, 0.487236f,  -0.262363f,
+  0.696962f,  0.121565f,  0.312511f,  0.430916f,   0.694134f,  0.393632f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = {
+  -2.42496f,  -1.239f,   0.832673f, 1.56923f,   -2.6175f,  -1.42492f,
+  -0.311387f, -1.94237f, 0.54071f,  -2.50391f,  0.352205f, -0.96572f,
+  1.47144f,   -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f,
+  0.789163f,  -0.65236f, 1.77018f,  0.273867f,  1.19506f,  1.07022f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = {
+  0.953424f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = {
+      0.0485154f,    0.0496279f,    0.0268229f,    -0.0584843f,   -0.166928f,
+      0.0316731f,    -0.0895094f,   -0.0433243f,   -0.00893639f,  -0.0886265f,
+      -0.0345622f,   -0.235395f,    -0.213754f,    -0.00212398f,  0.0218857f,
+      -0.0054983f,   -0.0248236f,   0.081822f,     -0.0355708f,   -0.0795593f,
+      -0.106995f,    -0.0596378f,   0.0350686f,    -0.133863f,    -0.00582928f,
+      0.114963f,     0.193906f,     -0.00419085f,  0.0430529f,    -0.128318f,
+      0.0614715f,    -0.000952935f, -0.0345722f,   -0.109459f,    0.074204f,
+      -0.0865131f,   0.0649158f,    -0.0942417f,   -0.10122f,     -0.047551f,
+      -1.27825f,     -0.0125456f,   -0.019722f,    -0.152058f,    0.280306f,
+      -0.121231f,    -0.0565484f,   0.0959188f,    0.0603919f,    0.0457468f,
+      0.967589f,     0.105892f,     -0.118326f,    0.198933f,     0.163437f,
+      -0.056824f,    -0.0302956f,   -0.07366f,     -0.681407f,    -0.0781575f,
+      0.255732f,     -0.0712105f,   0.177882f,     0.709206f,     -0.232457f,
+      1.33809f,      -0.0328557f,   0.0572231f,    -1.01361f,     0.130676f,
+      -0.205159f,    0.975398f,     0.356293f,     0.0766364f,    -0.297397f,
+      -0.0261066f,   -0.0933549f,   0.0568851f,    -0.0123034f,   -0.0433538f,
+      0.131003f,     0.890705f,     0.0084565f,    0.00547395f,   0.00157634f,
+      0.0047937f,    -0.0511092f,   0.0300034f,    -0.00604993f,  -0.0133502f,
+      -0.000274302f, 0.129728f,     -0.00532916f,  0.0855351f,    0.136885f,
+      0.0175562f,    -0.0123633f,   -0.000512229f, -0.019924f,    -0.0316328f,
+      0.422972f,     0.0460336f,    0.0170841f,    -0.00086795f,  -0.0655137f,
+      0.0287308f,    -0.0375644f,   -0.0329215f,   -0.0273072f,   0.0241426f,
+      -0.0429052f,   0.0221593f,    -0.063881f,    -0.0347391f,   -6.44339e-07f,
+      0.0476934f,    -0.0150068f,   0.0146403f,    -0.0653099f,   0.0107635f,
+      0.012407f,     0.0048935f,    1.50975f,      0.322256f,     0.17881f,
+      0.0943775f,    -0.100583f,    -0.367022f,    -0.156525f,    -0.0397161f,
+      0.0752784f,    -0.00219022f,  -0.887456f,    0.0153415f,    -0.0148185f,
+      -0.56435f,     0.163996f,     -0.0221024f,   -0.0115872f,   -0.0529284f,
+      0.156838f,     -1.13813f,     -0.207863f,    -0.00484959f,  0.135719f,
+      0.131004f,     0.0417939f,    0.31453f,      0.121719f,     -0.101515f,
+      0.267951f,     0.219727f,     0.0398821f,    0.0713504f,    3.65918e-06f,
+      -0.00659998f,  0.477343f,     -0.128426f,    0.0648877f,    0.111884f,
+      0.224552f,     0.0617426f,    0.117742f,     0.031377f,     0.0586865f,
+      -0.459293f,    0.100211f,     -0.14127f,     0.624412f,     0.014659f,
+      -1.41807f,     -0.382452f,    -0.695931f,    -0.103153f,    0.145808f,
+      0.333526f,     -0.256367f,    0.096842f,     0.102458f,     -0.181224f,
+      0.729272f,     0.151177f,     1.46729f,      0.111044f,     -4.28813f,
+      0.0178379f,    0.47641f,      -6.57533f,     0.0633335f,    0.496934f,
+      -0.154657f,    -9.07298e-05f, 0.848937f,     -5.40143f,     0.375685f,
+      0.23586f,      -0.166591f,    -0.0191648f,   -0.039862f,    -3.25093f,
+      0.168472f,     -0.260317f,    -5.51548f,     0.0575334f,    0.328979f,
+      0.112644f,     0.231339f,     -0.122641f,    0.0567331f,    1.19541f,
+      -0.038735f,    0.0630576f,    0.176668f,     0.0757184f,    -0.833104f,
+      0.133669f,     0.982669f,     0.0311783f,    0.0908558f,    -0.10065f,
+      -0.0386599f,   -0.231587f,    -0.83876f,     -0.347148f,    0.225529f,
+      -1.29625f,     0.0806834f,    0.369648f,     -1.63367f,     0.118057f,
+      -0.311948f,    0.95022f,      -0.354807f,    -0.648657f,    -1.72048f,
+      0.260397f,     0.915555f,     0.057737f,     -0.162019f,    -0.453543f,
+      -1.70388f,     -0.311632f,    -0.731593f,    -0.678089f,    0.10438f,
+      -0.293911f,    0.144864f,     0.039212f,     0.0289241f,    -0.0685266f,
+      0.634592f,     -0.0798614f,   -0.119197f,    -0.00517433f,  -0.04653f,
+      -0.127568f,    -0.0582645f,   0.0735302f,    -0.0946823f,   0.00865585f,
+      0.0115748f,    0.0194847f,    0.0455664f,    0.181006f,     -0.0824601f,
+      0.0869093f,    0.264767f,     -0.0750432f,   0.135136f,     0.316511f,
+      0.399015f,     0.0994808f,    -0.166944f,    -0.102126f,    0.457858f,
+      0.300488f,     0.467582f,     0.830244f,     -0.0511439f,   -0.522892f,
+      -0.183049f,    0.2626f,       0.118382f,     0.241674f,     0.250399f,
+      -0.0963507f,   -0.83231f,     -0.227699f,    -0.133314f,    0.231718f,
+      -0.0700274f,   0.891311f,     0.224742f,     -0.572836f,    0.402798f,
+      -0.191576f,    0.740922f,     -0.00374073f,  0.658178f,     -0.209364f,
+      -0.416259f,    0.166297f,     0.0095577f,    -0.0876076f,   0.424954f,
+      0.265226f,     -0.129343f,    -0.203146f,    -0.194637f,    -0.818142f,
+      -0.164152f,    -0.368962f,    0.273373f,     0.599927f,     -0.19859f,
+      0.0939651f,    -0.12458f,     -0.751816f,    -0.302997f,    -0.139176f,
+      -0.372737f,    0.332704f,     -0.206045f,    -0.00593763f,  -0.452363f,
+      -0.2704f,      -0.198846f,    0.0976308f,    -0.216124f,    0.110122f,
+      -0.220342f,    0.00763426f,   -0.0272775f,   -0.190395f,    -0.0359411f,
+      -0.0395759f,   0.000941162f,  -1.49959f,     0.0914233f,    0.448346f,
+      -0.420435f,    -0.0102102f,   -0.0757978f,   -0.0177687f,   -0.0231492f,
+      -0.142125f,    1.31774f,      0.0269368f,    0.134566f,     0.152079f,
+      -0.139933f,    0.139226f,     -0.214467f,    -0.194446f,    -0.555893f,
+      0.271197f,     -0.111047f,    0.0888069f,    -0.198121f,    0.0871713f,
+      0.100612f,     0.429782f,     -0.3787f,      0.123147f,     -0.12538f,
+      0.235678f,     0.139237f,     0.223326f,     0.85806f,      -0.00554756f,
+      0.285095f,     0.0954683f,    0.0464989f,    0.100806f,     -0.0211297f,
+      0.121672f,     0.242473f,     0.0810475f,    -0.834356f,    0.119629f,
+      0.111338f,     -0.227126f,    0.159296f,     -0.0584685f,   -0.108265f,
+      -0.0909221f,   -0.21749f,     0.0929309f,    -0.176815f,    0.178067f,
+      -0.0025905f,   0.317883f,     0.313045f,     0.26774f,      -0.589329f,
+      -1.19882f,     -0.285513f,    -0.109478f,    0.309441f,     -0.0604479f,
+      0.947461f,     -0.142342f,    -0.9086f,      -0.814788f,    0.184588f,
+      -0.0736317f,   0.276237f,     0.13132f,      -0.3931f,      -0.381744f,
+      -0.0122719f,   0.0246101f,    -0.0920412f,   0.11331f,      -0.110355f,
+      0.00848064f,   0.0931248f,    -0.0638655f,   -4.30869e-05f, -0.300367f,
+      0.0489508f,    0.464441f,     -0.0466243f,   -0.0137732f,   0.0099241f,
+      -0.223972f,    0.188966f,     -0.653173f,    -0.354322f,    0.189237f,
+      -0.624276f,    -1.46218f,     -0.075161f,    -0.516172f,    0.40993f,
+      0.291178f,     -1.95088f,     -0.0352157f,   0.196354f,     -0.335897f,
+      0.0857039f,    0.605319f,     -1.12923f,     -0.638387f,    1.41868f,
+      0.0955757f,    -0.00913477f,  0.315935f,     -0.671223f,    -0.851436f,
+      -0.157464f,    -0.296763f,    0.182277f,     -0.139309f,    0.232789f,
+      0.869562f,     0.248894f,     0.242709f,     0.195479f,     0.106153f,
+      0.358881f,     0.167443f,     0.982987f,     0.104767f,     -0.033925f,
+      -0.0263185f,   0.0045304f,    0.0722479f,    -0.111307f,    0.00128896f,
+      0.406128f,     -0.00944947f,  0.121592f,     0.546284f,     -0.00175696f,
+      0.776588f,     0.238846f,     0.064469f,     0.27082f,      0.269187f,
+      0.0294455f,    0.62364f,      -0.27872f,     -0.0488013f,   0.229024f,
+      0.154457f,     0.0445898f,    0.349943f,     0.0710998f,    0.0820674f,
+      0.0279449f,    0.172826f,     -0.122156f,    -0.164688f,    0.0292124f,
+      0.0496112f,    -0.741762f,    0.0673926f,    0.108159f,     -0.0942327f,
+      -0.0562883f,   0.558231f,     0.0552399f,    0.211393f,     0.0376817f,
+      -0.275788f,    0.0548436f,    0.212732f,     0.163603f,     0.0663363f,
+      -0.0252315f,   0.164533f,     0.0826088f,    0.0301389f,    0.345705f,
+      -0.0378046f,   -0.139581f,    1.30162f,      1.23551f,      -0.446693f,
+      0.682534f,     -0.0831157f,   -0.0121595f,   1.50505f,      0.0839017f,
+      -0.953413f,    0.0820985f,    -0.125556f,    0.699796f,     -0.140453f,
+      0.168438f,     -0.110966f,    0.173806f,     0.114683f,     0.132502f,
+      -0.0453539f,   -0.133096f,    0.511947f,     -0.180657f,    -0.0298605f,
+      0.291437f,     -0.0275017f,   -0.229703f,    -0.0504205f,   0.559622f,
+      0.384601f,     0.111024f,     -0.0773559f,   -0.0591752f,   -0.0866182f,
+      -0.189437f,    -0.262345f,    -0.0372182f,   0.149925f,     0.154644f,
+      -0.188298f,    0.236949f,     -0.199328f,    -0.378909f,    -0.680128f,
+      0.277184f,     -0.172784f,    0.184717f,     -0.23899f,     0.0712069f,
+      0.0235425f,    0.4225f,       -0.441487f,    0.177434f,     -0.298303f,
+      0.295696f,     0.17346f,      0.220542f,     -0.680116f,    0.00266223f,
+      -0.0408459f,   -0.15486f,     0.24335f,      0.237258f,     -0.0283245f,
+      0.19703f,      -0.100027f,    0.0554843f,    -1.03081f,     0.151745f,
+      0.538582f,     0.370368f,     0.196683f,     0.0222123f,    -0.0831401f,
+      -0.0832803f,   -0.286743f,    -0.686003f,    0.0995004f,    0.148901f,
+      -0.0436037f,   -0.316508f,    0.00391835f,   -0.228452f,    0.940058f,
+      0.520047f,     -0.334211f,    0.652142f,     -0.0755971f,   0.0965123f,
+      -0.98191f,     0.394096f,     -0.420466f,    0.327284f,     -0.134651f,
+      0.849297f,     -0.523372f,    0.010327f,     0.133636f,     0.298119f,
+      -0.257389f,    0.0376153f,    -0.198298f,    0.0736235f,    0.608809f,
+      0.0291836f,    -0.290005f,    -0.141316f,    0.0184599f,    0.0554437f,
+      0.0621519f,    0.485276f,     0.617062f,     -0.0924811f,   -0.0120834f,
+      0.0817611f,    0.100421f,     -0.0153553f,   -0.135958f,    -0.0185322f,
+      -0.395803f,    -0.204862f,    0.547916f,     -0.438117f,    0.0229788f,
+      0.406981f,     0.795584f,     -2.02756f,     -0.8355f,      -0.386789f,
+      0.00968368f,   1.2147f,       -0.740869f,    -1.18415f,     -0.954918f,
+      -0.541142f,    0.0596003f,    0.107189f,     -0.411708f,    -0.964593f,
+      0.511906f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = {
+  -0.485545f, 0.131552f,   0.796833f,   -0.157582f, -0.0948124f, 0.00818613f,
+  -0.485562f, 0.3826f,     -0.0839326f, 0.170998f,  0.279545f,   -0.287143f,
+  0.184986f,  -0.0719864f, 0.19748f,    0.404145f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = {
+      1.30172f,     0.720189f,   0.261675f,   -0.466201f,   1.21773f,
+      0.495525f,    0.62398f,    0.44567f,    -0.330993f,   -0.269798f,
+      0.835161f,    -0.294874f,  0.186981f,   0.0162467f,   0.367654f,
+      0.658468f,    1.08325f,    1.01558f,    0.12783f,     -0.280581f,
+      2.2204f,      0.0337286f,  -0.403649f,  -0.230908f,   -0.35188f,
+      0.437712f,    -0.103634f,  -0.645929f,  1.17407f,     0.157385f,
+      0.212438f,    1.41874f,    0.284242f,   -0.493105f,   1.0703f,
+      0.00632116f,  1.18222f,    -0.26003f,   0.276795f,    -0.823156f,
+      0.29577f,     -0.157467f,  -0.18092f,   0.0237336f,   0.205715f,
+      -0.295679f,   0.165443f,   -0.628279f,  1.00804f,     0.361232f,
+      0.646155f,    -0.028651f,  1.64317f,    0.334251f,    -1.50713f,
+      -1.51685f,    -0.488522f,  0.169694f,   -0.593176f,   -0.372682f,
+      -1.50223f,    0.35076f,    -0.24641f,   -0.237189f,   0.190502f,
+      -0.948191f,   -0.303346f,  0.45108f,    -0.794368f,   -2.3116f,
+      0.404008f,    -2.67269f,   -0.941992f,  -0.45336f,    0.0655987f,
+      -0.288432f,   0.106068f,   0.286978f,   0.121403f,    0.462739f,
+      0.0130292f,   0.240597f,   -2.30983f,   -0.453309f,   -0.149335f,
+      0.856424f,    -0.186576f,  0.769961f,   -0.0657097f,  -0.976188f,
+      0.972971f,    -0.532728f,  -0.699334f,  -0.168803f,   0.361945f,
+      0.950769f,    1.5368f,     -0.223899f,  1.17547f,     -0.281483f,
+      0.533619f,    0.315344f,   0.0854543f,  0.464701f,    0.346828f,
+      0.271794f,    -0.0185388f, 0.109517f,   0.371662f,    -0.10852f,
+      0.244092f,    0.491959f,   -0.750281f,  1.41865f,     -3.51221f,
+      0.298194f,    -0.0790832f, -0.134158f,  -0.424084f,   0.189593f,
+      -0.238361f,   -0.407872f,  -0.366222f,  -0.606813f,   -0.230498f,
+      0.387248f,    -0.102734f,  -0.190544f,  -1.43649f,    0.141338f,
+      -0.0438917f,  0.204628f,   1.57033f,    0.0366937f,   -0.14733f,
+      0.048198f,    -0.122631f,  0.183354f,   0.0658753f,   -0.243381f,
+      0.0246889f,   -0.768798f,  -0.0644054f, 0.775073f,    1.63419f,
+      0.491624f,    0.21898f,    -0.358944f,  3.31304f,     0.0195916f,
+      0.236174f,    0.530704f,   0.140124f,   0.0736778f,   -0.27361f,
+      -0.598836f,   -1.01659f,   0.361765f,   0.00455986f,  -0.345222f,
+      1.68731f,     0.764082f,   0.193555f,   0.322782f,    1.19801f,
+      0.538935f,    -0.0393231f, -0.0248292f, -0.151168f,   0.479879f,
+      -0.208582f,   0.22798f,    0.335473f,   -0.00295455f, 0.139539f,
+      0.400814f,    0.478307f,   -0.189376f,  0.540084f,    0.466072f,
+      0.920231f,    0.398774f,   -0.472403f,  -0.0431972f,  -0.581665f,
+      -0.990058f,   0.258995f,   -0.0148889f, 0.27105f,     0.340334f,
+      0.223576f,    -0.0405193f, -1.23888f,   -1.45229f,    -1.44543f,
+      -0.376146f,   0.132601f,   -0.4064f,    -0.583611f,   -0.374588f,
+      0.0659428f,   0.325652f,   -0.338456f,  0.253767f,    -0.0181164f,
+      0.681732f,    0.222041f,   0.837496f,   1.09735f,     0.156328f,
+      0.177236f,    -0.702702f,  0.473689f,   0.322118f,    0.43343f,
+      0.315441f,    -0.40798f,   0.0811291f,  0.631431f,    0.361929f,
+      0.0723276f,   0.0164498f,  0.0293847f,  0.156406f,    -1.10453f,
+      0.837977f,    -1.03449f,   -0.348408f,  1.71953f,     -0.401765f,
+      0.64272f,     -0.182438f,  -0.233954f,  0.364597f,    0.269177f,
+      -0.578512f,   0.397216f,   0.0425122f,  -0.258728f,   1.41621f,
+      -0.688768f,   0.0944726f,  0.253163f,   -0.989037f,   1.72726f,
+      1.15976f,     -0.0460612f, 0.534186f,   -0.136814f,   0.49327f,
+      0.115744f,    -0.633052f,  -0.433855f,  -1.01874f,    -0.324035f,
+      0.489487f,    1.08696f,    0.836376f,   -0.423477f,   -0.421309f,
+      1.07348f,     0.323266f,   0.717604f,   0.366422f,    0.32983f,
+      0.336583f,    0.749292f,   -0.210666f,  0.387101f,    -0.583376f,
+      0.0391101f,   -1.07537f,   0.914591f,   -0.51303f,    1.15023f,
+      -0.0378782f,  0.262889f,   -0.841128f,  0.41619f,     -0.669704f,
+      -0.109995f,   1.01825f,    -0.194853f,  0.120739f,    0.627889f,
+      -0.00269221f, 0.751152f,   -0.529865f,  -1.50238f,    0.184521f,
+      0.795464f,    0.106099f,   1.83117f,    0.0883305f,   0.306844f,
+      -0.0671504f,  -0.169306f,  -0.214575f,  -0.121606f,   -0.234965f,
+      0.109752f,    -0.35831f,   -0.07894f,   0.497203f,    -2.63013f,
+      0.815608f,    -0.193593f,  -0.62292f,   0.338941f,    0.0970922f,
+      -0.531178f,   0.723346f,   0.35063f,    0.182647f,    -0.257013f,
+      0.784924f,    -0.217915f,  -0.0797363f, -0.399706f,   -0.485602f,
+      1.23155f,     0.345998f,   0.322949f,   -0.168196f,   -0.173313f,
+      0.282205f,    0.45117f,    0.918706f,   -0.046172f,   -0.0873883f,
+      0.56103f,     -0.485768f,  0.546199f,   0.254997f,    0.394296f,
+      0.607178f,    0.667532f,   -0.343883f,  0.374402f,    -0.531439f,
+      2.27782f,     -1.13255f,   0.505867f,   -0.514742f,   0.998571f,
+      -1.60984f,    -0.172873f,  -0.0604094f, 0.719791f,    -0.733982f,
+      0.348905f,    1.39008f,    -0.895343f,  -0.677064f,   -1.84221f,
+      0.0434018f,   -0.534794f,  0.0434753f,  -0.266576f,   0.268099f,
+      -0.242935f,   0.00166289f, 0.0263789f,  -0.224794f,   -0.113493f,
+      -0.236397f,   0.0879936f,  0.510895f,   -0.511789f,   -1.48962f,
+      -2.78268f,    -0.0495784f, -0.0343907f, 0.440459f,    -0.364209f,
+      0.833223f,    -0.0589337f, 0.00181418f, 0.455499f,    0.101762f,
+      -1.16424f,    0.270405f,   0.219033f,   -4.91105f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = {
+  -0.40114f,  -0.372342f, -0.216186f, -0.240014f,  -0.341773f, -0.344489f,
+  -0.113037f, 0.198479f,  0.482958f,  -0.630072f,  -0.728704f, -0.171963f,
+  0.519883f,  0.253003f,  -0.121618f, -0.0569875f, -0.485568f, -0.147577f,
+  0.533305f,  -0.587251f, -0.120837f, -0.483953f,  0.445641f,  -0.125136f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = {
+  -1.57431f,  -1.09069f,  1.67996f,   -0.669702f, 0.499807f, -3.03145f,
+  -0.878135f, 0.637818f,  -1.58419f,  -3.79756f,  0.62755f,  -0.446646f,
+  0.653269f,  -0.667854f, -2.19774f,  -3.53349f,  2.6107f,   -0.685892f,
+  -1.2603f,   -0.89707f,  -0.715551f, 0.382202f,  2.09574f,  0.469386f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = {
+  -0.022787f
+};
+
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = {
+  BRANCH_0_NUM_DNN_FEATURES,
+  BRANCH_0_NUM_LOGITS,
+  BRANCH_0_NUM_DNN_LAYERS,
+  {
+      BRANCH_0_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_0_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_0_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_0_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = {
+  BRANCH_1_NUM_DNN_FEATURES,
+  BRANCH_1_NUM_LOGITS,
+  BRANCH_1_NUM_DNN_LAYERS,
+  {
+      BRANCH_1_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_1_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_1_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_1_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = {
+  BRANCH_2_NUM_DNN_FEATURES,
+  BRANCH_2_NUM_LOGITS,
+  BRANCH_2_NUM_DNN_LAYERS,
+  {
+      BRANCH_2_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_2_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_2_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_2_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = {
+  BRANCH_3_NUM_DNN_FEATURES,
+  BRANCH_3_NUM_LOGITS,
+  BRANCH_3_NUM_DNN_LAYERS,
+  {
+      BRANCH_3_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_3_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_3_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_3_logits_bias,
+  },
+};
+
+#undef NUM_DNN_BRANCHES
+#undef NUM_CNN_LAYERS
+#undef BRANCH_0_NUM_DNN_LAYERS
+#undef BRANCH_1_NUM_DNN_LAYERS
+#undef BRANCH_2_NUM_DNN_LAYERS
+#undef BRANCH_3_NUM_DNN_LAYERS
+#undef CNN_LAYER_0_HEIGHT
+#undef CNN_LAYER_0_WIDTH
+#undef CNN_LAYER_0_IN_CH
+#undef CNN_LAYER_0_OUT_CH
+#undef CNN_LAYER_0_HORZ_STRIDE
+#undef CNN_LAYER_0_VERT_STRIDE
+#undef CNN_LAYER_1_HEIGHT
+#undef CNN_LAYER_1_WIDTH
+#undef CNN_LAYER_1_IN_CH
+#undef CNN_LAYER_1_OUT_CH
+#undef CNN_LAYER_1_HORZ_STRIDE
+#undef CNN_LAYER_1_VERT_STRIDE
+#undef CNN_LAYER_2_HEIGHT
+#undef CNN_LAYER_2_WIDTH
+#undef CNN_LAYER_2_IN_CH
+#undef CNN_LAYER_2_OUT_CH
+#undef CNN_LAYER_2_HORZ_STRIDE
+#undef CNN_LAYER_2_VERT_STRIDE
+#undef CNN_LAYER_3_HEIGHT
+#undef CNN_LAYER_3_WIDTH
+#undef CNN_LAYER_3_IN_CH
+#undef CNN_LAYER_3_OUT_CH
+#undef CNN_LAYER_3_HORZ_STRIDE
+#undef CNN_LAYER_3_VERT_STRIDE
+#undef CNN_LAYER_4_HEIGHT
+#undef CNN_LAYER_4_WIDTH
+#undef CNN_LAYER_4_IN_CH
+#undef CNN_LAYER_4_OUT_CH
+#undef CNN_LAYER_4_HORZ_STRIDE
+#undef CNN_LAYER_4_VERT_STRIDE
+#undef BRANCH_0_NUM_DNN_FEATURES
+#undef BRANCH_0_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_0_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_0_NUM_LOGITS
+#undef BRANCH_1_NUM_DNN_FEATURES
+#undef BRANCH_1_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_1_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_1_NUM_LOGITS
+#undef BRANCH_2_NUM_DNN_FEATURES
+#undef BRANCH_2_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_2_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_2_NUM_LOGITS
+#undef BRANCH_3_NUM_DNN_FEATURES
+#undef BRANCH_3_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_3_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_3_NUM_LOGITS
+
+static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = {
+  100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = {
+  -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = {
+  100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = {
+  -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = {
+  100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = {
+  -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f,
+};
+
+static const float av1_intra_mode_cnn_partition_mean[1] = {
+  1.191922f,
+};
+
+static const float av1_intra_mode_cnn_partition_std[1] = {
+  1.730044f,
+};
+
+static const int quad_to_linear_0[1] = { 0 };
+static const int quad_to_linear_1[4] = { 0, 1, 2, 3 };
+static const int quad_to_linear_2[16] = { 0, 1, 4,  5,  2,  3,  6,  7,
+                                          8, 9, 12, 13, 10, 11, 14, 15 };
+static const int quad_to_linear_3[64] = {
+  0,  1,  8,  9,  2,  3,  10, 11, 16, 17, 24, 25, 18, 19, 26, 27,
+  4,  5,  12, 13, 6,  7,  14, 15, 20, 21, 28, 29, 22, 23, 30, 31,
+  32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59,
+  36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
new file mode 100644
index 0000000000..71c1ace782
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -0,0 +1,5646 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(chiyotsai@google.com): The performance of these models are getting worse
+// due the changes in the encoder. We should retrain the models here to get
+// better performance once we have the time.
+
+#define FEATURE_SIZE 10
+#define LABEL_SIZE 16
+// nn model for ab partition pruning, 128x128.
+static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = {
+  -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f,  -0.469759f,
+  0.426152f,  0.489798f,  0.469865f,  0.773821f,  0.088517f,  0.074585f,
+  0.838754f,  0.048449f,  -0.007584f, 0.638968f,  0.233305f,  -0.319236f,
+  -0.257124f, -0.170869f, 0.137180f,  0.114852f,  -0.721241f, -0.947962f,
+  -0.411298f, 0.494306f,  -0.060435f, -0.648421f, -0.126624f, 0.072686f,
+  -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f,  -0.189925f,
+  0.134361f,  -0.258070f, -0.177558f, 0.158049f,  0.168668f,  -0.062919f,
+  0.341986f,  0.038100f,  -0.435577f, -0.321255f, 0.203213f,  0.213061f,
+  0.533304f,  0.359296f,  -0.079558f, 0.004637f,  0.663904f,  0.043779f,
+  0.383018f,  1.136559f,  -0.084155f, 0.333057f,  -0.199011f, 0.152059f,
+  -0.078419f, -0.167752f, -0.093651f, 0.083171f,  -0.190143f, 0.086195f,
+  -0.280632f, -0.160663f, -0.017298f, 0.122628f,  -0.138116f, 0.062927f,
+  0.222462f,  0.626979f,  0.426928f,  0.117170f,  -0.240457f, 0.053750f,
+  0.038017f,  0.007359f,  -0.017595f, 0.101407f,  0.332891f,  0.074933f,
+  0.306498f,  0.219380f,  -0.151638f, -0.247976f, 0.343405f,  0.121256f,
+  0.049173f,  0.171474f,  -0.139608f, -1.016599f, -0.345553f, -0.901138f,
+  0.243401f,  0.059928f,  -0.089396f, -0.195565f, 0.364705f,  -0.020400f,
+  -1.383672f, 0.413018f,  0.536950f,  -0.020904f, -1.335306f, -0.732290f,
+  0.102885f,  0.315290f,  -0.208521f, -0.081811f, 0.182300f,  0.125712f,
+  -0.593833f, -0.220639f, -0.314155f, 0.188327f,  0.118503f,  0.524427f,
+  -1.083859f, -1.130640f, 0.390352f,  -0.045591f, 0.113160f,  -0.009149f,
+  -0.096183f, 0.115829f,  0.377752f,  0.318396f,  -0.591983f, 0.004797f,
+  -0.497377f, -0.342248f, 0.079546f,  -0.025249f, -0.295972f, 0.615501f,
+  -0.464372f, 0.418315f,  -0.173556f, 0.105217f,  0.298073f,  0.082478f,
+  0.033223f,  0.977341f,  -0.372982f, -0.052337f, 0.154124f,  0.396787f,
+  0.536654f,  -0.139061f, -0.223702f, 0.229666f,  -0.846766f, 0.107723f,
+  0.563839f,  -0.483141f, 0.304813f,  -0.765283f, 0.070964f,  0.151101f,
+  0.275188f,  0.490303f,  1.175892f,  0.085377f,  -0.191200f, 0.544532f,
+  -0.365075f, 0.167546f,  0.052183f,  -0.220529f, -0.212227f, -0.144988f,
+  -0.273356f, -0.062023f, 0.103993f,  -0.238493f, -0.161204f, -0.054611f,
+  -0.166672f, 0.128327f,  0.461751f,  -0.545822f, 0.739798f,  0.594386f,
+  -0.163192f, -0.332501f, 0.363834f,  -0.065043f, 0.474812f,  -0.138811f,
+  0.170924f,  -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f,
+  0.340591f,  0.041783f,  0.055419f,  0.015155f,  -0.981830f, -1.355237f,
+  0.347516f,  1.155327f,  0.081319f,  0.274163f,  -0.327230f, -0.113478f,
+  0.556552f,  -0.055986f, 0.217318f,  -0.445351f, 0.325759f,  0.526547f,
+  -0.657434f, -0.572214f, -0.037087f, 0.081384f,  0.064518f,  0.014892f,
+  0.215279f,  1.834504f,  -0.242107f, 0.079810f,  0.129558f,  0.079588f,
+  -0.035189f, -0.221745f, -0.163414f, 0.043978f,  -1.028662f, -0.623609f,
+  1.130336f,  0.664661f,  -0.063975f, -0.415863f, 0.018581f,  0.157758f,
+  0.200570f,  0.063420f,  0.901039f,  -0.746286f, 0.196230f,  -0.290592f,
+  0.042373f,  -0.502500f, 0.183638f,  0.103394f,  -0.298858f, 0.145436f,
+  0.196916f,  0.108319f,  -0.448572f, -0.881385f, 0.302497f,  0.121679f,
+  -0.021327f, 0.025150f,  0.481306f,  -0.359634f, 0.350257f,  -0.228647f,
+  -0.669860f, 0.260025f,  -0.034182f, 0.619247f,  -0.158826f, -0.405864f,
+  0.674112f,  -0.027885f, -0.325274f, -0.241492f, 0.036024f,  -0.437685f,
+  -0.091458f, -0.109295f, -0.350676f, 0.044706f,  0.297059f,  0.016290f,
+  1.121203f,  1.289062f,  -1.299476f, -1.129221f, 0.103752f,  0.131302f,
+  -0.263265f, 0.222155f,  -0.229908f, 0.013922f,  -0.226001f, -0.248383f,
+  -0.004415f, -0.020958f, 0.055634f,  0.086200f,  0.114556f,  -0.184061f,
+  -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f,  0.023781f,
+  -0.264460f, 0.157026f,  -0.235228f, -0.102564f, 0.043463f,  -0.187823f,
+  -0.257500f, -0.199049f, -0.242210f, 0.030448f,  0.221604f,  0.151804f,
+  -0.100404f, -0.073931f, 0.144749f,  -0.001572f, -1.438079f, -0.233716f,
+  0.733422f,  1.727080f,  -0.036397f, 0.027551f,  0.425321f,  0.085703f,
+  0.031186f,  0.032333f,  -0.675130f, 1.437733f,  -0.202392f, -0.525003f,
+  0.087048f,  0.328194f,  -0.079989f, -0.391088f, -0.238732f, -0.120660f,
+  -0.139600f, 0.154665f,  0.026202f,  -0.233501f, -0.009046f, -0.149187f,
+  -0.199646f, 0.115375f,  0.209762f,  -0.014875f, 0.124038f,  -0.119985f,
+  1.079625f,  -0.461513f, 0.614114f,  0.021003f,  0.439449f,  -0.824834f,
+  -0.299701f, 0.193817f,  -0.870551f, -1.262313f, -0.079517f, 0.341570f,
+  0.305310f,  -0.089721f, -0.317314f, -0.075631f, 0.127172f,  -0.208635f,
+  1.191922f,  0.163141f,  0.564285f,  0.286352f,  0.480865f,  0.173094f,
+  -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f,  0.090258f,
+  -0.016099f, 0.193230f,  0.188061f,  0.398144f,  0.722781f,  0.769949f,
+  0.025442f,  -0.162016f, 0.070192f,  -0.056946f, -0.100957f, -0.219934f,
+  -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f,  -0.017493f,
+  0.527446f,  0.083605f,  0.588318f,  0.878215f,  0.028747f,  -0.146479f,
+  -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f,  -0.101340f,
+  -0.027733f, -0.282611f, 0.265366f,  0.082362f,  -0.265420f, -0.131124f,
+  0.166303f,  0.040194f,  -0.100710f, 0.579151f,  -0.530136f, 0.163422f,
+  -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f,  -0.090302f,
+  1.723272f,  0.552370f,  -0.295954f, -0.439095f, -0.266730f, 0.027936f,
+  0.539616f,  -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f,
+  -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f,  -0.159378f,
+  0.029145f,  -0.050892f, -0.223407f, -0.246239f, 0.043152f,  -0.018460f,
+  0.169972f,  -0.187769f, -0.034670f, -0.238330f, 0.288070f,  -0.093243f,
+  -0.437105f, -0.573376f, 0.660073f,  0.285727f,  0.408470f,  0.158475f,
+  0.032699f,  0.056280f,  -0.237176f, -0.083003f, 0.105598f,  -0.169522f,
+  -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f,
+  0.029124f,  0.009580f,  -0.252034f, 0.103087f,  1.156561f,  0.603848f,
+  -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f,  0.076095f,
+  1.490819f,  0.415893f,  -0.277788f, -0.115787f, 0.093750f,  0.270726f,
+  -0.395983f, -0.353742f, 0.034605f,  0.005342f,  0.184537f,  0.086445f,
+  0.156417f,  1.476367f,  0.122587f,  0.002145f,  0.431057f,  -0.381184f,
+  -1.646457f, -0.014009f, -0.671224f, 0.193726f,  -0.019247f, -0.031267f,
+  -0.046208f, 0.298733f,  0.064734f,  0.616984f,  0.039381f,  0.182722f,
+  -0.116670f, 0.233093f,  -1.214374f, -0.817970f, -0.064394f, -0.584783f,
+  0.077697f,  -0.266720f, 0.130875f,  -0.235295f, -0.265754f, -0.159999f,
+  -0.250114f, -0.183017f, 0.194403f,  -0.105808f, -0.169215f, -0.240866f,
+  -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f,
+  -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f,
+  0.881460f,  -0.678603f, 0.008666f,  -0.252053f, -0.341035f, -0.175290f,
+  0.183012f,  0.385991f,  0.079888f,  -0.014039f, -0.148653f, 0.671778f,
+  -0.130219f, 1.086467f,  0.129267f,  -0.040400f, -0.201221f, -0.077005f,
+  0.015890f,  0.000781f,  0.137764f,  1.389546f,  0.172152f,  0.047279f,
+  -0.042783f, 0.127740f,  0.141467f,  -0.335738f, -1.396392f, 0.031496f,
+  0.357385f,  0.343602f,  -0.714553f, 0.311014f,  0.132845f,  0.061149f,
+  0.006796f,  0.568106f,  -0.255949f, 0.104134f,  -0.993447f, 0.298135f,
+  -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f,
+  0.068481f,  0.036240f,  -0.495801f, 0.180574f,  -0.766129f, 0.886967f,
+  -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f,
+  0.019016f,  -0.015837f, 0.600197f,  0.429773f,  0.315026f,  0.319667f,
+  0.214617f,  -0.017316f, 0.270257f,  -0.040524f, 0.695803f,  -0.015223f,
+  -1.554965f, 0.356997f,  -1.472428f, 0.024637f,  -0.562958f, 0.870351f,
+  0.193635f,  0.036063f,  0.328638f,  0.200274f,  -1.634707f, 0.110534f,
+  0.420104f,  -0.072042f, -0.006404f, 0.171680f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer0[64] = {
+  0.643147f,  -1.348826f, 0.431627f,  0.000000f,  0.102717f,  -0.772628f,
+  -0.034351f, -0.761977f, -0.638397f, 0.541969f,  -0.391311f, 0.563076f,
+  0.148553f,  0.267217f,  -0.788092f, 0.544573f,  -0.546280f, 0.000000f,
+  -0.446945f, 0.127732f,  0.270624f,  -0.219435f, -1.220203f, 0.324584f,
+  0.110885f,  0.276547f,  0.179726f,  -0.375160f, 0.026401f,  -0.032595f,
+  0.000000f,  -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f,
+  0.476453f,  -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f,
+  0.915351f,  -0.209962f, 0.000000f,  -0.025731f, 0.218288f,  0.000000f,
+  0.047726f,  -0.813077f, -1.263281f, 0.239087f,  0.278614f,  -0.030753f,
+  0.000000f,  0.346744f,  -0.948543f, -1.174211f, 0.216377f,  0.498913f,
+  0.853918f,  0.002504f,  -0.190403f, 0.452050f,
+};
+
+static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = {
+  0.179769f,  1.499417f,  -0.445135f, -0.142278f, -0.337661f, 0.682064f,
+  -0.203213f, 0.302171f,  0.226877f,  -0.422169f, 1.687586f,  0.783773f,
+  0.220995f,  0.253482f,  0.370435f,  -1.342775f, 0.337229f,  -0.271473f,
+  0.291796f,  1.362227f,  -1.751397f, -0.086178f, 0.725496f,  -0.118597f,
+  0.227963f,  -0.501577f, 0.223849f,  -0.122421f, -0.123437f, -0.051045f,
+  -0.020115f, 0.212711f,  0.246025f,  0.088120f,  -0.168995f, 1.740190f,
+  -0.195098f, 0.680339f,  -0.589572f, -0.075244f, 0.878766f,  0.064092f,
+  -3.548527f, 0.001660f,  0.107926f,  -0.169501f, -0.455212f, 0.123045f,
+  -1.836998f, 0.330365f,  1.301475f,  0.454761f,  -0.576552f, -0.190761f,
+  0.208459f,  0.618483f,  1.383364f,  0.970718f,  0.390174f,  0.406252f,
+  -0.564519f, -0.312062f, 1.345712f,  -0.151873f, 0.109290f,  0.408847f,
+  0.391243f,  0.152024f,  0.181764f,  -0.036263f, -0.160466f, 0.153595f,
+  0.049163f,  -0.753012f, -1.804062f, 0.347475f,  -2.746580f, 0.575618f,
+  0.261799f,  0.210505f,  -0.302054f, -0.109872f, 0.199506f,  -1.182971f,
+  0.723668f,  0.177758f,  -0.338202f, 0.254396f,  -0.220023f, 0.043504f,
+  0.669866f,  -0.040816f, -0.402730f, 0.017990f,  0.215523f,  -0.216816f,
+  0.454826f,  -0.726067f, -0.018750f, -0.928679f, 0.154315f,  -0.465641f,
+  0.144566f,  -0.030064f, -0.054667f, -0.154055f, 0.625384f,  1.323795f,
+  -0.159496f, 0.097072f,  -0.463197f, -0.057938f, 0.750290f,  -0.233061f,
+  0.412631f,  -0.535223f, -0.151423f, -0.154583f, 0.024721f,  -0.494448f,
+  0.230594f,  -0.980138f, -0.653968f, 0.126079f,  0.051814f,  -0.053219f,
+  -0.421708f, -0.228853f, 0.237885f,  0.888157f,  0.059655f,  0.241295f,
+  0.210443f,  0.228238f,  0.119127f,  -0.051989f, -0.355408f, 0.182215f,
+  0.244277f,  -0.104577f, -0.558035f, -0.023270f, 0.054571f,  0.700646f,
+  -0.223006f, 0.115523f,  0.023391f,  0.437264f,  0.709477f,  -0.531212f,
+  -0.094731f, 0.328161f,  -0.105418f, -0.133511f, 0.497168f,  -0.030948f,
+  -0.407132f, -0.043943f, 0.155505f,  0.251945f,  0.205010f,  0.167160f,
+  0.083654f,  -0.636810f, 0.401315f,  -0.398414f, 0.290046f,  0.206846f,
+  0.042218f,  0.168150f,  0.843181f,  -0.671242f, -0.202392f, -0.073301f,
+  0.142895f,  0.237466f,  0.212145f,  -0.091828f, 0.187038f,  -0.720841f,
+  -0.616069f, -0.238021f, 0.065365f,  0.434119f,  0.179023f,  -0.040107f,
+  -0.430734f, -0.297368f, 0.575954f,  0.382619f,  -0.709787f, -0.320810f,
+  0.242342f,  -0.047614f, 0.705216f,  0.098077f,  0.357179f,  0.046017f,
+  0.115074f,  -0.412305f, -0.272304f, 0.048096f,  -0.803811f, 0.275000f,
+  0.642198f,  0.180286f,  -0.087178f, -0.112707f, -0.394443f, 0.201989f,
+  0.241759f,  -1.038870f, 0.728124f,  0.800559f,  -1.296268f, 0.198612f,
+  -0.053478f, 0.414344f,  -0.510529f, 0.124179f,  -2.219115f, -0.074583f,
+  -0.143055f, 0.001697f,  0.810811f,  -0.657140f, 0.186818f,  -0.936414f,
+  0.539578f,  -0.308244f, -0.126624f, -0.204767f, 0.091145f,  -0.049340f,
+  0.252014f,  0.394582f,  0.018764f,  -0.060377f, -0.019133f, 0.064083f,
+  0.069211f,  -0.526693f, 0.209850f,  -0.481466f, -0.468302f, -0.100407f,
+  0.241018f,  -1.037781f, 0.038539f,  -2.113840f, -0.974895f, 0.163187f,
+  0.425132f,  -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f,
+  -0.745175f, -0.177077f, 0.217658f,  0.381431f,  -0.052338f, 0.087176f,
+  -0.165972f, 0.085937f,  0.472564f,  -0.796627f, -2.453307f, 0.569664f,
+  -0.233010f, -0.192134f, 0.064339f,  -0.111411f, -0.262469f, -0.410022f,
+  0.519993f,  -0.684620f, 0.393460f,  -0.277753f, -0.153624f, 0.528984f,
+  -0.415558f, -0.445863f, 0.588512f,  -0.142439f, -0.132127f, 0.199776f,
+  -0.579284f, 0.119488f,  -0.033590f, -0.503846f, -0.674979f, 0.335125f,
+  0.020519f,  0.233973f,  -0.297998f, -0.051511f, 0.518626f,  -0.412782f,
+  -0.074045f, 0.130523f,  0.465751f,  -0.117795f, 2.535813f,  0.352108f,
+  -0.499228f, 0.379784f,  0.056699f,  0.173142f,  -0.076519f, -0.026666f,
+  0.017834f,  0.492333f,  0.093364f,  0.037867f,  -0.165420f, -0.356429f,
+  -0.562334f, 0.057656f,  -0.307544f, 0.085857f,  -0.559851f, 0.107230f,
+  -0.398633f, 0.152618f,  -0.216835f, -0.024539f, 0.026044f,  -0.249519f,
+  -0.563594f, -0.746025f, 0.025265f,  -0.298888f, -0.185243f, 0.058794f,
+  0.233696f,  -0.115223f, 0.144617f,  -0.864390f, 0.619944f,  -0.023980f,
+  0.019481f,  0.225252f,  0.416552f,  -0.115993f, 0.935387f,  0.744386f,
+  0.053353f,  -0.052582f, -0.065650f, 0.228488f,  -0.032042f, -0.371252f,
+  -0.003638f, -0.736984f, -0.203776f, 0.030922f,  -0.065577f, -0.031643f,
+  -0.049253f, -0.054640f, 0.787134f,  0.545414f,  -0.140297f, -0.124274f,
+  -0.110011f, -0.029552f, 0.657005f,  0.214973f,  -0.374300f, 0.251642f,
+  0.276591f,  0.030566f,  -0.145470f, 0.350579f,  -0.356436f, -0.052694f,
+  -0.063966f, -0.751008f, -1.042392f, 0.328892f,  -0.425058f, -0.421571f,
+  -0.571889f, -1.141472f, -0.125216f, 0.212713f,  -0.485170f, -0.088791f,
+  0.124589f,  0.023237f,  0.077635f,  0.020901f,  -0.271402f, -0.321424f,
+  -0.513946f, -0.867872f, -0.284593f, 0.106276f,  0.220192f,  -0.143532f,
+  -0.014648f, 0.073402f,  0.327256f,  -0.139803f, 0.168763f,  0.048199f,
+  -0.122526f, 0.111713f,  -0.134257f, 0.810364f,  -0.085222f, -0.259221f,
+  -0.239349f, 0.044448f,  0.205031f,  0.413113f,  -0.107720f, -0.018816f,
+  -0.247741f, -0.004963f, 0.041170f,  -0.158019f, 0.134839f,  0.129502f,
+  0.800488f,  -1.041584f, -0.129336f, 0.170834f,  0.566586f,  -0.230443f,
+  0.437937f,  -0.149922f, -0.046665f, -0.094646f, 0.200070f,  0.072943f,
+  -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f,  -0.444731f,
+  -0.100877f, 0.545196f,  -1.786626f, -0.482946f, 0.500509f,  -0.843257f,
+  0.200374f,  0.045103f,  -0.575718f, -0.164335f, -0.232522f, -0.021825f,
+  -0.139490f, 0.356058f,  -0.352075f, 0.061751f,  -0.200616f, -1.180921f,
+  -0.181355f, -0.137459f, 0.247574f,  0.181541f,  0.184314f,  -0.961482f,
+  0.493615f,  0.910261f,  -2.279238f, 0.648631f,  -0.055526f, -0.037137f,
+  0.038643f,  0.136609f,  -0.819373f, -0.040840f, -0.265989f, 0.006877f,
+  0.454651f,  -0.595323f, -0.099500f, -0.263717f, 0.150456f,  0.245077f,
+  -0.268666f, 0.162232f,  -0.516451f, -0.024501f, 0.188046f,  -0.002262f,
+  0.261319f,  0.004173f,  0.746982f,  0.174761f,  0.470447f,  -0.159558f,
+  -0.385240f, 0.023084f,  -0.133520f, -0.220607f, -0.018731f, -0.373558f,
+  -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f,
+  -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f,  -0.153361f,
+  0.334394f,  -0.569472f, -0.198118f, 0.255922f,  0.104717f,  -0.065179f,
+  0.111879f,  -0.447237f, 1.373623f,  -0.190191f, -0.063311f, 0.337529f,
+  -0.138800f, 0.057009f,  -0.137006f, 0.641378f,  0.883147f,  -0.679655f,
+  0.267717f,  -0.351602f, -0.135225f, 0.229398f,  -0.513225f, -1.120345f,
+  0.528786f,  -0.051081f, 0.086653f,  0.140141f,  -0.563969f, 0.333402f,
+  -0.174745f, 0.321093f,  -0.438641f, -0.005131f, 0.247415f,  0.110120f,
+  -0.076308f, -0.083244f, 0.838944f,  -0.113043f, -0.013258f, -0.175028f,
+  -0.179941f, 0.272676f,  -0.047946f, -0.088076f, -0.450031f, 0.053929f,
+  -0.083549f, -0.089952f, -0.186253f, 0.257483f,  0.011019f,  0.586435f,
+  0.060580f,  -0.052078f, 0.090277f,  -0.780869f, 0.969811f,  -0.025349f,
+  -0.281917f, 0.014857f,  0.231863f,  -0.228601f, -0.003861f, 0.226550f,
+  0.141825f,  -0.102171f, -0.010387f, 0.220378f,  -2.561975f, -0.497071f,
+  -0.315117f, 0.371981f,  0.138247f,  0.625031f,  -0.308133f, -0.217876f,
+  0.005615f,  -0.860179f, 0.747491f,  0.006356f,  -0.057024f, -0.483189f,
+  0.055592f,  -0.316834f, 0.069858f,  0.218788f,  -0.200044f, 0.227588f,
+  0.215496f,  -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f,
+  -0.152512f, -0.332995f, 0.129053f,  0.178668f,  -0.302694f, 0.030678f,
+  0.925896f,  0.964375f,  0.169021f,  -0.218657f, -0.627204f, 0.206437f,
+  -0.521336f, 0.176206f,  0.142733f,  0.139248f,  0.411682f,  0.181544f,
+  0.224850f,  -0.935547f, -0.558208f, 0.348096f,  0.342129f,  -0.389340f,
+  -0.236308f, -0.132099f, 0.073642f,  0.089391f,  -0.306901f, -0.397842f,
+  0.444282f,  0.074623f,  -0.051075f, -0.106617f, -0.184037f, -0.239046f,
+  -0.138761f, 0.120794f,  -0.647577f, -0.336471f, 0.527899f,  -0.164234f,
+  -0.028354f, 1.083678f,  -0.251534f, -0.145903f, -0.182783f, 0.070976f,
+  -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f,
+  -1.152632f, 0.383685f,  -0.105895f, -0.096829f, 0.118382f,  0.047447f,
+  -0.019051f, 0.310180f,  -0.162793f, -0.029574f, 0.058054f,  -0.636017f,
+  0.490639f,  0.158347f,  -0.385701f, -0.147057f, 1.285825f,  -1.276083f,
+  -0.021795f, -0.101600f, 0.163254f,  0.267160f,  -2.317864f, -0.098598f,
+  -0.296337f, -0.309017f, 0.164127f,  -0.270012f, -0.071187f, -0.262270f,
+  0.075415f,  -0.368328f, 0.186728f,  -0.158031f, 0.481663f,  0.515950f,
+  -0.162551f, 0.497981f,  0.262196f,  0.168479f,  0.726066f,  -0.243856f,
+  -0.058998f, 0.140168f,  0.053242f,  -0.624623f, -0.249480f, 0.055197f,
+  -1.376804f, 0.417571f,  0.203784f,  0.174370f,  -0.155531f, -0.029400f,
+  -0.491473f, 0.079811f,  -0.080123f, 1.345900f,  0.637077f,  0.434862f,
+  -1.787438f, 0.005756f,  -0.362706f, 0.179458f,  -0.288263f, 0.516788f,
+  -0.921248f, 0.043794f,  -0.137729f, -0.196171f, -0.046295f, -0.793781f,
+  -0.156532f, -0.132566f, 0.517989f,  -0.154321f, -0.054174f, -0.077900f,
+  -0.373316f, -0.117718f, 0.188986f,  -0.476188f, -0.245312f, 0.181439f,
+  -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f,
+  -0.135429f, 0.125766f,  -0.081314f, -0.350894f, -0.163165f, -1.936507f,
+  -0.205966f, 0.031472f,  0.744446f,  -0.006680f, -0.837551f, 0.605862f,
+  -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f,  -0.183586f,
+  -0.010307f, 0.099373f,  -0.228278f, 0.175236f,  -0.000133f, 0.104491f,
+  -1.540545f, -0.570971f, -0.252885f, 0.483036f,  0.052531f,  0.260214f,
+  -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f,  -1.775975f,
+  -0.298634f, 0.323626f,  -0.373579f, -0.872977f, 0.619574f,  0.026862f,
+  -0.122531f, -0.084698f, -2.436297f, 0.483996f,  -0.203640f, -0.302157f,
+  -0.150666f, -0.238320f, 0.089250f,  0.236485f,  -0.668654f, -0.122863f,
+  0.491152f,  -0.226444f, -0.181248f, 0.120158f,  0.294027f,  0.250056f,
+  0.307601f,  0.357875f,  -1.746455f, -0.175670f, 0.385447f,  -0.108808f,
+  -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f,
+  0.607555f,  -0.489426f, 0.150624f,  0.598114f,  -0.128816f, -0.445793f,
+  -0.066524f, -0.254380f, 0.227106f,  -0.406495f, -0.121632f, -0.275960f,
+  -0.136494f, 0.339457f,  -1.318132f, -0.417572f, -2.614077f, 0.324603f,
+  -0.001211f, 0.375192f,  -0.473448f, -0.162510f, 0.099329f,  -0.277965f,
+  0.101221f,  -0.060263f, 0.121867f,  -1.042140f, 0.440851f,  0.078898f,
+  -0.209007f, -0.243699f, 0.715197f,  -0.093997f, 0.086022f,  -0.178203f,
+  -2.275496f, -0.098413f, 0.199352f,  -0.526791f, -0.162086f, -0.197806f,
+  -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f,  0.416236f,
+  0.064082f,  0.197655f,  0.340871f,  -0.186645f, -0.291498f, 0.433938f,
+  -1.110063f, 0.003751f,  0.392738f,  0.069360f,  0.102088f,  -0.302128f,
+  -1.518457f, 0.106939f,  0.404527f,  -0.306868f, -0.286928f, 0.729276f,
+  -0.531710f, 0.745048f,  -0.168837f, -1.953886f, -0.258828f, -0.190252f,
+  0.241877f,  -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f,
+  -0.489957f, 0.100850f,  0.323999f,  -0.802837f, -0.462408f, -0.079350f,
+  -0.029374f, 0.131213f,  -0.825032f, 0.040202f,  0.351821f,  0.002869f,
+  -0.132516f, -0.471264f, -0.297002f, 0.263913f,  0.033478f,  0.146161f,
+  0.533229f,  -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f,
+  0.005151f,  0.018584f,  -0.029771f, -0.396038f, -0.159236f, 0.038691f,
+  -1.197056f, 0.146302f,  0.226840f,  -0.852126f, 0.031214f,  0.108880f,
+  0.562000f,  -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f,
+  0.515073f,  -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f,
+  0.347673f,  0.623379f,  0.722067f,  -0.492458f, -0.513263f, 0.585167f,
+  0.721518f,  -0.693499f, 0.343725f,  -0.273861f, -0.040230f, -0.785664f,
+  -0.157500f, -0.308445f, 0.054062f,  0.600131f,  -0.860887f, 0.434470f,
+  -0.191382f, -0.306150f, -0.243965f, 0.705444f,  0.007789f,  -0.146154f,
+  -0.054499f, -0.073500f, -1.067364f, 0.404936f,  -2.864590f, 0.182323f,
+  0.326126f,  0.102405f,  -0.135800f, 1.128095f,  -0.012267f, -0.023996f,
+  -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f,  -0.498361f,
+  0.083560f,  -0.210074f, 0.019225f,  -0.201614f, -0.904760f, 0.181421f,
+  0.586384f,  -0.177706f, 0.065471f,  0.168552f,  0.054705f,  0.045241f,
+  0.048057f,  -0.410957f, -2.188854f, -0.169812f, 0.015521f,  0.176856f,
+  -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f,  0.010454f,
+  0.823643f,  -0.119781f, -0.098359f, 0.093119f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = {
+  -0.433195f, -0.120488f, -0.116721f, 0.112134f,  0.118170f, -0.259769f,
+  -0.077530f, 0.394044f,  0.279167f,  -0.317988f, 0.189538f, 0.314776f,
+  0.325655f,  -0.107123f, 0.591049f,  0.358744f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_128_layer0,
+      av1_ab_partition_nn_weights_128_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_128_layer0,
+      av1_ab_partition_nn_bias_128_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 64x64.
+static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = {
+  -0.495347f, -0.049498f, -0.026804f, 0.030474f,  -0.289308f, -0.264193f,
+  -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f,
+  -0.038217f, 0.014872f,  -0.289728f, -0.233577f, -0.415875f, -0.343615f,
+  -0.442543f, -0.482492f, 0.073510f,  0.007503f,  2.162329f,  -0.362849f,
+  2.145915f,  -0.883135f, 0.185636f,  -0.062859f, -0.465574f, -0.486205f,
+  -0.056710f, -0.330642f, -0.321860f, 0.042321f,  -0.348965f, 0.003542f,
+  -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f,
+  0.246622f,  0.199651f,  -0.663420f, -0.154152f, -1.220383f, 0.047138f,
+  0.816811f,  0.083247f,  -0.218839f, 0.038143f,  -0.063436f, 0.015517f,
+  -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f,
+  0.050425f,  -0.221723f, -0.256942f, -0.287285f, 0.144011f,  -0.033245f,
+  0.083649f,  0.119428f,  -0.056706f, -0.117805f, 0.021866f,  -0.257300f,
+  -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f,  -0.347247f,
+  0.042539f,  -0.302697f, 1.652316f,  0.000701f,  -0.482843f, -0.160332f,
+  -0.450099f, 0.212399f,  -4.715360f, -5.336774f, -5.375758f, -6.048339f,
+  0.085956f,  -0.037767f, 1.052409f,  -0.931924f, -2.221907f, 0.268946f,
+  0.015512f,  1.237094f,  -1.092185f, 0.418247f,  -0.082143f, -0.076914f,
+  -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f,
+  -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f,
+  -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f,  0.019839f,
+  0.451127f,  0.004376f,  1.410392f,  3.255835f,  -0.344815f, 0.145202f,
+  0.204132f,  0.171948f,  -0.527736f, -0.110353f, 0.901448f,  0.003238f,
+  -3.822090f, 0.235462f,  1.024823f,  -0.821244f, 0.876056f,  2.553762f,
+  -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f,
+  -0.246040f, 0.039430f,  -0.071769f, -0.118847f, -0.304053f, -0.281541f,
+  -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f,
+  0.084721f,  0.168089f,  -0.272169f, -0.204998f, -0.008303f, -0.173998f,
+  0.079376f,  -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f,
+  0.066176f,  -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f,
+  -0.385845f, 0.119769f,  -0.006567f, -0.382126f, -0.214221f, 0.038449f,
+  -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f,  -0.114423f,
+  -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f,
+  -0.279011f, -0.008132f, 0.208463f,  0.020569f,  -0.206803f, -0.213408f,
+  -0.206131f, -0.290245f, 0.069701f,  -0.000371f, -0.307572f, -0.451785f,
+  -0.300838f, -0.453186f, -0.301691f, 0.046327f,  -0.312668f, 0.058272f,
+  -0.303131f, -0.376252f, 0.108384f,  -0.086623f, -0.100630f, -0.027330f,
+  -0.003969f, 0.089502f,  -0.200722f, -0.107889f, 0.061843f,  -0.008478f,
+  -0.265057f, -0.271132f, -0.073562f, 0.129337f,  -0.283698f, -0.353414f,
+  0.076420f,  -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f,
+  -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f,
+  -0.011932f, -0.585700f, 0.253212f,  -1.061900f, -0.205116f, -0.336407f,
+  -0.762199f, 0.577737f,  0.230832f,  0.434440f,  -0.096713f, 0.038552f,
+  -0.147800f, -0.213553f, 0.041740f,  -0.281907f, -0.026154f, -0.082356f,
+  -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f,
+  -0.391963f, -0.467392f, 0.027453f,  -0.394761f, -0.045544f, 0.076052f,
+  0.483985f,  0.067093f,  0.141361f,  0.576772f,  0.859718f,  2.566515f,
+  -0.025476f, 0.769738f,  -0.680235f, -1.683309f, -2.394131f, -0.000714f,
+  -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f,
+  0.551148f,  1.777227f,  -0.461630f, 0.043093f,  0.012293f,  -0.255841f,
+  -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f,
+  -0.297266f, -0.128699f, -0.149555f, 0.016534f,  -0.375498f, -0.346759f,
+  -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f,
+  -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f,
+  -0.048421f, -0.144133f, 0.889073f,  0.012606f,  3.007608f,  -0.602584f,
+  -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f,  -1.867208f,
+  -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f,  0.119141f,
+  -0.230715f, 0.083247f,  0.020367f,  -0.128629f, -0.217455f, -0.159640f,
+  1.815952f,  -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f,
+  0.662971f,  0.486475f,  0.159746f,  -0.018932f, 3.692397f,  1.384353f,
+  -0.401984f, -0.248380f, -0.140861f, 0.215248f,  -0.023711f, 0.059679f,
+  -0.072260f, 0.004271f,  0.039545f,  -0.347971f, -0.081851f, -0.474896f,
+  -0.181572f, 0.066736f,  -0.157822f, -0.163760f, -0.171113f, -0.089935f,
+  -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f,
+  -0.102701f, -0.312336f, 0.149831f,  0.007229f,  -0.155700f, -0.173611f,
+  4.074261f,  1.342306f,  -1.272712f, 1.570899f,  -0.545093f, -0.317605f,
+  -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f,
+  -0.239130f, -0.067211f, 0.041957f,  -0.039234f, -1.003587f, -0.094412f,
+  0.532512f,  -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f,
+  0.419466f,  0.492122f,  -0.004368f, -0.022096f, -1.115132f, 0.150886f,
+  2.396852f,  2.660000f,  -0.376537f, 0.468628f,  0.149413f,  -0.074898f,
+  -0.067154f, 0.021245f,  0.127857f,  0.294189f,  0.508056f,  0.390232f,
+  -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f,
+  -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f,
+  -0.398724f, -0.372068f, -0.234279f, 0.017799f,  -0.424760f, -0.646717f,
+  -0.047568f, 2.924664f,  -0.644165f, 0.359349f,  -0.294800f, 0.591746f,
+  -0.404710f, -0.092358f, -0.250729f, 0.030829f,  -0.147149f, -0.476023f,
+  -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f,
+  -0.377052f, -0.449899f, -0.056452f, 0.138081f,  -0.085350f, -0.308391f,
+  0.106661f,  0.176234f,  0.258869f,  -0.230172f, -0.233029f, -0.241208f,
+  -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f,
+  -0.158114f, -0.223167f, -0.026689f, 0.051863f,  0.212834f,  -0.304714f,
+  -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f,
+  0.280815f,  -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f,
+  -0.380595f, 0.109504f,  -0.111141f, -0.437685f, -0.094459f, 0.144206f,
+  -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f,
+  -0.418429f, -0.183240f, 0.031319f,  -0.095785f, -0.315447f, 0.069404f,
+  -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f,
+  -0.178198f, 0.177208f,  0.134688f,  -0.081933f, -0.229452f, -0.208872f,
+  0.026287f,  -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f,
+  -0.267238f, -0.494125f, -0.056255f, 0.053715f,  -0.487754f, 0.014818f,
+  0.087383f,  -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f,
+  -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f,  0.010186f,
+  -0.001587f, 0.086735f,  -2.465718f, 1.482185f,  1.621193f,  -2.081680f,
+  1.386553f,  -3.204335f, -0.267111f, -0.004508f, 0.164712f,  0.274147f,
+  1.724306f,  -2.273659f, 0.749574f,  -0.891905f, 0.105965f,  -0.030428f,
+  -0.416018f, -0.300762f, 0.122911f,  -0.316908f, -0.292504f, 0.138666f,
+  -0.161327f, -0.042143f, -0.249128f, 0.149210f,  -0.088987f, -0.654101f,
+  -1.501843f, 0.216777f,  0.955914f,  0.524158f,  -1.642561f, -1.643626f,
+  0.864797f,  -0.425451f, -2.115764f, -0.012502f, 0.065172f,  1.297270f,
+  0.018845f,  1.167276f,  -0.470970f, -0.244995f, 0.374782f,  -1.811056f,
+  -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f,
+  -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f,
+  -0.370683f, 0.172816f,  -0.265069f, 0.194321f,  -0.273478f, 0.037442f,
+  -0.235552f, -0.078625f, -0.447541f, 0.016836f,  -0.271123f, -0.171481f,
+  -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f,
+  -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f,
+  0.230343f,  -0.034318f, -0.022687f, -0.047090f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer0[64] = {
+  -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f,  -0.150944f,
+  -0.075727f, -0.208414f, 1.054996f,  0.713758f,  -0.300051f, -0.151482f,
+  -2.443570f, 0.430590f,  -0.129001f, -0.160733f, -0.230547f, -0.143228f,
+  -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f,
+  0.578161f,  -0.220318f, -0.210107f, -3.111584f, 0.604419f,  -0.232622f,
+  -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f,  -2.535531f,
+  -0.209783f, -0.211189f, -2.766337f, 0.000000f,  0.450177f,  -1.754884f,
+  3.262664f,  -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f,
+  -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f,
+  -0.305430f, 0.739171f,  0.991277f,  -0.088150f, 0.086313f,  -0.023379f,
+  -0.125366f, -0.063576f, -0.212169f, -0.047463f,
+};
+
+static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = {
+  -0.036800f, 0.528721f,  0.490767f,   0.144409f,  1.103640f,  0.361910f,
+  -0.180069f, 0.068033f,  -14.868382f, 0.359013f,  0.322567f,  -0.199212f,
+  0.906164f,  -0.488254f, 0.149653f,   -0.216394f, -0.099347f, 0.004936f,
+  -0.111391f, 0.074848f,  -0.041709f,  0.147627f,  -0.018905f, 0.096116f,
+  0.184817f,  -0.016241f, 0.115739f,   2.376754f,  0.637097f,  0.052954f,
+  0.136428f,  0.225267f,  -0.181873f,  -0.142876f, 0.684048f,  0.658791f,
+  0.105795f,  0.241705f,  1.381114f,   -0.209379f, 1.145949f,  0.795293f,
+  -9.361877f, 0.198302f,  0.539600f,   0.092317f,  -0.081695f, 0.200777f,
+  0.102334f,  0.081583f,  0.060948f,   -0.025110f, 0.160951f,  -0.020170f,
+  0.234006f,  -0.029369f, 0.375036f,   0.270209f,  -0.556529f, 1.402949f,
+  0.101777f,  -0.027331f, 0.004502f,   -0.153166f, -0.116651f, 0.151573f,
+  -0.022187f, 0.144044f,  -0.108719f,  -0.129942f, -0.270321f, 0.227363f,
+  1.892330f,  -0.661052f, -0.219398f,  -0.229417f, -0.856438f, -1.196988f,
+  -0.081774f, 0.078847f,  -0.207057f,  -0.048947f, 0.152073f,  -0.243056f,
+  -0.233329f, -0.288689f, -0.158333f,  -0.141177f, -0.715436f, 0.016947f,
+  -0.093752f, 0.204984f,  -1.209782f,  0.155683f,  0.092239f,  0.146495f,
+  0.813146f,  -0.027757f, 0.330982f,   2.173948f,  -0.028867f, -0.141815f,
+  0.292708f,  -0.204794f, 0.014496f,   1.032799f,  1.312155f,  0.107020f,
+  0.824752f,  -0.013945f, 0.184829f,   -0.041633f, 0.215300f,  -0.476088f,
+  -0.053213f, 0.126862f,  -0.020777f,  0.082893f,  -0.223727f, -0.923063f,
+  0.466529f,  0.082140f,  -0.845758f,  -1.140791f, -0.262033f, 0.138491f,
+  0.151717f,  -0.182479f, -0.131128f,  0.055411f,  0.106771f,  0.125552f,
+  0.297184f,  -0.257403f, -0.059884f,  -0.274903f, 2.694357f,  -0.108244f,
+  0.025377f,  0.043092f,  -0.558317f,  3.517159f,  -0.270833f, -0.240676f,
+  0.205100f,  -0.057068f, -0.140445f,  -0.193449f, -0.030061f, -0.286762f,
+  -0.467523f, -0.012647f, 0.190564f,   0.022394f,  -0.101479f, 0.339684f,
+  -0.902743f, -0.169578f, -0.178029f,  -0.041836f, -3.952108f, -0.028298f,
+  -0.221137f, -0.733895f, -0.223895f,  0.039012f,  0.687867f,  0.021423f,
+  0.113063f,  0.676087f,  -0.961000f,  -0.064847f, 0.712856f,  -0.192765f,
+  -0.001132f, 0.016689f,  -0.236020f,  -0.766186f, -0.175729f, 0.012879f,
+  -0.251064f, -0.105523f, -0.039212f,  -0.347584f, 0.304352f,  -0.034174f,
+  -0.364258f, -0.685252f, -0.266115f,  -0.247345f, -0.155905f, 0.152283f,
+  -0.156315f, 0.174082f,  -0.757654f,  0.102303f,  -2.192316f, -0.245815f,
+  0.119882f,  -0.086542f, 1.987246f,   -1.353163f, -0.374813f, -0.233504f,
+  -1.980895f, 0.692093f,  -0.168351f,  0.172700f,  -0.009052f, -0.015734f,
+  0.106679f,  -0.060472f, -0.256813f,  -0.074874f, -0.207488f, -0.329515f,
+  -0.418268f, -0.017940f, -0.036081f,  0.064719f,  -1.488016f, 0.020591f,
+  -0.176325f, -0.141074f, 0.944494f,   0.150237f,  -0.249805f, -0.277280f,
+  0.012686f,  0.132483f,  0.116123f,   0.013737f,  -0.116091f, 0.750340f,
+  3.251343f,  -0.188864f, 1.096992f,   0.058467f,  -0.041433f, -0.037937f,
+  -0.133294f, -0.137908f, -0.171132f,  0.106362f,  0.069383f,  -0.052662f,
+  -0.177883f, -0.408049f, 0.680221f,   -0.117035f, -0.904240f, -1.395228f,
+  0.154527f,  0.134427f,  0.022767f,   -0.158886f, -0.230316f, 0.161096f,
+  0.362213f,  -0.235060f, -0.941620f,  0.055912f,  -0.049458f, -0.166632f,
+  0.481418f,  0.930146f,  0.041108f,   0.033674f,  1.372066f,  -1.847709f,
+  0.003324f,  0.259534f,  0.177014f,   -0.202761f, -0.262017f, -0.190852f,
+  -0.102839f, 0.028338f,  0.187193f,   -0.041684f, 0.123973f,  -0.198576f,
+  -0.110369f, -1.431400f, 0.208369f,   -0.302370f, -0.248549f, 0.062985f,
+  0.673409f,  0.036662f,  -0.711340f,  -0.120584f, -0.189789f, 0.098812f,
+  2.947819f,  0.216567f,  -0.414472f,  -0.181742f, 1.873779f,  -0.222726f,
+  -0.782870f, 0.007889f,  0.015062f,   -0.554328f, 0.182928f,  -0.191430f,
+  0.123636f,  -0.215460f, -0.225245f,  0.251516f,  -0.013025f, -1.359595f,
+  -0.750602f, 0.342667f,  -0.141899f,  -0.687493f, -0.072639f, 0.048018f,
+  -0.242107f, -0.031917f, -0.287472f,  -0.046088f, 0.832197f,  -0.016576f,
+  -1.553349f, -0.216341f, 0.023077f,   -0.410867f, 4.243743f,  -0.514878f,
+  -0.066007f, -0.160696f, -0.262678f,  -0.648790f, -0.430586f, 0.199940f,
+  -0.202496f, -0.222241f, -0.016406f,  -0.121473f, 0.000828f,  -0.081584f,
+  -0.152641f, -0.190166f, 0.644400f,   0.040196f,  -0.302104f, -1.143654f,
+  -0.160327f, -0.320780f, -0.187006f,  0.037311f,  0.440618f,  -0.070733f,
+  -0.117785f, 1.527539f,  -0.419310f,  0.001300f,  1.389956f,  -0.036366f,
+  -0.269203f, 0.612265f,  2.721897f,   -0.086836f, -0.446999f, 0.012525f,
+  -0.078317f, -0.287052f, -0.111188f,  -0.085181f, -0.164667f, -0.010466f,
+  -0.569722f, -0.018888f, -0.101663f,  -1.147130f, -0.465204f, 0.114524f,
+  -2.192402f, -0.221325f, 0.375748f,   0.206284f,  -0.261548f, -0.246257f,
+  -0.143004f, -0.069981f, -0.057306f,  -0.116481f, -0.435903f, -0.314970f,
+  0.013210f,  -0.010175f, 4.630571f,   -0.473226f, -0.197199f, -0.028204f,
+  0.122907f,  2.475548f,  0.025011f,   -0.092603f, -0.127561f, -0.151330f,
+  -0.077295f, 0.245016f,  -0.045005f,  0.183396f,  -0.330556f, -0.384887f,
+  0.356374f,  -0.016618f, -0.463353f,  -1.291546f, -0.071986f, -0.311599f,
+  0.072385f,  -0.430786f, -2.094788f,  0.202733f,  -0.910109f, -1.336543f,
+  -0.086800f, -0.096413f, 1.544383f,   0.031860f,  -0.796211f, 0.762786f,
+  3.250022f,  -0.441798f, -0.698537f,  0.062839f,  0.033525f,  -0.362996f,
+  0.027022f,  -1.131264f, -0.228926f,  0.053885f,  -0.338628f, 0.155037f,
+  -0.046844f, -0.888172f, -0.241767f,  0.084965f,  -0.617743f, -0.049896f,
+  -0.036894f, -0.304783f, -0.002639f,  0.137957f,  0.052121f,  -0.131161f,
+  -0.117200f, -0.253380f, -0.205561f,  -0.302450f, -0.047397f, -0.330518f,
+  3.613420f,  -1.525951f, -0.026738f,  0.209150f,  -2.103534f, 2.019689f,
+  -0.366199f, -0.095260f, 0.027417f,   -0.242512f, 0.162579f,  0.052113f,
+  -0.293851f, -0.068138f, -0.005799f,  -0.344696f, -0.114824f, -0.431107f,
+  -0.120058f, -1.139926f, -1.048379f,  0.036446f,  -0.323020f, -0.432945f,
+  0.454151f,  -0.140058f, 0.050649f,   -0.094900f, -0.017278f, -0.238719f,
+  1.193153f,  0.120447f,  -0.496061f,  0.917431f,  2.936126f,  -0.115521f,
+  -0.347397f, -0.435325f, -0.004383f,  -0.211864f, 0.162383f,  -1.040726f,
+  0.089537f,  -0.128579f, -0.133505f,  0.107129f,  -0.435657f, -0.180388f,
+  0.043650f,  0.018709f,  -0.773242f,  -0.687192f, -0.120633f, -0.063626f,
+  0.029912f,  0.113972f,  -0.403502f,  -0.127640f, -0.269625f, 0.129794f,
+  -0.188539f, 0.041641f,  0.029769f,   -0.198374f, 1.401407f,  0.353887f,
+  -0.219925f, 0.260515f,  1.157034f,   -2.992044f, -0.097618f, -0.064417f,
+  -0.203626f, -0.008217f, -0.112339f,  -0.227407f, -0.155118f, 0.247705f,
+  -0.012304f, -0.248447f, -0.913463f,  -0.064788f, -0.214619f, -0.251761f,
+  -0.386861f, -0.040574f, -0.163219f,  -0.100700f, 1.488274f,  -0.071684f,
+  -0.033626f, -0.006497f, -0.246945f,  -0.145221f, -3.747390f, 0.149609f,
+  -0.263326f, -0.297385f, -1.039896f,  -0.083174f, -0.025473f, -0.235586f,
+  -0.001087f, 0.254286f,  0.265106f,   0.007325f,  0.199239f,  0.134103f,
+  -0.578211f, -0.259801f, -0.062373f,  2.368348f,  0.560556f,  -0.252260f,
+  0.889997f,  -0.447872f, -0.059218f,  -0.095315f, -0.061667f, 0.183580f,
+  -0.157479f, 0.055387f,  -0.831734f,  0.007606f,  -1.104906f, 0.301180f,
+  -0.117115f, 0.212959f,  4.727223f,   -0.243833f, -0.397495f, -0.025021f,
+  -0.367587f, -2.082058f, -0.217699f,  0.148111f,  0.252430f,  0.111088f,
+  -0.260692f, 0.095124f,  -0.407774f,  -0.322169f, 0.002927f,  0.126169f,
+  -1.272325f, -0.279772f, -0.373680f,  -0.485177f, -0.605458f, 0.021225f,
+  -0.092031f, -0.226585f, 1.895162f,   0.037866f,  -0.275475f, 1.614360f,
+  -0.014972f, -0.277679f, -3.449082f,  -0.092060f, -0.747873f, 0.020716f,
+  2.776178f,  -0.049963f, 0.183999f,   -0.295259f, -0.028868f, 0.221895f,
+  0.001265f,  0.336823f,  0.219372f,   0.112824f,  0.408132f,  -0.017940f,
+  -0.311666f, 1.489606f,  -0.058093f,  -0.305659f, -0.491933f, -0.143847f,
+  0.166115f,  0.042867f,  -0.123447f,  -0.087099f, -0.305395f, -0.365079f,
+  -0.755801f, -0.160649f, 0.736260f,   -0.008611f, 0.095836f,  -0.017345f,
+  5.697515f,  -0.498971f, -0.125280f,  0.199907f,  0.300053f,  0.605026f,
+  -0.228225f, -0.259523f, 0.016384f,   0.146973f,  0.210258f,  0.226766f,
+  -0.075178f, -0.050924f, 0.188496f,   -0.415266f, -0.484880f, -0.236384f,
+  0.071931f,  -0.331863f, -0.601243f,  -0.232479f, -0.285272f, 0.123789f,
+  -1.341333f, 0.037082f,  -0.315202f,  -1.587215f, -0.271576f, 0.003216f,
+  -4.437186f, -0.256205f, -0.576589f,  -0.114147f, 2.153916f,  -0.369618f,
+  0.271415f,  0.145036f,  -0.158731f,  -0.240938f, -0.187369f, 0.036325f,
+  0.254771f,  0.211488f,  -0.240297f,  0.098417f,  -0.415011f, 2.334793f,
+  -0.127252f, 0.020069f,  -0.168755f,  -0.448922f, -0.219207f, 0.016232f,
+  -0.221935f, -0.269500f, -0.100636f,  0.102545f,  -0.809376f, -0.054979f,
+  0.360713f,  -0.326541f, 0.112933f,   0.138073f,  4.229404f,  -0.763801f,
+  -0.305429f, 0.199955f,  -1.787713f,  0.272866f,  0.109895f,  0.138466f,
+  -0.250259f, -0.167162f, -0.212588f,  -0.217589f, -0.067125f, -0.077490f,
+  -0.208970f, -0.006863f, -0.671146f,  -0.298320f, -0.165509f, 0.044597f,
+  -1.408624f, -0.213957f, -0.220947f,  0.129718f,  1.316777f,  -0.098928f,
+  -0.008121f, -0.558293f, -0.297290f,  -0.218873f, -4.346638f, -0.228174f,
+  -0.204710f, -0.388864f, 2.697919f,   0.025260f,  0.857020f,  0.009921f,
+  0.036915f,  -0.320275f, -0.087937f,  0.022636f,  0.236667f,  0.135496f,
+  -0.059616f, -0.192955f, 0.009470f,   2.139589f,  -0.200449f, 0.129818f,
+  1.017444f,  -0.608299f, 0.257914f,   -0.134306f, -0.033327f, 0.002855f,
+  -0.338598f, 0.015559f,  0.117362f,   -0.166760f, 0.086903f,  -0.167666f,
+  0.193523f,  0.033852f,  -1.147686f,  0.489468f,  -0.006969f, 0.125630f,
+  1.557907f,  -1.604449f, -0.071114f,  0.096178f,  0.007065f,  0.200013f,
+  0.213393f,  0.168466f,  -0.100568f,  -0.117861f, -0.161542f, -0.072561f,
+  -1.069871f, -0.470138f, -0.352578f,  -1.503513f, -0.001394f, -0.380109f,
+  0.065089f,  -0.281668f, 0.988953f,   -0.002778f, -0.659026f, -0.470692f,
+  -0.407292f, 0.011710f,  -1.362085f,  0.184738f,  -0.135786f, -1.374241f,
+  4.487930f,  -0.067274f, -0.956404f,  -0.233995f, 0.224527f,  -0.454556f,
+  0.037900f,  -0.281658f, 0.208224f,   -0.254753f, 0.045740f,  0.051444f,
+  -0.388281f, 0.257112f,  -0.485030f,  -0.082659f, 0.148103f,  -1.007456f,
+  -0.022295f, 0.036984f,  -0.369401f,  -0.076943f, -0.007636f, -0.293022f,
+  0.470466f,  0.199012f,  -2.158182f,  0.036577f,  -0.014725f, -0.229516f,
+  2.236929f,  0.030945f,  -0.400045f,  0.109348f,  0.214691f,  -0.891516f,
+  -0.251379f, -0.217358f, 0.013733f,   0.205573f,  -0.151725f, -0.191782f,
+  -0.339630f, -0.163905f, -0.119191f,  -0.032516f, 0.503015f,  0.025772f,
+  0.029094f,  -1.146153f, 0.216723f,   -0.330023f, 0.064695f,  -0.262521f,
+  0.425612f,  -0.093080f, -0.489648f,  1.051293f,  -0.092332f, 0.095557f,
+  -0.874132f, 0.218483f,  -0.127648f,  -1.605802f, 2.763617f,  -0.186734f,
+  -1.243166f, -0.193514f, -0.173748f,  0.337822f,  0.183873f,  -0.251594f,
+  -0.211582f, 0.144081f,  0.029620f,   -0.024853f, -0.385140f, 0.467341f,
+  -0.928316f, -0.195442f, 0.917783f,   0.357084f,  0.174445f,  -0.073659f,
+  -0.012811f, -0.115420f, -0.181147f,  -0.364449f, -0.567395f, -0.012969f,
+  -1.680714f, 0.065323f,  0.198063f,   -0.244201f, 1.428545f,  -0.432539f,
+  -0.208931f, -0.091205f, 0.957125f,   0.813519f,  -0.262677f, 0.246852f,
+  0.015536f,  0.055026f,  0.067054f,   0.262103f,  -0.358115f, -0.095206f,
+  -0.267522f, -0.402710f, -0.680397f,  -0.123627f, -0.385590f, -1.504680f,
+  -0.169513f, -0.215338f, 0.043633f,   -0.079052f, -0.464410f, 0.122894f,
+  -0.278231f, -2.456445f, -0.159917f,  -0.015597f, -0.735449f, -0.078854f,
+  -0.400290f, -1.153870f, 3.657228f,   -0.287093f, -1.174355f, -0.102001f,
+  -0.288281f, 0.185209f,  -0.145228f,  -0.200449f, -0.099914f, -0.138354f,
+  0.254428f,  -0.161751f, -0.118206f,  0.296043f,  -0.482613f, 0.080932f,
+  1.097605f,  -0.010190f, 0.232439f,   0.447617f,  -0.133508f, 0.115763f,
+  -0.388589f, 0.174695f,  -0.236014f,  0.006284f,  -1.374129f, 0.092015f,
+  -0.241419f, -0.231667f, 2.763950f,   -0.922932f, -0.061605f, 0.208740f,
+  -1.597190f, 1.353325f,  -0.198528f,  0.250498f,  -0.013950f, -0.203861f,
+  -0.254563f, 0.081931f,  -0.413369f,  0.011844f,  0.080961f,  -0.231161f,
+  -1.234909f, -0.440843f, -0.174980f,  -0.315283f, -0.337474f, -0.123243f,
+  -0.310001f, -0.271028f, 0.364179f,   0.022845f,  -0.535517f, -0.772936f,
+  -0.188435f, 0.039667f,  -0.807463f,  0.266550f,  -0.288857f, -1.630789f,
+  1.280155f,  0.065712f,  -0.279960f,  -0.300056f, 0.258440f,  -0.073781f,
+  0.213878f,  0.042196f,  0.021360f,   0.211698f,  -0.003751f, -0.192673f,
+  -0.137008f, 0.247878f,  -0.470604f,  0.073164f,  1.523241f,  0.734755f,
+  -0.114126f, -0.193834f, -0.025759f,  0.263183f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f,
+  -0.148074f, 0.923430f,  -0.364770f, 0.203550f,  0.401216f,  0.938246f,
+  -0.872737f, 0.718723f,  0.703398f,  2.560015f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_64_layer0,
+      av1_ab_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_64_layer0,
+      av1_ab_partition_nn_bias_64_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 32x32.
+static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = {
+  -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f,
+  -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f,
+  0.344916f,  -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f,
+  0.411575f,  -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f,
+  -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f,
+  0.225887f,  -0.000493f, 2.682241f,  0.871204f,  0.059014f,  0.803542f,
+  -1.407028f, -1.154669f, 1.388148f,  -0.293348f, -0.003669f, -0.009607f,
+  1.330030f,  -0.337841f, 2.118617f,  1.033059f,  -0.084788f, 0.212904f,
+  0.082405f,  -0.070579f, -0.494005f, -0.173392f, 0.039546f,  -0.463865f,
+  0.077163f,  -0.434066f, 0.030835f,  -0.427139f, -0.560520f, -0.031606f,
+  -0.368541f, -0.027458f, 0.370574f,  0.461418f,  1.087682f,  -0.572137f,
+  -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f,
+  -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f,  -0.156744f,
+  -0.267922f, 0.171216f,  0.110556f,  0.002954f,  -0.200327f, -0.187663f,
+  3.691601f,  1.234152f,  0.186315f,  -0.125370f, -0.211235f, -0.554432f,
+  -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f,  0.012896f,
+  -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f,  0.016307f,
+  0.384673f,  -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f,
+  -0.169709f, 0.421681f,  -0.033360f, -0.072817f, 0.003647f,  -0.110632f,
+  -0.158651f, -0.095136f, 0.223759f,  0.165767f,  -0.269129f, -0.196075f,
+  -0.023183f, -0.293420f, 0.014875f,  0.018688f,  -0.153407f, -0.172009f,
+  -0.259947f, -0.124015f, 0.173653f,  -0.089103f, -0.021001f, -0.334230f,
+  0.027177f,  0.103371f,  -0.183860f, -0.204051f, -0.023721f, -0.192297f,
+  -0.143771f, -0.247106f, 0.218116f,  -0.013240f, 2.831783f,  1.483928f,
+  -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f,  0.234684f,
+  -0.119150f, -0.075182f, -0.330463f, 0.071503f,  -0.254924f, -0.360071f,
+  -0.037022f, 0.063261f,  -0.148759f, -0.238254f, -0.462018f, -0.027166f,
+  0.065318f,  -0.235743f, -0.257194f, -0.094784f, 0.022423f,  0.055925f,
+  0.086672f,  -0.021010f, 0.009965f,  -0.001648f, -0.104917f, -0.387443f,
+  -0.102673f, -0.281706f, 0.145923f,  -0.233391f, -0.378365f, -0.145584f,
+  -0.077751f, -0.121166f, 1.134565f,  -0.097500f, -0.749202f, -0.544566f,
+  -1.361374f, -0.102494f, 1.089275f,  0.375299f,  -0.105091f, 0.037641f,
+  -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f,
+  -0.339326f, -0.128217f, -0.282905f, 0.014937f,  1.067185f,  -0.171764f,
+  0.484458f,  0.396706f,  -0.557055f, -0.891596f, -0.257839f, -0.720879f,
+  -0.218449f, -0.004755f, 1.572857f,  0.006229f,  1.962895f,  -0.029746f,
+  -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f,
+  -1.263078f, -0.304560f, 1.072374f,  2.556429f,  0.312850f,  0.257488f,
+  -0.634264f, 0.156769f,  -0.188943f, 0.040295f,  -0.389915f, 0.085250f,
+  -0.248525f, 0.045667f,  -0.776115f, -0.274680f, -0.448145f, -0.566161f,
+  -1.285316f, 0.079060f,  0.389124f,  -0.510401f, -0.015299f, -0.664661f,
+  0.099901f,  -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f,
+  -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f,
+  -0.064569f, -0.156516f, 0.543522f,  -0.005924f, 0.161432f,  0.974793f,
+  0.273712f,  1.104850f,  -0.290312f, 0.313417f,  -0.125370f, 0.136234f,
+  -0.191227f, -0.165054f, 0.011872f,  -0.298871f, 0.095740f,  0.142760f,
+  -0.215771f, -0.031437f, 0.101041f,  -0.085620f, 0.435387f,  0.002786f,
+  1.971375f,  0.018392f,  -1.771940f, -0.401433f, 0.808263f,  -3.350013f,
+  2.296952f,  -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f,
+  -0.276088f, -0.455907f, 0.266021f,  0.087348f,  -0.146566f, 0.040492f,
+  -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f,
+  -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f,  0.002185f,
+  -4.225019f, 0.344025f,  0.728796f,  -0.262936f, 1.383924f,  1.577300f,
+  -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f,
+  -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f,
+  0.031970f,  -0.373402f, -0.396079f, 0.045566f,  0.072595f,  -0.222681f,
+  -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f,
+  -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f,
+  -0.205936f, -0.316275f, 0.103729f,  -0.197893f, -0.128029f, -0.218796f,
+  -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f,
+  -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f,
+  -0.270504f, 0.234505f,  0.272144f,  0.266938f,  -0.392395f, -0.011717f,
+  -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f,
+  -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f,
+  -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f,
+  -0.119432f, -0.222351f, 0.000450f,  0.208724f,  -0.510526f, -0.144656f,
+  -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f,
+  0.043714f,  -0.235414f, 0.115594f,  -0.195616f, -0.106693f, -0.124242f,
+  0.083990f,  0.049110f,  -0.196130f, -0.059860f, -0.464235f, -0.516443f,
+  -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f,
+  -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f,
+  -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f,
+  -0.023459f, -0.222538f, 0.028849f,  -0.088038f, -0.301550f, -0.273566f,
+  0.067295f,  -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f,
+  -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f,
+  -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f,
+  -0.086147f, -0.430088f, 0.058466f,  -0.152129f, -0.058411f, -0.236392f,
+  -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f,
+  -0.324501f, 0.000490f,  -0.282167f, -0.073163f, -0.281452f, 0.047932f,
+  -0.175500f, 0.165220f,  -0.276212f, 0.062153f,  -0.217054f, -0.255487f,
+  -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f,
+  -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f,
+  -0.158325f, 0.151907f,  -0.266835f, -0.144697f, -0.193960f, -0.046587f,
+  -0.220028f, -0.247355f, 0.135584f,  0.016511f,  0.367705f,  -1.855877f,
+  0.435622f,  0.444710f,  -3.372301f, -3.030489f, 1.013267f,  0.380951f,
+  -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f,
+  -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f,
+  -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f,
+  -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f,
+  -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f,  1.189112f,
+  1.458468f,  -0.005876f, -0.927475f, 0.062038f,  -1.170818f, 0.338227f,
+  -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f,
+  -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f,
+  -0.310094f, 0.062721f,  0.251422f,  -0.014350f, -1.282910f, 1.619560f,
+  1.180566f,  -0.032163f, -1.322951f, -0.603601f, 1.443710f,  0.654650f,
+  -0.393227f, 0.003536f,  0.029725f,  -0.108925f, -0.053911f, 0.133977f,
+  -0.036145f, -0.168438f, 0.046989f,  -0.331463f, -0.176983f, -0.311922f,
+  -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f,
+  -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f,  -0.032867f,
+  -0.039424f, -0.063670f, 0.193808f,  -0.303514f, -0.013376f, -0.057761f,
+  0.187922f,  0.006938f,  0.031810f,  0.180594f,  -1.198427f, 2.820662f,
+  0.154986f,  -0.375518f, 0.116925f,  -0.795782f, -0.085139f, -0.079365f,
+  -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f,
+  -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f,
+  -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f,
+  -2.453587f, -0.045568f, -0.296932f, 0.613061f,  -0.320284f, 0.191620f,
+  -0.827145f, -0.225277f, 0.275800f,  1.696635f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer0[64] = {
+  -0.176206f, 0.660189f,  -0.186156f, -2.481963f, -1.564218f, -0.280424f,
+  0.732684f,  -0.135581f, -2.193132f, -0.172771f, 0.605001f,  -0.060392f,
+  -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f,
+  0.632779f,  0.005585f,  1.310169f,  1.392136f,  -0.563860f, -0.051053f,
+  0.660998f,  -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f,
+  -0.177726f, 1.200859f,  -0.178902f, -0.172620f, -0.184476f, -0.175559f,
+  0.538503f,  -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f,
+  -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f,  -0.020116f,
+  -0.208096f, 0.000000f,  1.246166f,  -0.225421f, -0.181555f, 0.861761f,
+  1.172429f,  -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f,
+  -1.384604f, -0.201713f, -0.271948f, 0.372351f,
+};
+
+static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = {
+  -0.037828f,  1.529029f,  0.004927f,  1.475763f,  0.627172f,  0.325872f,
+  -0.990757f,  0.129476f,  0.889958f,  -0.082031f, 0.332133f,  0.074422f,
+  -0.176212f,  -0.074355f, 0.774378f,  0.110987f,  -0.155469f, 0.253310f,
+  0.882538f,   0.253605f,  0.332436f,  -5.389474f, 0.278470f,  0.168644f,
+  0.914611f,   0.154165f,  0.809262f,  -0.174734f, 0.923673f,  0.064716f,
+  -0.070228f,  -0.228735f, 0.002312f,  0.112222f,  -0.045502f, -0.046004f,
+  0.514101f,   0.306480f,  0.021232f,  -0.015955f, -0.288260f, 0.189177f,
+  -0.104158f,  0.103273f,  0.096910f,  -0.086328f, 1.327289f,  -0.154247f,
+  0.056676f,   -0.243327f, -0.646676f, 0.177221f,  -0.086761f, 0.729729f,
+  -14.710893f, -0.044881f, 0.339003f,  -0.134737f, 0.073621f,  -0.162913f,
+  1.215237f,   0.140723f,  0.138630f,  1.241719f,  0.204092f,  -0.463080f,
+  -0.176086f,  1.125868f,  1.034814f,  0.225455f,  -0.203421f, -0.078787f,
+  -0.527498f,  0.012491f,  -0.563307f, -0.170792f, 0.002679f,  0.116153f,
+  0.211348f,   -0.191900f, -0.212505f, 0.263445f,  -0.074679f, -0.081441f,
+  -0.815405f,  2.448215f,  0.781299f,  0.149542f,  -1.045162f, 0.043014f,
+  0.217381f,   -0.094500f, -0.090427f, 0.025784f,  -0.228906f, -2.741798f,
+  0.230475f,   -0.256112f, -0.103297f, 0.159121f,  -0.229793f, -0.014883f,
+  -0.104131f,  -0.123816f, 0.164148f,  -0.052279f, -0.071845f, -0.041197f,
+  0.208527f,   -0.234197f, -0.542336f, 0.020053f,  0.088870f,  0.014346f,
+  2.502164f,   -0.010244f, -0.267792f, 0.844394f,  2.711486f,  -0.015262f,
+  -0.868053f,  -0.295704f, 0.222289f,  -0.000286f, -0.352098f, -0.079000f,
+  0.021267f,   -0.721739f, -0.240558f, -0.384775f, 0.065974f,  -2.161058f,
+  0.195889f,   0.268966f,  -0.009329f, 0.014949f,  0.314943f,  0.235885f,
+  0.072591f,   -0.127120f, 0.150784f,  0.105697f,  -1.297403f, -0.207509f,
+  -0.217688f,  -0.076752f, 0.170952f,  -0.294235f, 0.449973f,  -1.712690f,
+  0.860989f,   0.054757f,  -0.812627f, -0.105316f, -0.736230f, -0.133192f,
+  -3.741608f,  0.495660f,  -0.288936f, 4.654852f,  -0.021305f, -0.308916f,
+  0.049205f,   -0.259996f, 0.114248f,  -0.252647f, -0.253180f, -0.449314f,
+  0.022979f,   0.063281f,  -0.196154f, 0.078295f,  -0.322317f, -0.145142f,
+  0.300573f,   0.048385f,  -0.254787f, 0.123939f,  -1.263088f, -0.228565f,
+  -0.389061f,  0.391084f,  2.322438f,  0.075009f,  0.225743f,  -0.198808f,
+  -0.280538f,  -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f,
+  -0.102756f,  -1.760965f, 0.019149f,  -0.867342f, 0.347141f,  0.031588f,
+  0.302572f,   -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f,
+  -0.108561f,  -0.167077f, -2.851509f, -0.307116f, 0.202720f,  -0.160280f,
+  -0.215525f,  0.064355f,  -0.427220f, 1.516230f,  0.634453f,  0.099400f,
+  -1.013887f,  -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f,
+  -0.160953f,  0.399036f,  -0.030685f, -0.113619f, -0.184704f, 0.040519f,
+  -0.588252f,  -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f,
+  -0.253959f,  -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f,
+  -0.175087f,  -0.055171f, 1.642014f,  -0.192559f, -0.288147f, 0.610311f,
+  4.688195f,   -0.128728f, -0.914869f, -0.108286f, 0.013789f,  0.092125f,
+  0.019770f,   -0.178386f, 0.074164f,  -1.152658f, -0.216738f, -0.277286f,
+  0.012381f,   0.418259f,  -0.680727f, -0.221917f, -0.485946f, 0.101672f,
+  2.009457f,   0.054302f,  1.019838f,  -0.116170f, 0.165134f,  -0.112567f,
+  0.852632f,   -0.385796f, -0.108666f, 0.053181f,  -0.311797f, -0.372875f,
+  -0.675717f,  2.409268f,  -0.514720f, -0.214245f, -0.646596f, 0.009756f,
+  0.203993f,   0.093617f,  -0.301290f, 0.253551f,  -0.128909f, -1.448442f,
+  -0.186823f,  -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f,
+  -0.212084f,  -0.137326f, 0.012505f,  0.087850f,  -0.200413f, -0.394119f,
+  -0.132224f,  0.146917f,  0.155746f,  0.198725f,  -0.322541f, 0.196391f,
+  -0.945500f,  0.036736f,  -0.155646f, -0.677341f, 1.130545f,  -0.339554f,
+  0.411628f,   -0.355813f, -0.249843f, 0.213694f,  -2.035607f, 0.055694f,
+  -0.111669f,  0.408696f,  -0.067043f, -0.048182f, 0.398110f,  -0.067542f,
+  1.459801f,   0.236833f,  -0.178806f, 0.168758f,  0.492387f,  0.099691f,
+  -0.776680f,  -0.172865f, 0.204225f,  0.193982f,  0.575685f,  -0.062248f,
+  0.011486f,   0.058571f,  -0.493391f, 0.026893f,  -0.900467f, 3.793129f,
+  -0.634613f,  -0.064660f, -0.048262f, 0.361905f,  0.033641f,  0.245171f,
+  -0.064671f,  0.034954f,  0.204358f,  -0.904023f, -0.052714f, -0.250134f,
+  0.136700f,   0.000734f,  -0.371720f, 0.226483f,  0.217958f,  0.060559f,
+  0.180111f,   0.000970f,  0.079556f,  -0.096775f, 0.093855f,  -0.026224f,
+  -0.243664f,  0.004290f,  0.123281f,  -0.239476f, 1.230374f,  -0.107826f,
+  -0.101982f,  -0.153917f, 5.464427f,  0.304375f,  -0.809957f, 0.090564f,
+  -0.278416f,  -0.245555f, -2.078421f, 0.243093f,  -0.127666f, 0.052451f,
+  -0.126662f,  -0.783505f, 0.025149f,  -1.422675f, -0.207769f, -0.362547f,
+  0.115310f,   0.133390f,  1.264754f,  -0.027055f, -0.485312f, -0.240717f,
+  -0.239722f,  0.146818f,  -1.265043f, -0.235553f, 0.267104f,  -0.021357f,
+  -0.435949f,  -0.309371f, 0.049920f,  1.302721f,  -0.233978f, -0.097551f,
+  -0.240631f,  -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f,
+  -0.029361f,  2.703590f,  -0.430659f, 0.067927f,  -0.387520f, -0.370630f,
+  -0.229236f,  0.085653f,  -0.370956f, -0.065556f, -0.187859f, 0.068309f,
+  -0.109299f,  -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f,
+  -0.196713f,  -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f,
+  5.963707f,   -0.201157f, 0.726377f,  -0.011076f, 0.010553f,  -0.102918f,
+  -2.230088f,  -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f,
+  -0.094735f,  -1.381839f, 0.587298f,  -0.173048f, 0.721360f,  0.241900f,
+  0.764302f,   -0.023609f, -1.173755f, 0.103912f,  -0.185363f, 0.078435f,
+  -2.245062f,  -0.127269f, 0.202234f,  0.158975f,  -0.260909f, 0.098608f,
+  -0.348247f,  1.732502f,  -0.412298f, -0.269602f, -0.425771f, -0.146243f,
+  -0.530730f,  0.125716f,  -1.004419f, 0.145109f,  -0.059289f, 1.096304f,
+  0.012891f,   0.045033f,  -0.306875f, 0.003514f,  -0.176110f, 0.037544f,
+  -0.441537f,  -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f,
+  -0.128894f,  -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f,
+  1.173404f,   0.088312f,  -0.393568f, -0.175134f, 6.529819f,  -0.326652f,
+  -0.631917f,  -0.393476f, 0.057781f,  -0.217748f, -1.781139f, -0.012614f,
+  -0.212621f,  -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f,
+  -0.608744f,  -0.265146f, 0.238517f,  0.066882f,  -2.916806f, 0.054642f,
+  0.282590f,   0.075248f,  0.010188f,  -0.133486f, 0.985945f,  -0.045849f,
+  -0.347564f,  0.057320f,  -0.417920f, 0.063664f,  0.387062f,  -2.692059f,
+  -0.535549f,  0.263736f,  0.327889f,  -0.070273f, -0.775254f, 0.147250f,
+  3.309425f,   -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f,
+  0.022907f,   0.138421f,  -0.112159f, -0.288447f, -0.010799f, 0.056049f,
+  -0.036527f,  0.021525f,  0.106649f,  -0.291883f, 0.088424f,  -0.057773f,
+  -0.086031f,  0.015277f,  -0.318505f, -0.269049f, -1.008913f, -0.224785f,
+  -0.025820f,  -0.649037f, 0.706381f,  0.096410f,  0.643776f,  -0.046743f,
+  -0.009654f,  -0.024246f, 1.469255f,  -0.183536f, -0.370046f, -0.048442f,
+  -0.376527f,  -0.431264f, -0.245109f, -0.093951f, 0.203683f,  -0.099872f,
+  0.087210f,   0.160692f,  -3.527694f, -0.068891f, -0.228994f, -0.231817f,
+  -0.241949f,  0.193613f,  0.979597f,  -0.091259f, 0.414424f,  -0.047341f,
+  -0.209582f,  -0.295134f, -0.016824f, 0.460327f,  -0.072671f, 0.246234f,
+  0.235896f,   0.127238f,  -1.068683f, 0.035648f,  2.254888f,  0.180105f,
+  -0.260098f,  -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f,
+  -0.237916f,  0.031103f,  -0.274063f, -0.049384f, -0.044917f, 0.102477f,
+  -0.342148f,  -0.257558f, -0.346300f, 0.115333f,  -0.115456f, 0.208354f,
+  -0.359301f,  -0.167395f, 1.146514f,  -0.177861f, -0.098658f, -0.444570f,
+  6.759993f,   -0.369772f, -0.831118f, 0.001866f,  -0.073298f, -0.072095f,
+  0.811902f,   -0.431997f, -0.286587f, -0.269500f, 0.111492f,  -0.525364f,
+  -0.351785f,  -2.463474f, -1.852659f, 0.135325f,  0.138267f,  0.100643f,
+  -2.373278f,  -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f,
+  -0.716424f,  -0.031674f, 0.011147f,  0.057405f,  -0.215873f, -0.094401f,
+  0.573528f,   -1.223820f, 0.414852f,  -0.059053f, -0.076488f, -0.287168f,
+  -0.842640f,  0.174084f,  -0.567186f, 0.336629f,  -0.062514f, 2.075448f,
+  -0.061680f,  -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f,
+  -0.049616f,  -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f,
+  0.141501f,   -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f,
+  -1.521661f,  -0.122639f, -0.015760f, -0.718912f, 5.877828f,  0.146916f,
+  0.151767f,   0.220785f,  -0.032298f, 0.230902f,  0.663943f,  -0.252613f,
+  0.057718f,   -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f,
+  -1.031206f,  -0.104136f, 0.389897f,  0.127602f,  -2.667789f, -0.212366f,
+  -0.506262f,  -0.009115f, -0.213202f, 0.076167f,  -1.629405f, 0.055129f,
+  0.375393f,   -0.150272f, -0.241515f, -0.326497f, 0.100069f,  0.410703f,
+  0.340622f,   0.042437f,  -0.349945f, 0.041176f,  -1.178950f, 0.030992f,
+  0.933908f,   -0.035844f, -0.098660f, 1.030584f,  -0.092043f, -0.355739f,
+  -0.305562f,  0.036161f,  -0.049558f, -0.033225f, -0.403856f, -0.088276f,
+  0.215493f,   -0.149105f, -0.013363f, 0.025886f,  -0.101306f, -0.205781f,
+  -1.072487f,  -0.076019f, 0.077555f,  0.131003f,  1.267763f,  -0.008954f,
+  -0.327617f,  -0.246539f, 6.664081f,  -0.404403f, -1.442489f, 0.191301f,
+  -0.336361f,  0.181156f,  0.833108f,  0.007879f,  -0.194464f, -1.029408f,
+  -0.036268f,  -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f,
+  -0.065990f,  0.203160f,  -0.291788f, 0.000680f,  0.587011f,  -0.241289f,
+  0.037034f,   0.000552f,  1.072308f,  -0.387230f, -0.230050f, 0.292322f,
+  -0.720001f,  0.034109f,  -0.467260f, 2.211644f,  -1.839191f, -0.048797f,
+  -0.083469f,  -0.334686f, -0.269056f, 0.051295f,  1.319904f,  -0.035603f,
+  -0.018457f,  -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f,
+  -0.305469f,  -0.099011f, 0.014225f,  -0.452772f, 0.170331f,  -0.389312f,
+  -0.115084f,  -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f,
+  -0.125137f,  0.067228f,  -1.329271f, -0.117874f, -0.132499f, -0.218376f,
+  -0.588325f,  -0.320024f, 0.085695f,  -0.235047f, -0.217790f, 0.103015f,
+  -0.698644f,  0.017766f,  -0.058299f, 0.199411f,  -0.122485f, -0.563949f,
+  -0.349011f,  -0.557045f, -0.131165f, 0.002281f,  0.118559f,  -0.210302f,
+  -1.153815f,  0.116738f,  -0.236007f, -0.003487f, -0.006885f, -0.244816f,
+  0.953222f,   0.093748f,  0.266869f,  0.241869f,  -0.860832f, -0.387012f,
+  -0.338986f,  2.097515f,  -1.942512f, -0.298021f, 0.543911f,  -0.043214f,
+  0.082125f,   -0.120242f, 0.712231f,  0.213327f,  -0.301687f, -0.544011f,
+  -0.392131f,  0.004302f,  0.004825f,  -0.317440f, -0.107518f, -0.293407f,
+  -0.159111f,  -0.080367f, 0.132663f,  -0.017726f, -0.237521f, -0.190297f,
+  -0.361633f,  0.200518f,  -0.538296f, -0.027975f, -0.381704f, -0.016963f,
+  0.630105f,   -0.190997f, -0.287840f, -0.603488f, 3.605598f,  -0.276614f,
+  -1.346383f,  0.186912f,  -0.047575f, -0.189232f, -1.519072f, 0.097816f,
+  -0.223722f,  0.304924f,  -0.213022f, -1.052433f, -0.322283f, -1.706734f,
+  -2.458027f,  0.237976f,  0.171050f,  -0.103139f, -0.278689f, 0.329824f,
+  -0.262448f,  -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f,
+  0.091018f,   -0.386471f, -0.723940f, 0.064956f,  -0.057652f, 1.321024f,
+  -1.397418f,  -0.143136f, 0.272468f,  -0.030749f, 0.037324f,  0.069316f,
+  -0.904925f,  -0.333693f, -0.117709f, 2.279598f,  -0.428065f, -0.131157f,
+  -0.014288f,  -0.402862f, -0.666090f, 0.017070f,  -0.028333f, 0.002481f,
+  0.197156f,   -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f,
+  -0.905007f,  -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f,
+  -0.089948f,  -0.936827f, 1.437569f,  -0.388908f, 0.126170f,  0.186162f,
+  -0.018819f,  -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f,
+  -0.230436f,  -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f,
+  0.378157f,   0.113377f,  -0.850610f, 0.080245f,  -0.087305f, -0.002852f,
+  0.044408f,   -0.188172f, -1.891998f, 0.092189f,  0.125325f,  -0.105090f,
+  -0.848510f,  -0.396308f, -0.384130f, 2.007509f,  -1.480787f, -0.126946f,
+  0.314767f,   0.000195f,  -0.285628f, -0.110442f, -0.293948f, 0.258559f,
+  -0.417603f,  1.570705f,  0.092459f,  -0.340974f, -0.284754f, -0.007801f,
+  -0.324610f,  -0.004734f, -0.207716f, -0.057175f, 0.055467f,  -0.210830f,
+  -0.113005f,  -0.299177f, 0.068074f,  0.017929f,  -2.897598f, -0.260074f,
+  -0.014422f,  -0.206467f, 1.246997f,  -0.372863f, -0.214160f, -0.114035f,
+  5.805862f,   0.003611f,  -1.340990f, -0.021085f, -0.260431f, -0.002720f,
+  -1.251640f,  -0.353531f, -0.304009f, -0.153376f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f,
+  0.001427f,  0.523607f,  0.225068f,  -0.055273f, 1.019519f,  1.181880f,
+  -0.010198f, 0.130597f,  1.276752f,  2.028188f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_32_layer0,
+      av1_ab_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_32_layer0,
+      av1_ab_partition_nn_bias_32_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 16x16.
+static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = {
+  0.151902f,  0.007947f,  -1.788454f, 0.431869f,  -2.971387f, 0.923566f,
+  1.632542f,  -1.665136f, -0.338632f, -5.075884f, 0.398267f,  0.030467f,
+  2.263534f,  -0.045532f, -1.066128f, 0.915139f,  -0.560500f, -3.293125f,
+  2.072793f,  -1.011414f, 0.122716f,  -0.060169f, -0.388860f, 0.031019f,
+  -0.381861f, 0.001551f,  -0.328472f, 0.038296f,  -0.060398f, -0.375556f,
+  0.209226f,  0.014764f,  -1.443469f, -0.345486f, 2.409269f,  1.524846f,
+  -0.640666f, 1.322139f,  -2.074771f, -0.580944f, -0.203960f, -0.072893f,
+  0.329701f,  0.115339f,  -1.339542f, 0.249024f,  -0.421545f, -0.409151f,
+  -0.258293f, 0.836288f,  -0.073685f, -0.009624f, 0.895712f,  0.320639f,
+  0.451002f,  -1.544558f, 0.193709f,  -1.389012f, 1.305451f,  0.089795f,
+  0.050338f,  -0.017433f, -0.304667f, 0.500729f,  0.504346f,  0.073757f,
+  0.582649f,  -0.993623f, 1.766766f,  -3.067265f, -0.415774f, -0.006036f,
+  -1.245281f, 0.253205f,  -0.591245f, -0.626238f, 0.551852f,  0.593755f,
+  0.491023f,  1.099384f,  -0.348448f, 0.054564f,  -0.451422f, -0.375781f,
+  -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f,
+  -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f,
+  0.068066f,  -0.374920f, 0.057536f,  -0.189748f, 0.058375f,  -0.267749f,
+  -0.147286f, -0.246153f, 0.006183f,  -0.202029f, -0.059128f, 0.116852f,
+  0.134719f,  -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f,
+  -0.264499f, 0.155816f,  -0.107255f, -0.056983f, -0.209771f, -0.099070f,
+  0.007313f,  -0.254124f, -0.231964f, -0.275972f, 0.032098f,  -0.264564f,
+  -0.208743f, 0.155599f,  -0.121511f, -0.156145f, -0.162315f, -0.059788f,
+  -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f,
+  -0.154114f, 0.017032f,  -0.017364f, -0.233247f, 0.009918f,  -0.179289f,
+  -0.190722f, 0.147106f,  -0.063910f, -0.396872f, -0.263123f, -0.003850f,
+  -0.040718f, -0.324699f, 0.118660f,  -0.170727f, -0.316788f, 0.100886f,
+  -0.202842f, 0.045371f,  0.150561f,  -0.057054f, -0.308150f, 0.028346f,
+  -0.381473f, -0.195365f, 0.026221f,  -0.281795f, 0.087204f,  0.047689f,
+  -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f,
+  -0.340273f, 0.048086f,  0.046103f,  -0.121527f, 0.021697f,  0.054109f,
+  -0.002768f, -0.008461f, -2.297240f, 0.124651f,  3.621661f,  -0.057120f,
+  -1.151656f, 2.296894f,  -3.678720f, -0.290240f, 0.087683f,  -0.186389f,
+  0.007656f,  -0.090236f, -0.245217f, 0.110389f,  -0.251719f, -0.029084f,
+  -0.128203f, -0.100005f, -0.032779f, 0.007281f,  -0.366596f, -0.267870f,
+  -0.215620f, 0.047687f,  0.010303f,  0.097980f,  -0.191569f, -0.341162f,
+  0.119249f,  0.026279f,  -2.161546f, 0.459591f,  1.290566f,  1.791797f,
+  -0.409835f, 0.127081f,  -1.156367f, 0.198286f,  0.099561f,  -0.067445f,
+  -0.034352f, 0.017966f,  -0.277380f, -0.057220f, -0.174198f, -0.014164f,
+  0.146090f,  -0.357530f, 0.097644f,  -0.000932f, 0.446603f,  -0.066793f,
+  2.448620f,  0.937617f,  -1.232922f, 0.313183f,  0.816827f,  -0.275115f,
+  -0.245205f, -0.126895f, 0.156668f,  -0.186977f, -0.273505f, 0.013315f,
+  0.168629f,  -0.089084f, 0.006166f,  -0.116107f, -0.199316f, -0.024010f,
+  -0.242303f, 0.011612f,  -0.218485f, -0.229661f, -0.123922f, 0.136699f,
+  0.006732f,  -0.148718f, -0.164225f, 0.116063f,  1.587898f,  0.690519f,
+  0.360566f,  0.009739f,  -0.678702f, -0.046003f, 0.126984f,  0.605212f,
+  1.240663f,  -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f,
+  -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f,
+  -0.242255f, 0.137424f,  -0.307490f, -0.084637f, -0.023812f, -0.196582f,
+  -0.078695f, 0.038257f,  -0.012110f, -0.263521f, 0.009839f,  -0.109125f,
+  -0.226036f, 0.060712f,  0.093671f,  0.153143f,  0.039116f,  -0.290891f,
+  0.227057f,  -0.204633f, -0.207539f, -0.148242f, 0.046204f,  -0.231268f,
+  -0.209315f, -0.307579f, -0.436556f, 0.023475f,  0.131793f,  -0.038301f,
+  1.650584f,  0.392570f,  1.446576f,  1.254380f,  -0.516867f, -0.057116f,
+  0.149320f,  0.414424f,  -0.246309f, 0.003877f,  -0.480238f, -1.037035f,
+  -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f,  0.940609f,
+  -1.113370f, -0.018554f, 0.141064f,  -0.182504f, 1.270707f,  0.414904f,
+  -0.216036f, 0.203831f,  0.450716f,  -0.452909f, 0.139358f,  -0.027143f,
+  1.956892f,  1.643732f,  -0.867839f, -0.620520f, -0.334607f, -0.519982f,
+  0.205023f,  0.661159f,  -0.000809f, 0.049033f,  -0.348579f, -0.200338f,
+  -0.362144f, -0.346590f, -0.230096f, 0.180746f,  -0.149954f, -0.253429f,
+  -0.378170f, -0.040724f, -0.041597f, 0.243659f,  -0.472181f, 0.015401f,
+  -0.180376f, 0.153139f,  -0.247738f, -0.010485f, -0.157158f, 0.016825f,
+  -0.238925f, -0.265798f, -0.318374f, 0.142352f,  -0.210520f, 0.051928f,
+  -0.352190f, -0.179052f, -0.185498f, 0.025540f,  -0.111667f, -0.235187f,
+  -0.215454f, 0.010931f,  -0.238372f, -0.126659f, 0.075691f,  -0.091167f,
+  -2.462379f, -0.007950f, -0.637990f, 0.285554f,  -0.051275f, 0.282279f,
+  -0.744083f, -0.570646f, 0.592198f,  1.421332f,  -0.256027f, -0.140315f,
+  0.160247f,  -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f,
+  -0.071228f, 0.055864f,  -1.084764f, -0.263409f, 0.779266f,  0.228187f,
+  0.375013f,  0.121204f,  -0.656948f, 0.533561f,  0.272671f,  -0.015423f,
+  -0.124180f, -0.009127f, 2.934838f,  -0.150998f, 1.163152f,  0.081997f,
+  -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f,  0.024046f,
+  -1.451709f, 0.332558f,  0.990504f,  0.376290f,  -1.466773f, -0.448439f,
+  -2.929108f, -4.255188f, 0.065238f,  0.019950f,  1.372393f,  0.444052f,
+  -2.538772f, 1.579767f,  -0.464911f, -1.866114f, 1.053958f,  0.434467f,
+  -0.125964f, 0.034671f,  0.077116f,  -0.138466f, -0.413395f, -0.223453f,
+  -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f,  0.037459f,
+  -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f,
+  -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f,  -0.328036f,
+  -0.169790f, 0.036506f,  0.052572f,  -0.183570f, -0.073617f, -0.244959f,
+  0.266498f,  0.032846f,  -1.902106f, 0.486078f,  2.414993f,  0.975182f,
+  -0.382875f, 1.647810f,  -2.197017f, -0.890107f, 0.221287f,  0.010889f,
+  3.817042f,  0.572728f,  0.092466f,  0.473337f,  -1.634659f, -1.069455f,
+  1.486776f,  -1.023850f, 0.088184f,  0.008842f,  0.518202f,  0.270259f,
+  1.757191f,  -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f,
+  -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f,
+  -0.267836f, -0.319354f, -0.274975f, 0.068970f,  -0.406467f, 0.044074f,
+  -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f,  -0.177674f,
+  -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f,
+  -0.312272f, -0.222217f, -0.100548f, 0.106260f,  -0.034655f, 0.135109f,
+  -0.021276f, 0.018177f,  -0.353097f, -0.011128f, 0.061136f,  -0.511662f,
+  -0.223236f, -0.308841f, 0.118789f,  -0.154628f, -0.053178f, -0.055973f,
+  0.013175f,  -0.368337f, -0.090863f, -0.116920f, 0.178990f,  -0.025278f,
+  -0.190553f, -0.238092f, 0.303943f,  -0.024944f, 0.719373f,  0.384332f,
+  -0.378480f, -0.423316f, 0.709922f,  0.758514f,  -1.559023f, -2.503173f,
+  0.068652f,  -0.234741f, -0.182932f, 0.037878f,  0.020684f,  -0.174142f,
+  -0.182300f, -0.052796f, -0.219145f, 0.113028f,  -1.041826f, 0.035317f,
+  0.919904f,  -0.676011f, 0.652297f,  1.456447f,  -0.166904f, -0.861823f,
+  0.895827f,  0.429821f,  -0.180376f, -0.076587f, -0.273945f, -0.288990f,
+  -0.206692f, -0.080745f, -0.085444f, 0.186953f,  -0.050135f, 0.044243f,
+  -0.391706f, -0.160498f, -0.292268f, 0.164060f,  0.412649f,  0.211611f,
+  -0.327294f, -0.919399f, 0.320297f,  0.385284f,  -0.088848f, -0.072556f,
+  -0.384813f, -0.176267f, -0.065918f, 0.134724f,  -0.231104f, -0.337707f,
+  -0.195442f, -0.263569f, 0.098090f,  -0.341411f, -0.189211f, -0.439276f,
+  -0.404046f, 0.262491f,  -0.311093f, -0.086454f, -0.013400f, -0.061447f,
+  -0.026945f, -0.112036f, -0.322985f, 0.078500f,  -0.230205f, -0.344535f,
+  -0.021087f, 0.110220f,  -0.128671f, 0.044219f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer0[64] = {
+  2.936406f,  -0.396539f, -0.110456f, -1.254954f, 0.785350f,  0.516290f,
+  -0.172341f, 0.254386f,  -0.192465f, -0.106751f, -0.055518f, -0.094994f,
+  0.000000f,  -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f,
+  0.457446f,  -0.125051f, -0.107712f, 0.714607f,  -0.140809f, -1.788650f,
+  -0.087199f, 0.000000f,  -1.290050f, 0.443930f,  -0.110634f, -0.109380f,
+  -0.188213f, -1.414179f, 1.193579f,  0.388775f,  -0.873193f, -0.110050f,
+  -0.072565f, -0.117050f, -0.119132f, 0.456959f,  -0.132069f, 0.131974f,
+  1.160474f,  1.746465f,  0.442628f,  -0.188849f, -0.207794f, -0.108364f,
+  -0.856655f, -2.141620f, 0.335476f,  -0.105508f, -0.212162f, -0.109319f,
+  -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f,  -0.023908f,
+  0.123809f,  -0.109797f, 0.200510f,  -0.147542f,
+};
+
+static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = {
+  -6.823716f, 1.406568f,  -0.144009f, 2.228765f,  0.838336f,  0.738107f,
+  -0.319014f, -0.148756f, 0.240862f,  -0.111089f, -0.004241f, 0.025758f,
+  -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f,  0.252994f,
+  -0.289443f, 0.194932f,  0.057467f,  0.724735f,  0.014063f,  1.361352f,
+  0.025191f,  0.024274f,  0.231462f,  -7.227959f, -0.094515f, 0.039946f,
+  0.412719f,  0.812318f,  3.038903f,  -0.286289f, 0.647482f,  -0.115114f,
+  0.053590f,  0.066069f,  0.153134f,  0.996250f,  -0.125700f, 0.951365f,
+  -6.243494f, -4.827697f, 0.566320f,  0.239515f,  -0.099702f, 0.054546f,
+  1.847330f,  3.680076f,  -3.049829f, -0.127709f, 0.068469f,  -0.017794f,
+  0.223864f,  -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f,
+  -0.552073f, 0.043311f,  0.218668f,  0.033209f,  -3.199210f, 0.193079f,
+  0.321406f,  0.718307f,  -0.181418f, -0.459612f, -1.981170f, 0.968496f,
+  -0.029757f, -0.130065f, 0.043782f,  0.072394f,  -0.088686f, 0.025322f,
+  0.129882f,  0.101324f,  0.335707f,  0.072714f,  -2.079774f, 0.203997f,
+  0.239321f,  -0.301757f, 0.257845f,  1.288382f,  -0.031275f, -0.234194f,
+  0.310722f,  2.045469f,  0.034716f,  0.135638f,  -0.251388f, 0.320071f,
+  -1.065301f, -0.322731f, -0.545028f, 0.226276f,  0.090799f,  0.019289f,
+  0.048950f,  -1.079300f, 0.231938f,  0.083683f,  4.762127f,  0.145037f,
+  -0.145549f, 0.075592f,  0.172336f,  0.108175f,  0.333751f,  1.090501f,
+  1.056114f,  0.047073f,  0.182052f,  -0.081587f, 0.089900f,  0.339286f,
+  2.049988f,  0.073585f,  0.537355f,  -0.243322f, -0.010179f, -0.052601f,
+  -0.174915f, 0.117793f,  2.222990f,  -2.520837f, -0.092699f, 1.199887f,
+  0.138720f,  0.679918f,  -0.463155f, -0.659496f, -0.109913f, -0.003398f,
+  0.114633f,  -0.128377f, 0.092970f,  -0.107489f, -0.191078f, 0.185182f,
+  0.216980f,  -0.019343f, 3.443133f,  0.287953f,  0.099314f,  0.985958f,
+  0.157268f,  -0.606516f, 0.049418f,  -0.221809f, -0.453081f, -0.344796f,
+  -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f,
+  -1.011192f, 0.022795f,  0.186363f,  -0.076356f, -0.050932f, -0.165098f,
+  0.168177f,  -0.101596f, -5.270886f, 2.553943f,  -0.440870f, -0.017494f,
+  0.215208f,  -0.017032f, 1.495915f,  -4.304677f, 0.762211f,  0.182937f,
+  0.254406f,  -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f,
+  0.737697f,  -0.234989f, 0.168095f,  0.245118f,  -0.077262f, 0.195718f,
+  0.753302f,  -1.637869f, 0.126227f,  0.982129f,  -0.121444f, -0.295570f,
+  -1.215799f, 0.147867f,  -0.068496f, 0.132726f,  -0.005772f, -0.181774f,
+  0.126513f,  0.204723f,  -0.366123f, 0.103906f,  -0.148053f, -0.075272f,
+  0.243884f,  -0.104828f, 0.198988f,  0.501034f,  -0.112671f, 0.111421f,
+  0.167508f,  -0.117803f, -0.738624f, 2.046292f,  0.124011f,  0.057983f,
+  -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f,
+  0.122417f,  0.060291f,  -0.129033f, -0.843086f, 0.268241f,  -0.399927f,
+  1.585888f,  1.816393f,  -0.631427f, 0.127826f,  0.088105f,  0.073488f,
+  0.717694f,  -1.497362f, 2.608528f,  0.066896f,  -0.079230f, 0.223436f,
+  -0.010530f, 0.175310f,  1.120365f,  0.034391f,  0.835312f,  0.071652f,
+  -0.080615f, 0.111395f,  0.162742f,  0.079927f,  -3.859582f, -0.638431f,
+  -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f,  0.931940f,
+  -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f,  0.102793f,
+  -0.048546f, 0.063545f,  0.023864f,  -0.190863f, 1.934257f,  -0.136286f,
+  -0.107916f, -0.637468f, 0.066449f,  1.089693f,  -0.214047f, -0.265780f,
+  0.899660f,  -0.130333f, 0.288311f,  -0.049024f, 0.090202f,  0.487969f,
+  0.339704f,  0.858479f,  0.841253f,  -0.184100f, -0.637070f, -0.125071f,
+  -0.077650f, -0.087877f, 0.202268f,  -0.027300f, 2.842862f,  -0.100698f,
+  -0.259080f, 0.260556f,  0.157912f,  -0.070364f, 0.467190f,  1.200037f,
+  1.419317f,  -0.033588f, -0.227824f, 0.292617f,  0.228574f,  0.213839f,
+  -1.091099f, -0.022258f, -1.294681f, 0.136118f,  0.081652f,  -0.185359f,
+  -0.039706f, 0.191407f,  -2.053219f, -0.261934f, 0.047812f,  -0.029536f,
+  -0.823869f, -1.090534f, -0.755890f, 0.441035f,  -0.167945f, 0.231441f,
+  -0.135013f, -0.260762f, 0.256872f,  0.130339f,  -0.243751f, 0.189760f,
+  -0.288454f, 0.145363f,  0.338490f,  0.403898f,  -0.022814f, -1.263598f,
+  -0.101315f, 0.860135f,  0.136511f,  0.028942f,  0.574047f,  2.656370f,
+  0.037587f,  -0.188690f, -0.125312f, 1.100435f,  -1.080402f, 0.380905f,
+  0.004635f,  0.097144f,  -0.214309f, 0.085552f,  -0.285066f, -0.705134f,
+  -0.054704f, -0.319951f, 5.486626f,  0.958158f,  -1.380585f, 0.223340f,
+  -0.169167f, -0.170697f, -0.216748f, 0.324232f,  2.684204f,  -0.008490f,
+  -0.211052f, -0.201190f, 0.123466f,  -0.000234f, 0.579907f,  0.096938f,
+  -0.042745f, 0.201855f,  0.157195f,  -0.261440f, 0.029699f,  -0.046599f,
+  1.618216f,  -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f,
+  0.579699f,  -0.100392f, 0.150694f,  0.061794f,  0.200425f,  -0.062515f,
+  -0.179122f, 0.250112f,  -0.344675f, -0.118359f, -0.095670f, 0.152311f,
+  3.662276f,  -0.154921f, -0.312991f, 0.972008f,  -0.308596f, -0.190426f,
+  0.133889f,  -0.238673f, -0.094726f, 1.683835f,  -0.215629f, -0.198890f,
+  -0.035278f, -0.367973f, -0.822435f, 0.240848f,  -0.194656f, 0.034655f,
+  -0.079424f, 0.146670f,  0.026646f,  -0.034507f, 0.059467f,  -0.153109f,
+  -0.431033f, 2.552991f,  -1.894091f, -0.180462f, -0.306839f, -0.025648f,
+  1.026326f,  -3.096230f, 1.346935f,  0.033633f,  -0.181827f, 0.094376f,
+  0.001696f,  -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f,
+  0.281795f,  -0.127251f, 0.180776f,  0.067763f,  0.697124f,  -1.040779f,
+  0.111280f,  0.188351f,  -0.340234f, -0.207790f, -0.720075f, -0.137409f,
+  -0.070310f, -0.032918f, -0.060787f, 0.131484f,  -0.077845f, -0.258652f,
+  0.056911f,  -0.062034f, 0.007663f,  -0.185100f, 1.340361f,  0.014096f,
+  -0.124602f, 0.194241f,  0.128383f,  0.360465f,  0.082979f,  -0.050475f,
+  -0.519294f, 3.323262f,  0.067014f,  0.221203f,  -0.085082f, -0.228606f,
+  -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f,
+  -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f,  1.790253f,
+  -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f,
+  2.251166f,  -0.146007f, 0.138527f,  -0.003134f, 0.103665f,  0.006928f,
+  -0.240253f, -0.227464f, 0.578437f,  -0.214724f, 0.503085f,  0.158093f,
+  0.033091f,  0.008061f,  4.815371f,  2.132264f,  0.281850f,  -2.288560f,
+  -0.145012f, 1.296832f,  -0.362401f, -0.403252f, 0.109873f,  0.185746f,
+  0.244764f,  0.172367f,  -0.185588f, 0.139801f,  -0.178254f, 0.068629f,
+  0.358488f,  -0.153969f, -6.433524f, 0.225983f,  -0.138123f, -0.095971f,
+  -0.036089f, -1.400083f, 0.265908f,  0.257787f,  0.181144f,  -1.647228f,
+  -0.136289f, -0.074206f, 0.122988f,  -0.088895f, -1.266717f, 0.006010f,
+  0.536681f,  0.263061f,  -0.032207f, -0.155136f, 0.086431f,  0.441950f,
+  -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f,  0.117667f,
+  -0.000408f, 0.225719f,  -2.199698f, 0.141447f,  -1.459051f, 0.051315f,
+  0.203228f,  0.354432f,  -0.005775f, -0.028073f, -0.965817f, 0.231083f,
+  -0.666884f, 0.026283f,  -0.317486f, 0.210754f,  0.123897f,  0.223827f,
+  4.214405f,  1.457334f,  -0.253945f, -1.306733f, -0.391235f, 0.451154f,
+  -1.553888f, -0.353429f, 0.069533f,  0.159278f,  -0.173836f, -0.004952f,
+  -0.137033f, 0.127012f,  0.143600f,  0.051587f,  -0.070549f, 0.066509f,
+  -5.776547f, 0.180021f,  -0.189183f, -1.288504f, -0.233575f, -1.473873f,
+  0.140940f,  0.144451f,  -0.104534f, 2.089873f,  -0.168168f, 0.110726f,
+  0.132134f,  -0.215223f, -1.682754f, 0.157757f,  -0.146163f, 0.064882f,
+  0.117313f,  -0.038780f, -0.124720f, -0.501697f, 0.092047f,  -0.233992f,
+  3.324976f,  0.516601f,  1.294202f,  0.119989f,  0.061055f,  0.043420f,
+  -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f,
+  -0.282998f, -0.282705f, 0.073798f,  0.169851f,  0.135651f,  0.182677f,
+  -0.040220f, 0.132462f,  -0.303120f, -0.230113f, 6.165739f,  -0.258596f,
+  0.024127f,  -1.388283f, -0.006042f, 0.572600f,  0.348411f,  -0.387376f,
+  -0.075845f, 0.122319f,  -0.029616f, 0.077873f,  0.154763f,  0.049073f,
+  0.018597f,  0.102688f,  -0.204165f, 0.020734f,  -1.389133f, -0.032854f,
+  -0.147561f, 0.853944f,  0.132100f,  -3.259659f, 0.243745f,  0.181529f,
+  -0.738414f, 1.509994f,  0.023470f,  -0.005329f, 0.066115f,  -1.345081f,
+  -1.455402f, -0.172023f, -0.194625f, 0.071885f,  -0.201742f, -0.262402f,
+  0.077601f,  -0.048938f, 0.257993f,  -0.504029f, -2.032415f, 1.158880f,
+  0.448647f,  -0.025633f, 0.117586f,  -0.072275f, -0.673744f, -3.854342f,
+  -0.983843f, 0.047766f,  -0.017193f, -0.215775f, -0.158743f, -0.232042f,
+  -0.509112f, 0.148812f,  0.130122f,  0.006486f,  -0.099016f, 0.022514f,
+  -0.486850f, -0.059623f, 4.012731f,  0.025454f,  0.029059f,  -0.783546f,
+  -0.295260f, 0.322521f,  -0.473201f, -0.172100f, -0.100087f, -0.076516f,
+  -0.258367f, -0.112897f, 0.269364f,  -0.065912f, 0.169022f,  -0.178783f,
+  -0.095114f, 0.122089f,  -2.790099f, -0.100431f, -0.087963f, -0.009431f,
+  -0.087819f, -2.774399f, -0.100757f, 0.013005f,  -0.964533f, 3.236665f,
+  -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f,
+  -1.799262f, -0.365269f, 0.108611f,  0.037994f,  0.024747f,  -1.073639f,
+  -0.203158f, -0.935006f, 1.880891f,  1.578385f,  0.726272f,  -0.024546f,
+  -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f,  0.075451f,
+  0.182899f,  0.092215f,  -0.207347f, -0.030111f, 0.054316f,  0.192481f,
+  0.594639f,  -0.247694f, 0.547471f,  -0.032094f, -0.065000f, 0.007198f,
+  1.605377f,  -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f,
+  0.919365f,  0.599980f,  0.125545f,  0.265813f,  0.246884f,  0.095385f,
+  -0.260374f, -0.202916f, -0.042770f, 0.234967f,  -0.233139f, -0.326994f,
+  -1.375256f, 0.121766f,  0.077433f,  -1.103569f, 0.019497f,  -1.029185f,
+  0.253905f,  0.206569f,  0.187334f,  -0.237089f, -0.294351f, 0.164137f,
+  0.149696f,  -0.749787f, -0.413433f, 0.976587f,  1.027976f,  -0.285264f,
+  0.209273f,  -0.124762f, 0.050884f,  0.250764f,  -0.082031f, -0.646520f,
+  4.116680f,  0.437336f,  0.671684f,  0.129509f,  -0.078462f, 0.014072f,
+  -0.678232f, 0.094831f,  1.125624f,  0.207070f,  -0.154750f, -0.025780f,
+  -0.103030f, 0.118019f,  -0.908186f, -0.263546f, -1.555324f, -0.236887f,
+  -0.217854f, -0.051790f, 0.017915f,  0.171001f,  1.355562f,  0.094603f,
+  -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f,
+  -0.298901f, 0.038162f,  0.251899f,  0.039612f,  -0.022935f, -0.232308f,
+  -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f,
+  -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f,
+  0.185269f,  1.465082f,  0.040240f,  0.112665f,  0.144329f,  -0.286112f,
+  -0.617649f, 0.916177f,  0.221044f,  -0.079867f, 0.170251f,  -0.093638f,
+  -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f,  1.241179f,
+  0.355922f,  -0.170848f, -0.189168f, 0.080225f,  -1.357793f, 0.190890f,
+  0.976800f,  -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f,
+  -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f,
+  -0.049715f, -0.178005f, 3.029985f,  -1.141546f, 0.080066f,  -1.932316f,
+  -0.641137f, -0.189564f, 0.935080f,  0.136119f,  0.015558f,  -0.179331f,
+  0.204571f,  0.020350f,  0.009362f,  0.108478f,  0.037076f,  -0.049009f,
+  0.081090f,  -0.180202f, 1.455561f,  -0.081559f, 0.059361f,  0.484971f,
+  0.160923f,  -2.170744f, -0.013204f, 0.126561f,  -0.407122f, 1.223661f,
+  0.044262f,  0.118044f,  0.058274f,  -1.747100f, -0.171318f, 0.971374f,
+  0.306995f,  -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f,
+  -0.106479f, -0.907933f, 1.121231f,  1.673840f,  -0.421458f, -0.021146f,
+  -0.254838f, 0.097632f,  0.235109f,  -2.901782f, 0.289518f,  -0.355459f,
+  -0.068264f, -0.179121f, 0.068560f,  -0.047570f, -0.522523f, -0.228963f,
+  -1.037158f, -0.163723f, 0.280563f,  -0.000868f, -0.197220f, -0.239329f,
+  1.985274f,  -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f,
+  -0.792024f, -0.114290f, 0.060969f,  0.104106f,  -0.252123f, -0.150400f,
+  -0.133277f, 0.267147f,  0.274413f,  0.223744f,  -0.180223f, -0.345415f,
+  -0.104883f, 0.119210f,  -0.095041f, -0.301635f, 0.013175f,  -2.128121f,
+  -0.147208f, -0.151509f, -0.692013f, 3.418555f,  -0.016541f, 0.171511f,
+  0.107159f,  -1.516672f, 0.127408f,  0.687035f,  -0.906486f, -0.145463f,
+  -0.169382f, -0.143906f, 0.125091f,  -0.960645f, -0.180869f, -0.716908f,
+  2.840951f,  1.904919f,  -0.416268f, -0.425181f, -0.194697f, -0.075932f,
+  -0.950604f, -1.599800f, 0.943671f,  -0.022744f, -0.270492f, 0.080843f,
+  -0.372916f, 0.047838f,  -0.100300f, -0.026600f, 0.011733f,  -0.226051f,
+  0.172790f,  -0.172982f, 0.041258f,  -0.299379f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f,
+  0.748430f,  0.203096f,  0.059317f, 0.418219f,  0.841294f,  0.402693f,
+  -0.658522f, 0.723479f,  0.544264f, 1.035225f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_16_layer0,
+      av1_ab_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_16_layer0,
+      av1_ab_partition_nn_bias_16_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 18
+#define LABEL_SIZE 4
+
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+  -2.032866f, 0.056691f,  0.495960f,  0.778785f,  0.548153f,  -0.806942f,
+  0.481155f,  0.282298f,  0.584980f,  0.504688f,  0.209648f,  0.234616f,
+  0.213484f,  0.221969f,  0.205862f,  0.235054f,  0.317863f,  0.257139f,
+  0.529478f,  0.098122f,  -0.657532f, 0.036296f,  0.327728f,  1.323180f,
+  -0.813082f, 0.160216f,  -0.702030f, 0.722733f,  -0.270576f, -0.347416f,
+  -0.264700f, -0.254248f, 0.159820f,  0.087995f,  -0.184163f, 0.117357f,
+  0.074194f,  -0.667369f, 0.498246f,  0.420506f,  0.072409f,  -0.121581f,
+  0.315788f,  0.000525f,  0.414986f,  0.678166f,  -0.011230f, 0.188131f,
+  -0.227749f, 0.009564f,  0.108672f,  0.106923f,  -0.080695f, -0.279382f,
+  -0.061339f, -0.297835f, -0.134707f, 0.145865f,  -0.009655f, -0.000842f,
+  -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+  -0.143413f, 0.279668f,  0.000885f,  -0.022380f, -0.140194f, -0.310473f,
+  0.252699f,  0.066204f,  0.477568f,  0.994609f,  -0.276000f, 1.213182f,
+  0.277028f,  -0.411570f, -0.211559f, 0.377815f,  0.121488f,  -0.100559f,
+  -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+  -0.143196f, -0.334035f, 0.162305f,  0.142279f,  -0.001210f, -0.135252f,
+  -0.033562f, 0.204307f,  -0.039757f, -0.394174f, 0.126617f,  -0.128648f,
+  -0.410979f, 0.107641f,  -0.117573f, -0.326512f, 0.235166f,  0.084959f,
+  0.290063f,  -0.005838f, 0.459894f,  1.023709f,  -0.196145f, 1.100137f,
+  -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+  -0.029743f, 0.125113f,  -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+  -0.101234f, 0.079706f,  -1.128615f, -0.467381f, 0.220563f,  -0.409900f,
+  -0.435353f, 0.759499f,  -0.465799f, -0.394309f, 0.176282f,  -0.086275f,
+  -0.161225f, -0.354814f, 0.562871f,  0.418253f,  0.414361f,  0.445480f,
+  -0.995903f, -0.086632f, -0.230645f, 0.354656f,  -0.317576f, 0.079926f,
+  0.424369f,  0.997232f,  -0.304388f, 1.071667f,  -0.023540f, 0.029677f,
+  0.108564f,  0.183581f,  -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+  -0.271949f, -0.358483f, 0.304930f,  0.023823f,  -0.009319f, -0.214247f,
+  0.100712f,  -0.050162f, 0.327103f,  -0.212999f, -0.030496f, 0.316380f,
+  -0.439589f, -0.249959f, 0.229777f,  -0.353664f, -0.384559f, 0.114236f,
+  0.023119f,  0.007927f,  0.618368f,  0.957759f,  -0.019780f, -1.002389f,
+  0.564277f,  -0.839531f, 1.040445f,  0.054340f,  0.031908f,  -0.032893f,
+  -0.019170f, -0.042011f, 0.568928f,  0.362567f,  -0.559999f, -0.605344f,
+  -0.586146f, -0.290778f, 0.195943f,  -0.109580f, -0.088898f, -0.113054f,
+  0.293282f,  0.429019f,  0.306136f,  0.863025f,  0.021234f,  0.125770f,
+  -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f,  0.064151f,
+  0.029883f,  0.076287f,  0.757543f,  0.276713f,  -2.529775f, -0.351727f,
+  -1.832316f, 0.544780f,  -0.944529f, 0.509705f,  -0.010236f, -0.016181f,
+  0.021520f,  0.086417f,  0.041312f,  0.296853f,  -0.372378f, 0.354446f,
+  -1.366762f, 0.048875f,  0.464918f,  -0.007450f, 0.750013f,  -0.360261f,
+  0.518532f,  0.753776f,  0.641448f,  0.710746f,  0.250866f,  0.257063f,
+  0.283421f,  0.253585f,  0.170303f,  0.210426f,  0.208842f,  0.158000f,
+  -0.033144f, 0.130748f,  0.907147f,  0.409248f,  -0.854301f, -0.981307f,
+  0.294427f,  -0.507137f, 1.079967f,  0.203203f,  0.383890f,  0.368278f,
+  0.305122f,  0.449288f,  -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+  0.007016f,  -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+  0.033502f,  -0.018578f, -0.231531f, 0.177949f,  0.099564f,  -0.010233f,
+  -0.333055f, -0.078586f, -0.417867f, 0.171271f,  0.013662f,  -0.143599f,
+  -0.117296f, 0.135382f,  0.048321f,  0.000924f,  -0.055024f, -0.405595f,
+  -0.068260f, -0.271011f, -0.436425f, 0.206751f,  -0.899890f, 0.605510f,
+  0.535649f,  -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+  0.004660f,  0.176644f,  -1.008475f, -0.038895f, 0.155429f,  -0.095229f,
+  -0.680124f, -0.258063f, -0.261901f, 0.110380f,  -0.337649f, -0.505870f,
+  -1.428536f, 0.610629f,  0.254905f,  0.045098f,  0.044109f,  0.172329f,
+  0.060001f,  -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+  -0.312134f, 0.081261f,  0.160166f,  0.112690f,  0.266081f,  0.030175f,
+  -0.242746f, 0.000754f,  -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+  -0.121466f, 0.067300f,  0.342176f,  0.474538f,  0.085441f,  -0.263935f,
+  0.479235f,  -0.003713f, -0.784840f, 0.119480f,  0.456632f,  -0.640082f,
+  -0.080575f, -0.744403f, 0.259970f,  0.034667f,  -0.274641f, -0.257594f,
+  -1.121124f, -0.003745f, -0.420693f, 0.300441f,  -0.100976f, -1.049016f,
+  0.201960f,  0.113054f,  0.187010f,  1.237427f,  0.054803f,  -0.028673f,
+  0.003596f,  -0.034724f, 0.117246f,  0.190977f,  0.278915f,  0.224307f,
+  0.017852f,  -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+  0.045698f,  -0.301095f, 0.184447f,  0.348240f,  -0.017021f, -0.145064f,
+  -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+  0.647597f,  -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+  -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+  -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+  -0.971412f, 0.038542f,  0.705204f,  0.887113f,  0.150430f,  -0.243676f,
+  0.638410f,  0.320953f,  0.776676f,  0.527584f,  0.070389f,  0.051554f,
+  0.177519f,  0.140451f,  0.128892f,  0.087771f,  0.197660f,  0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+  0.614063f,  -0.384872f, 0.084884f,  -0.023980f, -0.378765f, -0.082312f,
+  -0.458271f, 0.189578f,  -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+  0.148803f,  0.829214f,  -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+  -0.031905f, -0.143459f, 0.078823f,  -0.021940f, 0.026834f,  0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+  -0.985391f, 0.587616f,  0.740683f,  0.192066f,  0.447080f,  -0.016585f,
+  0.680449f,  0.028983f,  0.643111f,  0.234338f,  0.107148f,  0.328456f,
+  -0.216394f, 1.106838f,  -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+  -0.306017f, -0.350989f, 0.859284f,  -0.372831f, -0.954419f, 0.250495f,
+  1.046732f,  0.287923f,  -0.421088f, 0.326613f,  -0.314396f, -0.084757f,
+  -0.474228f, 0.687999f,  0.052334f,  0.441708f,  -0.630698f, -0.350348f,
+  -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f,  0.603119f,
+  0.120245f,  0.182920f,  -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+  0.489497f,  -0.527019f, 0.102453f,  0.426731f,  0.034606f,  0.311461f,
+  -0.012723f, -0.229877f, -0.284290f, 0.383227f,  0.065696f,  -0.222400f,
+  1.279248f,  -0.862190f, 0.629766f,  -0.250011f, -0.325060f, -0.360115f,
+  -0.159540f, -0.291856f, -0.038348f, 0.224639f,  0.600934f,  0.030205f,
+  1.337615f,  -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+  -0.481860f, 0.563625f,  -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+  -0.095253f, -0.711135f, 0.207759f,  0.076313f,  -0.056087f, -0.162719f,
+  -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f,  -1.504446f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.462133f,
+  0.465060f,
+  0.062211f,
+  0.401786f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      24,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_16_layer0,
+      av1_4_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_16_layer0,
+      av1_4_partition_nn_bias_16_layer1,
+  },
+};
+
+static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
+  -0.219494f, -0.428273f, 0.471006f,  0.448210f,  -0.152935f, 0.440435f,
+  0.922857f,  -0.074436f, 1.002195f,  0.414176f,  -0.327202f, -0.380066f,
+  -0.212346f, 0.061868f,  -0.056620f, 0.594134f,  0.617995f,  0.308358f,
+  0.232484f,  0.129849f,  1.483593f,  -0.071460f, 1.984515f,  1.116422f,
+  -1.141762f, -0.306220f, 0.089075f,  -0.271845f, 0.187524f,  0.050396f,
+  -0.061025f, 0.030809f,  0.172799f,  -0.458151f, -0.318357f, 0.122052f,
+  -0.414329f, 0.089366f,  0.118898f,  -0.376213f, -0.206151f, -0.519946f,
+  -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f,  -0.245280f,
+  -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+  -0.500856f, 0.003388f,  -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+  0.023005f,  0.157273f,  0.073400f,  0.019099f,  -0.113848f, -0.098601f,
+  -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+  -1.282604f, 0.048505f,  -0.746382f, 0.093740f,  -0.706583f, -0.085729f,
+  0.947382f,  -0.002961f, 1.175362f,  1.007309f,  0.141638f,  -0.037608f,
+  -0.118807f, -0.021474f, -0.146763f, 0.069363f,  -0.074372f, -0.215713f,
+  -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f,  -0.534598f,
+  -0.357759f, -0.455950f, 0.139469f,  0.036582f,  -0.384743f, -0.168828f,
+  -0.239250f, 0.003520f,  -0.049003f, 0.075702f,  -0.025809f, -0.225972f,
+  -0.228905f, -0.412489f, 0.060570f,  -0.328819f, -0.206446f, -0.080231f,
+  -0.372008f, -0.218118f, -0.011954f, 0.024155f,  0.156014f,  0.020679f,
+  0.194398f,  -0.283491f, -0.024463f, -0.275099f, 0.028031f,  0.026340f,
+  -0.254668f, 0.103637f,  2.178693f,  0.552284f,  0.109366f,  -0.474806f,
+  -0.379286f, -0.026315f, 2.487924f,  -0.089466f, 0.206428f,  0.114578f,
+  0.152248f,  0.184050f,  -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+  0.009343f,  -0.021029f, -0.060534f, -0.025164f, 1.841311f,  1.842748f,
+  -1.979708f, 0.450985f,  -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+  0.198991f,  -0.258070f, 0.055974f,  0.224069f,  0.453051f,  0.408053f,
+  0.027873f,  -0.180538f, 0.056609f,  0.207654f,  0.104086f,  -0.194426f,
+  -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+  -0.160439f, -0.044856f, -0.346647f, 0.044859f,  0.231398f,  -0.023643f,
+  -0.140316f, -0.260177f, 0.206965f,  -0.425386f, -0.420268f, -0.409748f,
+  0.006971f,  0.066186f,  -0.034950f, -0.345518f, 0.018633f,  -0.122489f,
+  -0.038506f, -0.330942f, 0.161236f,  -0.314119f, -0.050202f, -0.179597f,
+  0.731897f,  -0.184481f, 0.153598f,  -0.539501f, -0.301493f, -0.184967f,
+  -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+  -0.101083f, -0.064142f, 0.161190f,  0.430826f,  0.355647f,  0.138266f,
+  0.051114f,  -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+  -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+  0.120193f,  0.011360f,  -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+  -0.259893f, -0.073217f, 0.200128f,  0.103407f,  -0.229233f, 0.128831f,
+  -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+  -0.442879f, -0.310456f, -0.112881f, 0.263696f,  -0.205014f, -0.497936f,
+  -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+  0.117232f,  -0.577809f, 0.154596f,  -0.409522f, -0.413113f, -0.359199f,
+  0.307294f,  -0.008746f, -0.310522f, 0.347620f,  -0.384845f, -0.451398f,
+  -0.226199f, 0.054154f,  -0.167608f, 0.046836f,  -0.013285f, -0.408119f,
+  -0.177973f, -0.248293f, -0.465830f, 0.035827f,  -0.222208f, -0.221717f,
+  0.066392f,  -0.349769f, -0.428029f, -0.516692f, 0.022398f,  -0.251682f,
+  0.134746f,  0.011167f,  -2.078787f, 0.173592f,  -1.948348f, 0.330060f,
+  1.993785f,  -0.052859f, -0.004795f, -3.703177f, 0.013450f,  -0.011687f,
+  0.073079f,  0.034803f,  0.025515f,  0.005994f,  0.101731f,  0.074303f,
+  -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+  0.007667f,  -0.358453f, 0.027412f,  0.033492f,  0.021197f,  -0.049991f,
+  0.104468f,  -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+  -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+  -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f,  -0.246475f,
+  -0.229144f, -0.087744f, -0.346909f, 0.172611f,  0.004377f,  -0.009386f,
+  -0.023104f, 0.008000f,  -0.029390f, -0.317842f, 0.549674f,  -0.195337f,
+  -0.863979f, 0.160889f,  -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+  -0.112837f, 0.881303f,  0.000764f,  -0.035415f, -0.141877f, 0.184831f,
+  -0.363566f, -0.178569f, 0.254134f,  -0.326893f, 0.127325f,  0.310620f,
+  -0.384621f, 0.146058f,  -0.287682f, -0.373447f, 0.026930f,  0.251650f,
+  0.053817f,  0.227509f,  0.121396f,  0.396514f,  -0.278381f, -0.038969f,
+  -1.538756f, -0.002856f, -0.892900f, 0.363426f,  -1.257922f, 0.743795f,
+  0.941177f,  0.219345f,  0.684189f,  1.396858f,  0.026299f,  -0.093433f,
+  -0.066182f, 0.057868f,  -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+  0.005349f,  -0.031314f, 0.027917f,  -0.182113f, -0.212086f, -0.160774f,
+  0.051468f,  0.036787f,  0.183881f,  -0.288205f, -0.349691f, 0.162511f,
+  0.117878f,  -0.294534f, -0.365037f, -0.246313f, 0.073977f,  -0.072378f,
+  -0.173579f, -0.584560f, 0.547194f,  0.259853f,  -0.405287f, -0.421146f,
+  0.165788f,  -0.146964f, 0.257415f,  0.772394f,  -0.475302f, -0.310906f,
+  0.058723f,  0.276833f,  0.586842f,  0.248998f,  -0.061135f, 0.255779f,
+  0.152158f,  -0.024781f, 2.821834f,  1.365141f,  0.914744f,  0.165752f,
+  -1.048304f, -0.333891f, 1.804087f,  -0.437028f, -0.120211f, -0.020443f,
+  0.040077f,  0.258600f,  -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+  0.005258f,  0.053986f,  0.322755f,  0.429495f,  -1.992364f, -0.717192f,
+  -1.774802f, 2.047362f,  -0.016194f, 0.312606f,  0.019331f,  0.060950f,
+  0.116428f,  0.168458f,  -0.307001f, -0.420734f, 0.475843f,  0.425346f,
+  -0.107119f, 0.049892f,  -1.168619f, 0.010878f,  0.354872f,  0.902717f,
+  -0.391407f, 0.332772f,  -1.335037f, -0.447100f, 0.481719f,  -0.101069f,
+  -1.806565f, 0.925280f,  0.346999f,  0.093809f,  0.006275f,  0.270814f,
+  -0.691123f, 0.230748f,  0.137033f,  0.068228f,  1.555975f,  -0.271637f,
+  -0.370403f, 0.236131f,  0.367464f,  -0.136562f, 0.428838f,  0.181750f,
+  0.338762f,  0.292449f,  -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+  -0.140501f, 0.070525f,  1.248748f,  0.637990f,  -1.307246f, -0.514055f,
+  0.393858f,  -1.858727f, 0.713591f,  -0.141044f, 0.080723f,  0.120220f,
+  -0.031175f, 0.224488f,  0.753818f,  -0.833351f, -1.099132f, 0.651100f,
+  -0.135061f, -0.043820f, 0.026983f,  -0.059259f, 0.001345f,  -0.281775f,
+  0.006958f,  0.046103f,  -0.246539f, 0.057630f,  -0.360778f, -0.160681f,
+  -0.414870f, -0.301979f, 0.000683f,  0.132957f,  -0.477609f, 0.106110f,
+  -0.637769f, -0.078374f, -0.229494f, 0.583108f,  -0.822973f, -0.107540f,
+  1.063426f,  -0.268346f, 1.105787f,  2.587550f,  -0.020314f, -0.002161f,
+  -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+  -0.019870f, -0.018920f, -0.219732f, 0.035608f,  -1.789450f, 0.483032f,
+  -0.464729f, 1.563277f,  -1.054195f, 0.359991f,  0.065204f,  0.135623f,
+  0.158380f,  -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer0[32] = {
+  0.421645f,  -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+  -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+  -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f,  -0.032831f,
+  -0.165621f, 0.145844f,  -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+  -0.862375f, -0.108826f, -0.486259f, 0.685325f,  0.072569f,  -0.187961f,
+  0.109579f,  -0.082685f,
+};
+
+static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
+  0.255012f,  0.658860f,  0.216907f,  0.165947f,  0.241182f,  0.340854f,
+  0.409445f,  0.165220f,  0.553373f,  -0.242385f, -0.209571f, 0.255515f,
+  0.222500f,  0.037032f,  0.238590f,  0.061624f,  -2.038693f, 0.264167f,
+  -0.230144f, 0.129952f,  -0.027979f, 0.847761f,  0.438922f,  0.462323f,
+  0.555345f,  0.030689f,  0.336357f,  -0.357326f, -0.113137f, 0.272631f,
+  0.421022f,  0.367776f,  -0.197094f, 0.157117f,  -0.015008f, -0.056123f,
+  -0.283913f, 0.186417f,  0.178561f,  -0.763041f, 0.602038f,  0.341092f,
+  0.320453f,  -0.312776f, -0.371240f, -0.356279f, 0.220117f,  -0.131871f,
+  1.517429f,  0.162223f,  -0.255069f, 0.451861f,  0.045071f,  -0.223257f,
+  0.003257f,  0.015734f,  -0.630447f, -0.672588f, 0.670164f,  0.571031f,
+  -0.657948f, 0.034506f,  -0.249076f, 0.790293f,  0.066491f,  -0.131245f,
+  0.355173f,  0.564622f,  0.374048f,  0.033974f,  0.253970f,  0.495498f,
+  -0.556321f, -0.104651f, 0.276947f,  0.057148f,  -0.039126f, -0.170050f,
+  -0.141542f, 0.158541f,  0.582763f,  -0.100992f, 0.096705f,  -0.209029f,
+  0.008449f,  0.255865f,  0.103565f,  0.317719f,  0.479499f,  0.599126f,
+  -0.065613f, -0.268614f, 0.508736f,  0.180813f,  -0.815868f, 0.051238f,
+  0.001223f,  -0.305423f, -0.270079f, 0.036180f,  0.304342f,  0.202634f,
+  0.218348f,  -0.304304f, -0.438297f, 0.241123f,  0.200230f,  0.151804f,
+  0.051944f,  0.160422f,  -0.262981f, -0.417412f, 1.845729f,  -0.086183f,
+  0.403517f,  0.059667f,  0.564543f,  -0.081752f, 0.114907f,  -0.284489f,
+  -0.673943f, 0.056965f,  0.362221f,  0.403224f,  -0.000233f, -0.209552f,
+  -0.800926f, -0.134132f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.019518f,
+  0.198546f,
+  0.339015f,
+  -0.261961f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      32,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_32_layer0,
+      av1_4_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_32_layer0,
+      av1_4_partition_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+  -0.152649f, 0.074509f,  1.000136f,  0.601661f,  -1.416694f, -1.932396f,
+  -1.163850f, 0.640931f,  -0.888625f, -0.345711f, 0.161799f,  0.103165f,
+  0.147513f,  0.089956f,  0.204329f,  0.196922f,  0.014927f,  0.283714f,
+  -0.110422f, 0.062005f,  -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+  -0.005592f, -0.130490f, -0.015779f, 0.093521f,  -0.158487f, 0.072241f,
+  0.066879f,  -0.418566f, -0.206281f, 0.025634f,  0.048334f,  -0.534750f,
+  0.302081f,  0.028707f,  -1.543248f, 0.103799f,  -1.214052f, 0.395870f,
+  0.394754f,  -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+  0.014742f,  0.065263f,  0.000879f,  -0.019768f, 0.101275f,  0.163059f,
+  -0.371392f, -0.283484f, 0.241915f,  0.012684f,  -0.210101f, -0.166534f,
+  -0.024894f, 0.274696f,  0.098993f,  0.104086f,  0.055044f,  -0.289378f,
+  0.146571f,  -0.147441f, 0.004056f,  0.112244f,  -0.416162f, -0.033176f,
+  -0.214836f, -0.213787f, 0.023197f,  -0.339043f, 0.301109f,  -0.408551f,
+  0.284922f,  -0.344418f, -0.039255f, 0.158748f,  -0.344169f, 0.078286f,
+  -0.043957f, -0.302162f, -0.310826f, 0.063425f,  0.198166f,  -0.285324f,
+  -0.108252f, 0.038992f,  -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+  0.643206f,  -0.850240f, 0.889641f,  -0.733214f, 0.147302f,  0.060291f,
+  -0.052954f, 0.167453f,  0.111870f,  0.085471f,  0.035107f,  0.064361f,
+  0.176053f,  0.184373f,  0.676576f,  0.066164f,  1.455569f,  0.925111f,
+  -0.640845f, 0.803795f,  -0.653782f, -0.201038f, 0.060033f,  0.016964f,
+  -0.047590f, 0.045908f,  0.354162f,  0.014812f,  0.156978f,  0.058792f,
+  -0.238119f, 0.002450f,  -0.094388f, -0.155229f, 0.194858f,  -0.355429f,
+  -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f,  -0.425339f,
+  -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f,  -0.185816f,
+  -0.317294f, 0.002453f,  -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+  -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+  -0.146812f, 0.171111f,  0.090261f,  -0.367033f, -0.299051f, -0.322132f,
+  0.428192f,  -0.252613f, 0.488498f,  -0.559682f, 0.486720f,  -0.511084f,
+  0.992506f,  0.346765f,  -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+  -0.426517f, -0.516836f, 0.307083f,  0.609362f,  0.369555f,  0.093775f,
+  -0.375664f, -0.221595f, -0.025465f, 0.134374f,  -0.387031f, 0.096236f,
+  0.337465f,  -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+  -0.247705f, 0.146559f,  -0.236206f, -0.036073f, 0.064206f,  -0.330919f,
+  0.516591f,  -0.013492f, 1.269568f,  1.182530f,  -0.455390f, -1.328091f,
+  -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f,  0.021176f,
+  0.169119f,  0.103707f,  -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+  0.459111f,  0.036129f,  0.769570f,  -0.080405f, 1.667107f,  0.355567f,
+  -2.433896f, 0.627572f,  -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+  -0.009933f, 0.014864f,  -0.049378f, -0.041561f, 0.075180f,  0.138307f,
+  0.122366f,  -0.160756f, 0.215327f,  0.013572f,  0.198194f,  -0.762650f,
+  0.054466f,  1.110332f,  1.692853f,  0.658654f,  -0.409549f, 0.506085f,
+  0.330962f,  -0.223008f, 0.007448f,  -0.289062f, -0.476231f, -0.228359f,
+  0.013977f,  -0.000609f, -0.673604f, 0.275996f,  0.405291f,  1.693561f,
+  -1.079768f, 1.122516f,  -0.203227f, 0.099265f,  -0.165207f, -0.323899f,
+  -0.269973f, -0.080122f, 0.127700f,  0.190201f,  0.219527f,  0.306194f,
+  0.026049f,  -0.003779f, 1.107357f,  1.720315f,  1.017908f,  0.078664f,
+  -1.599813f, -0.482636f, -0.117450f, 0.122249f,  0.030220f,  0.039794f,
+  0.176350f,  0.129715f,  -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+  -0.073616f, -0.564507f, -0.127758f, 0.044855f,  -0.191090f, 0.039095f,
+  0.115378f,  0.969352f,  -0.088360f, 0.301443f,  0.065726f,  -0.019740f,
+  -0.102350f, -0.084913f, -0.194615f, 0.118582f,  0.920789f,  -0.171615f,
+  -1.436553f, -0.026419f, -0.730864f, 0.615697f,  -0.795079f, 0.119701f,
+  0.601782f,  0.792902f,  0.184920f,  1.635090f,  -0.085860f, -0.033187f,
+  -0.166883f, 0.008487f,  -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+  -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+  -0.063101f, -0.121002f, 0.058072f,  -0.031166f, 0.086413f,  -0.016203f,
+  -0.305075f, -0.005420f, -0.168796f, 0.148745f,  -0.116737f, -0.050222f,
+  -0.287952f, -0.290982f, -0.090449f, 0.076098f,  -0.345632f, -0.061309f,
+  0.142218f,  0.035692f,  0.304517f,  -0.228031f, 0.119608f,  -0.120350f,
+  0.163404f,  -0.105605f, -0.305462f, -0.176657f, 0.210070f,  -0.227600f,
+  -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f,  0.017162f,
+  -0.069792f, 0.305768f,  -0.421095f, 0.187740f,  -0.032059f, 0.575115f,
+  -0.064283f, -0.091828f, 0.772648f,  -0.393189f, -0.297098f, 0.141420f,
+  0.826389f,  -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+  1.546000f,  -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+  -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+  -0.492625f, 0.025350f,  -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+  0.108957f,  0.246298f,  -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+  0.073806f,  -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+  0.018810f,  -0.098241f, 1.027369f,  0.479328f,  1.129707f,  0.484813f,
+  -0.085207f, 0.621873f,  -0.520981f, 0.236175f,  0.273487f,  0.061426f,
+  0.306085f,  0.161487f,  0.220991f,  0.223783f,  -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+  0.580225f,  -0.191304f, 1.091767f,  -0.134522f, -0.089361f, 0.398750f,
+  -0.882708f, -0.213102f, -0.119981f, 0.378296f,  -0.075719f, 0.426598f,
+  -2.015505f, 0.202534f,  -1.044792f, -0.841519f, 0.266421f,  -0.047115f,
+  -0.131147f, -0.075066f, -0.009441f, 0.853007f,  -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+  -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f,  -0.340847f,
+  0.498032f,  0.308652f,  -0.051574f, 0.323146f,  -0.097547f, -0.040269f,
+  1.909655f,  0.098348f,  0.588136f,  0.568112f,  0.313297f,  0.920848f,
+  -0.014486f, 0.386014f,  0.029199f,  -0.537330f, -0.021502f, 0.349073f,
+  -0.524715f, -0.351848f, 1.565454f,  -0.297148f, 0.020177f,  0.648369f,
+  0.027321f,  -0.096052f, -0.363163f, -0.132642f, 0.024292f,  -0.734176f,
+  -0.782700f, 0.408299f,  0.476945f,  -0.489512f, -0.728318f, -0.632042f,
+  0.405417f,  0.184086f,  -0.400730f, 0.359032f,  0.019710f,  -0.217409f,
+  0.519159f,  -0.136316f, 0.993592f,  -0.147128f, 0.097495f,  0.426189f,
+  -0.295233f, 0.278799f,  0.080667f,  -0.025052f, -0.307757f, 0.418716f,
+  -0.853388f, -0.374878f, -0.322725f, 0.696335f,  -0.380649f, -0.160356f,
+  -0.140060f, 0.502455f,  0.656728f,  -0.095023f, -0.184198f, -0.347069f,
+  0.456372f,  -0.029754f, 0.907923f,  0.265710f,  -0.065505f, 0.226763f,
+  -0.277798f, 0.413292f,  -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+  -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+  -0.057382f, 0.334741f,  -0.283083f, 0.368280f,  -0.407197f, -0.441849f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.478735f,
+  0.292948f,
+  0.293172f,
+  0.040013f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      24,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_64_layer0,
+      av1_4_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_64_layer0,
+      av1_4_partition_nn_bias_64_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 4
+static const float
+    av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+      -0.331785f,  0.068675f,  -0.323814f,  0.033714f,  -0.237835f, 0.166316f,
+      -0.498766f,  -0.545634f, -0.266173f,  -0.476957f, -0.120409f, -0.021042f,
+      0.124056f,   -0.278750f, -0.110120f,  -0.372812f, 4.547939f,  0.097618f,
+      -0.002710f,  -0.064169f, -1.841173f,  -0.403833f, 0.005536f,  0.067188f,
+      -0.434935f,  -0.227421f, -0.000011f,  -0.139961f, -0.174056f, -0.652384f,
+      -0.000015f,  -0.262847f, -3.319706f,  -0.947693f, 0.002981f,  0.016717f,
+      -10.408850f, -0.014568f, -0.000018f,  0.019084f,  1.523383f,  0.074525f,
+      -0.002076f,  -0.020734f, 4.881495f,   0.002799f,  0.000342f,  -0.019623f,
+      1.786154f,   0.037462f,  -0.019037f,  0.052833f,  11.408153f, -0.044602f,
+      0.026155f,   -0.518627f, -0.474499f,  -0.427430f, -0.442733f, -0.011116f,
+      -22.379410f, -0.000549f, -0.001418f,  0.008090f,  -0.295090f, -0.230268f,
+      -0.337278f,  -0.001127f, -0.644282f,  -0.598783f, -0.539417f, -0.003303f,
+      9.189824f,   0.038066f,  -0.004097f,  -0.460045f, -0.308858f, -0.242691f,
+      -0.230835f,  -0.273057f, 0.152226f,   0.179239f,  -0.146382f, -0.004655f,
+      -0.242940f,  -0.718862f, -0.001685f,  -0.214736f, 3.263186f,  0.079463f,
+      -0.003854f,  -0.187461f, -0.599144f,  -0.419808f, -0.000597f, -0.136980f,
+      0.184813f,   -0.319525f, -0.007246f,  0.079709f,  -0.883229f, -0.343748f,
+      -0.000077f,  -0.172214f, -0.548759f,  -0.194674f, -0.144786f, 0.043896f,
+      -0.176364f,  -0.248394f, -0.090215f,  -0.294743f, -0.280980f, -0.181436f,
+      -0.115681f,  -0.071915f, -13.035494f, -0.075623f, 0.017052f,  -0.171152f,
+      5.910803f,   0.128344f,  0.010256f,   -1.073301f, 2.387826f,  0.166183f,
+      -0.007193f,  -0.257836f,
+    };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+  0.115591f,  -0.100178f, -0.165523f, -0.122997f, 11.045759f,  1.034761f,
+  -0.323672f, -0.189087f, 2.850950f,  7.010029f,  -21.447067f, 1.877031f,
+  0.437442f,  5.929414f,  -0.117274f, 4.462253f,  -0.135198f,  -0.145927f,
+  8.727211f,  0.000000f,  -3.532987f, -0.405898f, 11.364439f,  -0.141728f,
+  -5.994947f, -0.362574f, 1.857687f,  -0.100400f, -0.130312f,  0.006080f,
+  0.429660f,  -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+  -0.013738f, 0.022052f,  -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+  0.002091f,  0.014252f,  0.134834f,  0.190263f,  0.244175f,  -0.031747f,
+  0.020068f,  -0.068326f, 0.185471f,  0.660268f,  -0.134898f, -0.010376f,
+  -0.276023f, -0.282921f, -0.022769f, 0.007070f,  -0.186235f, 0.024407f,
+  -0.024837f, 0.005764f,  0.016599f,  -0.040077f, 0.020990f,  0.095054f,
+  -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+  0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      32,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_128_layer0,
+      av1_partition_breakout_nn_weights_128_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_128_layer0,
+      av1_partition_breakout_nn_bias_128_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+      0.872892f,  -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+      -0.001373f, 0.112147f,  5.281734f,  0.060704f,  0.000838f,  -0.961554f,
+      0.244995f,  0.154515f,  -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+      0.003208f,  -0.418226f, 2.618152f,  0.026832f,  0.003988f,  -0.404406f,
+      -0.405434f, 0.102791f,  -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+      0.012947f,  -0.195075f, 0.009311f,  -0.411410f, -0.010986f, -0.554822f,
+      0.160576f,  0.020796f,  -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+      -0.001322f, 0.055691f,  0.291924f,  -0.053076f, -0.148379f, -0.298383f,
+      1.022023f,  -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+      -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+      -2.407131f, -0.062304f, 0.000874f,  0.108786f,
+    };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+  0.081425f,  -14.404084f, 11.511393f, -0.930053f, 1.841889f,  15.020920f,
+  -1.872288f, 5.392535f,   -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+  -0.337413f, 4.492778f,   0.000000f,  17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+  -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+  0.014185f,  0.067030f,  -0.001939f, -0.175049f, 0.245992f,  -0.181660f,
+  -0.038572f, 0.307899f,  -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+  -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_64_layer0,
+      av1_partition_breakout_nn_weights_64_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_64_layer0,
+      av1_partition_breakout_nn_bias_64_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+      -4.825528f, -0.145737f, 0.001907f,  0.145415f,  -1.858153f, -0.080744f,
+      0.000601f,  0.211991f,  0.384265f,  -0.043945f, -0.521332f, -0.170622f,
+      -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+      -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+      -1.202551f, -0.729727f, -0.370084f, 0.088215f,  -1.926800f, -0.086519f,
+      0.000359f,  0.215120f,  0.718749f,  0.022942f,  0.003840f,  -0.176518f,
+      1.213451f,  0.080786f,  0.001557f,  -1.053430f, 0.202698f,  -0.583919f,
+      -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+      -0.178518f, -0.585784f, 0.000029f,  -0.833014f, -0.331358f, -0.520297f,
+      -0.088676f, -0.178487f, -1.430755f, 0.022981f,  -0.106931f, 0.015573f,
+      -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+    };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+  11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+  6.669584f,  16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+  -0.423808f, 0.000000f,  6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+  0.168561f,  -0.122519f, 0.524667f,  0.032474f,  0.059097f,  0.011900f,
+  0.166445f,  0.127256f,  -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+  -0.004171f, 0.157694f,  0.117845f,  0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+  0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_32_layer0,
+      av1_partition_breakout_nn_weights_32_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_32_layer0,
+      av1_partition_breakout_nn_bias_32_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+      0.209371f,  0.028758f,  0.005764f,  -0.384401f, -0.625777f, -0.005647f,
+      -0.316867f, 0.042985f,  0.127344f,  0.025461f,  0.011465f,  -0.071043f,
+      -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+      -0.130997f, -0.012326f, 0.024124f,  -0.323578f, -0.005790f, -0.085664f,
+      -1.575066f, -0.119221f, 0.015018f,  0.187204f,  0.238117f,  0.084924f,
+      -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+      -0.278642f, -0.011114f, 0.021162f,  0.081290f,  -0.467486f, -0.040771f,
+      -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+      -0.814479f, -0.050450f, -0.003666f, 0.085668f,  -0.272589f, 0.057330f,
+      -0.206540f, -0.303418f, 0.075335f,  -0.180468f, -0.064872f, -0.755948f,
+      -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+    };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+  16.421495f, 4.012273f,  -1.828571f, 0.000000f,  -0.263564f, -0.201972f,
+  6.564987f,  14.651000f, -3.227779f, 2.241833f,  -0.137116f, 0.762876f,
+  5.625762f,  0.615822f,  0.040057f,  16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+  -0.096440f, 0.184316f,  -0.021148f, 0.424974f, 0.003743f,  0.006310f,
+  0.046266f,  -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+  0.269773f,  -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+  1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_16_layer0,
+      av1_partition_breakout_nn_weights_16_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_16_layer0,
+      av1_partition_breakout_nn_bias_16_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+      -0.255885f, 0.109548f,  -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+      0.048241f,  -0.356013f, -0.085054f, 0.124908f,  0.000084f,  -0.149906f,
+      -0.729829f, 0.133535f,  -0.002125f, 0.207516f,  -0.210163f, -0.567365f,
+      -0.590103f, 0.045308f,  -0.539406f, 0.130550f,  -0.663879f, -0.170549f,
+      0.017587f,  -0.054187f, 0.000550f,  0.038297f,  -0.112891f, -0.012751f,
+      -0.048067f, 0.095564f,  0.079892f,  0.077285f,  -0.749708f, -0.286312f,
+      -0.054334f, 0.132242f,  -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+      -0.001034f, -0.090990f, 0.122823f,  -0.109794f, -0.230066f, -0.391155f,
+      -0.262245f, -0.004744f, -0.232246f, 0.099290f,  -0.637484f, 0.111937f,
+      -0.548556f, -0.598344f, 0.123265f,  -0.281395f, -0.399711f, -0.525671f,
+      -0.596269f, 0.098494f,  -0.005765f, 0.173652f,
+    };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+  0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+  2.056990f, 5.284306f,  0.639643f, -2.792049f, -2.232339f, -0.232209f,
+  2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+  -0.014439f, 0.010171f, 0.048116f,  -0.090659f, -0.081235f, -0.021840f,
+  -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+  0.055858f,  0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+  1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_8_layer0,
+      av1_partition_breakout_nn_weights_8_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_8_layer0,
+      av1_partition_breakout_nn_bias_8_layer1,
+  },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9  // Input layer size
+#define NUM_NODES 32    // Hidden layer size
+#define LABEL_SIZE 3    // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+                                                          NUM_NODES] = {
+  0.22151f,  0.99424f,  0.23415f,  -1.13841f, -0.11277f, 0.09530f,  0.14769f,
+  -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f,  0.15777f,  -0.03176f,
+  0.02729f,  -0.37344f, -0.01727f, -0.05469f, 0.19402f,  -3.45508f, 0.90106f,
+  -2.91557f, 0.19379f,  0.14356f,  -0.13291f, 0.05734f,  -0.03032f, -0.13060f,
+  0.35744f,  1.31630f,  -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+  1.08305f,  -0.21596f, 0.76244f,  1.10616f,  -1.71706f, 0.05768f,  0.10966f,
+  0.00949f,  -0.12680f, 0.00699f,  -0.11522f, -0.38566f, 0.34283f,  -0.35266f,
+  -0.40643f, -0.22462f, 0.32300f,  -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+  0.30314f,  -1.35659f, -0.38212f, 0.45857f,  0.76615f,  0.16819f,  -1.24459f,
+  0.39677f,  0.87436f,  -2.33757f, 1.27471f,  0.27488f,  0.01019f,  -0.01221f,
+  -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+  -0.06777f, -1.13868f, 0.01354f,  -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+  0.56899f,  1.17144f,  0.70899f,  0.48568f,  0.11266f,  0.81579f,  -0.03929f,
+  0.01088f,  0.33599f,  -0.22401f, -0.49654f, -0.02598f, 0.04509f,  -0.08217f,
+  -0.30687f, 0.19851f,  -2.96860f, -2.30698f, 0.01848f,  0.11801f,  0.06614f,
+  0.01673f,  -0.11002f, -0.08168f, 0.09204f,  -0.06379f, 0.27972f,  -0.31716f,
+  -0.00566f, -0.13651f, -0.37276f, 0.01511f,  -0.23697f, 0.21696f,  -0.19480f,
+  0.60758f,  -0.43506f, -0.02247f, -1.45073f, 0.84442f,  -0.94018f, 0.32550f,
+  0.03985f,  -0.06581f, 0.21665f,  0.79472f,  -2.41080f, 0.04788f,  -0.09492f,
+  -0.10677f, 0.07250f,  0.14329f,  -0.37319f, 0.53043f,  -0.49108f, 0.25792f,
+  -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+  -0.26196f, 0.93379f,  0.72115f,  0.54464f,  0.27642f,  0.04757f,  2.01629f,
+  1.55787f,  -0.11665f, 1.00722f,  -0.24352f, 0.53308f,  0.57719f,  0.39344f,
+  0.19174f,  0.06339f,  -0.02530f, 0.07724f,  -0.32416f, -0.26992f, -0.35887f,
+  -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f,  -1.52153f, -0.26503f,
+  0.97552f,  -2.96705f, -0.91220f, -0.11827f, 0.00406f,  -0.14514f, 0.18417f,
+  -0.20874f, 0.27293f,  -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+  -0.19293f, -0.18275f, -0.05902f, 0.58625f,  -0.05470f, -0.48814f, -0.45382f,
+  -0.05959f, 2.01250f,  -0.30014f, 0.69546f,  -1.24180f, 1.34923f,  0.20337f,
+  0.16850f,  0.07187f,  0.72630f,  -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+  -1.61695f, 0.50052f,  0.09730f,  0.00579f,  0.06133f,  -0.06512f, -0.61439f,
+  -1.16173f, -0.58716f, 1.60438f,  0.23242f,  0.91847f,  0.49041f,  -0.16277f,
+  -0.02574f, -0.64593f, 1.17028f,  0.46852f,  0.14926f,  0.73853f,  -0.78521f,
+  0.05959f,  -0.35590f, 0.02039f,  0.10812f,  -0.28650f, 1.34038f,  -0.72188f,
+  0.62385f,  -0.35271f, -0.39599f, 0.41543f,  0.53124f,  -0.23510f, -0.15480f,
+  -0.05066f, -0.33529f, 0.05238f,  -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+  0.26593f,  -0.18411f, -0.29945f, 0.50090f,  -0.03397f, 0.78562f,  -0.33068f,
+  1.21308f,  -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f,  0.65567f,
+  0.76496f,  0.44325f,  0.01368f,  -0.33619f, -0.64256f, 0.64478f,  0.84553f,
+  1.74183f,  0.22563f,  -0.14550f, -0.16258f, 0.03010f,  0.49922f,  0.64575f,
+  -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f,  0.87411f,
+  -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+  0.33919f,  -0.03003f, 0.79073f,  -0.18508f, 0.00668f,  -0.12017f, 0.35362f,
+  -0.51642f, 0.06536f,  0.41668f,  -0.06509f, 0.94606f,  -0.15385f, 0.14936f,
+  1.46274f,  -0.06961f, 2.82537f,  -1.95576f, -0.09457f, 0.02042f,  -0.07480f,
+  -0.55083f, 0.26170f,  4.39883f,  0.33999f,  -0.10502f, 0.70884f,  -0.06992f,
+  -0.22638f, 1.40940f,  -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+                                                          LABEL_SIZE] = {
+  0.09209f,  0.26236f,  0.62136f,  0.76324f,  -1.14678f, 0.42289f,  -0.08895f,
+  -0.97267f, 2.05958f,  0.00843f,  0.35335f,  1.12096f,  -0.11679f, 0.07350f,
+  -1.23231f, -0.61990f, 1.51379f,  -1.99450f, 0.22441f,  2.41974f,  -0.30488f,
+  -0.37869f, 0.47168f,  -3.70132f, 0.00061f,  0.19432f,  0.11512f,  0.26200f,
+  -0.35285f, 0.37985f,  0.90571f,  0.27344f,  0.74840f,  -0.17965f, -2.51433f,
+  0.59235f,  1.16670f,  -0.53446f, 0.67897f,  0.04505f,  -0.86874f, 0.45361f,
+  -0.35033f, 1.21283f,  0.31426f,  -0.20841f, 0.56757f,  0.45909f,  -1.23683f,
+  0.09835f,  -0.17214f, -0.96323f, 0.01138f,  -0.50233f, 0.30104f,  2.01814f,
+  1.15821f,  -0.11947f, 0.74574f,  -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+  -0.12259f, -0.54977f, 0.30069f,  1.84299f,  -0.95141f, -0.65887f, -0.25888f,
+  -0.63265f, 1.29531f,  -0.56672f, 0.10837f,  -0.21297f, -2.19131f, 0.01156f,
+  0.51912f,  0.46704f,  0.42810f,  -0.59271f, 0.98469f,  -0.17914f, -1.91163f,
+  -0.32807f, 0.48199f,  -0.99525f, 1.67108f,  -0.87631f, -0.60258f, -0.78731f,
+  -0.32877f, 0.44237f,  0.01087f,  0.07489f,  -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+  1.70665f,
+  -0.77954f,
+  -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_8_layer0,
+    av1_rect_partition_nn_weights_8_layer1 },
+  { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  -0.18480f, -0.05410f, -0.18957f, 0.15451f,  -0.38649f, -0.26162f, -0.22727f,
+  -0.38555f, -0.36738f, 0.74384f,  -1.85999f, 0.98491f,  -0.72119f, 1.77321f,
+  0.39983f,  0.96314f,  0.23695f,  0.30200f,  0.30629f,  -0.47617f, -1.43320f,
+  -1.81730f, 0.36554f,  -0.07142f, -1.27242f, -1.27697f, 0.00110f,  -0.32179f,
+  0.27460f,  0.45428f,  0.15308f,  -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+  -0.85390f, 1.05484f,  -1.62812f, 0.77632f,  -0.27327f, -0.32527f, 0.32726f,
+  1.73255f,  0.53763f,  0.59121f,  -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+  0.07519f,  -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+  0.39674f,  -0.08596f, 0.18587f,  -2.04097f, -1.73993f, 1.57212f,  1.42410f,
+  -1.36762f, -0.41485f, -1.12103f, 0.56959f,  0.11500f,  0.48945f,  -0.13585f,
+  1.22125f,  0.67071f,  -1.11812f, -0.20660f, -0.52856f, 0.70663f,  0.74382f,
+  0.61114f,  -0.11454f, 1.14687f,  0.80322f,  -0.45965f, -0.44466f, -0.05830f,
+  0.13206f,  -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+  1.34433f,  2.49427f,  2.91955f,  1.71730f,  0.03295f,  0.03587f,  -0.14550f,
+  0.08189f,  -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+  -1.01334f, -0.57302f, 0.22592f,  0.05916f,  -0.05305f, -0.89824f, -0.52969f,
+  -0.24542f, 0.27029f,  -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+  1.83695f,  2.19716f,  2.31001f,  0.03657f,  0.00063f,  -0.04379f, 0.05835f,
+  -0.08623f, 0.20557f,  -0.17791f, 0.07874f,  -0.25456f, -0.19513f, -0.27753f,
+  -0.31982f, 0.00245f,  -0.33183f, 0.26059f,  -0.22165f, 0.37582f,  -0.30411f,
+  -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f,  1.03673f,
+  0.66139f,  0.44941f,  -0.44461f, -0.50376f, -0.49664f, 0.18608f,  -0.26175f,
+  0.14844f,  0.78715f,  -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+  -0.43135f, -0.22571f, 3.46263f,  3.13580f,  -1.33203f, -0.15247f, -0.15866f,
+  -0.11214f, 0.12211f,  0.03964f,  -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+  -5.62336f, -0.05337f, -0.00943f, 0.00792f,  0.02742f,  1.05679f,  2.41455f,
+  0.85382f,  1.42504f,  0.58096f,  0.21443f,  1.02694f,  1.06746f,  1.20242f,
+  0.60767f,  1.98667f,  -0.80879f, -0.63495f, 1.95508f,  0.23952f,  -0.15019f,
+  -0.16097f, 0.30155f,  -3.42407f, -1.34998f, 9.07689f,  -2.22559f, 2.22562f,
+  -0.03348f, -0.05229f, 0.05931f,  0.03042f,  -0.18068f, -0.05732f, -0.33010f,
+  -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f,  -0.16366f, -0.24935f,
+  -0.69124f, 0.58508f,  0.50654f,  0.04492f,  1.38340f,  -1.51487f, 1.72889f,
+  -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f,  0.03784f,
+  0.08052f,  -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+  0.61751f,  -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+  0.15671f,  -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f,  -0.11533f,
+  -0.05799f, -0.03142f, 0.20218f,  -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+  -0.28459f, -0.20346f, 0.89457f,  -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+  0.33915f,  0.50047f,  -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+  -1.70297f, 1.00482f,  -0.00103f, -1.40813f, 0.21311f,  0.39230f,  -0.07302f,
+  -3.49100f, 1.60675f,  -2.90692f, 0.11022f,  0.13507f,  -0.13308f, 0.15201f,
+  -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+  -0.16783f, -0.16023f, 0.52215f,  -0.04109f, 2.00122f,  -0.11633f, 0.25535f,
+  1.80638f,  1.69273f,  -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+  0.00000f,  -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f,  -0.78548f,
+  -1.39335f, -5.42248f, -0.10388f, 0.07634f,  2.81012f,  -0.57429f, -0.15629f,
+  -0.12044f, 1.65478f,  -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  -0.26407f, 0.06322f,  0.87932f,  0.17772f,  0.71686f,  -0.12283f, 0.08454f,
+  0.20098f,  -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f,  3.80486f,
+  0.16750f,  0.29218f,  0.57234f,  -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+  0.95803f,  -4.13925f, 0.24567f,  0.25708f,  1.60547f,  -1.03251f, -0.31053f,
+  -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f,  0.13689f,
+  0.24504f,  0.49623f,  0.19980f,  0.38349f,  0.37481f,  0.54540f,  -0.02198f,
+  3.43385f,  1.02543f,  -0.40921f, -3.07235f, 0.02996f,  0.00323f,  -0.35414f,
+  0.71099f,  1.39334f,  2.43741f,  -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+  0.00353f,  -1.69637f, 0.45944f,  -0.19884f, 0.03624f,  0.25729f,  0.23659f,
+  -2.08405f, 0.08573f,  -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+  -0.09884f, -0.69026f, -0.37284f, 0.04622f,  1.32973f,  -0.15414f, 0.19138f,
+  -0.67927f, -0.17658f, 0.36008f,  -0.51832f, 0.09887f,  -1.94414f, 2.95227f,
+  1.76937f,  -0.26687f, 8.50976f,  0.26247f,  0.60262f,  -0.27910f, 0.30061f,
+  -0.05117f, 0.16018f,  0.71195f,  0.57871f,  1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+  2.68750f,
+  -1.31894f,
+  -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_16_layer0,
+    av1_rect_partition_nn_weights_16_layer1 },
+  { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+  -0.09858f, -0.09438f, 0.37306f,  0.23934f,  -1.86375f, -1.18307f, -0.32995f,
+  -0.09745f, 0.05431f,  -0.13799f, 0.14734f,  -0.33219f, 0.18057f,  -0.23792f,
+  -0.28126f, 0.02977f,  -0.07431f, 0.07860f,  0.00067f,  -0.01927f, 1.01841f,
+  -0.57739f, 0.08412f,  -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+  -0.16703f, 0.02808f,  0.11994f,  -0.26267f, 0.19706f,  -0.29707f, -0.25305f,
+  -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f,  -0.37001f, -0.23319f,
+  -0.11139f, -0.30513f, 0.04213f,  -0.12550f, 0.02504f,  0.33245f,  0.01102f,
+  -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+  0.65066f,  0.28443f,  -1.24943f, -3.00246f, -1.01897f, 0.09304f,  0.70052f,
+  -0.12877f, 0.21120f,  -0.37476f, 0.23261f,  -0.28401f, 0.09837f,  0.00020f,
+  -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f,  0.16596f,  -0.06532f,
+  1.72938f,  1.57754f,  0.55963f,  0.33246f,  -0.20023f, 0.30715f,  0.08629f,
+  0.18945f,  -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+  -0.57698f, 0.04157f,  -0.92428f, -1.31268f, 1.78210f,  0.10291f,  1.55042f,
+  -1.26793f, 1.39042f,  -1.43729f, 0.25600f,  5.21263f,  5.31955f,  5.19316f,
+  5.43430f,  0.00294f,  -0.00970f, -0.02333f, 0.00250f,  1.17672f,  6.27544f,
+  4.95973f,  3.54009f,  4.51269f,  0.30750f,  0.78780f,  -0.44741f, -0.76442f,
+  0.75050f,  0.58799f,  0.03400f,  -2.09859f, 1.67313f,  0.12503f,  0.28609f,
+  1.15809f,  2.46530f,  -0.04898f, 0.23072f,  -0.12635f, -0.82097f, -0.63827f,
+  2.16779f,  1.77132f,  0.15434f,  -1.06427f, 0.06206f,  -0.87732f, -0.61897f,
+  -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f,  -0.22992f,
+  1.74638f,  1.29199f,  -0.55464f, 0.98316f,  0.06665f,  0.50254f,  -0.66292f,
+  0.17113f,  -0.32633f, -1.85803f, -0.92759f, 4.44965f,  1.33057f,  0.02135f,
+  -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+  -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f,  0.14248f,
+  -0.48630f, 0.18840f,  0.11040f,  0.17287f,  -0.51880f, 1.12466f,  -0.38888f,
+  -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f,  -0.26881f,
+  -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+  -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f,  1.29826f,  0.23788f,
+  0.04189f,  2.66416f,  0.48815f,  -0.06803f, 0.96742f,  1.27165f,  -0.70348f,
+  -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+  -0.48073f, 2.43657f,  -2.60191f, -1.82837f, 0.50440f,  0.71829f,  0.76491f,
+  0.28293f,  0.20568f,  0.92642f,  -0.02496f, 1.43637f,  -0.24474f, -1.21030f,
+  0.54084f,  1.05130f,  1.29572f,  0.03750f,  -0.36894f, 0.74548f,  -1.33857f,
+  -0.84858f, 1.35230f,  0.80175f,  0.66136f,  1.06473f,  0.18701f,  1.42413f,
+  0.04661f,  -0.07820f, 0.64990f,  -0.43595f, 1.18304f,  -0.11437f, -0.06365f,
+  0.03558f,  0.78260f,  -1.74890f, 1.56217f,  -1.23424f, 4.59193f,  -3.35072f,
+  0.01180f,  -0.18296f, -0.20870f, 0.04510f,  1.52595f,  -1.37402f, -0.33123f,
+  -0.85957f, 0.80598f,  0.03743f,  0.02354f,  0.37707f,  1.62095f,  -0.29627f,
+  -0.31778f, -0.45789f, -0.14906f, 0.25315f,  -0.10817f, -0.32610f, -0.40890f,
+  0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+  -0.17482f, 0.39042f,  0.00000f,  1.69677f,  0.08792f,  -0.09301f, 0.13809f,
+  4.84061f,  0.00000f,  0.40515f,  0.46246f,  0.20644f,  -5.77478f, -1.54510f,
+  0.05660f,  -0.32013f, 0.23649f,  0.03778f,  -2.53710f, -0.27869f, 0.45623f,
+  -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f,  1.93272f,  -1.07032f,
+  -0.27602f, -1.98063f, 0.20816f,  -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  0.02827f,  1.02560f,  -0.07137f, -0.31911f, 0.11365f,  0.13684f,  -0.07816f,
+  -5.23036f, -0.34340f, 0.84526f,  -1.51845f, 0.07017f,  -8.12570f, 6.24061f,
+  0.35739f,  -0.09937f, -0.30978f, 0.22032f,  0.74968f,  -0.34557f, 0.45547f,
+  -0.16512f, 0.07118f,  1.66415f,  0.41320f,  -1.81533f, -1.96004f, 1.04666f,
+  0.84049f,  4.31009f,  0.68850f,  0.26322f,  -0.24634f, -1.25889f, 0.31952f,
+  0.63632f,  0.05801f,  -0.10664f, -0.21992f, 2.44386f,  0.19526f,  -0.09838f,
+  1.53049f,  -0.26630f, 3.54126f,  -3.40574f, 0.72730f,  0.04557f,  0.92652f,
+  0.15522f,  2.35895f,  -0.13347f, 0.56907f,  0.15352f,  0.01823f,  -0.73939f,
+  0.43104f,  1.90321f,  0.31267f,  -0.51972f, 0.50094f,  -3.98372f, -3.41518f,
+  -0.48183f, 0.26661f,  0.64146f,  0.14500f,  -0.01695f, 0.16653f,  -0.37846f,
+  0.08412f,  2.69714f,  -0.20258f, -0.75786f, 0.11201f,  0.61878f,  4.22231f,
+  -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+  -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f,  2.11441f,
+  -1.08794f, -1.41694f, 0.02620f,  2.18792f,  0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+  2.47332f,
+  -1.65756f,
+  -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_32_layer0,
+    av1_rect_partition_nn_weights_32_layer1 },
+  { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  0.08972f,  4.09095f,  -0.31398f, -2.43631f, -0.74767f, 1.42471f,  1.60926f,
+  1.44721f,  1.88259f,  2.35375f,  1.88299f,  2.01109f,  0.98679f,  2.24131f,
+  0.06279f,  -0.08315f, 0.32107f,  0.91334f,  -0.36569f, 5.55049f,  5.44943f,
+  5.20471f,  5.39099f,  -0.01943f, -0.00284f, 0.02203f,  -0.01309f, 1.41917f,
+  6.68460f,  -6.15986f, 6.41341f,  -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+  0.04308f,  0.95366f,  3.48535f,  2.98266f,  4.11784f,  3.44255f,  0.61630f,
+  0.71405f,  0.63945f,  -0.00713f, 0.39193f,  1.91621f,  3.32755f,  0.71674f,
+  -0.11647f, 2.07090f,  2.64191f,  0.07949f,  -0.05023f, 0.99935f,  0.83145f,
+  0.75898f,  -0.98764f, -0.58731f, 1.21734f,  -0.08076f, -3.26780f, 1.66278f,
+  0.04189f,  -0.33177f, -1.58648f, 1.00883f,  -0.56132f, -2.34877f, 0.67056f,
+  -2.32297f, -0.91641f, -1.02909f, 4.19781f,  3.87484f,  4.32778f,  -1.97171f,
+  -0.24734f, 0.00822f,  0.05892f,  0.12697f,  -3.62915f, -2.93127f, 7.94856f,
+  -3.29311f, 3.26001f,  -0.02231f, 0.02741f,  0.05919f,  0.08190f,  -1.49344f,
+  -0.64475f, -0.24627f, 4.03324f,  -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+  0.08580f,  -5.74721f, 4.42467f,  3.63964f,  3.00258f,  -1.22744f, -0.29408f,
+  0.00767f,  0.12305f,  0.05249f,  -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+  0.04628f,  -0.35249f, -0.18272f, 0.03956f,  -0.19329f, -0.33564f, 0.09856f,
+  -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+  -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f,  -0.24918f, -0.53516f,
+  -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f,  6.54429f,
+  -0.00226f, 0.00430f,  0.00321f,  0.00442f,  0.87551f,  -0.16224f, -0.22832f,
+  -0.60640f, -0.28738f, 0.18062f,  0.22008f,  -0.47406f, 0.80302f,  0.12149f,
+  1.49530f,  1.05069f,  -2.02985f, -0.92833f, 0.25616f,  0.12852f,  3.51840f,
+  0.25226f,  -2.63283f, -4.04386f, 8.46300f,  -2.93408f, 0.44069f,  0.08276f,
+  0.34482f,  -0.22615f, 0.28666f,  3.02962f,  -1.20055f, -1.04832f, -0.97632f,
+  -0.99530f, 1.44196f,  1.68550f,  0.49360f,  1.08155f,  -0.26059f, -0.02876f,
+  -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+  1.17443f,  2.41497f,  1.90537f,  2.37716f,  2.91495f,  -0.44455f, -0.51176f,
+  0.48195f,  0.53032f,  0.23696f,  -1.06211f, 1.47459f,  -0.89029f, 0.29521f,
+  0.66291f,  -0.42653f, 1.82308f,  -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+  -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f,  0.27297f,
+  -4.81472f, 4.60404f,  -0.11053f, 0.14765f,  0.02826f,  -0.14688f, -0.07066f,
+  -0.01224f, 1.20377f,  7.02725f,  -6.02627f, 6.87255f,  -3.14257f, 0.01074f,
+  0.02397f,  -0.02359f, 0.01901f,  0.14956f,  -1.67671f, 2.26714f,  2.57043f,
+  -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+  -2.14814f, -0.67266f, -1.56850f, 0.57137f,  -1.14428f, -0.34265f, -0.12521f,
+  0.01220f,  -0.74906f, -0.19270f, 0.68110f,  -0.24737f, -0.70568f, -1.64826f,
+  -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f,  6.17812f,
+  -0.03191f, -0.00104f, 0.01402f,  -0.00046f, -0.94517f, 1.51266f,  -0.56318f,
+  0.72260f,  -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f,  2.21148f,
+  -1.47954f, -1.01439f, 0.31536f,  0.77238f,  -0.85083f, -0.15758f, -0.50886f,
+  0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+  0.91706f,  -1.31328f, -5.16196f, 1.13191f,  -0.98044f, -1.61122f, 1.03039f,
+  -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f,  -0.10752f, -0.13065f,
+  -0.35567f, -0.35693f, 1.74941f,  1.17379f,  -3.45555f, 5.66321f,  -0.24917f,
+  -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f,  -2.97859f, -0.16774f,
+  0.59835f,  -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  0.58963f,  4.20320f,  -8.62465f, -6.54014f, 5.41108f,  2.33581f,   -0.10354f,
+  -1.17753f, -3.45909f, -2.24722f, 2.20881f,  3.21971f,  -0.09087f,  -0.21624f,
+  0.16529f,  -8.40985f, -1.60205f, -1.41538f, 4.41826f,  -4.63069f,  -0.27742f,
+  4.08710f,  0.26439f,  -1.46028f, 0.51234f,  6.25212f,  -3.35650f,  -1.21348f,
+  1.37201f,  8.89151f,  0.28859f,  -0.97328f, -0.36196f, -2.71701f,  4.54196f,
+  -0.62476f, -2.43814f, -1.34209f, 0.12850f,  1.73859f,  3.09809f,   -4.42434f,
+  -1.82552f, -3.66420f, -0.31535f, 0.00968f,  -0.02019f, 9.66824f,   0.58835f,
+  1.50425f,  2.84487f,  2.55522f,  0.01409f,  -2.27594f, -0.31800f,  0.91076f,
+  -0.66808f, 0.33120f,  -0.12460f, 0.64457f,  -0.36416f, -10.30843f, 1.51013f,
+  2.06861f,  -0.20989f, -0.87119f, 3.68642f,  7.33662f,  -2.88037f,  -0.52414f,
+  -0.35036f, -0.45947f, -0.07406f, 6.46346f,  -0.16031f, 0.27071f,   0.38845f,
+  -0.21940f, 0.08583f,  -1.39526f, 0.50554f,  0.45279f,  -6.61856f,  1.84069f,
+  -0.19149f, -1.77235f, 0.75136f,  1.11797f,  0.32677f,  -7.10427f,  3.82908f,
+  1.04238f,  -0.91435f, 1.93317f,  -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+  0.32215f,
+  -0.57522f,
+  0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_64_layer0,
+    av1_rect_partition_nn_weights_64_layer1 },
+  { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+                                                            NUM_NODES] = {
+  -0.70901f, -3.03481f, 3.30604f,  -1.28803f,  -0.08610f, -0.33320f, -0.30716f,
+  0.25100f,  0.14323f,  -0.98422f, -0.89084f,  -0.24508f, -1.10785f, -0.82524f,
+  0.11766f,  -0.42777f, 1.08965f,  4.35125f,   -1.19388f, 4.22042f,  4.96306f,
+  6.32406f,  3.29899f,  -0.90768f, 0.05203f,   0.38467f,  1.74257f,  -0.19918f,
+  -0.11335f, 0.00140f,  -0.42303f, -0.04419f,  0.03583f,  -0.05441f, -0.19586f,
+  0.01484f,  -1.19964f, 0.25497f,  3.04502f,   0.05446f,  -0.23253f, 0.00266f,
+  0.07117f,  -2.78986f, -4.62953f, 1.45331f,   0.43923f,  0.92298f,  -0.47736f,
+  1.49165f,  0.45942f,  -1.99787f, 3.33510f,   0.17234f,  0.04024f,  -1.42780f,
+  0.23566f,  -0.90970f, 1.18041f,  -1.45865f,  2.30878f,  -1.28507f, 1.87290f,
+  1.91186f,  4.74826f,  -3.70735f, 4.49808f,   -4.72275f, -0.02696f, -0.02642f,
+  -0.06093f, -0.01121f, -0.70683f, 2.69737f,   -1.88563f, 2.48637f,  1.10922f,
+  0.74624f,  0.40308f,  2.06396f,  1.39289f,   0.00909f,  -2.05271f, -1.53539f,
+  -1.38323f, 0.83303f,  -0.32250f, 0.51172f,   3.91249f,  1.66373f,  1.13184f,
+  -2.22874f, -1.13448f, -0.11185f, 0.19387f,   0.36770f,  -0.58933f, 0.22789f,
+  1.17307f,  0.77461f,  0.20817f,  0.33417f,   0.54037f,  0.32961f,  -0.18456f,
+  -9.78171f, -0.17216f, -3.44703f, -2.42158f,  0.51946f,  4.35949f,  -0.73335f,
+  -1.61515f, -0.29622f, -0.37617f, -0.42316f,  0.74922f,  1.44386f,  3.92704f,
+  -3.76274f, 4.19775f,  -3.86958f, 0.00074f,   -0.02418f, -0.12944f, 0.05857f,
+  -0.85507f, 5.42546f,  5.40338f,  5.54347f,   5.59791f,  -0.01611f, 0.01618f,
+  -0.01654f, -0.00270f, -0.39608f, -0.40410f,  -0.24551f, 0.09124f,  -0.34413f,
+  -0.11504f, 0.12793f,  -0.31523f, 0.09148f,   -0.08567f, -0.05140f, -0.13310f,
+  -0.81200f, 0.06882f,  -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+  -1.02573f, 0.32788f,  1.06828f,  -1.25503f,  -0.42693f, 2.01413f,  -2.29103f,
+  0.62271f,  1.11764f,  -1.83113f, -1.32325f,  -1.65651f, -2.87826f, 1.46910f,
+  0.60885f,  0.16079f,  0.00171f,  -0.25658f,  -0.25465f, -0.14149f, 0.19497f,
+  -0.07866f, -0.37080f, -0.05778f, -0.08870f,  -0.20491f, 0.84521f,  -0.18214f,
+  -1.38441f, -1.08932f, -1.76627f, 0.73172f,   0.05967f,  1.28057f,  3.42722f,
+  1.69287f,  0.77169f,  0.44528f,  1.85513f,   0.07840f,  1.31252f,  2.89948f,
+  1.49489f,  0.15281f,  0.54708f,  -1.14185f,  -2.51063f, 0.36618f,  -0.55322f,
+  0.96671f,  1.59470f,  1.38252f,  1.99697f,   0.03266f,  -0.23200f, -0.01127f,
+  -0.18918f, -0.37598f, -0.03119f, -0.36039f,  -0.21192f, -0.11565f, -4.22635f,
+  1.41252f,  0.56608f,  -0.08867f, 3.11924f,   -0.54597f, -0.12504f, -0.05289f,
+  -0.28665f, -0.58297f, -1.18362f, -0.76201f,  -1.22011f, -0.58756f, 0.14740f,
+  1.43971f,  0.98381f,  -0.02998f, -0.40678f,  -0.23047f, -0.12979f, 0.04003f,
+  -0.22081f, -0.09294f, -0.15955f, -0.10379f,  -0.10192f, -1.51316f, 2.39482f,
+  -1.69975f, 3.58976f,  -0.91032f, -0.03498f,  0.48982f,  -0.13418f, 0.76256f,
+  1.61003f,  -2.01676f, -1.24430f, -3.25763f,  1.12314f,  2.00740f,  0.04613f,
+  -0.14746f, -0.57374f, 3.44511f,  -0.56767f,  -4.08432f, -2.04894f, 2.35951f,
+  -0.00458f, 0.18512f,  0.09916f,  -0.04084f,  -1.56207f, 1.38034f,  4.17302f,
+  -1.47326f, -2.03530f, -0.00210f, 0.27469f,   -0.17423f, 0.86860f,  2.76195f,
+  2.43269f,  -3.57331f, 2.08715f,  -1.44171f,  -0.17389f, 2.26157f,  -0.07852f,
+  2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+  2.53427f,  1.66678f,  -0.84914f, -0.15070f, -1.74769f, 0.45218f,  -0.26067f,
+  2.05916f,  0.08978f,  5.30984f,  2.66243f,  -1.62740f, 0.70018f,  1.96403f,
+  -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f,  -0.08373f, 0.00225f,
+  -1.40692f, -0.27569f, -0.30253f, 0.77377f,  -0.67636f, -0.26379f, 1.82348f,
+  0.66120f,  0.61119f,  -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+                                                            LABEL_SIZE] = {
+  1.53453f,  -0.23707f, 7.88368f,  0.33340f,  0.97523f,  1.38538f,  -0.16746f,
+  4.42070f,  3.18678f,  -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+  -8.75673f, 0.27398f,  -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+  3.32134f,  0.25375f,  -0.00394f, 2.30213f,  -0.14183f, 0.14544f,  -1.42830f,
+  1.31101f,  3.99389f,  -0.00017f, -2.90184f, -2.11444f, 2.16734f,  -3.05133f,
+  0.39206f,  4.61489f,  -2.88181f, -0.47745f, 2.86649f,  -1.20621f, 3.70550f,
+  1.58029f,  -4.58731f, -2.29350f, -0.76930f, 5.19135f,  -0.22521f, -5.08782f,
+  2.17316f,  1.30563f,  0.16777f,  -2.17767f, -2.09904f, 1.37001f,  0.25091f,
+  -1.76743f, 1.57940f,  0.30544f,  -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+  -0.88449f, 0.79299f,  -1.35368f, -4.54110f, 0.02244f,  -5.11580f, 1.60883f,
+  0.29352f,  -6.47042f, -1.81426f, 1.24013f,  0.90980f,  7.93977f,  2.12555f,
+  5.24720f,  4.19508f,  0.21499f,  11.06045f, -0.74752f, 0.89396f,  0.26422f,
+  1.72332f,  -1.25113f, -1.71136f, 0.13676f,  -0.07867f, -0.96929f, 0.19911f,
+  3.58233f,  -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+  1.09014f,
+  -0.53317f,
+  -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_128_layer0,
+    av1_rect_partition_nn_weights_128_layer1 },
+  { av1_rect_partition_nn_bias_128_layer0,
+    av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
+// Below are the models used for simple_motion_search_based_split
+// Thresholds
+// The first index level is for aggresiveness, and the second is frame
+// resolution, third is bsize
+static const float av1_simple_motion_search_split_thresh[4][3][5] = {
+  // Aggressiveness = 0
+  {
+      // lowres
+      {
+          1.40402595879f,  // p = 0.8028197
+          4.72845183649f,  // p = 0.99123732
+          1.86517797783f,  // p = 0.86589934
+          1.58715223005f,  // p = 0.83021506
+          7.22695596987f,  // p = 0.9992738
+      },
+      // midres
+      {
+          5.839480f,  // p = 0.997098
+          1.877167f,  // p = 0.867285
+          3.073499f,  // p = 0.955783
+          1.405601f,  // p = 0.803071
+          2.555636f,  // p = 0.927951
+      },
+      // hdres
+      {
+          5.839480f,  // p = 0.997098
+          1.877167f,  // p = 0.867285
+          3.073499f,  // p = 0.955783
+          1.405601f,  // p = 0.803071
+          2.555636f,  // p = 0.927951
+      },
+  },
+  // Aggressiveness = 1
+  {
+      // Lowres
+      {
+          100.0000f,  // p = 1.000000
+          4.952535f,  // p = 0.992984
+          1.720880f,  // p = 0.848242
+          1.426233f,  // p = 0.806314
+          1.491905f,  // p = 0.816364
+      },
+      // Midres
+      {
+          100.0000f,  // p = 100.0000
+          3.137263f,  // p = 0.958404
+          2.703262f,  // p = 0.937219
+          1.877166f,  // p = 0.867285
+          2.221149f,  // p = 0.902133
+      },
+      // Hdres
+      {
+          4.417680f,  // p = 0.988082
+          3.086898f,  // p = 0.956349
+          3.966704f,  // p = 0.981416
+          1.532565f,  // p = 0.822381
+          3.449975f,  // p = 0.969230
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // lowres
+      {
+          100.000000f,  // p = 0.998048
+          1.484020f,    // p = 0.815179
+          1.866781f,    // p = 0.866085
+          1.706711f,    // p = 0.846409
+          2.080369f,    // p = 0.888980
+      },
+      // midres
+      {
+          100.000000f,  // p = 0.0
+          3.265763f,    // p = 0.963235428881
+          2.024598f,    // p = 0.883355591569
+          1.846446f,    // p = 0.863709256976
+          2.240962f,    // p = 0.903868036126
+      },
+      // hdres
+      {
+          3.133026f,  // p = 0.958234684141
+          2.940954f,  // p = 0.949834204693
+          2.484544f,  // p = 0.923051170045
+          1.702972f,  // p = 0.845922460525
+          1.655562f,  // p = 0.839641385729
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // lowres
+      { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f,
+        0.762099214988f },
+      // midres
+      { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f,
+        0.557298794638f },
+      // hdres
+      { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f,
+        1.86572095242f },
+  },
+};
+
+static const float av1_simple_motion_search_no_split_thresh[4][3][5] = {
+  // Aggressiveness = 0
+  {
+      // lowres
+      {
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+      },
+      // midres
+      {
+          -3.38168078f,  // p = 0.032872917
+          -4.08610739f,  // p = 0.016526795
+          -1.78302370f,  // p = 0.15270848
+          -100.000000f,  // p = 0.0
+          -100.000000f,  // p = 0.0
+      },
+      // hdres
+      {
+          -100.000000f,  // p = 0.0
+          -100.000000f,  // p = 0.0
+          -2.98718897f,  // p = 0.048008
+          -100.000000f,  // p = 0.0
+          -3.33229488f,  // p = 0.03447975
+      },
+  },
+  // Aggressiveness = 1
+  {
+      // Lowres
+      {
+          -100.0000f,  // p = 0.0
+          -4.893793f,  // p = 0.007437
+          -3.387766f,  // p = 0.032680
+          -2.982806f,  // p = 0.048209
+          -2.330372f,  // p = 0.088639
+      },
+      // Midres
+      {
+          -100.0000f,  // p = 0.000000
+          -6.131853f,  // p = 0.002168
+          -2.346579f,  // p = 0.087338
+          -2.712849f,  // p = 0.062219
+          -3.195430f,  // p = 0.039338
+      },
+      // Hdres
+      {
+          -3.491416f,  // p = 0.029557
+          -2.192853f,  // p = 0.100394
+          -3.620180f,  // p = 0.026079
+          -2.030855f,  // p = 0.116001
+          -2.797586f,  // p = 0.057455
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // lowres
+      {
+          -100.0000f,  // p = 0.0
+          -3.617350f,  // p = 0.026151
+          -5.902503f,  // p = 0.002725
+          -4.677840f,  // p = 0.009213
+          -2.168378f,  // p = 0.102626
+      },
+      // midres
+      {
+          -100.0000f,  // p = 0.0
+          -3.204195f,  // p = 0.0390081679555
+          -2.354128f,  // p = 0.0867382128969
+          -2.523326f,  // p = 0.0742390077132
+          -3.112328f,  // p = 0.0426016085803
+      },
+      // hdres
+      {
+          -5.047760f,  // p = 0.00638270448225
+          -3.414994f,  // p = 0.0318301469487
+          -5.628090f,  // p = 0.00358255438917
+          -2.122691f,  // p = 0.10691083145
+          -1.972387f,  // p = 0.122132728355
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // lowres
+      { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f,
+        -1.0830321897f },
+      // midres
+      { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f,
+        -0.228236297886f },
+      // hdres
+      { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f,
+        -1.36741555171f },
+  },
+};
+
+static const float av1_simple_motion_search_split_mean_128[17] = {
+  14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f,
+  12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f,  0.714786f,
+  3.535450f,  3.566207f,  0.835913f,  3.315452f,  3.302908f,
+};
+
+static const float av1_simple_motion_search_split_std_128[17] = {
+  1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f,
+  1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f,
+  1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f,
+};
+
+static const float av1_simple_motion_search_split_mean_64[17] = {
+  12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f,
+  10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f,  0.896393f,
+  2.819613f,  2.855845f,  0.926296f,  2.808782f,  2.798229f,
+};
+
+static const float av1_simple_motion_search_split_std_64[17] = {
+  1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f,
+  1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f,
+  1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f,
+};
+
+static const float av1_simple_motion_search_split_mean_32[17] = {
+  10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f,
+  8.759780f,  8.656299f,  8.772563f, 8.669839f, 4.208026f, 0.958573f,
+  2.308769f,  2.347375f,  0.961685f, 2.323464f, 2.296322f,
+};
+
+static const float av1_simple_motion_search_split_std_32[17] = {
+  1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f,
+  1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f,
+  0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f,
+};
+
+static const float av1_simple_motion_search_split_mean_16[17] = {
+  9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f,
+  7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f,
+  1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f,
+};
+
+static const float av1_simple_motion_search_split_std_16[17] = {
+  1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f,
+  1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f,
+  0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f,
+};
+
+static const float av1_simple_motion_search_split_mean_8[17] = {
+  7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f,
+  5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f,
+  1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f,
+};
+
+static const float av1_simple_motion_search_split_std_8[17] = {
+  1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f,
+  1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f,
+  0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f,
+};
+
+static const float *const av1_simple_motion_search_split_mean[5] = {
+  av1_simple_motion_search_split_mean_128,
+  av1_simple_motion_search_split_mean_64,
+  av1_simple_motion_search_split_mean_32,
+  av1_simple_motion_search_split_mean_16,
+  av1_simple_motion_search_split_mean_8,
+};
+
+static const float *const av1_simple_motion_search_split_std[5] = {
+  av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64,
+  av1_simple_motion_search_split_std_32,  av1_simple_motion_search_split_std_16,
+  av1_simple_motion_search_split_std_8,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 17
+#define NUM_LAYER_0_UNITS_128 20
+#define NUM_LOGITS_128 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = {
+  0.24095f,    -0.397761f,  -0.388619f,  -0.0629548f, -0.44577f,   0.688212f,
+  -0.20889f,   -1.08227f,   -0.0313894f, -0.615505f,  -0.401839f,  0.40233f,
+  -0.171305f,  0.439803f,   1.58527f,    -0.968535f,  -1.29255f,   1.14846f,
+  0.885777f,   0.116412f,   -0.225704f,  0.316506f,   0.793951f,   -0.63591f,
+  0.097789f,   -0.327027f,  -0.778396f,  -0.231667f,  -0.9622f,    1.0044f,
+  0.32594f,    0.179768f,   -0.115529f,  -0.499395f,  -1.14727f,   -1.26111f,
+  0.269818f,   -0.0882028f, -0.349107f,  0.100901f,   0.0249506f,  0.528929f,
+  0.113961f,   0.929794f,   0.242494f,   -0.122828f,  -0.0477379f, 0.170659f,
+  0.0500187f,  0.28859f,    0.78783f,    0.482412f,   0.795298f,   0.179517f,
+  0.453911f,   -0.298029f,  -0.903332f,  0.510615f,   0.691994f,   0.433383f,
+  -0.140802f,  -1.11635f,   -0.547326f,  1.11318f,    0.71905f,    0.978538f,
+  0.097444f,   -0.0386012f, 0.713599f,   0.465164f,   0.391278f,   -0.472864f,
+  0.230224f,   -0.279508f,  0.558192f,   -0.468625f,  0.55995f,    -0.57507f,
+  -1.39947f,   -0.755819f,  -1.04512f,   -0.411552f,  -0.830444f,  -0.106571f,
+  -0.0972184f, 0.251842f,   0.269955f,   0.230492f,   -0.290581f,  -0.484799f,
+  0.0151041f,  0.171047f,   0.829999f,   -0.384581f,  0.220301f,   -0.121687f,
+  1.88848f,    -0.482809f,  -0.48185f,   1.34482f,    -0.716438f,  -0.284482f,
+  -1.78592f,   -1.29333f,   0.886867f,   0.80106f,    0.456415f,   0.649095f,
+  0.231093f,   0.361562f,   0.290018f,   0.128009f,   -0.196343f,  0.0607802f,
+  0.576761f,   -0.0413836f, 0.0300984f,  -0.318998f,  0.204434f,   -0.712524f,
+  0.833394f,   -0.81168f,   0.765488f,   -0.720973f,  1.12866f,    -0.838694f,
+  1.295f,      -0.159127f,  1.05404f,    0.736519f,   0.248662f,   0.229233f,
+  0.0434302f,  0.0551856f,  0.197862f,   0.354823f,   -0.32429f,   -0.227353f,
+  -0.132198f,  -0.438118f,  -0.210401f,  -0.81046f,   0.653555f,   0.826737f,
+  0.154235f,   0.228945f,   0.123089f,   0.614964f,   -0.0940471f, -0.00676807f,
+  0.24996f,    0.949233f,   0.746526f,   -0.044474f,  0.386414f,   0.503221f,
+  0.155133f,   -0.698848f,  -0.735356f,  -0.255091f,  0.413235f,   -0.335295f,
+  -0.145757f,  0.326299f,   -0.602629f,  -0.844474f,  -0.346722f,  -0.42598f,
+  -0.491016f,  -0.447732f,  -0.965366f,  -0.0242841f, 0.836606f,   -0.104877f,
+  1.23236f,    0.683986f,   0.787005f,   -0.0253437f, 1.2145f,     1.29554f,
+  -1.24302f,   -0.229495f,  0.439415f,   0.885087f,   -0.408704f,  -0.119299f,
+  -0.0960972f, 0.60148f,    0.683271f,   -0.057129f,  -0.180295f,  -0.264815f,
+  -0.363184f,  0.638271f,   0.631083f,   -0.252899f,  -0.164364f,  -1.31274f,
+  0.354408f,   0.0429172f,  0.371154f,   -1.0978f,    0.0433642f,  -0.467394f,
+  -0.706572f,  1.57198f,    -0.0701271f, 1.93149f,    -0.446267f,  1.4519f,
+  -1.29567f,   0.309978f,   -0.878062f,  0.891494f,   0.364005f,   -0.209611f,
+  -0.125927f,  0.184097f,   0.0629695f,  -0.43375f,   -0.0980562f, 1.08547f,
+  0.578312f,   0.16566f,    -0.198852f,  -0.241854f,  -0.523934f,  -0.206037f,
+  -0.867721f,  1.00041f,    1.09848f,    -2.12562f,   -0.19992f,   -0.186128f,
+  -0.03507f,   0.0484884f,  0.160856f,   0.10802f,    -0.805141f,  -1.06902f,
+  0.290363f,   0.0222096f,  -0.849266f,  0.112932f,   0.148682f,   -0.0457585f,
+  1.139f,      1.79141f,    0.194122f,   -0.342508f,  -0.403572f,  0.133678f,
+  0.217553f,   -0.263759f,  0.18441f,    0.254529f,   0.0471115f,  0.733178f,
+  -0.416205f,  0.441447f,   -0.443335f,  0.725005f,   -0.78946f,   0.71301f,
+  -0.644969f,  1.5445f,     0.365277f,   -0.455775f,  -0.365066f,  0.4742f,
+  -0.381714f,  -0.545794f,  -0.0464861f, -0.222768f,  -0.0106466f, -0.069743f,
+  0.0335566f,  0.378348f,   -0.249663f,  0.922286f,   0.125711f,   -0.894619f,
+  0.444682f,   0.447893f,   -1.98936f,   -1.41978f,   0.0406667f,  -0.199928f,
+  -0.199786f,  0.463481f,   0.334931f,   -0.396222f,  -0.0732259f, 0.796684f,
+  -0.140817f,  -0.26878f,   0.194642f,   0.895784f,   -0.369976f,  -2.26981f,
+  -0.0791776f, -0.0492268f, 0.6715f,     0.281805f,   0.0156664f,  -0.779785f,
+  0.17743f,    0.188786f,   -0.588077f,  -0.359153f,  0.258319f,   0.881688f,
+  0.846894f,   1.00292f,    0.838134f,   0.680632f,   0.273098f,   -0.329261f,
+  0.217757f,   -0.506726f,  -0.336523f,  -0.695875f,  -0.252006f,  0.751216f,
+  0.334409f,   -0.0151467f, 0.0885474f,  0.0973114f,  -0.248754f,  -0.263716f,
+  0.369906f,   -0.213749f,  -0.0355395f, -0.137799f,  2.43233f,    -0.944233f,
+  -0.745167f,  0.318558f,   0.316608f,   0.568678f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = {
+  0.821344f,  1.11542f,   -1.24172f,  1.03642f,  1.13511f,
+  1.16414f,   -0.278655f, -1.35558f,  -1.26788f, -1.63189f,
+  -0.323271f, 1.21319f,   -0.888415f, 0.987145f, -1.16767f,
+  0.255833f,  -0.1392f,   1.43265f,   -1.54952f, 1.65159f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_128[] = {
+  0.3565753f, 0.5490161f, -1.015597f, 0.565366f,   0.751604f,
+  0.922747f,  -1.931846f, 1.759353f,  -0.7362949f, 0.5707034f,
+  -1.092127f, 0.936767f,  2.034499f,  2.08148f,    0.9509507f,
+  -1.342504f, -0.834566f, 0.618184f,  0.844113f,   1.182693f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_128[] = {
+  1.819351f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = {
+  NUM_FEATURES_128,
+  NUM_LOGITS_128,
+  NUM_HIDDEN_LAYERS_128,
+  {
+      NUM_LAYER_0_UNITS_128,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_128,
+      av1_simple_motion_search_split_logits_kernel_128,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_128,
+      av1_simple_motion_search_split_logits_bias_128,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 17
+#define NUM_LAYER_0_UNITS_64 24
+#define NUM_LOGITS_64 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = {
+  -1.40663f,    -0.851503f,   -0.0613111f,  0.741591f,    0.302754f,
+  0.184001f,    0.0474853f,   0.371096f,    0.0541624f,   0.381508f,
+  0.355427f,    0.0428822f,   0.154916f,    -0.00490099f, 0.025484f,
+  0.0208921f,   0.140596f,    -0.292525f,   -0.459067f,   -0.081393f,
+  0.109824f,    -0.290183f,   0.720236f,    0.385835f,    -0.150643f,
+  -0.078518f,   0.0979819f,   -0.102135f,   0.137152f,    -0.0786457f,
+  0.0171441f,   0.991338f,    -0.546583f,   -1.0714f,     -0.0842851f,
+  0.244072f,    0.427379f,    0.146775f,    -0.921613f,   -0.912093f,
+  0.393566f,    -0.232375f,   0.19963f,     0.312355f,    0.55659f,
+  -0.104714f,   -0.137563f,   0.0985237f,   0.0788307f,   -0.225514f,
+  0.0228832f,   -0.288733f,   -0.00737685f, -0.711657f,   -0.256796f,
+  0.0869605f,   0.583977f,    0.384306f,    1.46692f,     -0.741126f,
+  -0.21105f,    -0.276604f,   -0.0151463f,  -0.0227997f,  -0.0403232f,
+  0.044122f,    0.0185784f,   -0.0451951f,  0.00489513f,  -0.387131f,
+  0.0966724f,   -0.599174f,   -0.00243351f, -0.21439f,    0.302043f,
+  0.130334f,    -0.191251f,   0.863261f,    -1.50112f,    0.00901057f,
+  0.000324294f, -0.0572545f,  0.0117685f,   -0.0734682f,  -0.0570435f,
+  -0.126253f,   1.2313f,      -0.328267f,   0.211788f,    -0.175438f,
+  -0.0419298f,  0.166447f,    -0.178739f,   -0.326221f,   -0.0439188f,
+  1.01182f,     -0.390678f,   -0.426343f,   0.0944665f,   -0.225042f,
+  -0.183344f,   0.0500763f,   -0.377393f,   -0.673401f,   -0.436907f,
+  -0.00366876f, -0.363412f,   0.195194f,    0.250248f,    -0.397193f,
+  -0.0917222f,  -0.0221579f,  1.7693f,      -0.0694484f,  -0.0410764f,
+  -0.134571f,   -0.159992f,   -0.170359f,   -0.249333f,   -0.128056f,
+  -0.617054f,   -0.808701f,   -0.540642f,   0.396391f,    0.147787f,
+  0.346916f,    0.709852f,    0.116064f,    0.0509731f,   0.073713f,
+  -0.365082f,   -1.09287f,    -0.618214f,   0.20545f,     0.126161f,
+  -0.140012f,   0.62592f,     0.316326f,    -0.392765f,   -0.15934f,
+  0.337617f,    -0.41669f,    -0.295225f,   0.0602025f,   -0.0150657f,
+  -0.319629f,   0.783729f,    -0.0661199f,  -0.362657f,   0.390042f,
+  -0.043614f,   -0.0414596f,  0.121155f,    -0.309775f,   -0.284761f,
+  -0.243932f,   0.279855f,    -0.266823f,   0.734824f,    -0.164028f,
+  0.261776f,    -0.105585f,   0.10733f,     -0.180469f,   1.18875f,
+  -1.12836f,    -0.173008f,   0.150221f,    0.111598f,    0.148306f,
+  -1.2833f,     -1.06346f,    0.233546f,    0.16432f,     0.00142378f,
+  0.340574f,    -0.0140885f,  0.634761f,    -0.122096f,   0.821487f,
+  0.421424f,    -0.0256687f,  -0.035503f,   -0.0453547f,  -0.0215179f,
+  -0.0671277f,  -0.0486862f,  -0.962761f,   -0.208383f,   0.109573f,
+  -0.210668f,   -0.176485f,   0.421279f,    0.41605f,     0.342084f,
+  0.619364f,    0.103718f,    -0.00341643f, 0.00266677f,  0.249089f,
+  -0.22848f,    -0.0368968f,  1.12092f,     -0.64912f,    -0.456579f,
+  0.477823f,    0.418345f,    1.41515f,     0.0936279f,   0.886155f,
+  -0.785656f,   -0.217109f,   -0.561829f,   -0.286435f,   -0.884068f,
+  -0.148839f,   -0.282848f,   0.0683745f,   0.0962815f,   -0.111975f,
+  0.0509158f,   -0.211274f,   0.744909f,    -0.8982f,     0.315232f,
+  -0.78624f,    0.598387f,    -0.530952f,   0.677357f,    0.0371339f,
+  0.99209f,     -0.681899f,   -0.291416f,   -0.224822f,   -0.26049f,
+  -0.0436525f,  -0.380004f,   -0.27187f,    0.534779f,    0.717939f,
+  0.418197f,    -0.152539f,   -0.0684039f,  -0.186308f,   -0.0653121f,
+  0.194145f,    -0.196367f,   0.256997f,    -0.726269f,   -0.307672f,
+  -0.153362f,   0.450827f,    0.708842f,    -0.0667079f,  0.555564f,
+  0.0486892f,   0.0715072f,   -0.7211f,     -0.849797f,   0.0650271f,
+  1.2747f,      -0.646738f,   -0.53042f,    0.182197f,    0.928203f,
+  0.180621f,    -0.00640791f, -0.171416f,   0.092688f,    -0.391275f,
+  -0.0650657f,  0.0843773f,   0.170824f,    0.378085f,    0.0596657f,
+  0.844398f,    -1.3083f,     -1.27828f,    -0.199179f,   0.557855f,
+  0.241479f,    0.385804f,    0.169533f,    -0.0028072f,  0.0538041f,
+  0.00136234f,  0.0130481f,   0.0349449f,   -0.0366494f,  -0.000474055f,
+  0.437956f,    0.286724f,    -0.298187f,   0.461967f,    0.43065f,
+  -0.0877194f,  -0.19133f,    0.379121f,    -0.687751f,   -1.64077f,
+  -0.375191f,   -0.336836f,   -0.323904f,   -0.101859f,   0.0126672f,
+  -0.346332f,   0.112303f,    -0.863336f,   0.155538f,    0.366509f,
+  -0.0976829f,  0.635278f,    -0.681967f,   -0.527729f,   0.591839f,
+  0.366678f,    0.189981f,    0.0208007f,   -0.565809f,   0.70183f,
+  -0.282844f,   -0.327485f,   0.347243f,    -1.13014f,    -0.373378f,
+  -0.514978f,   0.662994f,    -0.144931f,   0.1402f,      -0.820049f,
+  0.711498f,    0.681156f,    1.06515f,     -0.423409f,   -0.0392664f,
+  0.0675396f,   -0.0508602f,  0.0431443f,   0.0212639f,   -0.0279887f,
+  -0.62611f,    -0.202064f,   0.701934f,    1.28452f,     -0.00858481f,
+  -0.517249f,   0.0615832f,   -0.260215f,   0.0949119f,   -0.28423f,
+  -0.39573f,    -0.0574246f,  -0.318658f,   0.0601775f,   -0.0629386f,
+  -0.134208f,   0.111686f,    -0.23355f,    0.078667f,    0.741023f,
+  0.828523f,    -0.345067f,   -0.315135f,   -0.0957154f,  0.522825f,
+  -0.190057f,   -0.473789f,   -0.390489f,   0.200677f,    -0.0271802f,
+  0.110336f,    0.493302f,    0.663126f,    0.570148f,    -0.380042f,
+  -0.437349f,   -0.660884f,   0.301908f,    0.0644179f,   0.172494f,
+  0.461917f,    0.330938f,    -0.140041f,   -0.0430205f,  -1.51003f,
+  -0.410984f,   -0.182161f,   0.0235313f,   -0.364849f,   0.154183f,
+  -0.592465f,   0.272701f,    0.192389f,    -0.0497777f,  -0.924467f,
+  -0.179513f,   -0.592217f,   0.436363f,    -0.0716164f,  0.189094f,
+  -0.574697f,   -0.304303f,   0.326441f,    -0.0865553f,  0.735948f,
+  0.266912f,    0.435824f,    -0.123322f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = {
+  -1.19333f,  1.01834f,   -1.10844f,  0.0454873f, -1.45506f,   0.580864f,
+  -0.040979f, -0.505681f, -1.15072f,  0.692697f,  -0.520812f,  -0.479384f,
+  0.529652f,  0.507252f,  -1.08619f,  0.0586375f, 0.0929614f,  -0.46753f,
+  -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_64[] = {
+  -3.32501f,  0.43082f,   -1.060692f, 1.328908f,  0.8892894f,  0.6488833f,
+  -1.096516f, -0.664786f, -1.301339f, 0.508805f,  -2.128406f,  -0.757304f,
+  0.383839f,  0.694763f,  -0.591725f, 0.770385f,  1.021594f,   0.589181f,
+  -0.76238f,  1.488826f,  0.709135f,  -0.575738f, 0.26421759f, -0.2484219f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_64[] = {
+  0.699037f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = {
+  NUM_FEATURES_64,
+  NUM_LOGITS_64,
+  NUM_HIDDEN_LAYERS_64,
+  {
+      NUM_LAYER_0_UNITS_64,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_64,
+      av1_simple_motion_search_split_logits_kernel_64,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_64,
+      av1_simple_motion_search_split_logits_bias_64,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 17
+#define NUM_LAYER_0_UNITS_32 20
+#define NUM_LOGITS_32 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = {
+  -0.980626f,   -0.946611f,    0.103761f,    0.408899f,    0.498149f,
+  0.0490161f,   0.253279f,     0.332029f,    0.00367441f,  0.364401f,
+  -0.236433f,   0.0592119f,    -0.0978848f,  0.159733f,    -0.018052f,
+  -1.10726f,    1.16167f,      -0.244982f,   -0.147819f,   -0.147095f,
+  0.111404f,    -0.349502f,    0.441178f,    0.0984191f,   -0.135537f,
+  -0.0423312f,  0.0123079f,    0.358012f,    -0.266796f,   0.0125811f,
+  0.196563f,    0.337093f,     -1.07266f,    -1.25134f,    0.57337f,
+  -0.521717f,   0.259824f,     0.537383f,    -0.463688f,   -0.336128f,
+  0.373385f,    0.483443f,     -0.229293f,   -0.33373f,    -0.656021f,
+  0.768647f,    0.179279f,     0.315415f,    0.187749f,    1.07839f,
+  0.0626629f,   -0.230299f,    0.662606f,    -0.414154f,   0.459334f,
+  -0.6312f,     0.427704f,     -0.249849f,   0.701056f,    -0.707969f,
+  0.057401f,    0.620434f,     0.665748f,    -0.501356f,   -0.230685f,
+  0.0722371f,   -0.0988625f,   -0.114035f,   -0.653799f,   0.571353f,
+  0.268276f,    1.13251f,      -1.0695f,     -0.225607f,   -0.984355f,
+  -0.42213f,    0.300422f,     1.21492f,     -0.139931f,   -0.000726004f,
+  0.045964f,    -0.0817352f,   -0.0278813f,  -0.0102341f,  -0.0144087f,
+  -0.475882f,   1.20682f,      -0.359919f,   0.277189f,    -0.166401f,
+  0.599211f,    -0.129872f,    0.574211f,    -0.247573f,   0.824405f,
+  -1.53329f,    -0.202151f,    -0.328698f,   -0.516322f,   -0.281416f,
+  -0.383651f,   -0.252862f,    -0.43185f,    0.456802f,    -0.430055f,
+  -0.55245f,    -0.6884f,      -0.541456f,   -0.281376f,   1.10425f,
+  -0.140706f,   1.59816f,      -0.0343895f,  -0.00920039f, -0.0307667f,
+  0.0560132f,   -0.0340302f,   -0.10848f,    0.0593314f,   -0.951795f,
+  0.876831f,    -1.00548f,     -0.566244f,   0.430061f,    1.10109f,
+  -0.634212f,   -0.0755369f,   -0.108953f,   1.03191f,     0.109036f,
+  -0.0415309f,  0.0681162f,    -0.0611775f,  -0.0231938f,  0.0973158f,
+  -0.0558169f,  -0.823484f,    -0.918509f,   0.16756f,     0.27087f,
+  0.286074f,    0.174069f,     0.1304f,      0.386074f,    0.433953f,
+  0.0291467f,   -1.74087f,     0.0296094f,   -0.00793714f, -0.13041f,
+  0.00990992f,  -0.0137848f,   -0.0742606f,  -0.251029f,   -0.645316f,
+  0.640029f,    0.550607f,     0.470097f,    0.549451f,    -0.285723f,
+  -0.164759f,   -0.128166f,    -0.391496f,   -0.80287f,    0.0769472f,
+  1.34391f,     0.0215005f,    0.0669497f,   0.131919f,    0.291674f,
+  0.0952889f,   -0.677953f,    -0.364054f,   0.144823f,    0.246198f,
+  -0.12393f,    0.363661f,     0.215091f,    -0.239658f,   0.18491f,
+  0.118703f,    0.0064156f,    1.38619f,     -1.3845f,     0.0567323f,
+  1.20812f,     -0.720374f,    -1.92158f,    -1.48657f,    0.335601f,
+  0.409379f,    0.373618f,     0.231274f,    0.292194f,    0.368619f,
+  0.2398f,      0.473579f,     0.83402f,     -0.0133751f,  -0.00344358f,
+  2.20688e-05f, 0.00836757f,   0.00405377f,  0.0110539f,   -0.260154f,
+  0.192112f,    -0.666986f,    0.302875f,    -0.113302f,   0.17882f,
+  -0.221493f,   0.146161f,     -0.448697f,   0.584187f,    0.122109f,
+  0.989981f,    -1.14706f,     -0.734042f,   0.0638213f,   0.213357f,
+  0.068543f,    -0.808558f,    0.404741f,    0.808313f,    1.57523f,
+  -0.113448f,   0.254102f,     -0.350065f,   -0.615f,      0.0753549f,
+  -0.540936f,   -0.0250732f,   -0.225681f,   -0.161384f,   0.0128342f,
+  -0.0933368f,  -0.286904f,    0.130133f,    -0.874747f,   0.392585f,
+  -0.493135f,   0.169708f,     0.0909804f,   1.89921f,     -0.469954f,
+  0.65165f,     -0.953401f,    -0.21595f,    -0.37479f,    0.0451146f,
+  0.0234621f,   -0.0596903f,   -0.0682308f,  -0.0830426f,  0.130011f,
+  -0.409141f,   0.0627038f,    -0.581148f,   -0.513922f,   0.631676f,
+  0.0637034f,   0.0539081f,    0.0638872f,   0.515863f,    -0.0123463f,
+  0.177238f,    0.279506f,     -0.930345f,   1.23726f,     0.202851f,
+  0.708792f,    -0.445086f,    -0.0267075f,  -0.913822f,   -0.0714978f,
+  -0.281107f,   -0.0770565f,   -0.23086f,    -0.165893f,   -0.319683f,
+  0.216235f,    -0.490999f,    2.04841f,     -0.0524071f,  -0.239043f,
+  -0.0526375f,  0.023002f,     -0.132685f,   -0.155354f,   -0.186503f,
+  -0.904296f,   0.166478f,     0.063268f,    -0.302842f,   -0.27179f,
+  -0.428299f,   0.50193f,      0.480717f,    -0.864275f,   0.317096f,
+  0.40698f,     0.0286107f,    0.189432f,    -0.0374374f,  0.0671728f,
+  0.203681f,    -0.457959f,    -0.155776f,   0.340948f,    0.542841f,
+  0.342675f,    -0.000952399f, 0.470957f,    0.744418f,    -1.11763f,
+  -0.658812f,   -0.044832f,    0.0688237f,   -0.357766f,   0.428662f,
+  -0.087152f,   -0.291903f,    0.373244f,    -0.587853f,   0.415895f,
+  -0.535694f,   0.621785f,     -0.143648f,   0.0451373f,   0.00068827f,
+  1.84432f,     -1.26239f,     -0.432087f,   -0.152307f,   0.0293551f,
+  0.184744f,    -0.0173156f,   -0.00572154f, -0.0305062f,  -0.0900071f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = {
+  0.160011f,  0.903856f,   -0.13738f,  0.358221f, -0.0906044f,
+  -0.606558f, -0.0215651f, -0.03377f,  -1.67017f, -0.144554f,
+  -0.201482f, -0.87719f,   0.639815f,  -0.51976f, -0.309922f,
+  -1.33421f,  0.721328f,   -0.889354f, -1.7158f,  -0.285963f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_32[] = {
+  -0.2745374f,  0.333548f,  -0.2437388f, 0.288009f,   0.55635f,
+  0.4560176f,   0.2970518f, 0.391192f,   1.311854f,   -0.231219f,
+  -0.2968651f,  -1.819984f, 0.2775824f,  0.28929857f, 0.419126f,
+  -0.32868411f, -0.916399f, -0.1921077f, -0.617489f,  0.637953f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_32[] = {
+  0.208473f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = {
+  NUM_FEATURES_32,
+  NUM_LOGITS_32,
+  NUM_HIDDEN_LAYERS_32,
+  {
+      NUM_LAYER_0_UNITS_32,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_32,
+      av1_simple_motion_search_split_logits_kernel_32,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_32,
+      av1_simple_motion_search_split_logits_bias_32,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 17
+#define NUM_LAYER_0_UNITS_16 20
+#define NUM_LOGITS_16 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = {
+  0.0136957f,   0.182135f,    -0.583394f,    0.0556956f,   0.211152f,
+  0.168234f,    -0.694203f,   -0.678216f,    0.289943f,    1.00014f,
+  -0.0427784f,  -0.0427538f,  -0.0276009f,   -0.00133608f, 0.0901944f,
+  0.0674892f,   0.104068f,    -0.308582f,    -0.43596f,    0.855997f,
+  -0.223414f,   0.0390026f,   0.366492f,     0.216065f,    -0.386863f,
+  -0.148823f,   -0.297022f,   0.0529546f,    -0.202885f,   1.26471f,
+  -0.861163f,   -0.0949431f,  0.573627f,     -0.00277083f, -0.616063f,
+  -0.626927f,   0.371583f,    -0.411743f,    0.173387f,    -0.209734f,
+  0.293697f,    -0.260714f,   0.442728f,     -0.594486f,   1.38987f,
+  0.208025f,    -0.0433776f,  0.01173f,      0.921766f,    -0.168379f,
+  0.000697326f, 0.209967f,    -0.304577f,    0.149551f,    -0.196658f,
+  0.389251f,    -0.449106f,   -0.456329f,    0.669073f,    -0.163806f,
+  0.083348f,    -0.0783998f,  0.0678355f,    0.0510435f,   0.103964f,
+  0.104537f,    -0.778093f,   -1.0641f,      -0.626102f,   -2.02131f,
+  0.159591f,    0.254161f,    -0.000362642f, 0.289859f,    0.192713f,
+  0.139801f,    -0.0251327f,  0.164002f,     1.22892f,     -0.0852193f,
+  0.0769487f,   0.0296408f,   -0.0418688f,   0.0936023f,   0.0448523f,
+  0.674015f,    -0.0732944f,  0.313575f,     -0.593432f,   0.642067f,
+  -1.06063f,    0.468223f,    -0.769085f,    -0.173798f,   -0.175663f,
+  0.692808f,    0.00753295f,  -0.123327f,    -0.0234937f,  -0.0923153f,
+  0.0216917f,   -0.0690157f,  -0.397488f,    0.426628f,    0.264475f,
+  0.342074f,    -0.139817f,   0.215915f,     0.422544f,    -0.321102f,
+  0.0355587f,   0.460193f,    0.0315326f,    0.080556f,    -0.0256533f,
+  -0.0857874f,  -0.488283f,   -0.299653f,    -0.245987f,   0.104383f,
+  0.203731f,    0.328734f,    0.668104f,     -0.586909f,   -0.501335f,
+  -0.661292f,   -0.359811f,   0.00951363f,   0.816315f,    -0.0124104f,
+  0.0545827f,   0.089863f,    0.0125486f,    0.043609f,    -0.0259544f,
+  0.0123911f,   0.12557f,     -0.539875f,    -0.0556721f,  0.16532f,
+  0.265834f,    -0.384171f,   0.646496f,     0.366147f,    -0.111272f,
+  0.262096f,    -0.0845724f,  0.382724f,     0.165783f,    0.1025f,
+  0.392988f,    0.290525f,    0.038659f,     0.540269f,    -0.485586f,
+  -0.273065f,   -0.154052f,   -0.0896895f,   -0.35394f,    0.193214f,
+  -0.423728f,   0.654576f,    -0.373321f,    0.814914f,    0.026278f,
+  -0.0328304f,  -0.220913f,   -0.0442121f,   0.487545f,    -0.509537f,
+  -0.777581f,   -1.23886f,    0.223482f,     0.206009f,    0.20391f,
+  0.194628f,    0.226762f,    0.171609f,     -0.219037f,   0.557892f,
+  -0.312011f,   1.27709f,     0.064013f,     0.105384f,    0.0493933f,
+  0.074059f,    -0.0100078f,  -0.0176888f,   -0.440005f,   0.302922f,
+  -0.197456f,   0.296128f,    -0.326647f,    0.305323f,    -0.30696f,
+  0.201951f,    -0.15874f,    -0.793042f,    0.0197254f,   0.0569867f,
+  -0.0295468f,  -0.0215012f,  0.025855f,     -0.0196102f,  0.215558f,
+  -0.253069f,   0.298469f,    0.261269f,     0.435305f,    0.0120354f,
+  -0.384789f,   -0.2772f,     0.0366613f,    -0.494994f,   0.149072f,
+  1.32981f,     -0.427717f,   0.43938f,      -0.16375f,    -0.444342f,
+  0.548214f,    0.127955f,    -1.24387f,     0.0863676f,   0.175071f,
+  0.172673f,    -0.0906204f,  0.444454f,     -0.546669f,   0.215857f,
+  -0.100621f,   0.200699f,    -0.0985915f,   0.134706f,    -0.256396f,
+  0.393427f,    0.119606f,    -0.214278f,    -0.0183637f,  0.194266f,
+  -0.238025f,   0.182203f,    0.599718f,     0.846933f,    0.0607852f,
+  -0.183434f,   -0.723743f,   -0.72414f,     -0.124701f,   0.0227527f,
+  -0.0664636f,  -0.0385867f,  -0.0257377f,   -0.149054f,   0.12077f,
+  0.678029f,    -0.624456f,   0.189644f,     -0.518604f,   0.134397f,
+  -0.189777f,   -0.309376f,   -0.00377086f,  0.701132f,    -0.170915f,
+  0.00736111f,  -0.121906f,   0.329136f,     0.165514f,    0.0328356f,
+  0.171275f,    0.248619f,    0.247704f,     -0.449933f,   0.0841684f,
+  0.136982f,    0.122703f,    -0.0169439f,   -0.0726496f,  0.302648f,
+  -0.128556f,   0.0667425f,   -0.289717f,    -0.207532f,   -1.20269f,
+  -0.68892f,    0.045259f,    0.0973945f,    0.0988314f,   -0.944748f,
+  -0.180401f,   0.134331f,    0.033834f,     0.109023f,    0.265723f,
+  0.38063f,     -0.106518f,   -0.0686953f,   0.3744f,      -1.0957f,
+  0.0302782f,   0.0515164f,   0.00188222f,   0.0014413f,   -0.0404425f,
+  0.0124618f,   -0.0828645f,  0.506166f,     -0.776352f,   -0.405138f,
+  -0.123887f,   0.0732116f,   0.379928f,     0.604524f,    -0.492317f,
+  0.439191f,    0.0744193f,   0.389101f,     0.0604518f,   0.0943165f,
+  0.0339942f,   0.0917975f,   0.0161988f,    0.512227f,    0.538021f,
+  -0.411495f,   0.307281f,    0.33746f,      -0.218639f,   0.265742f,
+  0.39738f,     -0.12442f,    0.125236f,     -0.0845223f,  -0.150396f,
+  0.0334878f,   -0.00391915f, 0.0406864f,    -0.0487059f,  0.0377073f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = {
+  0.0535976f, -0.0130279f, 0.150146f,   -0.511132f, -0.357698f,
+  0.6719f,    -1.27877f,   -0.0208048f, 0.0961914f, 0.263603f,
+  0.704574f,  -1.48998f,   0.728063f,   0.941829f,  -0.199981f,
+  0.797802f,  -0.29816f,   -0.60894f,   -0.116624f, -1.16723f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_16[] = {
+  0.343153f,   -0.2110482f, -0.487199f,   0.3274144f, -2.1975f,
+  -0.6051438f, 0.1901127f,  0.4741924f,   -0.24029f,  -0.185018f,
+  -0.652635f,  2.57714f,    -0.31033031f, -0.307222f, 0.329035f,
+  -0.430181f,  0.3429f,     0.742292f,    0.3269808f, 0.4142165f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_16[] = {
+  -0.783658f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = {
+  NUM_FEATURES_16,
+  NUM_LOGITS_16,
+  NUM_HIDDEN_LAYERS_16,
+  {
+      NUM_LAYER_0_UNITS_16,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_16,
+      av1_simple_motion_search_split_logits_kernel_16,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_16,
+      av1_simple_motion_search_split_logits_bias_16,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 17
+#define NUM_LAYER_0_UNITS_8 20
+#define NUM_LOGITS_8 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = {
+  0.079443f,   -1.04068f,   0.336819f,    -0.20901f,   0.796251f,
+  0.181066f,   0.0118876f,  -0.207145f,   0.250671f,   -0.402119f,
+  -0.0847227f, 1.88683f,    0.303469f,    0.0718458f,  0.0338589f,
+  0.158896f,   0.0540238f,  -0.385426f,   0.955925f,   0.424506f,
+  0.492584f,   -0.795058f,  -0.248667f,   -0.905349f,  -0.316989f,
+  0.545471f,   0.63762f,    -0.232613f,   -0.238947f,  -0.395338f,
+  -0.322673f,  -0.0761563f, -0.125357f,   0.0694415f,  -0.371599f,
+  0.358387f,   -0.486841f,  0.403863f,    -0.0295666f, 0.283074f,
+  -0.424396f,  0.156318f,   -0.685355f,   0.6663f,     0.337949f,
+  0.273198f,   0.517448f,   0.458911f,    0.157252f,   0.692096f,
+  0.64965f,    -0.23987f,   -1.08431f,    -0.252475f,  -0.332614f,
+  -0.712291f,  -0.380973f,  0.460545f,    0.48936f,    0.337601f,
+  0.489223f,   1.65336f,    -0.223585f,   0.17367f,    -0.235057f,
+  -0.456773f,  0.327877f,   -0.221192f,   -0.940151f,  -1.06616f,
+  0.687084f,   -0.109973f,  0.106636f,    0.445895f,   0.163432f,
+  0.378306f,   0.201902f,   0.176811f,    0.693082f,   1.62156f,
+  -0.178346f,  0.455175f,   1.61943f,     0.231376f,   0.0890932f,
+  -0.889693f,  -1.03298f,   0.778196f,    -0.0289539f, 0.137848f,
+  0.18707f,    0.171889f,   0.119157f,    0.24893f,    -0.313628f,
+  0.00250735f, -0.0758209f, 0.272974f,    -0.229825f,  2.47926f,
+  -0.0354665f, 0.175366f,   0.0411555f,   -1.52149f,   -0.0258663f,
+  0.253027f,   -0.0520839f, -0.0189782f,  0.362387f,   -0.371154f,
+  0.622929f,   0.0447056f,  0.242529f,    -0.168391f,  0.308935f,
+  -0.117294f,  2.16307f,    0.0673638f,   0.080771f,   -0.460779f,
+  -0.940176f,  0.473266f,   -0.0125302f,  0.475145f,   -0.218187f,
+  0.43258f,    -0.0380196f, 0.413607f,    -0.110856f,  -1.52076f,
+  0.0896812f,  0.246636f,   -0.0612008f,  0.189583f,   0.0106902f,
+  -0.158403f,  -0.629377f,  -0.0634279f,  -0.0864584f, -0.226568f,
+  -0.286234f,  -0.0721132f, -0.43702f,    0.113702f,   0.433372f,
+  0.743396f,   0.14312f,    0.29914f,     0.801188f,   0.7609f,
+  0.385046f,   0.480314f,   0.171119f,    -1.59058f,   -1.18853f,
+  0.150676f,   0.408123f,   -0.00677924f, 0.398145f,   0.0914611f,
+  0.176945f,   0.0677457f,  0.316478f,    0.998219f,   -0.22618f,
+  0.0756793f,  -0.0156674f, 0.105716f,    0.0496245f,  -0.0827133f,
+  -0.423119f,  -0.161033f,  0.212962f,    -0.234453f,  0.743366f,
+  1.04108f,    0.0597604f,  -0.285993f,   -0.114829f,  -0.557364f,
+  -0.840051f,  0.326509f,   -0.192508f,   -0.141769f,  0.370626f,
+  -0.126353f,  0.00672923f, 0.493623f,    -0.852076f,  0.466798f,
+  -0.226436f,  0.259268f,   -0.452662f,   0.0721126f,  0.0198245f,
+  0.2048f,     0.02506f,    0.316194f,    0.814651f,   1.01288f,
+  -0.569607f,  -0.0838994f, 1.37146f,     -0.613135f,  0.441761f,
+  -0.643901f,  0.364269f,   -0.147177f,   0.338001f,   -0.332376f,
+  0.518875f,   -0.628964f,  -0.291889f,   -0.050736f,  0.108047f,
+  1.05673f,    0.0479492f,  0.466756f,    -0.0867334f, -0.0355575f,
+  0.57626f,    -0.227583f,  -0.146421f,   0.0990489f,  0.117351f,
+  -0.103858f,  -0.0336936f, 0.0201903f,   -0.0766383f, -0.010211f,
+  0.0400779f,  0.0725462f,  0.137142f,    0.478261f,   0.287869f,
+  0.0882359f,  -0.739754f,  -0.853521f,   -0.43703f,   0.316856f,
+  0.27593f,    0.312149f,   0.175575f,    0.441839f,   0.264325f,
+  0.0148051f,  -0.005559f,  0.373176f,    0.933701f,   -0.0197615f,
+  0.0219723f,  -0.0559883f, -0.103456f,   -0.0323009f, 0.0773202f,
+  -0.390838f,  0.855488f,   -0.596525f,   -0.249093f,  0.124262f,
+  0.220172f,   0.0552478f,  1.04041f,     -0.960992f,  -0.495255f,
+  -0.211612f,  0.350007f,   -0.238998f,   -0.0265068f, 0.384686f,
+  -0.0815808f, -0.0570019f, 0.123903f,    -0.485114f,  -0.00282573f,
+  -0.0649603f, 0.163719f,   -0.469479f,   -0.439713f,  0.0602562f,
+  -0.527993f,  -0.111458f,  2.48686f,     -0.180723f,  0.0553895f,
+  0.0560679f,  -0.0978928f, -0.216063f,   0.089457f,   -1.5602f,
+  -1.62332f,   -0.147388f,  0.736155f,    0.440409f,   0.243519f,
+  0.0622638f,  0.522932f,   0.109686f,    0.422849f,   0.510589f,
+  1.01116f,    0.174019f,   0.0191171f,   -0.0717751f, -0.0068308f,
+  0.172932f,   -0.834888f,  -0.635788f,   0.32012f,    0.298656f,
+  0.274309f,   -0.155456f,  0.1755f,      -0.175171f,  0.343498f,
+  -0.122832f,  -0.107696f,  0.279924f,    -0.797633f,  -0.344658f,
+  0.162669f,   0.389092f,   0.644479f,    -0.635216f,  -0.181868f,
+  0.0579244f,  -0.0568976f, 0.433003f,    -0.591067f,  0.71013f,
+  -0.165515f,  0.225725f,   -0.358156f,   0.0541944f,  1.95485f,
+  -0.315223f,  0.61537f,    -0.0401568f,  0.22811f,    0.271147f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = {
+  1.63441f,  -0.616459f, -0.437775f, -0.71669f,  1.56616f,  2.28109f, 1.64054f,
+  -1.51476f, 0.0274108f, 0.935156f,  -0.966329f, 0.906069f, 1.19954f, -1.25867f,
+  -1.7376f,  -0.594211f, 0.322242f,  0.438631f,  -1.01682f, 1.30032f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_8[] = {
+  -0.463187f, 0.2936127f, 0.16762f,    -0.1663271f, -0.292418f,
+  -0.421457f, -0.378265f, 1.053049f,   0.32432879f, -0.49775575f,
+  0.427357f,  -0.239251f, -0.1631546f, 0.335468f,   0.255371f,
+  0.276901f,  -0.665683f, -0.7021493f, 0.381513f,   -0.1339761f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_8[] = {
+  -1.739754f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = {
+  NUM_FEATURES_8,
+  NUM_LOGITS_8,
+  NUM_HIDDEN_LAYERS_8,
+  {
+      NUM_LAYER_0_UNITS_8,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_8,
+      av1_simple_motion_search_split_logits_kernel_8,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_8,
+      av1_simple_motion_search_split_logits_bias_8,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = {
+  &av1_simple_motion_search_split_nn_config_128,
+  &av1_simple_motion_search_split_nn_config_64,
+  &av1_simple_motion_search_split_nn_config_32,
+  &av1_simple_motion_search_split_nn_config_16,
+  &av1_simple_motion_search_split_nn_config_8,
+};
+
+// Model based on simple_motion_search for pruning rect
+// Thresholds. The first idx level is aggresiveness, second is frame resolution,
+// third is bsize
+static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = {
+  // Aggressivness = 0
+  {
+      // Lowres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+      // Midres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+      // Hdres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+  },
+  // Aggressivness = 1
+  {
+      // Lowres
+      {
+          0.000000f,
+          0.116076f,
+          0.049759f,
+          0.057747f,
+          0.006001f,
+      },
+      // Midres
+      {
+          0.000000f,
+          0.017380f,
+          0.026077f,
+          0.078111f,
+          0.064477f,
+      },
+      // Hdres
+      {
+          0.002994f,
+          0.103093f,
+          0.076408f,
+          0.010456f,
+          0.187211f,
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // Lowres
+      {
+          0.000000f,
+          0.003111f,
+          0.144294f,
+          0.144884f,
+          0.069924f,
+      },
+      // Midres
+      {
+          0.000000f,
+          0.013696f,
+          0.055203f,
+          0.152271f,
+          0.078886f,
+      },
+      // Hdres
+      {
+          0.030577f,
+          0.082486f,
+          0.040690f,
+          0.140924f,
+          0.067608f,
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // Lowres
+      { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f,
+        0.287219697095f },
+      // Midres
+      { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f,
+        0.178833795641f },
+      // Hdres
+      { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f,
+        0.21329309279f },
+  },
+};
+
+// Mean and std
+static const float av1_simple_motion_search_prune_rect_mean_128[25] = {
+  13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f,
+  10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f,
+  12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f,
+  12.152315f, 11.517566f, 11.465651f, 5.383040f,  0.757934f,
+  4.012611f,  4.052191f,  0.853365f,  3.954503f,  3.944135f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_128[25] = {
+  2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f,
+  3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f,
+  2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f,
+  1.208679f, 0.353742f, 1.228122f, 1.211777f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_64[25] = {
+  11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f,
+  9.084122f,  8.559063f,  8.499496f, 8.095865f, 8.041795f,
+  10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f,
+  10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f,
+  3.306144f,  3.351039f,  0.928582f, 3.319739f, 3.287726f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_64[25] = {
+  2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f,
+  3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f,
+  2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f,
+  1.081292f, 0.257521f, 1.112510f, 1.089404f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_32[25] = {
+  9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f,
+  7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f,
+  8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f,
+  2.751266f, 0.963302f, 2.716584f, 2.709725f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_32[25] = {
+  1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f,
+  1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f,
+  1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f,
+  0.952221f, 0.188018f, 0.985295f, 0.946228f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_16[25] = {
+  8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f,
+  6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f,
+  7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f,
+  2.131698f, 0.981005f, 2.110868f, 2.106539f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_16[25] = {
+  1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f,
+  1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f,
+  1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f,
+  0.829935f, 0.136507f, 0.828972f, 0.808563f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_8[25] = {
+  6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f,
+  4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f,
+  6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f,
+  1.531762f, 0.989606f, 1.496581f, 1.484139f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_8[25] = {
+  1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f,
+  1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f,
+  1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f,
+  0.754040f, 0.101419f, 0.738239f, 0.729455f,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_mean[5] = {
+  av1_simple_motion_search_prune_rect_mean_128,
+  av1_simple_motion_search_prune_rect_mean_64,
+  av1_simple_motion_search_prune_rect_mean_32,
+  av1_simple_motion_search_prune_rect_mean_16,
+  av1_simple_motion_search_prune_rect_mean_8,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_std[5] = {
+  av1_simple_motion_search_prune_rect_std_128,
+  av1_simple_motion_search_prune_rect_std_64,
+  av1_simple_motion_search_prune_rect_std_32,
+  av1_simple_motion_search_prune_rect_std_16,
+  av1_simple_motion_search_prune_rect_std_8,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 25
+#define NUM_LAYER_0_UNITS_128 8
+#define NUM_LOGITS_128 4
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = {
+  -0.129103f, 0.457758f,  -0.489986f, 0.65462f,   -0.184312f, 3.81202f,
+  -0.444407f, -0.64198f,  -0.575008f, 0.0311711f, 0.525243f,  -20.892f,
+  1.08811f,   -65.0976f,  -12.3973f,  -1.38278f,  -0.264233f, 0.241636f,
+  -10.6925f,  -0.725414f, -18.8987f,  -40.2284f,  -16.08f,    0.995331f,
+  1.47614f,   -0.964864f, 0.405506f,  0.140449f,  0.459534f,  -1.9093f,
+  0.398452f,  0.696949f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = {
+  1.22789f, -1.34527f, 0.759048f,  0.315086f,
+  1.0834f,  -1.58019f, -0.465158f, 1.20716f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = {
+  -0.668677f,  0.58694f,    -0.417094f,   0.754735f,   -0.7859f,
+  0.377479f,   -0.0415929f, -0.0140585f,  -0.730001f,  0.747528f,
+  -0.135247f,  0.406505f,   -0.234184f,   0.956362f,   -0.637555f,
+  0.791884f,   0.0303722f,  1.04424f,     -0.727859f,  -0.274321f,
+  -0.122986f,  0.066312f,   -0.00559175f, -0.239643f,  -0.0188767f,
+  -0.102787f,  -0.262967f,  0.071882f,    -0.283398f,  0.111607f,
+  -0.425826f,  0.02699f,    0.108873f,    -0.180558f,  -0.0794057f,
+  0.29665f,    -0.0252969f, -0.0266213f,  -0.277462f,  -0.361973f,
+  0.512552f,   0.395011f,   -0.225876f,   0.301924f,   0.136954f,
+  0.507259f,   1.23425f,    0.0137135f,   0.662572f,   0.591583f,
+  0.101564f,   0.416805f,   -0.645081f,   -0.179086f,  -0.36747f,
+  -0.332213f,  0.095177f,   0.220739f,    -0.153256f,  0.706155f,
+  0.161701f,   0.696815f,   -1.21531f,    -0.115059f,  0.486764f,
+  -0.396093f,  0.784883f,   0.535357f,    -0.278021f,  0.143496f,
+  -0.44931f,   -0.144543f,  0.319326f,    0.0190167f,  -0.206295f,
+  0.373995f,   -0.247897f,  -0.608095f,   -0.41796f,   -0.137129f,
+  -0.709562f,  0.678273f,   0.537607f,    0.557474f,   0.453308f,
+  0.21405f,    -0.0466495f, 0.519139f,    -0.168832f,  0.902911f,
+  0.681131f,   -0.139876f,  -0.2052f,     -0.393271f,  0.262222f,
+  -0.246246f,  -0.213993f,  0.646619f,    0.0496181f,  -0.00354157f,
+  0.822927f,   0.0939522f,  0.180738f,    0.118355f,   0.120456f,
+  -0.0472214f, -0.144958f,  0.173405f,    -0.886644f,  -0.0949769f,
+  -0.813518f,  -0.3947f,    -0.128021f,   0.356196f,   0.469169f,
+  -0.413702f,  1.04242f,    0.428853f,    -0.387293f,  0.0850877f,
+  0.279409f,   -0.142276f,  0.0579376f,   0.211112f,   0.0703013f,
+  -1.9274f,    -0.729147f,  0.534193f,    0.773586f,   0.922864f,
+  0.642881f,   1.15127f,    0.621032f,    0.933942f,   1.01837f,
+  -0.660282f,  -0.40059f,   -1.11279f,    -0.77088f,   -0.43349f,
+  0.202361f,   -0.0840912f, 0.0935707f,   0.056333f,   -0.0779369f,
+  0.0173447f,  -0.0104756f, 0.0115005f,   -0.0195593f, 0.03592f,
+  -0.343454f,  -0.618048f,  0.258172f,    -0.412322f,  -0.0463746f,
+  -0.0413654f, -0.0400194f, 0.615981f,    -0.452094f,  0.644555f,
+  0.0822476f,  -0.359791f,  -0.0904274f,  0.209427f,   0.0116338f,
+  -0.190978f,  0.890233f,   0.737769f,    -1.66663f,   -0.392605f,
+  0.0785728f,  -0.224553f,  -0.128258f,   -0.227227f,  -0.0777773f,
+  0.685976f,   0.347042f,   -0.555325f,   -0.249221f,  0.0919837f,
+  -0.0660016f, -0.272316f,  0.0390632f,   -0.619624f,  -0.0565801f,
+  0.585026f,   0.597375f,   0.54114f,     0.593389f,   0.604391f,
+  0.0820294f,  -0.85339f,   -1.40741f,    -0.391675f,  0.0579205f,
+  -0.197626f,  0.130044f,   -0.234488f,   -0.0373991f, -0.0717973f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = {
+  1.58571f, -4.6314f, -2.00273f, 0.543699f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = {
+  NUM_FEATURES_128,
+  NUM_LOGITS_128,
+  NUM_HIDDEN_LAYERS_128,
+  {
+      NUM_LAYER_0_UNITS_128,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_128,
+      av1_simple_motion_search_prune_rect_logits_kernel_128,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_128,
+      av1_simple_motion_search_prune_rect_logits_bias_128,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 25
+#define NUM_LAYER_0_UNITS_64 32
+#define NUM_LOGITS_64 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = {
+  0.10424f,    -0.346025f,  0.534547f,   -0.385925f,  2.58341f,    -0.256414f,
+  -0.232498f,  0.329823f,   -0.0777376f, -0.590939f,  0.062657f,   -0.628252f,
+  0.0934588f,  2.04029f,    -0.224448f,  0.371168f,   -0.385348f,  -0.589883f,
+  -3.73627f,   -0.943144f,  0.346409f,   -0.211215f,  -0.351008f,  0.418807f,
+  0.943663f,   0.173267f,   1.16585f,    -0.0840888f, 0.227464f,   0.374412f,
+  0.0422597f,  -0.338868f,  0.222576f,   0.431713f,   1.12366f,    0.00753411f,
+  0.248412f,   -0.0902425f, 0.542455f,   -0.665629f,  -0.311245f,  -0.205639f,
+  -0.447149f,  -0.0502733f, -0.290186f,  -0.794384f,  0.0940881f,  -0.0686117f,
+  -0.0199961f, -0.587965f,  0.777096f,   -0.083381f,  -1.21282f,   0.652959f,
+  -1.18238f,   0.539991f,   0.352497f,   -0.540076f,  -0.26222f,   -0.568556f,
+  0.409102f,   -0.131146f,  -0.407161f,  -0.188287f,  -0.478657f,  0.000401932f,
+  -0.689324f,  0.351064f,   -1.43704f,   -0.315185f,  -0.868726f,  0.376341f,
+  -0.0566277f, 0.364831f,   0.611298f,   -0.495253f,  -0.0193132f, 0.617978f,
+  0.189586f,   -0.236758f,  -0.608246f,  -0.149017f,  -1.78303f,   0.143023f,
+  0.698386f,   -0.994086f,  -0.673327f,  0.233868f,   0.360425f,   0.0294123f,
+  -0.248683f,  -0.148392f,  0.0861829f,  -0.190843f,  -0.414906f,  0.607378f,
+  -0.756715f,  -0.511713f,  -0.321556f,  1.0078f,     -1.18141f,   0.519751f,
+  0.834629f,   -0.359343f,  0.612262f,   -0.0730553f, 0.262935f,   0.488276f,
+  0.387071f,   -1.44123f,   1.08269f,    0.554402f,   -0.069f,     0.14113f,
+  0.323817f,   0.824314f,   -0.431417f,  -0.349448f,  0.950728f,   -0.587836f,
+  -0.83914f,   -0.10844f,   0.26602f,    0.831933f,   -0.271315f,  0.231563f,
+  0.417049f,   0.190627f,   -0.0940667f, 0.255363f,   -0.0741022f, -0.0987662f,
+  -0.847522f,  0.00287554f, 0.0615741f,  -0.0832218f, 0.0847148f,  -0.392843f,
+  -0.938068f,  -0.10621f,   -0.260859f,  -0.825175f,  -0.401039f,  0.315213f,
+  -0.108269f,  0.288036f,   -8.66166f,   -0.970752f,  -0.66678f,   -0.593405f,
+  -0.518294f,  -0.138722f,  -0.454698f,  -0.22969f,   -0.553006f,  -0.440111f,
+  0.462661f,   -0.536854f,  0.0108295f,  -0.522888f,  0.00111157f, 0.229999f,
+  0.0267768f,  0.176266f,   -1.57043f,   0.0318106f,  0.257534f,   -0.198583f,
+  0.175564f,   -0.251465f,  -0.262441f,  -1.65283f,   -0.319603f,  -0.875282f,
+  -0.301303f,  0.0170948f,  -0.227075f,  0.0299545f,  -4.98346f,   0.470046f,
+  -1.28051f,   -0.213809f,  -0.486585f,  -0.906463f,  -0.169984f,  -0.333153f,
+  -0.376733f,  0.108016f,   0.486744f,   -0.186936f,  -0.429259f,  0.056501f,
+  -0.266545f,  0.265447f,   -0.137718f,  -0.490687f,  -0.935668f,  -0.16229f,
+  -0.696932f,  0.173157f,   0.434959f,   -0.140595f,  0.345845f,   -1.08013f,
+  -0.0205929f, -0.815874f,  -0.179812f,  0.02767f,    -0.141727f,  0.471936f,
+  -7.29453f,   -1.04362f,   -0.745482f,  -0.28725f,   -0.214997f,  -0.0850651f,
+  -0.748471f,  0.161325f,   -1.04387f,   -0.705305f,  0.489427f,   -0.765373f,
+  -0.301576f,  0.0742467f,  -0.331282f,  0.0372328f,  -0.90298f,   -0.0608646f,
+  -2.18756f,   0.170384f,   -0.258357f,  0.106287f,   -0.161684f,  -0.103799f,
+  -0.127774f,  -0.156313f,  0.0705286f,  -0.977908f,  -0.281191f,  -0.056757f,
+  -0.309474f,  0.050476f,   -9.78198f,   -2.42795f,   -0.289626f,  -1.07579f,
+  -0.439256f,  -1.09948f,   -0.564671f,  0.0913182f,  -0.417216f,  -1.19909f,
+  0.287063f,   0.402315f,   -0.17646f,   0.540488f,   0.00840239f, 0.397492f,
+  0.702393f,   -0.10566f,   0.655296f,   -0.0443876f, 0.154918f,   -0.760479f,
+  -0.0523153f, -0.366199f,  -1.08212f,   -0.398556f,  -0.415203f,  -1.10488f,
+  0.208349f,   0.27079f,    0.101546f,   -0.205752f,  -13.7923f,   -0.218637f,
+  -1.10077f,   0.355735f,   -0.306196f,  0.627434f,   -0.473101f,  -0.308027f,
+  -1.12724f,   0.301597f,   0.660785f,   0.0576217f,  -0.155925f,  -0.56107f,
+  -0.223537f,  0.114299f,   -0.53803f,   -0.252674f,  -2.66103f,   -0.185245f,
+  -0.314673f,  0.403337f,   0.679821f,   -0.69231f,   0.506264f,   -0.999705f,
+  -0.549097f,  0.353745f,   0.188249f,   0.414484f,   -0.615853f,  0.525681f,
+  -5.23065f,   -3.05174f,   1.02074f,    -0.965499f,  -0.158947f,  0.0436088f,
+  -0.485824f,  0.0375094f,  -1.39985f,   -0.481392f,  0.485785f,   -0.24874f,
+  -0.359633f,  0.668108f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = {
+  0.0735592f, -0.045064f, -0.0114103f, 1.39246f,    -0.683467f,  0.155765f,
+  -0.667652f, -0.202425f, -0.585433f,  -0.146752f,  -0.0812931f, 0.580642f,
+  0.578542f,  -0.831916f, 0.610063f,   0.0101856f,  -0.235863f,  0.538141f,
+  -2.91334f,  -1.71887f,  0.126616f,   0.582497f,   -0.438879f,  0.221833f,
+  0.850773f,  -0.280886f, 0.443233f,   -0.0964873f, -0.216161f,  0.34413f,
+  0.656818f,  0.0169274f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = {
+  -0.310947f,   -0.232675f,    0.0171092f,    0.0834474f,   0.373977f,
+  0.300429f,    0.215072f,     -0.454074f,    0.187565f,    0.282742f,
+  0.562562f,    -0.0419322f,   0.000978486f,  -0.298267f,   0.216934f,
+  -0.388722f,   -0.146866f,    -0.275946f,    0.202361f,    0.225847f,
+  1.42868f,     0.473127f,     -0.145747f,    -0.104986f,   0.153459f,
+  0.69382f,     0.162266f,     0.0207715f,    -0.45095f,    -0.412071f,
+  -0.235109f,   -0.130199f,    0.231741f,     0.460193f,    0.0378202f,
+  0.429516f,    0.387691f,     -0.272479f,    0.0723884f,   -0.453914f,
+  -0.150618f,   -0.10745f,     -0.258615f,    0.0838312f,   -0.00554958f,
+  0.105377f,    -0.0415479f,   0.13228f,      1.09044f,     -0.73053f,
+  -0.422553f,   -0.435842f,    0.211416f,     0.420332f,    0.0181353f,
+  -0.030891f,   0.522788f,     0.613526f,     0.374032f,    0.287986f,
+  -0.403118f,   -0.287362f,    -1.11523f,     -0.577713f,   -0.020228f,
+  0.86465f,     -0.0590579f,   0.341274f,     -0.0115644f,  -0.260236f,
+  0.192123f,    -0.0849825f,   0.0501709f,    0.444382f,    0.0762727f,
+  0.0926596f,   -0.101157f,    -0.142787f,    0.40861f,     0.555805f,
+  -0.00614654f, -0.122846f,    0.203163f,     0.234266f,    0.409795f,
+  -0.0206245f,  -0.224679f,    0.025081f,     0.518044f,    -0.287186f,
+  0.016494f,    -0.0886331f,   0.236438f,     -1.01032f,    0.118332f,
+  0.364217f,    0.061438f,     0.0381303f,    0.128418f,    0.0257077f,
+  -0.975751f,   -0.694894f,    0.00351914f,   0.278179f,    0.29363f,
+  0.525576f,    0.0604849f,    0.531734f,     0.406643f,    0.812497f,
+  -0.403196f,   -0.16664f,     -0.620887f,    -0.428194f,   0.275401f,
+  0.432063f,    -0.00378342f,  0.295758f,     0.105615f,    -0.00683626f,
+  0.00396146f,  0.00598654f,   -0.0131701f,   -0.0115787f,  0.00386643f,
+  -0.69686f,    -0.139623f,    -0.440817f,    0.0542873f,   0.217962f,
+  0.527035f,    -0.0201046f,   0.0471354f,    0.0271858f,   -0.0775197f,
+  -0.309797f,   0.184879f,     -0.232854f,    -0.407081f,   0.706227f,
+  -0.0877534f,  0.306843f,     0.455075f,     -0.333961f,   0.0759148f,
+  0.0444791f,   -0.0693626f,   -0.0850289f,   -0.513063f,   -0.643971f,
+  -0.630279f,   -0.153889f,    0.123315f,     0.00548238f,  0.170707f,
+  0.734339f,    -0.176988f,    0.322519f,     0.178365f,    0.183519f,
+  -0.698683f,   -0.12043f,     -0.349914f,    -0.0696762f,  -0.53986f,
+  -0.104738f,   1.05264f,      0.983568f,     -0.109035f,   0.0113748f,
+  0.0815189f,   -0.0628812f,   0.0769389f,    0.010261f,    0.146573f,
+  -0.433194f,   -0.211572f,    -0.000397392f, 0.445325f,    0.145091f,
+  -0.0625902f,  0.29394f,      0.302315f,     0.0892226f,   -0.209504f,
+  -0.0150374f,  0.242608f,     0.216223f,     0.366857f,    0.209829f,
+  -0.540035f,   0.117599f,     -0.329315f,    0.0471133f,   -0.0115449f,
+  -0.0638235f,  0.0527461f,    0.348149f,     0.360802f,    1.06624f,
+  -0.615991f,   -0.341396f,    0.18972f,      0.0709888f,   -0.0414466f,
+  -0.0193809f,  0.0938933f,    0.209058f,     0.575042f,    0.483608f,
+  -0.285875f,   -0.115905f,    -0.363637f,    0.375425f,    0.336217f,
+  0.0336358f,   -0.00265618f,  -0.406854f,    -0.792959f,   -0.219354f,
+  0.0331615f,   0.0298859f,    -0.211446f,    -0.00280773f, -0.194011f,
+  0.262109f,    0.548076f,     0.120183f,     -0.661603f,   0.241855f,
+  -0.501428f,   0.00102718f,   -0.347331f,    -0.58306f,    0.0977254f,
+  0.117491f,    0.0840667f,    0.00693675f,   0.000600294f, 0.649569f,
+  -0.0553811f,  -0.197198f,    0.397236f,     -0.523737f,   -0.564192f,
+  -0.374679f,   -0.249344f,    0.00861428f,   0.00393439f,  -0.0834608f,
+  0.124389f,    -0.0393049f,   0.0425391f,    -0.153383f,   -0.182346f,
+  0.420953f,    0.464221f,     0.288984f,     0.570921f,    -0.239965f,
+  0.247239f,    -0.083434f,    0.714418f,     0.986323f,    -0.460244f,
+  -0.260993f,   -0.947743f,    -1.0789f,      -0.0391231f,  0.612407f,
+  -0.0306767f,  0.281419f,     0.0072426f,    -0.37623f,    0.188744f,
+  0.221666f,    -0.424914f,    0.29703f,      0.261715f,    0.277809f,
+  -0.0617616f,  -0.000611999f, -0.0547053f,   -0.0901018f,  -0.347669f,
+  0.856072f,    0.596675f,     -0.467639f,    -1.09324f,    -0.184224f,
+  -0.56051f,    -0.0144704f,   0.102894f,     -0.122982f,   -0.0020749f,
+  -0.0423487f,  0.0328702f,    -0.0154263f,   0.0349021f,   -0.00315595f,
+  0.0254802f,   -0.729191f,    0.207296f,     -0.0212349f,  -0.207078f,
+  0.20636f,     -0.156883f,    0.429765f,     -0.42672f,    0.138775f,
+  -0.0267343f,  0.631528f,     0.300646f,     -0.4793f,     -0.273833f,
+  -0.0135367f,  -0.530819f,    -0.534881f,    0.830896f,    0.0266992f,
+  0.473744f,    0.210334f,     0.0234739f,    0.255394f,    0.123531f,
+  -0.489341f,   -0.796627f,    0.372617f,     0.190136f,    0.275342f,
+  0.739505f,    0.402354f,     0.782806f,     0.437374f,    1.04948f,
+  -0.55963f,    0.382704f,     -0.698321f,    0.0817868f,   -0.440108f,
+  -0.0635004f,  -0.277851f,    -0.524194f,    0.286157f,    -0.01097f,
+  -0.0293145f,  -0.0405071f,   -0.035662f,    -0.012871f,   -0.0516409f,
+  -0.406671f,   0.709259f,     -0.525177f,    0.521123f,    -0.44813f,
+  0.48412f,     -0.0546513f,   0.305253f,     -0.468328f,   0.316453f,
+  -0.36307f,    0.497515f,     -0.0606276f,   0.315764f,    -0.422066f,
+  0.554025f,    -0.679183f,    0.616914f,     0.00283324f,  -0.000643824f,
+  0.0639999f,   0.0488285f,    -0.141031f,    0.068003f,    -0.0792678f,
+  -0.425307f,   -0.152235f,    0.269917f,     -0.352327f,   0.44792f,
+  -0.116514f,   -0.465868f,    0.154287f,     0.0161028f,   -0.16848f,
+  -0.255487f,   0.189832f,     0.254883f,     0.0240822f,   0.432638f,
+  -0.136564f,   0.137036f,     0.0375734f,    0.989246f,    -0.126287f,
+  0.111416f,    -0.0271002f,   0.718755f,     -0.0412969f,  0.00645681f,
+  0.253811f,    -0.0186998f,   0.691971f,     -0.282042f,   -0.0783915f,
+  0.274592f,    -0.358449f,    0.34155f,      -0.186374f,   -0.136907f,
+  -0.192334f,   -0.251168f,    -0.100874f,    -0.166578f,   -0.336507f,
+  0.402373f,    0.173695f,     0.108788f,     0.00885581f,  -0.310063f,
+  1.05545f,     0.0295867f,    0.180785f,     -0.173469f,   -0.469924f,
+  -0.224155f,   0.665862f,     -0.126546f,    0.240691f,    -0.0415301f,
+  -0.598534f,   0.0012723f,    -0.122297f,    -0.558947f,   0.268844f,
+  0.241193f,    0.0524422f,    -0.1683f,      0.575588f,    -0.139012f,
+  0.0636691f,   -0.446709f,    -0.094532f,    0.883809f,    -0.112981f,
+  -0.224047f,   0.0811193f,    -0.140571f,    -0.09683f,    -0.0796143f,
+  -0.102246f,   -0.863392f,    -0.0755124f,   0.23125f,     -0.0301361f,
+  -0.153029f,   -0.172238f,    -0.0286382f,   -0.338495f,   -0.317216f,
+  -0.146629f,   -0.242264f,    -0.702306f,    -0.285052f,   0.0623479f,
+  0.265735f,    0.00674475f,   0.666196f,     0.883586f,    0.278416f,
+  -0.341692f,   -0.509931f,    -0.156263f,    0.635885f,    -0.544143f,
+  -0.572632f,   -0.213285f,    0.443396f,     -0.268329f,   0.0638439f,
+  -0.185397f,   0.071126f,     0.386503f,     -0.402212f,   -0.140784f,
+  -0.411661f,   0.049398f,     -0.0672907f,   -0.267034f,   -0.0560875f,
+  0.0607937f,   0.0445484f,    -0.547651f,    0.574718f,    0.417189f,
+  -0.0610166f,  0.0632293f,    0.391619f,     -0.00671215f, -0.136883f,
+  -0.339346f,   0.0356183f,    0.511993f,     0.178676f,    0.286998f,
+  0.136511f,    -0.00796929f,  0.203985f,     0.0423532f,   -0.175196f,
+  0.378534f,    0.770417f,     0.593778f,     0.0256067f,   -0.82394f,
+  -0.500691f,   -0.425725f,    -0.623708f,    -0.0406241f,  -0.00226464f,
+  0.0207836f,   0.30732f,      -0.00784268f,  0.0065445f,   -0.0991039f,
+  -0.20871f,    -0.206835f,    0.281219f,     0.119361f,    0.259346f,
+  -0.102713f,   0.186488f,     -0.034455f,    -0.00198392f, -0.279107f,
+  -0.638993f,   -0.374404f,    -0.48601f,     -0.262345f,   0.624532f,
+  0.620632f,    -0.227014f,    0.433579f,     -0.0455096f,  1.22123f,
+  -0.429156f,   0.12396f,      0.0815152f,    -0.0837355f,  0.0282623f,
+  -0.407475f,   0.787321f,     -0.434974f,    0.312904f,    -0.230805f,
+  0.213042f,    -0.250929f,    0.302997f,     -0.354709f,   0.0504905f,
+  -0.561706f,   0.595558f,     0.374951f,     0.802969f,    -0.674902f,
+  0.33136f,     0.156606f,     0.0218968f,    -0.694188f,   -0.0221949f,
+  -0.00639123f, 0.0146536f,    0.0104145f,    0.021635f,    -0.0499428f,
+  -0.575116f,   -0.239035f,    -0.0588276f,   0.599722f,    0.541932f,
+  0.437433f,    0.716268f,     0.193207f,     0.548351f,    0.326951f,
+  -0.197124f,   0.0355353f,    -0.0952009f,   -0.217265f,   -0.389789f,
+  0.0528124f,   -0.21334f,     -0.190296f,    -1.17367f,    0.108905f,
+  0.109397f,    -0.0192577f,   0.0343813f,    0.085004f,    -0.0556737f,
+  -0.0411158f,  -0.534989f,    0.0361896f,    0.124415f,    0.291603f,
+  -0.0311974f,  -0.326726f,    0.343131f,     0.0276456f,   -0.231827f,
+  -0.373894f,   -0.208898f,    -0.273011f,    0.061323f,    -0.0910538f,
+  -0.30746f,    -0.108644f,    -0.190736f,    1.58048f,     -0.0739711f,
+  -0.0623489f,  -0.137967f,    -0.0601359f,   -0.133004f,   -0.0857153f,
+  0.00955987f,  -0.365561f,    -0.0329051f,   0.463463f,    0.14758f,
+  -0.512256f,   -0.227463f,    -0.26008f,     -0.567777f,   0.0646234f,
+  1.02161f,     0.66157f,      -0.16733f,     0.264921f,    -0.242036f,
+  0.214622f,    0.0712054f,    -0.260377f,    0.0849665f,   0.735094f,
+  0.11001f,     0.297301f,     -0.333342f,    0.066978f,    -0.123625f,
+  1.07596f,     0.401263f,     0.0800875f,    -0.340862f,   -0.115587f,
+  -0.32692f,    -0.300842f,    0.0277397f,    0.0630788f,   -0.261198f,
+  0.428695f,    -0.0544757f,   -0.124511f,    0.036992f,    0.126322f,
+  0.0317603f,   0.0820762f,    0.117277f,     -1.14594f,    -0.108076f,
+  -0.0258198f,  -0.00337525f,  -0.00512531f,  0.1274f,      -0.0660535f,
+  -0.640733f,   0.197142f,     0.147278f,     0.489271f,    0.226507f,
+  -0.0668414f,  0.0946318f,    0.0994164f,    -0.820516f,   0.512939f,
+  -0.305172f,   -0.715187f,    -0.195125f,    0.279346f,    0.462144f,
+  0.913882f,    -0.453879f,    0.0582033f,    -0.462866f,   0.0538736f,
+  0.0115737f,   0.00626993f,   -0.0185185f,   0.0114601f,   -0.0181164f,
+  0.41588f,     -0.0447331f,   0.611756f,     0.43385f,     0.834465f,
+  0.122019f,    -0.352983f,    0.340429f,     -0.245425f,   -0.365328f,
+  -0.521825f,   0.0371057f,    0.172188f,     -0.387949f,   0.221054f,
+  0.0126359f,   0.422958f,     0.584198f,     -0.581498f,   -0.019466f,
+  -0.0271737f,  -0.0740885f,   0.00540879f,   0.186086f,    -0.0324402f,
+  -0.563462f,   -0.458759f,    -0.425296f,    -0.0118862f,  -0.641508f,
+  0.0132084f,   0.0581128f,    0.0231444f,    0.468587f,    0.258838f,
+  0.0296665f,   0.0562801f,    0.630014f,     0.381816f,    -0.269761f,
+  -0.135515f,   0.046186f,     1.07632f,      -0.050616f,   0.104987f,
+  0.29991f,     0.119316f,     0.117248f,     0.0795009f,   0.242573f,
+  0.0416634f,   -0.0577639f,   -0.0974078f,   0.106255f,    -0.13098f,
+  0.0141486f,   -0.00418257f,  0.144848f,     -0.463934f,   0.0452591f,
+  0.252617f,    0.205222f,     -0.189843f,    0.0652245f,   -0.135386f,
+  0.0500646f,   -0.200368f,    -0.0142312f,   -0.0286832f,  -0.254355f,
+  -1.02752f,    -0.73549f,     0.0364518f,    0.0416227f,   -0.13185f,
+  -0.0886515f,  -0.502314f,    -0.102916f,    0.410911f,    -0.355655f,
+  0.400416f,    -0.340217f,    0.208829f,     0.245972f,    0.149739f,
+  -0.49458f,    0.589482f,     0.550827f,     0.912709f,    -0.351275f,
+  -0.128076f,   -0.285172f,    -0.672752f,    0.090583f,    -0.245286f,
+  -0.737297f,   -0.201515f,    -0.025122f,    -0.109854f,   0.36738f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = {
+  0.346819f,  0.442965f,  -0.0216032f,  0.0229235f, -0.402797f,
+  -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = {
+  NUM_FEATURES_64,
+  NUM_LOGITS_64,
+  NUM_HIDDEN_LAYERS_64,
+  {
+      NUM_LAYER_0_UNITS_64,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_64,
+      av1_simple_motion_search_prune_rect_logits_kernel_64,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_64,
+      av1_simple_motion_search_prune_rect_logits_bias_64,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 25
+#define NUM_LAYER_0_UNITS_32 28
+#define NUM_LOGITS_32 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = {
+  0.486581f,    0.340847f,   -0.109226f,   0.467224f,   -0.541561f,
+  0.0943619f,   -0.429442f,  -0.207442f,   0.959963f,   0.618666f,
+  -0.0636751f,  0.144508f,   -0.0278289f,  0.332293f,   -0.751493f,
+  0.245438f,    -0.917758f,  0.612128f,    -0.32648f,   0.534618f,
+  -0.615239f,   2.71641f,    0.233759f,    0.820558f,   -0.249758f,
+  -0.427783f,   -0.359361f,  0.0375732f,   0.806973f,   0.352512f,
+  -0.0532192f,  0.0576861f,  -0.464178f,   -0.334877f,  -0.697042f,
+  0.0538218f,   0.0919659f,  -0.00765812f, 0.0603847f,  -0.460315f,
+  0.37979f,     -0.0867612f, -0.670683f,   -0.188619f,  -0.570586f,
+  0.233418f,    0.153581f,   0.290905f,    -0.624885f,  -0.557842f,
+  -0.555567f,   0.463773f,   -0.123909f,   -0.277731f,  0.0374468f,
+  0.409903f,    0.287638f,   -0.593066f,   -0.223434f,  0.154263f,
+  -0.250464f,   -0.077696f,  0.229652f,    -0.304174f,  0.308053f,
+  0.33155f,     -0.502825f,  0.361216f,    -0.499294f,  0.00595444f,
+  -0.307201f,   0.5766f,     -0.438384f,   -0.093701f,  -0.118586f,
+  0.202337f,    -0.486623f,  0.261552f,    0.139756f,   -0.655642f,
+  -0.0627001f,  -0.213053f,  -0.243037f,   0.205918f,   0.0718368f,
+  0.188041f,    0.141529f,   -0.132239f,   0.425827f,   -0.218353f,
+  0.153114f,    0.33268f,    0.0226116f,   0.167394f,   0.269854f,
+  -0.457001f,   0.1973f,     -0.526087f,   0.467528f,   0.290934f,
+  1.16267f,     0.0823663f,  -0.754389f,   -0.83716f,   0.270157f,
+  -1.41229f,    0.148511f,   -0.286832f,   0.664796f,   0.492254f,
+  0.360567f,    -0.533993f,  0.0435672f,   -0.103001f,  0.220668f,
+  0.594621f,    -0.0213356f, -0.347638f,   -0.694457f,  0.0759505f,
+  0.161358f,    -0.389384f,  -0.0455192f,  -0.61252f,   -0.174173f,
+  -0.00788878f, -1.22487f,   0.332233f,    -0.0457021f, -0.225918f,
+  -0.197657f,   -0.115408f,  -0.240589f,   -2.05681f,   0.00914629f,
+  -1.92213f,    0.0268578f,  -0.49076f,    -0.0120123f, 0.291157f,
+  0.267116f,    -0.0775724f, 0.181115f,    -0.392441f,  -0.488114f,
+  -0.28842f,    -0.115465f,  0.128974f,    -0.0829899f, -0.14096f,
+  -0.140145f,   -0.700281f,  0.0368945f,   -0.437598f,  0.243485f,
+  -1.00301f,    0.332324f,   0.125014f,    -0.0604481f, -0.0652028f,
+  -0.207295f,   -1.0209f,    -0.341525f,   0.191326f,   -0.147578f,
+  0.0878327f,   0.129827f,   -0.0848319f,  0.187381f,   -1.28663f,
+  0.00537885f,  -0.134277f,  -0.0411126f,  -0.3434f,    -0.0456494f,
+  0.37861f,     0.409095f,   0.237177f,    -0.396855f,  -0.205418f,
+  -1.31701f,    -0.319032f,  -0.123404f,   -0.240005f,  -0.305206f,
+  -0.0258176f,  -0.26367f,   -0.142396f,   0.191672f,   -1.44061f,
+  0.0554776f,   -0.571839f,  -0.284789f,   -0.425677f,  -0.0307376f,
+  0.20275f,     -0.223146f,  0.144612f,    0.0212636f,  0.0238303f,
+  -0.253802f,   -0.188922f,  -0.0637066f,  -0.340836f,  0.124774f,
+  0.130474f,    -0.154099f,  -0.0292733f,  0.158148f,   -0.246989f,
+  -0.259059f,   0.220224f,   0.228449f,    -0.41956f,   -0.321848f,
+  -0.2396f,     -0.316449f,  -1.3363f,     0.0264099f,  -1.46865f,
+  0.113073f,    0.0722885f,  -0.166986f,   -0.164877f,  0.0360911f,
+  0.534472f,    -0.551152f,  -0.328501f,   0.0781121f,  -0.378112f,
+  -0.459502f,   0.28015f,    -0.212302f,   -0.521641f,  0.618993f,
+  -0.347709f,   0.266253f,   -0.0280894f,  0.348511f,   -0.0155031f,
+  -0.100693f,   0.0447673f,  0.277519f,    -0.233998f,  -0.0796738f,
+  -1.73644f,    -0.160776f,  0.53092f,     -0.180406f,  0.056447f,
+  0.385356f,    -0.262337f,  -0.241479f,   -0.271426f,  -0.457354f,
+  -0.266788f,   0.367371f,   -0.103065f,   0.47783f,    -0.188327f,
+  -0.159636f,   0.00142907f, -0.409756f,   0.454889f,   -0.24566f,
+  -0.0760084f,  0.286355f,   0.462102f,    0.0431695f,  -0.127395f,
+  -0.200476f,   -0.350557f,  0.217275f,    -0.23975f,   0.255148f,
+  -0.280626f,   0.42476f,    0.157411f,    0.0358675f,  -0.192591f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = {
+  0.940498f,  0.15602f,   -0.234831f, 0.0268585f, 0.144769f,  0.243081f,
+  0.611406f,  0.366093f,  0.361868f,  0.39668f,   0.401479f,  0.369467f,
+  0.0909503f, 0.710595f,  0.032786f,  0.525891f,  -1.0232f,   0.732557f,
+  -0.064425f, 0.865222f,  -0.042917f, -0.237191f, -0.527006f, -0.0172101f,
+  0.59681f,   -0.472405f, 0.0969218f, -0.250624f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = {
+  0.355607f,    0.126701f,    -0.0825159f,  0.200675f,     -0.011308f,
+  -0.280057f,   0.559816f,    0.142689f,    0.0422419f,    -0.151692f,
+  -0.0275637f,  -0.283101f,   -0.20822f,    -0.200394f,    0.465427f,
+  0.344491f,    -0.525319f,   -0.358813f,   -0.39767f,     0.0974486f,
+  0.00559058f,  -0.00546089f, 0.0506486f,   0.114475f,     -0.0436463f,
+  -0.574152f,   -0.376294f,   0.16563f,     -0.0967032f,   0.00579838f,
+  0.0639909f,   -0.037129f,   0.407574f,    -0.231428f,    0.489326f,
+  -0.221566f,   -0.270382f,   -0.784628f,   -0.155502f,    0.481698f,
+  -0.0296057f,  0.431855f,    0.840807f,    0.112291f,     0.773874f,
+  -0.0610936f,  -0.012892f,   0.365154f,    0.0267687f,    -0.0751114f,
+  0.25043f,     0.516472f,    -0.186133f,   -0.12762f,     -0.168804f,
+  -0.146309f,   0.139314f,    -0.367113f,   -0.601079f,    0.0559856f,
+  0.176081f,    0.22397f,     0.434113f,    0.0363256f,    0.313051f,
+  0.0143976f,   0.190076f,    0.474607f,    -0.681134f,    -0.0709097f,
+  -0.253289f,   -0.216277f,   -0.0593789f,  -0.107795f,    -0.194842f,
+  0.513945f,    0.239171f,    -0.720561f,   0.0136723f,    -0.391147f,
+  -0.272043f,   -0.164766f,   0.124248f,    0.147178f,     -0.35497f,
+  0.397725f,    -0.117603f,   0.262937f,    -0.331964f,    0.182418f,
+  0.315671f,    -0.0385649f,  0.488769f,    -0.334568f,    0.00596018f,
+  0.0661557f,   -0.0446985f,  -0.0928255f,  -0.0221032f,   -0.019045f,
+  -0.20881f,    0.197907f,    -0.381881f,   0.0598071f,    -0.0434551f,
+  0.159283f,    -0.110631f,   0.266996f,    -0.0265494f,   0.135199f,
+  -0.00833162f, 0.804482f,    -0.114698f,   -0.15066f,     -0.479553f,
+  0.448407f,    -0.344069f,   -0.0280952f,  -0.208211f,    -0.102269f,
+  -0.679066f,   -0.37476f,    -0.0228875f,  0.0535049f,    0.111015f,
+  -0.18125f,    -0.167584f,   0.0110497f,   0.262723f,     -0.413839f,
+  -0.0611238f,  0.358499f,    0.0807514f,   0.208254f,     0.214499f,
+  0.11137f,     -0.14262f,    -0.0513973f,  0.243718f,     -0.373716f,
+  -0.00413366f, 0.216501f,    -0.164149f,   -0.064935f,    -0.0840282f,
+  0.0566148f,   0.0377686f,   0.289835f,    0.769388f,     0.891198f,
+  -0.592739f,   0.40744f,     -0.153095f,   0.657311f,     0.140737f,
+  0.28209f,     0.158344f,    0.353546f,    0.0868246f,    0.116887f,
+  0.402004f,    0.437184f,    0.589219f,    0.760594f,     -0.575419f,
+  -0.754308f,   -0.709219f,   -0.297814f,   -0.418609f,    -0.0262104f,
+  0.0411959f,   0.0597708f,   -0.143728f,   -0.136642f,    0.099614f,
+  -0.257601f,   -0.2404f,     0.305893f,    0.254009f,     -0.0301398f,
+  -0.0653091f,  -0.459002f,   -0.163404f,   0.123152f,     -0.0284252f,
+  -0.457272f,   0.00788622f,  -0.828399f,   -0.0534199f,   0.586877f,
+  0.982728f,    0.424581f,    0.0891856f,   0.383182f,     -0.122053f,
+  0.0808408f,   -0.00384914f, -0.0560201f,  -0.0524772f,   -0.263444f,
+  -0.239287f,   -0.882777f,   0.0180592f,   -0.0948711f,   -0.177946f,
+  0.0296473f,   0.096082f,    0.0455604f,   -0.108608f,    0.00777951f,
+  -0.140896f,   0.117187f,    -0.342467f,   -0.0691604f,   0.0761611f,
+  -0.0892053f,  0.111386f,    -0.167456f,   1.40616f,      -0.00478793f,
+  0.00547665f,  -0.0441829f,  0.0151323f,   -0.0674099f,   -0.0380578f,
+  0.16072f,     0.31882f,     0.245486f,    -0.424318f,    0.101845f,
+  -0.203343f,   -0.197402f,   -0.163025f,   -0.0771961f,   -0.264435f,
+  0.319429f,    0.250076f,    0.782726f,    0.386003f,     0.00700673f,
+  -0.375715f,   0.151453f,    -0.296265f,   -0.560183f,    -0.00767249f,
+  -0.109593f,   -0.119419f,   -0.0161516f,  0.0380283f,    -0.156417f,
+  0.131708f,    0.396268f,    -0.221796f,   0.232099f,     0.128852f,
+  0.0567268f,   0.297297f,    0.173269f,    0.213411f,     0.0384426f,
+  -0.290985f,   -0.0426841f,  -0.488292f,   -0.087101f,    -0.311582f,
+  0.83009f,     -0.153163f,   0.903335f,    -1.15644f,     -0.0378635f,
+  -0.0552129f,  -0.126362f,   -0.176945f,   0.0653115f,    0.0989368f,
+  -0.333543f,   -0.330586f,   0.29775f,     -0.103535f,    0.210824f,
+  -0.00300509f, 0.317105f,    0.216852f,    0.479718f,     0.0485808f,
+  -0.15662f,    0.718199f,    0.327513f,    0.115169f,     -0.423598f,
+  -0.456633f,   -0.575814f,   -0.494454f,   0.304411f,     0.0493055f,
+  -0.381171f,   0.467251f,    -0.122872f,   -0.167441f,    0.017253f,
+  -0.0583646f,  -0.1586f,     0.214046f,    -0.0284424f,   -0.217112f,
+  0.606567f,    -0.107533f,   0.36615f,     -0.0709227f,   0.604761f,
+  -0.244657f,   -0.296651f,   -0.595611f,   -0.156629f,    -0.693468f,
+  -0.310603f,   0.499272f,    0.282941f,    0.295043f,     -0.178704f,
+  0.281186f,    0.014329f,    -0.120819f,   0.154234f,     0.0131325f,
+  -0.472231f,   -0.631281f,   0.422955f,    0.711432f,     -0.118025f,
+  0.0864996f,   0.343971f,    -0.301477f,   -0.246638f,    0.165068f,
+  0.218044f,    0.224236f,    -0.0848522f,  0.00671216f,   0.401141f,
+  -0.218857f,   -0.0298495f,  -0.135725f,   -0.377618f,    0.022473f,
+  0.106955f,    -0.0582005f,  0.0468484f,   -0.0217442f,   0.130911f,
+  -0.0926905f,  0.383007f,    -0.159353f,   -0.222711f,    -0.0286419f,
+  0.372315f,    -0.469095f,   0.797571f,    -0.301315f,    0.239327f,
+  -0.997507f,   -0.363409f,   0.353717f,    0.676686f,     -0.0500028f,
+  0.0638539f,   -0.431927f,   0.243852f,    0.000884826f,  -0.00166585f,
+  0.0613292f,   -0.029558f,   -0.0248432f,  -0.0125607f,   -0.0309674f,
+  -0.743308f,   0.0409806f,   0.0921015f,   0.167816f,     0.406849f,
+  0.095677f,    0.0308913f,   0.139956f,    -0.400472f,    0.396617f,
+  0.936517f,    0.355057f,    -0.423816f,   -0.232472f,    -0.220188f,
+  -0.399746f,   -0.409623f,   -0.158797f,   0.361153f,     0.0327019f,
+  0.0690844f,   -0.032197f,   0.0248558f,   0.00438518f,   0.0222724f,
+  -0.326832f,   -0.314295f,   0.156563f,    0.0562703f,    0.332694f,
+  0.299424f,    0.228206f,    0.322038f,    0.0136098f,    0.0060297f,
+  -0.165851f,   -0.306512f,   0.0796508f,   -0.37158f,     0.239395f,
+  -0.349442f,   0.198515f,    -0.253854f,   -1.13694f,     0.0202873f,
+  -0.0504009f,  -0.130528f,   -0.017126f,   -0.0370001f,   -0.087458f,
+  -0.119952f,   -0.130404f,   0.0333733f,   -0.184736f,    0.182162f,
+  0.227776f,    -0.166563f,   -0.156162f,   0.118215f,     -0.220183f,
+  0.00474779f,  -0.107792f,   0.260493f,    0.11884f,      0.156587f,
+  0.303936f,    -0.131788f,   -0.314774f,   0.310606f,     0.0935523f,
+  0.790767f,    0.26461f,     0.0236426f,   0.0629469f,    0.0344072f,
+  -0.151513f,   0.211498f,    0.0245435f,   0.0629973f,    0.052019f,
+  -0.03308f,    0.123487f,    0.0885027f,   0.159172f,     -0.0510615f,
+  0.0298033f,   -0.130515f,   -0.121799f,   -0.104915f,    0.208822f,
+  -0.310496f,   -0.314106f,   0.303307f,    -0.0196736f,   0.0420045f,
+  0.461777f,    -0.433699f,   0.00345407f,  0.703139f,     -0.655637f,
+  -0.210767f,   -0.201278f,   0.163694f,    -0.236534f,    0.300877f,
+  0.0769982f,   -0.282453f,   0.149721f,    -0.0303466f,   -0.191473f,
+  -0.406056f,   -0.213472f,   0.1619f,      -0.245953f,    0.00544399f,
+  -0.121434f,   0.193012f,    -0.307165f,   1.45431f,      -0.161468f,
+  -0.12444f,    -0.146129f,   -0.0528212f,  -0.0925165f,   -0.134528f,
+  -0.479475f,   0.315525f,    0.133845f,    0.382158f,     -0.0799693f,
+  -0.151041f,   0.255772f,    0.409536f,    -0.240663f,    -0.323741f,
+  -0.205876f,   0.03699f,     -0.217541f,   0.108511f,     0.640628f,
+  0.705993f,    -0.423899f,   -0.78314f,    -0.100733f,    -0.00859087f,
+  0.0251879f,   0.0458335f,   0.00210128f,  -0.047576f,    -0.0560518f,
+  -1.23869f,    -0.829914f,   0.0346551f,   0.350505f,     0.193688f,
+  0.459154f,    0.137898f,    0.503818f,    0.260867f,     0.649539f,
+  0.0150802f,   0.0239274f,   -0.276069f,   -0.0621478f,   -0.193106f,
+  -0.0375665f,  -0.654529f,   0.189493f,    0.446625f,     -0.0208265f,
+  0.019838f,    -0.0201955f,  0.00180428f,  -0.0110678f,   -0.0172414f,
+  0.0276489f,   -0.252882f,   -0.0351807f,  -0.0518874f,   0.279098f,
+  -0.245122f,   0.101287f,    -0.114202f,   -0.0812187f,   0.572429f,
+  -0.0821731f,  0.564183f,    0.0222552f,   0.190111f,     -0.0417497f,
+  -0.00385925f, -0.182995f,   -0.240482f,   -0.291572f,    -0.0450444f,
+  0.0962974f,   -0.165973f,   -0.0954637f,  -0.163841f,    -0.833405f,
+  -1.31541f,    -0.336473f,   -0.0920702f,  0.816105f,     0.393377f,
+  0.0340241f,   -0.0844545f,  0.61729f,     -0.17596f,     0.241149f,
+  -0.42825f,    -0.59091f,    -0.290702f,   0.0796465f,    0.0982819f,
+  0.466934f,    0.261666f,    0.0373333f,   0.332509f,     -0.0266694f,
+  -0.0476951f,  -0.00642167f, -0.0132542f,  -0.000320841f, 0.00475532f,
+  0.000502778f, 0.296534f,    -0.13297f,    -0.113082f,    -0.327923f,
+  0.35901f,     -0.302246f,   0.189799f,    -0.37994f,     0.16107f,
+  -0.20414f,    0.548575f,    -0.460821f,   0.591878f,     -0.213113f,
+  -0.169373f,   -0.07332f,    0.228841f,    0.682302f,     -0.0665316f,
+  -0.142456f,   -0.0873117f,  0.00607451f,  0.0376443f,    0.0536673f,
+  -0.0109536f,  -0.400279f,   0.550058f,    0.820871f,     -0.666373f,
+  -0.471962f,   -0.315925f,   -0.313142f,   0.952742f,     0.473928f,
+  -0.119006f,   0.153241f,    -0.0383078f,  0.631869f,     -0.343423f,
+  -0.233473f,   -0.218195f,   -0.077688f,   -0.728291f,    0.0382408f,
+  -0.00662886f, -0.0419666f,  0.0309776f,   -0.0281592f,   0.0154229f,
+  -0.198534f,   0.0206324f,   0.0152272f,   -0.235067f,    0.0330486f,
+  0.139198f,    -0.0612118f,  0.133154f,    -0.258675f,    0.0900275f,
+  -0.127771f,   0.157322f,    -0.00767807f, -0.329258f,    0.327458f,
+  0.0528581f,   -0.181125f,   0.409995f,    -0.162979f,    -0.0193475f,
+  0.186009f,    0.0519501f,   0.651877f,    -0.37821f,     -1.10341f,
+  -0.189776f,   -0.0922788f,  0.460256f,    0.168011f,     0.440295f,
+  0.478135f,    0.374573f,    0.384048f,    0.116953f,     0.68886f,
+  -0.427727f,   -0.36676f,    -0.500013f,   -0.228685f,    -0.218859f,
+  0.208396f,    -0.0173765f,  -0.0680241f,  -0.00538013f,  -0.0674409f,
+  -0.092764f,   0.0295707f,   -0.0462887f,  -0.00636006f,  0.0334169f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = {
+  0.176459f,  0.154405f, 0.281821f,  0.375264f,  -0.882863f,
+  -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = {
+  NUM_FEATURES_32,
+  NUM_LOGITS_32,
+  NUM_HIDDEN_LAYERS_32,
+  {
+      NUM_LAYER_0_UNITS_32,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_32,
+      av1_simple_motion_search_prune_rect_logits_kernel_32,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_32,
+      av1_simple_motion_search_prune_rect_logits_bias_32,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 25
+#define NUM_LAYER_0_UNITS_16 32
+#define NUM_LOGITS_16 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = {
+  -0.520913f,   0.395611f,    0.0369091f,   -0.318591f,  -0.463252f,
+  0.134992f,    -0.43154f,    -0.0739112f,  -0.118817f,  0.476373f,
+  -0.281406f,   0.3413f,      0.456255f,    0.33307f,    0.2942f,
+  0.1317f,      0.498113f,    1.95406f,     -0.165726f,  -0.219306f,
+  -0.302656f,   -1.31157f,    -0.433662f,   0.151716f,   -0.214817f,
+  0.504523f,    -0.710049f,   0.359616f,    -0.412695f,  -0.103193f,
+  0.341912f,    0.351378f,    -0.181486f,   0.573862f,   -0.0396254f,
+  -0.17855f,    -0.276163f,   0.0367465f,   -0.353905f,  -0.204689f,
+  0.309581f,    -0.0439686f,  -0.147855f,   0.152745f,   0.290871f,
+  0.131049f,    -0.27808f,    -0.142997f,   0.207843f,   -1.23074f,
+  -0.267714f,   -0.336923f,   0.313781f,    -0.61488f,   -0.161984f,
+  0.238059f,    -0.0879942f,  -0.085543f,   -0.260156f,  -0.13614f,
+  -0.242196f,   0.201216f,    -0.248691f,   0.0936671f,  -0.350522f,
+  -0.35002f,    -0.156583f,   -0.00579001f, 0.300578f,   -0.341269f,
+  -0.290712f,   0.354802f,    -0.31629f,    0.509107f,   -0.236953f,
+  -0.0923519f,  0.544509f,    -0.280991f,   -0.017437f,  -0.202721f,
+  -0.116388f,   -0.7191f,     0.324586f,    0.254249f,   0.125505f,
+  0.00658697f,  -0.333322f,   -0.126537f,   -0.140004f,  -0.0241202f,
+  -0.172466f,   0.210035f,    -0.270833f,   0.0579044f,  0.0950352f,
+  -0.120382f,   0.063292f,    -0.394925f,   0.482165f,   0.147753f,
+  0.331465f,    -0.187444f,   0.1083f,      0.414028f,   0.279238f,
+  -0.486889f,   -0.674349f,   -0.313656f,   -0.131186f,  -0.100662f,
+  0.238191f,    -1.19083f,    -0.30667f,    -2.4324f,    0.235311f,
+  0.108605f,    1.67197f,     0.476157f,    0.30055f,    0.0839538f,
+  0.408469f,    -0.473517f,   0.560283f,    -0.0188136f, 0.273824f,
+  -0.43707f,    -0.0346978f,  -0.438315f,   -0.0196275f, -0.0567921f,
+  -0.220166f,   0.216175f,    -0.0180461f,  0.0116429f,  -0.0096949f,
+  -0.32613f,    0.176829f,    -0.243563f,   -0.240972f,  -0.621819f,
+  -0.00619648f, -0.145525f,   0.124324f,    -0.0306925f, 0.172208f,
+  -2.04631f,    -0.200087f,   -0.594135f,   -0.352303f,  -0.309826f,
+  0.0922786f,   -0.698371f,   -0.0366823f,  0.0244036f,  0.338775f,
+  -0.115947f,   0.144971f,    -0.0607037f,  -0.762412f,  0.0125584f,
+  -0.262427f,   -0.0830273f,  -0.291252f,   -0.176059f,  -0.203983f,
+  0.0871455f,   -0.0894925f,  0.0426263f,   -0.060001f,  -0.542355f,
+  -0.407837f,   -0.0419273f,  0.226608f,    -0.114844f,  0.158733f,
+  -0.187237f,   0.113163f,    -1.86337f,    -0.367544f,  -0.547048f,
+  -0.24192f,    -0.226764f,   0.090912f,    0.819604f,   0.433766f,
+  -0.841657f,   0.446987f,    -0.622761f,   -0.0296385f, -0.130176f,
+  -0.0518136f,  -0.640326f,   -0.330107f,   -0.137832f,  -0.0119033f,
+  0.39401f,     0.111331f,    -0.141367f,   -0.230289f,  0.171054f,
+  -0.924059f,   -0.107317f,   -0.347983f,   0.0261109f,  0.423002f,
+  -0.305817f,   0.247696f,    0.0436002f,   0.0305862f,  -1.52448f,
+  -0.595587f,   -0.155552f,   -1.11949f,    -0.513937f,  0.138347f,
+  -0.301487f,   0.352144f,    -0.615801f,   0.0326701f,  -0.215322f,
+  -0.0608176f,  -0.416557f,   -0.306073f,   -0.441512f,  -0.0569277f,
+  -0.709768f,   -0.602527f,   -0.311134f,   0.152471f,   -0.255299f,
+  0.354505f,    0.194464f,    0.0144251f,   0.110732f,   -0.4452f,
+  -0.804814f,   0.205325f,    -0.0957486f,  0.502684f,   0.09112f,
+  -0.533087f,   -1.77979f,    0.556992f,    -0.176157f,  -0.642633f,
+  0.11553f,     -0.232561f,   0.161277f,    -0.0631125f, -0.20759f,
+  0.489253f,    -0.067533f,   0.0231024f,   -0.179831f,  -0.272985f,
+  -0.390059f,   0.3089f,      0.185733f,    -0.257065f,  -0.508838f,
+  -0.550028f,   0.0665621f,   -0.138288f,   -0.413188f,  0.191193f,
+  -1.32969f,    -0.431025f,   0.270242f,    -0.340062f,  0.0817257f,
+  0.0376051f,   -0.18633f,    0.0828274f,   0.00670051f, -0.431295f,
+  -0.450316f,   -0.173042f,   -0.322248f,   0.370628f,   0.10019f,
+  0.317293f,    -0.266613f,   0.0752441f,   -0.425656f,  -0.112223f,
+  0.557991f,    -0.324368f,   -0.195261f,   -0.0526129f, -0.807472f,
+  -0.387466f,   0.192186f,    0.353213f,    -0.120238f,  0.107686f,
+  0.200678f,    -0.75363f,    0.466857f,    -0.282345f,  -0.0849236f,
+  -0.0490695f,  -0.00643182f, 0.123047f,    -0.207805f,  -0.130456f,
+  -1.09455f,    0.340973f,    0.334784f,    0.0706643f,  -1.65681f,
+  -0.319952f,   -0.198514f,   -0.0787972f,  0.089524f,   0.0531034f,
+  -0.202705f,   -0.0852339f,  -0.62572f,    -0.0734234f, -0.838088f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = {
+  -0.0616197f, 0.939947f, 0.521161f,  0.213886f,  0.130324f,  -0.127443f,
+  -0.0538715f, 0.708746f, 0.445031f,  0.418781f,  -0.114539f, 0.521941f,
+  1.13719f,    0.606545f, -0.32193f,  -0.150788f, 0.158487f,  -0.224005f,
+  0.654715f,   0.115729f, -0.286506f, -2.06223f,  0.0117697f, 0.503905f,
+  -0.102339f,  0.653256f, -0.813561f, 0.905235f,  -0.417269f, -0.206265f,
+  0.661496f,   0.95533f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = {
+  -0.203489f,   0.00686229f,  -0.161414f,   0.0637276f,   0.27516f,
+  0.512219f,    0.164205f,    0.00326062f,  -0.41914f,    -0.400334f,
+  0.554419f,    0.715772f,    -0.295569f,   -0.703503f,   0.0137744f,
+  -0.0934259f,  0.174234f,    -0.148618f,   -0.0360558f,  -0.0986598f,
+  -0.138502f,   -0.0770713f,  0.122922f,    -0.00784415f, 0.0953234f,
+  -0.255754f,   -0.310967f,   0.185306f,    0.464554f,    0.147338f,
+  -0.0612304f,  0.164783f,    0.301097f,    0.161364f,    -0.12723f,
+  -0.0265984f,  -0.471361f,   0.0578776f,   -0.362865f,   0.425789f,
+  0.402758f,    -0.190235f,   0.00549738f,  -0.570908f,   1.27206f,
+  0.048868f,    -0.0097675f,  0.0708324f,   0.0456103f,   0.0149062f,
+  -0.563032f,   -0.420573f,   0.107278f,    0.0938258f,   0.142712f,
+  -0.00251036f, -0.250583f,   0.522272f,    0.0113175f,   0.126751f,
+  -0.433028f,   -0.035542f,   -0.536686f,   -0.0668722f,  0.253094f,
+  0.254007f,    -0.435505f,   0.343001f,    0.0531542f,   -0.361914f,
+  -0.102664f,   0.0404874f,   0.132686f,    0.0762298f,   0.0236971f,
+  -0.419454f,   0.230877f,    -0.223714f,   0.037813f,    0.0818604f,
+  0.383705f,    -0.235028f,   -0.0554801f,  0.429851f,    0.0845829f,
+  0.166295f,    0.355111f,    -0.421197f,   0.298949f,    0.0218224f,
+  0.445705f,    -0.392217f,   -0.429578f,   -0.076276f,   -0.0963531f,
+  -0.631425f,   -0.225977f,   8.06349e-06f, 0.0676679f,   0.0779651f,
+  0.0706891f,   0.101377f,    0.517103f,    0.0945502f,   -0.52522f,
+  -0.312022f,   0.0358089f,   0.616509f,    -0.0507444f,  -0.465814f,
+  -0.0326024f,  0.591298f,    0.188544f,    -0.0633316f,  -0.199987f,
+  0.403118f,    -0.511281f,   -0.696263f,   0.112996f,    0.103875f,
+  0.0495595f,   -0.0107449f,  0.521539f,    -0.0123823f,  -0.0642751f,
+  0.08548f,     -0.0679207f,  0.526558f,    0.0651114f,   -0.342643f,
+  -0.349934f,   0.307437f,    0.368763f,    -0.194851f,   -0.134117f,
+  0.102448f,    -0.0520666f,  0.0415824f,   -0.175085f,   0.272685f,
+  0.0675856f,   0.120627f,    0.391408f,    -0.135249f,   -0.357024f,
+  0.019666f,    -0.0622677f,  0.407427f,    0.22655f,     -0.129432f,
+  -0.165327f,   0.004893f,    0.5479f,      0.0613981f,   -0.479682f,
+  -0.144228f,   -0.130106f,   0.206458f,    -0.342086f,   0.12691f,
+  -0.113554f,   0.231164f,    -0.051419f,   0.0401286f,   -0.560429f,
+  -0.070609f,   0.420232f,    0.442465f,    -0.237501f,   -0.000293732f,
+  -1.017f,      -0.210222f,   0.0157063f,   0.0488178f,   0.0734721f,
+  -0.52626f,    -0.276441f,   -0.521579f,   0.443532f,    -0.0819051f,
+  -0.0732633f,  -0.17999f,    0.258525f,    -0.0374872f,  0.150115f,
+  0.0510939f,   0.168116f,    0.473372f,    0.824489f,    0.302195f,
+  -0.348613f,   0.238569f,    0.176444f,    -0.633945f,   -0.0567195f,
+  -0.0305827f,  -0.0551851f,  0.85822f,     -0.0628099f,  0.0364294f,
+  -0.234823f,   0.179067f,    0.143208f,    -0.0511014f,  -0.404191f,
+  0.428035f,    0.0235506f,   0.371991f,    -0.312909f,   0.550933f,
+  -0.389265f,   -0.271813f,   -0.293461f,   -0.583752f,   0.179991f,
+  0.191698f,    0.659094f,    1.07941f,     -0.509555f,   -0.100638f,
+  0.079988f,    -0.0519107f,  -0.112723f,   -0.0663326f,  0.0353569f,
+  -0.795055f,   -0.465999f,   0.283579f,    0.340913f,    0.152738f,
+  0.294664f,    0.527839f,    0.187735f,    0.359461f,    0.164629f,
+  0.107512f,    0.390402f,    0.236702f,    0.114674f,    -0.525655f,
+  -0.555476f,   -0.6589f,     -0.266601f,   -0.0946547f,  0.6306f,
+  0.0248513f,   0.038497f,    0.432706f,    -0.0715465f,  0.0410172f,
+  -0.115313f,   -0.428684f,   0.136283f,    0.0913185f,   0.11277f,
+  0.0968689f,   -0.00437052f, 0.0888981f,   0.10304f,     0.02442f,
+  -0.211315f,   0.00981596f,  -0.0974827f,  0.208611f,    0.140644f,
+  0.0315567f,   0.350332f,    -0.291049f,   -0.0715449f,  -0.352992f,
+  -0.858004f,   0.828658f,    0.439092f,    0.0151291f,   0.0503828f,
+  0.0656112f,   -0.710749f,   -0.0951757f,  0.193908f,    0.00908018f,
+  0.141486f,    -0.0657711f,  0.099791f,    0.153729f,    -0.419576f,
+  -0.892636f,   -0.0449268f,  -0.170786f,   -0.156564f,   0.384511f,
+  0.296565f,    0.0569815f,   -0.103938f,   1.27479f,     -0.0406475f,
+  0.154083f,    -0.186442f,   0.0282588f,   0.0312102f,   -0.188994f,
+  0.284243f,    -0.564693f,   0.425525f,    -0.00924596f, 0.810003f,
+  0.233812f,    -0.0180273f,  0.121082f,    -0.209096f,   0.151437f,
+  0.286921f,    -0.348095f,   0.174813f,    -0.413798f,   0.108994f,
+  -0.34266f,    -0.0337981f,  -0.459f,      -0.409812f,   -0.0890104f,
+  0.0834802f,   -0.00259191f, -0.105914f,   -0.164207f,   0.0697689f,
+  -0.312098f,   -0.00650536f, -0.486758f,   -0.248486f,   0.24314f,
+  -0.0857144f,  0.0884781f,   -0.65615f,    -0.121744f,   0.0709335f,
+  -0.0237193f,  0.10764f,     -0.0409452f,  -0.0824305f,  0.42329f,
+  0.138258f,    0.502607f,    0.228545f,    0.0687789f,   0.0361586f,
+  0.39074f,     0.0722654f,   -0.0133148f,  0.283278f,    0.0743384f,
+  0.310292f,    -0.297675f,   -0.359935f,   0.521021f,    -0.10082f,
+  -0.272333f,   0.0120283f,   0.138118f,    -0.123711f,   -0.0711386f,
+  0.0170747f,   0.831039f,    0.0509626f,   0.790608f,    -0.0863406f,
+  -0.31962f,    0.0631013f,   0.0873453f,   -0.472331f,   -0.0826027f,
+  -0.241722f,   0.148835f,    -0.131611f,   0.000195347f, -0.0615804f,
+  -0.838663f,   -0.586979f,   0.247713f,    0.362254f,    0.492727f,
+  -0.132163f,   0.0516545f,   0.477838f,    -0.0395182f,  0.0124993f,
+  -0.771514f,   0.0386912f,   -0.118525f,   -0.346172f,   -0.265905f,
+  -0.175257f,   -0.406287f,   0.393837f,    0.409096f,    -0.408501f,
+  -0.0207146f,  0.0487809f,   0.0636982f,   0.0276368f,   0.0878249f,
+  0.0425889f,   0.0868633f,   0.17423f,     -0.128217f,   -0.477068f,
+  -0.321294f,   0.0393771f,   0.00812823f,  -0.350529f,   -0.129012f,
+  0.439953f,    0.396662f,    0.410475f,    -0.123129f,   -0.565966f,
+  0.0298635f,   -0.614611f,   -0.477514f,   0.453651f,    0.0617068f,
+  0.0530563f,   0.0479074f,   0.213551f,    0.039034f,    0.0449095f,
+  -1.06868f,    -1.2654f,     -0.175482f,   0.595068f,    -0.230095f,
+  0.719838f,    -0.272148f,   0.696564f,    0.0485396f,   0.468584f,
+  0.0695439f,   -0.0842122f,  -0.228978f,   0.161397f,    -0.000441421f,
+  -0.0297514f,  -0.250599f,   0.196656f,    0.608423f,    -0.0112096f,
+  0.0236881f,   -0.00167311f, 0.0040709f,   0.015495f,    0.00757698f,
+  -0.165886f,   0.359767f,    -0.0214696f,  0.377208f,    0.0303547f,
+  0.0657094f,   0.140775f,    0.21867f,     -0.203922f,   0.263878f,
+  -0.0529099f,  0.202438f,    -0.243226f,   0.156659f,    -0.627056f,
+  -0.845036f,   -0.500873f,   0.172588f,    0.402972f,    -0.147734f,
+  0.151792f,    -0.075579f,   0.443519f,    0.0311335f,   -0.0328222f,
+  -0.0299781f,  0.435956f,    -0.0987376f,  0.288402f,    0.135902f,
+  -0.173584f,   -0.186255f,   0.224524f,    -0.249645f,   0.123702f,
+  -0.0846244f,  0.491317f,    0.544846f,    0.338677f,    -0.258885f,
+  -0.617434f,   -0.629003f,   -0.347233f,   0.181262f,    -0.0606015f,
+  -0.537766f,   0.215089f,    -0.334527f,   0.0488534f,   0.0577997f,
+  -1.12431f,    -0.932292f,   -0.11559f,    0.573715f,    0.151128f,
+  0.693818f,    -0.16956f,    0.802591f,    -0.231531f,   1.04318f,
+  -0.476417f,   0.293452f,    -0.610136f,   0.27506f,     -0.384012f,
+  0.305366f,    -0.0540464f,  -0.337583f,   -0.174285f,   0.157248f,
+  0.0477345f,   -0.0229535f,  0.0475766f,   -0.00603319f, 0.00856119f,
+  -0.702893f,   -0.0579673f,  0.183024f,    -0.166222f,   0.109763f,
+  -0.148019f,   -0.258873f,   -0.0820157f,  -0.186716f,   -0.449265f,
+  -0.0534138f,  0.15732f,     0.46357f,     0.00502591f,  -0.0282085f,
+  0.152277f,    -0.855199f,   -0.357115f,   0.0366159f,   0.0131101f,
+  -0.0407758f,  0.0462835f,   0.146309f,    -0.00276278f, -0.0591814f,
+  -0.109437f,   0.506764f,    -0.044421f,   0.465907f,    0.114444f,
+  -0.241053f,   -0.362649f,   -0.432615f,   0.199989f,    -0.00635866f,
+  -0.521886f,   0.0958924f,   -0.485725f,   0.0430527f,   0.069746f,
+  0.681091f,    -0.288144f,   0.505671f,    0.0489065f,   -0.0373836f,
+  0.266079f,    0.145173f,    -0.011481f,   -0.225074f,   -0.754501f,
+  -0.122939f,   -0.294213f,   0.334738f,    0.281561f,    0.558977f,
+  -0.21551f,    -0.346507f,   -0.0625635f,  0.0782034f,   -0.236999f,
+  -0.803783f,   -0.601117f,   0.091192f,    0.636122f,    -0.250626f,
+  0.0354961f,   0.103915f,    0.508571f,    0.329911f,    -0.0425999f,
+  -0.0867587f,  -0.0385824f,  1.13914f,     -0.0261992f,  0.00484478f,
+  0.124603f,    -0.012173f,   -0.377358f,   -0.243563f,   0.236094f,
+  0.145663f,    -0.132752f,   0.347497f,    -0.529315f,   0.271632f,
+  -0.372805f,   0.0261836f,   0.126169f,    0.0941008f,   0.283773f,
+  0.765701f,    -0.226477f,   -0.181549f,   -0.306896f,   0.110165f,
+  -0.0784234f,  -0.0827892f,  -0.0374252f,  -0.0950872f,  -0.451015f,
+  -0.995793f,   -0.452663f,   0.293338f,    -0.380865f,   0.032683f,
+  0.0178248f,   0.0699194f,   -0.0811722f,  -0.0866096f,  0.139289f,
+  0.296604f,    0.192293f,    -0.0589607f,  -0.179878f,   0.00360266f,
+  -0.0905794f,  0.136744f,    -0.191555f,   1.31877f,     -0.0592033f,
+  -0.158766f,   0.0214746f,   -0.190113f,   -0.116671f,   0.0449292f,
+  -0.109533f,   -0.709307f,   0.386424f,    0.40201f,     0.262211f,
+  -0.155244f,   0.233988f,    -0.0166317f,  0.462665f,    0.0484462f,
+  0.210902f,    -0.352798f,   0.38698f,     -0.228261f,   -0.084309f,
+  -0.220751f,   -0.170879f,   -0.352617f,   -1.24277f,    0.266004f,
+  -0.0125749f,  -0.0380073f,  0.101838f,    -0.0483024f,  -0.0629178f,
+  -0.0695577f,  -0.103439f,   0.242131f,    -0.0796858f,  0.349718f,
+  -0.332045f,   0.0138352f,   -0.380235f,   -0.28717f,    -0.176276f,
+  0.865903f,    0.36593f,     0.243925f,    -0.422289f,   -0.117327f,
+  0.21876f,     0.245393f,    -0.426134f,   -0.186077f,   0.0352515f,
+  -0.123742f,   0.249376f,    1.3281f,      0.0707771f,   0.071415f,
+  -0.286827f,   -0.131691f,   -0.270881f,   -0.434378f,   0.376064f,
+  0.35966f,     0.513374f,    0.439378f,    -0.222716f,   -0.5874f,
+  0.487997f,    -0.293271f,   -0.184245f,   -0.037256f,   0.17723f,
+  -0.438651f,   0.428184f,    0.112983f,    -0.449287f,   -0.0451963f,
+  0.0854929f,   0.0735442f,   -0.0148642f,  -0.0586782f,  -0.176455f,
+  -0.438979f,   -0.127109f,   0.211478f,    0.388035f,    -0.0372021f,
+  0.220575f,    0.382144f,    0.302121f,    0.0857121f,   0.193445f,
+  -0.488858f,   -0.195288f,   -0.316184f,   -0.314026f,   -0.111956f,
+  0.0744768f,   0.292709f,    0.30187f,     -0.285506f,   -0.105006f,
+  0.0851402f,   -0.082318f,   0.277518f,    0.725294f,    -0.756304f,
+  0.0155309f,   -0.378542f,   0.293377f,    -0.347252f,   -0.338458f,
+  0.221449f,    -0.176443f,   -0.131972f,   0.0129163f,   -0.290649f,
+  0.198596f,    -0.0721333f,  0.620591f,    0.568736f,    0.174001f,
+  -0.205186f,   -0.265606f,   -0.249155f,   0.299163f,    1.11842f,
+  0.17423f,     0.196417f,    -0.014484f,   0.0735422f,   0.26329f,
+  0.12284f,     -0.750305f,   -0.351337f,   0.121994f,    -0.00542878f,
+  -0.295707f,   -0.094124f,   0.300993f,    0.412408f,    -0.170761f,
+  -0.0676329f,  -0.106638f,   -0.419785f,   -0.43878f,    0.22421f,
+  0.0339903f,   0.619851f,    0.0615381f,   0.514631f,    1.35424f,
+  -0.0679228f,  -0.203457f,   0.131948f,    -0.0041251f,  -0.209054f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = {
+  0.304025f,  0.131887f, 0.259279f,  -0.561564f, -0.161729f,
+  -0.208036f, 0.102206f, -0.162937f, -1.42311f,  -0.708305f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = {
+  NUM_FEATURES_16,
+  NUM_LOGITS_16,
+  NUM_HIDDEN_LAYERS_16,
+  {
+      NUM_LAYER_0_UNITS_16,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_16,
+      av1_simple_motion_search_prune_rect_logits_kernel_16,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_16,
+      av1_simple_motion_search_prune_rect_logits_bias_16,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 25
+#define NUM_LAYER_0_UNITS_8 32
+#define NUM_LOGITS_8 4
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = {
+  -0.266303f,  -0.387676f,  0.204501f,   -0.120842f,  -0.0752326f, 0.0337739f,
+  0.0243477f,  -0.356748f,  0.0143051f,  -0.16403f,   -0.139013f,  0.175003f,
+  -0.206754f,  0.349059f,   0.181763f,   0.212768f,   -0.313783f,  0.182829f,
+  0.00205376f, -0.939525f,  -0.0992424f, 0.306254f,   0.083329f,   -0.133137f,
+  -0.179022f,  -0.0237902f, 0.0601026f,  -0.216698f,  -0.551149f,  0.081711f,
+  -0.442191f,  0.0680832f,  -0.0353678f, 0.237704f,   0.23155f,    -0.36097f,
+  0.123389f,   -0.288927f,  0.178133f,   -0.152222f,  -0.235648f,  -0.0495293f,
+  -0.316522f,  0.034207f,   0.0463139f,  -0.817825f,  0.417443f,   -0.110984f,
+  -0.402371f,  0.0341694f,  -0.37383f,   0.414532f,   0.093993f,   0.0039505f,
+  0.0803175f,  -0.511859f,  -0.0154802f, 0.0979595f,  0.0909049f,  -0.120938f,
+  -0.577382f,  -0.155041f,  -0.404295f,  0.122223f,   -0.084703f,  0.00415336f,
+  0.149135f,   0.113219f,   0.124236f,   -0.240905f,  0.163909f,   -0.154202f,
+  -0.208917f,  0.00200158f, -0.71796f,   0.105984f,   -0.131996f,  -0.539603f,
+  0.223768f,   -0.0710733f, -0.346679f,  -0.0745909f, 0.171032f,   0.215701f,
+  0.218519f,   0.105981f,   -0.096209f,  -0.166453f,  -0.468894f,  -0.401578f,
+  -0.239222f,  0.111382f,   0.38747f,    -0.164734f,  -0.175955f,  0.336621f,
+  -0.0305501f, -0.0576765f, 0.0672671f,  -0.183692f,  0.412082f,   -0.262951f,
+  -0.153429f,  -0.128589f,  -0.530472f,  0.0936412f,  -1.08296f,   -0.45147f,
+  0.0714904f,  -3.96842f,   0.438125f,   -0.313945f,  0.231104f,   -0.00183851f,
+  -0.0192768f, -0.637531f,  -0.109296f,  0.0531702f,  0.00262162f, -0.615951f,
+  -0.546241f,  -0.635305f,  -0.0762367f, 0.0122019f,  0.423693f,   -0.129142f,
+  -0.112242f,  0.295184f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = {
+  -2.16023f,  -3.12831f, -0.213206f,  -2.97875f, -1.83791f, -2.84713f,
+  -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f,
+  -0.853224f, -3.29503f, -0.537517f,  0.923106f, -3.18665f, -1.29905f,
+  1.64506f,   -1.99848f, -2.24315f,   0.408613f, 0.503671f, -3.83393f,
+  -2.88388f,  -3.52337f, 1.46818f,    -1.67169f, -3.83253f, 1.52644f,
+  -0.490783f, -0.415782f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = {
+  -0.702198f,  -0.102148f,   0.0564545f,   -0.0555548f,  0.16184f,
+  0.0950792f,  0.136974f,    -0.00824146f, 0.05746f,     0.0447542f,
+  0.145978f,   0.0855769f,   -0.041449f,   0.301347f,    -0.0206691f,
+  -0.0662514f, -0.0525079f,  -0.0998387f,  -0.0891438f,  0.110545f,
+  -0.863098f,  -1.83798f,    0.238818f,    0.127797f,    0.116872f,
+  -0.270655f,  -0.21057f,    0.197013f,    -0.123332f,   0.137104f,
+  -0.174766f,  -0.00803025f, 0.0234369f,   -0.0894175f,  -0.0380927f,
+  0.00827928f, -0.134148f,   0.110575f,    -0.250173f,   0.116273f,
+  0.0197749f,  0.270391f,    0.108437f,    0.173197f,    -0.0650348f,
+  0.0884626f,  0.262792f,    0.0649228f,   0.5573f,      -2.81315f,
+  -0.479801f,  -1.15825f,    0.0807932f,   -0.19144f,    0.404016f,
+  -0.211521f,  0.233269f,    -0.391414f,   0.160381f,    -0.277233f,
+  0.426354f,   0.156839f,    0.494315f,    -0.214259f,   -0.0132062f,
+  0.148628f,   -0.0899568f,  0.161845f,    0.467689f,    0.229474f,
+  0.590634f,   -0.705793f,   -0.0486113f,  -0.439088f,   0.994566f,
+  0.679065f,   0.777869f,    -0.225291f,   -0.0303006f,  -0.638782f,
+  -0.0824632f, -0.128561f,   -0.327603f,   0.105624f,    0.567581f,
+  -0.396135f,  -0.471028f,   0.181286f,    0.274604f,    0.180169f,
+  0.0612144f,  -0.865004f,   0.0306804f,   0.142985f,    -0.0914358f,
+  -0.243284f,  0.358359f,    -0.443847f,   -0.371978f,   0.606933f,
+  -0.900408f,  -0.52076f,    0.472118f,    0.0610973f,   0.152526f,
+  -0.550379f,  0.309331f,    -0.141573f,   0.203046f,    -0.231485f,
+  0.505156f,   0.393224f,    0.435487f,    -0.218681f,   0.123707f,
+  -0.270383f,  -0.033565f,   0.210373f,    -2.33967f,    0.367434f,
+  0.0308118f,  -0.205771f,   0.546141f,    0.19837f,     0.035648f,
+  -0.467007f,  -1.50995f,    -0.0314176f,  0.11762f,     -0.15307f,
+  0.618257f,   -0.139502f,   0.303386f,    -0.00758681f, 0.228107f,
+  -0.594499f,  -0.201984f,   -0.239666f,   0.114878f,    -0.922174f,
+  -0.530137f,  -0.379366f,   -0.319582f,   0.0889624f,   -0.00544663f,
+  0.316264f,   -0.204262f,   -0.0959358f,  0.23552f,     0.141369f,
+  -0.207129f,  -1.04067f,    -0.0780501f,  0.226768f,    -0.246752f,
+  0.0823105f,  0.114783f,    0.49315f,     0.0197732f,   0.705433f,
+  0.158076f,   -0.250584f,   -0.157326f,   -0.0439547f,  -0.139047f,
+  0.090531f,   -0.38833f,    0.743143f,    -1.47418f,    -0.155009f,
+  0.511466f,   -0.726716f,   -0.181075f,   0.450133f,    -0.390204f,
+  0.292725f,   0.00811462f,  -0.347738f,   0.613381f,    -0.237124f,
+  0.750748f,   -0.383123f,   0.410309f,    -0.204166f,   0.667199f,
+  -0.313197f,  0.436059f,    -0.607571f,   0.193681f,    0.409399f,
+  0.631747f,   -0.0454149f,  0.198232f,    0.345591f,    -0.0137374f,
+  -0.307014f,  -0.535515f,   0.764678f,    -0.225686f,   -0.451621f,
+  -2.75564f,   -1.52877f,    0.0511933f,   0.905979f,    0.145029f,
+  0.759615f,   0.130166f,    0.83827f,     0.0655081f,   1.07555f,
+  -0.529777f,  0.682967f,    -0.412052f,   0.611947f,    -0.83676f,
+  0.940695f,   -0.465681f,   0.51505f,     -0.883659f,   -0.105524f,
+  -0.0344173f, -0.0683618f,  -0.00698688f, -0.139349f,   0.135741f,
+  -0.294455f,  -0.377834f,   -0.602084f,   -1.00128f,    0.483291f,
+  1.25327f,    0.178987f,    0.75068f,     -0.520731f,   -0.325517f,
+  0.272032f,   0.144144f,    -0.279453f,   0.564907f,    0.144036f,
+  0.297448f,   -0.504243f,   -0.250508f,   -1.26395f,    0.4816f,
+  0.392771f,   -0.389961f,   -0.261585f,   -0.127124f,   -0.202945f,
+  -0.709716f,  -0.174719f,   0.113613f,    0.477753f,    -0.226659f,
+  0.0697828f,  -0.177994f,   0.300726f,    -0.185504f,   0.339424f,
+  -0.316746f,  0.369693f,    -0.339723f,   -0.143886f,   -0.0326589f,
+  -0.268761f,  -0.241094f,   0.284876f,    -0.0270867f,  -0.207397f,
+  -1.42738f,   0.495612f,    -0.0277732f,  0.199675f,    1.48638f,
+  -0.659257f,  -1.28199f,    0.498702f,    0.140695f,    0.571152f,
+  0.416368f,   0.14153f,     0.126876f,    0.521114f,    -0.00150571f,
+  0.375581f,   0.00537624f,  0.1286f,      -0.332227f,   0.417663f,
+  -0.539023f,  0.217124f,    -0.787111f,   -0.0335266f,  1.56751f,
+  0.0640563f,  -0.158791f,   0.118195f,    0.000970493f, -0.0403852f,
+  -0.0572557f, -0.0201181f,  -0.10255f,    0.63237f,     0.156662f,
+  0.418696f,   -0.274802f,   -0.663923f,   -0.375232f,   -0.40846f,
+  0.462092f,   1.2176f,      -0.301532f,   -0.779704f,   -0.112876f,
+  0.0806591f,  -0.0141923f,  0.00960801f,  -0.663557f,   0.0979948f,
+  -0.0575999f, -0.012847f,   0.0403853f,   -0.133666f,   -0.00330217f,
+  -0.931518f,  -0.774599f,   -0.21391f,    0.377601f,    -0.183365f,
+  0.299094f,   0.0238552f,   0.206716f,    -0.18959f,    0.346013f,
+  -0.150991f,  -0.192817f,   -0.293962f,   -0.0537604f,  -0.0648171f,
+  -0.275941f,  -0.144854f,   -0.224092f,   2.43113f,     0.0422494f,
+  -0.047236f,  -0.0262028f,  0.0282119f,   -0.175553f,   0.0888502f,
+  0.580682f,   0.951055f,    -0.284441f,   -0.120133f,   -0.268058f,
+  -0.312083f,  -0.411556f,   0.21431f,     -0.28033f,    0.324851f,
+  -1.02787f,   -0.936816f,   -0.577628f,   0.544743f,    0.295807f,
+  0.406157f,   0.447927f,    0.25369f,     -0.811421f,   -0.0424979f,
+  -0.189867f,  0.00778673f,  -0.113587f,   -0.116175f,   -0.0542222f,
+  -1.80089f,   -1.44175f,    -0.35332f,    0.191314f,    -0.236691f,
+  -0.0261926f, -0.502363f,   0.252278f,    -0.485478f,   0.296495f,
+  0.455612f,   -0.0489631f,  0.227255f,    0.170975f,    0.473487f,
+  0.257812f,   0.178048f,    0.2506f,      2.04637f,     -0.173857f,
+  0.0583379f,  0.00765589f,  -0.025772f,   -0.162666f,   -0.016214f,
+  -0.607486f,  -0.0808025f,  0.0551611f,   -0.0772291f,  0.126421f,
+  0.10869f,    -0.0877463f,  -0.111527f,   -0.0775766f,  0.503886f,
+  -0.002757f,  -0.0421354f,  -0.247857f,   0.140827f,    0.383576f,
+  0.228232f,   -0.157877f,   -0.0927911f,  0.344687f,    0.191181f,
+  0.236533f,   0.00102869f,  -0.0184502f,  -1.4509f,     -1.15945f,
+  -0.521978f,  -0.643225f,   0.133139f,    0.0660321f,   0.0851957f,
+  0.0303648f,  0.0296239f,   0.0455713f,   0.175647f,    0.080532f,
+  0.0445691f,  -0.257356f,   -0.125602f,   -0.138829f,   -0.167057f,
+  -0.0992552f, -0.13944f,    0.507531f,    0.444997f,    0.221452f,
+  -0.308384f,  -0.327554f,   0.13235f,     2.1487f,      -1.15453f,
+  -0.280239f,  -0.363582f,   -0.00358745f, 0.012866f,    0.251088f,
+  0.0676416f,  0.178492f,    -0.136631f,   0.197938f,    -0.078198f,
+  0.812439f,   1.1173f,      0.712113f,    1.10124f,     -0.836503f,
+  -1.22433f,   -1.07894f,    -1.29215f,    0.56057f,     2.23928f,
+  -0.419029f,  0.282178f,    -0.0719266f,  -0.172192f,   0.28034f,
+  -2.99124f,   -2.01481f,    0.0688982f,   0.697466f,    0.00635555f,
+  0.566069f,   0.047534f,    0.507755f,    -0.00690707f, 0.712594f,
+  -0.191467f,  0.355733f,    -0.480016f,   0.664669f,    -0.390619f,
+  0.351199f,   -0.482342f,   0.325005f,    1.9089f,      0.155987f,
+  0.17032f,    0.132729f,    0.0402649f,   0.146991f,    0.0314905f,
+  -0.775316f,  -0.208892f,   -0.105993f,   0.0181653f,   -0.12735f,
+  0.0897852f,  0.0470231f,   0.25807f,     0.127406f,    -0.0893252f,
+  -0.279776f,  0.190844f,    0.110384f,    -0.148833f,   0.025293f,
+  0.239838f,   0.00932245f,  0.35103f,     -0.128268f,   -0.0536754f,
+  0.506899f,   -0.16793f,    0.0955582f,   -2.01108f,    0.721433f,
+  -2.31413f,   -2.08646f,    0.033315f,    0.689828f,    -0.271213f,
+  0.790425f,   -0.114234f,   0.755325f,    -0.211533f,   0.774544f,
+  -0.263268f,  0.795762f,    -0.551455f,   0.953602f,    -0.168454f,
+  0.529055f,   -0.768991f,   0.882371f,    0.29763f,     -0.155017f,
+  0.00464101f, 0.121093f,    0.948271f,    0.113138f,    -0.110332f,
+  -2.0492f,    -1.31322f,    -0.129212f,   0.464778f,    -0.181465f,
+  0.618403f,   0.0627984f,   0.465228f,    0.165729f,    0.278277f,
+  -0.563276f,  -0.358358f,   -0.590638f,   0.0104993f,   0.731206f,
+  0.752569f,   0.631615f,    0.811822f,    0.129804f,    -0.0558327f,
+  0.570081f,   -0.417922f,   -0.168275f,   0.0703671f,   0.269127f,
+  0.240457f,   -0.197159f,   -0.00179261f, 0.220065f,    0.463511f,
+  0.0714626f,  -0.716477f,   -0.441865f,   -0.717028f,   -0.149176f,
+  0.452182f,   0.662699f,    -0.906534f,   -0.817133f,   0.237747f,
+  0.26024f,    -7.7441e-05f, 0.0934616f,   0.824641f,    -0.0404494f,
+  -0.088297f,  -0.157899f,   0.037408f,    0.132435f,    -0.316155f,
+  -0.276785f,  0.0117868f,   0.185008f,    0.32369f,     -0.465855f,
+  -0.302127f,  0.303289f,    0.338597f,    -0.665408f,   -0.507594f,
+  0.526979f,   0.532091f,    0.234395f,    0.754063f,    0.116769f,
+  0.0800309f,  -0.939344f,   -1.51269f,    1.4583f,      0.178444f,
+  0.0106756f,  -0.213468f,   -0.00369439f, 0.071015f,    -0.192798f,
+  -0.0933147f, -0.129901f,   -0.368279f,   -0.246564f,   0.126966f,
+  0.478565f,   -0.476246f,   -0.762863f,   0.168883f,    0.536136f,
+  -0.272969f,  0.2573f,      -0.161577f,   0.311428f,    -0.777994f,
+  -1.29752f,   0.216046f,    0.329016f,    1.57265f,     0.168075f,
+  -0.192518f,  0.0829308f,   -0.073533f,   -0.0202034f,  0.114716f,
+  -0.34888f,   -0.519215f,   0.190809f,    0.0138507f,   0.133635f,
+  0.14194f,    0.410618f,    -0.165106f,   0.214438f,    0.0438265f,
+  -0.8481f,    -1.19182f,    -1.07878f,    -0.882217f,   0.45616f,
+  0.977385f,   0.74929f,     0.918466f,    0.904704f,    0.041938f,
+  0.0362776f,  0.0757255f,   1.14007f,     0.0516825f,   -0.160068f,
+  0.219535f,   0.638634f,    -0.0284544f,  -0.222849f,   -0.0344915f,
+  -0.0350256f, -0.0504452f,  -0.0458416f,  0.146099f,    0.0783083f,
+  0.206579f,   0.241264f,    0.28401f,     0.0425312f,   -0.802049f,
+  -0.746271f,  -0.578969f,   -0.078218f,   0.436176f,    -0.281465f,
+  -2.5539f,    0.237868f,    -0.121796f,   0.0715619f,   0.106992f,
+  -0.621862f,  -0.167142f,   0.153716f,    0.0570912f,   -0.06525f,
+  -0.923773f,  0.130759f,    0.0517066f,   0.0729862f,   -0.873064f,
+  0.0403328f,  -0.186499f,   -0.0831918f,  -0.223723f,   0.144697f,
+  0.212845f,   0.416876f,    0.361598f,    0.138229f,    0.0728777f,
+  -1.95419f,   -0.00382816f, -0.0440387f,  0.433627f,    0.44781f,
+  -1.05229f,   -1.54506f,    0.564827f,    -0.263456f,   0.296105f,
+  -0.158055f,  0.388274f,    -0.366639f,   0.212006f,    -0.245619f,
+  0.593064f,   0.088727f,    0.410632f,    -0.263462f,   0.507075f,
+  -0.0974155f, 0.275268f,    -0.1293f,     0.136679f,    1.98276f,
+  0.411766f,   0.391987f,    0.34283f,     -0.114077f,   0.258462f,
+  -0.302443f,  0.301138f,    -0.00726621f, 0.276441f,    -0.291582f,
+  0.66498f,    -0.321451f,   -0.332805f,   0.0943272f,   0.572253f,
+  -0.45818f,   -0.0219593f,  -0.151679f,   0.402033f,    -1.15502f,
+  -0.882955f,  0.772904f,    0.88126f,     -0.149555f,   0.709525f,
+  0.350116f,   -0.21531f,    0.797893f,    0.0230234f,   0.0203034f,
+  0.2744f,     1.08273f,     0.039349f,    0.503909f,    -0.45892f,
+  -0.579516f,  -0.344058f,   0.390628f,    -0.386941f,   -0.430317f,
+  -0.0807066f, 0.435906f,    0.522996f,    0.724476f,    -0.74371f,
+  -0.05376f,   -0.340898f,   -0.962646f,   -0.0278005f,  0.0981149f,
+  -0.0811161f, 0.00237994f,  0.850042f,    0.0665473f,   0.134413f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = {
+  1.63404f, -0.715866f, -1.0132f, -2.08745f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = {
+  NUM_FEATURES_8,
+  NUM_LOGITS_8,
+  NUM_HIDDEN_LAYERS_8,
+  {
+      NUM_LAYER_0_UNITS_8,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_8,
+      av1_simple_motion_search_prune_rect_logits_kernel_8,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_8,
+      av1_simple_motion_search_prune_rect_logits_bias_8,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+static const NN_CONFIG
+    *const av1_simple_motion_search_prune_rect_nn_config[5] = {
+      &av1_simple_motion_search_prune_rect_nn_config_128,
+      &av1_simple_motion_search_prune_rect_nn_config_64,
+      &av1_simple_motion_search_prune_rect_nn_config_32,
+      &av1_simple_motion_search_prune_rect_nn_config_16,
+      &av1_simple_motion_search_prune_rect_nn_config_8,
+    };
+
+// nn model for predicting max square partition level of a superblock
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_FEATURES 13
+#define NUM_LAYER_0_UNITS 48
+#define NUM_LOGITS 4
+
+static const float av1_max_part_pred_logits_kernel[] = {
+  -0.304561f,   0.0885596f,   -0.988539f,   1.08147f,    0.215213f,
+  0.202965f,    -0.828457f,   -0.233945f,   -0.0866977f, -0.115521f,
+  0.02079f,     0.196491f,    -0.0285075f,  0.05067f,    -0.00872862f,
+  0.00281844f,  -0.238954f,   0.0253801f,   0.0257775f,  0.339269f,
+  0.176174f,    -0.152545f,   -0.0588704f,  -1.62275f,   -0.189329f,
+  0.0808033f,   0.233844f,    -4.53798f,    0.674968f,   -0.0361688f,
+  -0.0754075f,  1.16129f,     -0.0188879f,  0.113255f,   -3.04378f,
+  0.814728f,    -0.568517f,   -0.00179383f, -3.61223f,   -1.67535f,
+  -2.20417f,    -0.197196f,   0.0507745f,   -0.0909394f, -0.0507879f,
+  -1.27999f,    -0.055623f,   0.0318497f,   0.192867f,   0.138726f,
+  0.0443392f,   -0.595075f,   -0.166774f,   0.0882958f,  -0.348161f,
+  0.0214428f,   -0.0599275f,  -0.0995385f,  -0.82358f,   0.141205f,
+  -0.053232f,   0.00508296f,  -1.90872f,    1.15004f,    -0.194219f,
+  0.0229019f,   -0.00354318f, 0.22016f,     0.154101f,   -0.159231f,
+  -0.0446647f,  -0.197503f,   0.0408453f,   0.197659f,   0.797858f,
+  -0.189722f,   0.343653f,    0.124666f,    -1.03083f,   0.603059f,
+  0.101565f,    0.0932993f,   0.462484f,    0.295984f,   1.11198f,
+  0.143709f,    -0.846232f,   -0.464392f,   -1.06058f,   -0.124889f,
+  0.0727475f,   1.18446f,     -0.100302f,   0.0641918f,  -0.101622f,
+  0.10219f,     0.130189f,    0.0915623f,   -0.166904f,  -1.10606f,
+  -0.16726f,    -0.146152f,   0.145443f,    -0.177091f,  -0.0215214f,
+  0.0158506f,   -0.553294f,   0.0784749f,   -0.0416628f, -0.027785f,
+  0.280027f,    0.484898f,    -0.164225f,   0.0238317f,  -0.0345254f,
+  0.0410244f,   0.131529f,    0.0239622f,   -0.0749436f, -0.0224914f,
+  0.128926f,    0.224539f,    0.413297f,    0.0638572f,  0.103308f,
+  0.0913242f,   -0.119274f,   0.0163103f,   0.113828f,   0.119809f,
+  0.297057f,    -0.124889f,   -0.533108f,   -0.181408f,  -0.129896f,
+  0.0221064f,   -0.0773281f,  -0.0386467f,  0.0342961f,  0.126575f,
+  -0.24114f,    0.0735576f,   0.0524791f,   0.246896f,   -0.130674f,
+  -0.03979f,    0.173639f,    1.95193f,     -0.113029f,  -0.0305852f,
+  -0.00671737f, 0.157159f,    -0.00102858f, -0.543688f,  0.566772f,
+  0.124124f,    -0.0294064f,  -0.0699021f,  -0.0704103f, -0.766097f,
+  -0.0625802f,  -0.0906173f,  -0.0520414f,  -0.0272724f, 0.283064f,
+  0.236213f,    -0.127319f,   0.019392f,    0.170042f,   -0.0214542f,
+  0.0740938f,   0.356578f,    -0.236257f,   0.269021f,   0.114759f,
+  -0.641166f,   0.136308f,    -0.0386959f,  -0.112024f,  -0.361209f,
+  0.686095f,    0.183906f,    0.288656f,    0.182007f,   0.337458f,
+  0.058974f,    -0.305512f,   -0.841708f,   -0.243779f,  -0.0614058f,
+  0.208747f,    0.448697f
+};
+
+static const float av1_max_part_pred_layer_0_bias[] = {
+  -0.776544f, -2.0022f,    -0.330294f, 2.47665f,  1.90206f,   -1.61571f,
+  0.536246f,  1.00455f,    5.24561f,   1.55111f,  -0.816399f, -4.88703f,
+  -1.06417f,  -1.15359f,   -0.145289f, 1.91831f,  0.630915f,  -1.94256f,
+  -3.35239f,  -1.05007f,   -1.05186f,  1.36824f,  -5.2878f,   1.10482f,
+  -5.00077f,  -0.0445198f, 3.41427f,   2.3439f,   -0.413306f, -1.88152f,
+  -2.28638f,  8.24783f,    -1.91961f,  -1.49324f, 1.96599f,   -6.32309f,
+  -0.332426f, -0.425506f,  4.06511f,   5.84386f,  4.15747f,   1.22402f,
+  2.8512f,    2.53027f,    0.0170272f, -1.43966f, -0.997785f, 5.43064f
+};
+
+static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f,
+                                                       1.96217f, 0.728905f };
+
+static const float av1_max_part_pred_layer_0_kernel[] = {
+  0.992471f,    0.533006f,    0.143743f,     -2.51788f,    -0.468337f,
+  -0.201376f,   -0.151834f,   0.479883f,     1.16061f,     -0.278878f,
+  -0.814954f,   -0.152405f,   -0.0521608f,   0.797104f,    -2.08912f,
+  0.385839f,    -2.22889f,    -0.106858f,    -0.239766f,   -0.951128f,
+  -0.698753f,   0.0831051f,   1.1702f,       0.342834f,    -0.0352795f,
+  -0.0847639f,  -0.802086f,   0.258982f,     1.14174f,     0.645885f,
+  -1.19226f,    -0.592888f,   -0.343659f,    1.1912f,      1.45411f,
+  -1.22927f,    0.152858f,    0.00373585f,   -1.60637f,    0.592611f,
+  0.0857475f,   -0.346147f,   -0.150784f,    -0.0817408f,  -0.189918f,
+  -0.804952f,   -1.33036f,    -1.03307f,     0.0248769f,   0.16607f,
+  -2.896f,      -2.1293f,     0.12293f,      -0.173179f,   -0.212128f,
+  -6.76221f,    0.033188f,    0.0231787f,    0.905957f,    0.0551327f,
+  -0.356276f,   0.0181795f,   0.0977523f,    -0.0352873f,  -0.0396386f,
+  2.3241f,      0.0632874f,   -0.11804f,     -6.32521f,    0.0224659f,
+  -0.00188896f, 0.267992f,    0.272337f,     0.00936963f,  0.659969f,
+  -2.25707f,    -0.0278229f,  -0.0185089f,   -1.14466f,    0.104827f,
+  0.0435885f,   0.558586f,    -0.00697004f,  0.0312611f,   0.540574f,
+  -0.568625f,   0.218608f,    0.378911f,     -0.0289192f,  -0.0734742f,
+  -1.08782f,    -2.42069f,    -0.0127239f,   0.0493651f,   -1.15837f,
+  0.261831f,    0.401824f,    -1.04545f,     0.284173f,    0.784972f,
+  -0.511243f,   -0.982599f,   -0.106134f,    -0.325964f,   -1.44107f,
+  -1.42434f,    -1.02402f,    -1.52034f,     0.0737116f,   0.0462242f,
+  0.628722f,    -1.0405f,     -0.113718f,    2.20573f,     -4.33951f,
+  -0.0192695f,  -0.0229314f,  -1.89156f,     0.645942f,    0.375708f,
+  -1.97447f,    -0.267014f,   0.0989443f,    -0.450534f,   -1.01737f,
+  -0.642416f,   -0.0897288f,  -2.08724f,     -0.190965f,   -0.279135f,
+  -0.830178f,   0.808754f,    -0.139091f,    1.11004f,     -0.454439f,
+  -0.479238f,   -1.44001f,    0.0888059f,    0.885689f,    -0.642505f,
+  -0.00773651f, -0.0265721f,  -0.906346f,    1.68504f,     0.084257f,
+  -0.951101f,   -8.06495f,    0.19231f,      0.16389f,     -0.193678f,
+  0.729837f,    -1.98392f,    -5.98513f,     3.32638f,     -0.0658378f,
+  -0.0910426f,  -0.666567f,   -0.315339f,    0.123124f,    -2.66375f,
+  -0.714852f,   -0.136176f,   -0.460166f,    -0.567551f,   -1.06193f,
+  -1.21389f,    -0.83865f,    0.00280695f,   -0.199519f,   -0.534704f,
+  0.419311f,    -0.149008f,   -3.68707f,     0.00285113f,  -0.0718198f,
+  -1.41026f,    -1.34155f,    -0.538687f,    -0.623666f,   -2.56462f,
+  -0.0183333f,  -0.323532f,   -1.27141f,     -0.0212039f,  0.198633f,
+  0.459554f,    -4.65103f,    -1.01293f,     -1.39512f,    -0.289026f,
+  0.208724f,    -0.665226f,   1.13369f,      -1.96734f,    -1.45442f,
+  -3.46172f,    0.810681f,    -0.603973f,    0.842764f,    -3.90371f,
+  -0.394561f,   -3.61363f,    -2.88085f,     0.031645f,    -0.23125f,
+  -2.63898f,    -1.35314f,    -0.46726f,     1.33145f,     1.20269f,
+  1.38682f,     -0.331637f,   0.069021f,     0.149523f,    -1.24957f,
+  -0.878857f,   -0.200368f,   0.465744f,     1.01365f,     -0.0122221f,
+  -0.550586f,   -1.12581f,    -0.422132f,    -0.0744868f,  -2.4804f,
+  -1.07072f,    -0.479006f,   0.101817f,     -0.118947f,   0.341576f,
+  -1.0538f,     -0.812346f,   -1.13727f,     -0.00939806f, 10.1571f,
+  -0.0441302f,  0.00280407f,  -21.5044f,     0.0181152f,   -0.0143246f,
+  3.23462f,     -1.38624f,    -1.80416f,     4.89763f,     -2.67364f,
+  2.31771e-05f, 0.000393989f, 0.352204f,     -0.193455f,   0.531455f,
+  0.488757f,    -0.442555f,   -0.518528f,    0.431482f,    -2.67727f,
+  -2.00626f,    -0.39729f,    -0.221494f,    -0.0188888f,  -0.0377649f,
+  -1.80169f,    0.0810332f,   -0.0408335f,   -1.28675f,    -0.0353824f,
+  -0.666723f,   -1.07281f,    0.252912f,     -1.24547f,    -1.7831f,
+  -1.14354f,    -0.137662f,   0.00230182f,   0.736862f,    0.175872f,
+  -0.187556f,   0.43963f,     -0.796524f,    0.056219f,    -0.387874f,
+  0.0710224f,   -0.16548f,    -0.100993f,    0.931481f,    -3.20738f,
+  -0.0197576f,  0.266148f,    -0.173909f,    -0.337795f,   -0.0682381f,
+  0.176844f,    0.140286f,    1.12033f,      0.429064f,    -2.24192f,
+  -1.54682f,    2.23646f,     -0.0371138f,   -0.0475339f,  -3.21766f,
+  0.0412858f,   0.387811f,    6.6711f,       0.140649f,    0.0559547f,
+  -0.802839f,   0.599977f,    0.64552f,      -2.08103f,    -0.503401f,
+  -0.0407036f,  -0.0299199f,  0.0849445f,    -0.111657f,   -1.63462f,
+  3.33762f,     0.0441394f,   0.0466889f,    -0.951806f,   0.0723954f,
+  0.00348661f,  -1.36903f,    2.24625f,      -0.0348915f,  -0.0508893f,
+  -0.240891f,   -0.120143f,   -0.17991f,     -2.09137f,    0.0150871f,
+  0.0480333f,   1.72012f,     0.0309551f,    -0.0370507f,  -0.377075f,
+  0.103916f,    -0.0169255f,  -0.0145395f,   -4.02144f,    0.83193f,
+  -0.316502f,   6.3832f,      -1.70038f,     -1.97215f,    -1.94501f,
+  1.45479f,     0.711725f,    -0.348496f,    -0.279056f,   -1.13396f,
+  -1.51744f,    -0.853307f,   1.53131f,      -0.0032358f,  1.41808f,
+  -1.32989f,    -0.245221f,   -0.161614f,    -0.500845f,   -0.449252f,
+  0.0724151f,   -0.116333f,   -0.0946182f,   -2.0945f,     0.0564572f,
+  0.393261f,    -1.06861f,    -0.111458f,    -0.839943f,   -0.0880348f,
+  0.0365742f,   0.415339f,    -1.57494f,     -0.713697f,   1.02349f,
+  -0.221371f,   -0.0446281f,  1.89223f,      -0.0811754f,  -0.402773f,
+  -0.930987f,   0.0243194f,   0.0678332f,    -0.0233014f,  0.165372f,
+  -0.44083f,    -1.2404f,     0.35675f,      -0.040916f,   -0.0512548f,
+  -2.9071f,     0.861174f,    -0.778133f,    2.14436f,     -0.688427f,
+  -0.480371f,   -1.69032f,    0.706687f,     -0.281982f,   -2.30451f,
+  1.61541f,     -0.0213638f,  -0.740509f,    -0.266677f,   0.0268434f,
+  -0.0116908f,  -3.17595f,    0.0114825f,    0.0196997f,   -0.144005f,
+  0.0550181f,   -0.851459f,   -0.000285073f, -0.538441f,   -0.0254868f,
+  -0.0104454f,  -0.0661998f,  -0.196469f,    -0.346372f,   -5.52892f,
+  -0.643683f,   -0.622224f,   -0.31463f,     -0.555956f,   -0.520132f,
+  -0.843166f,   -2.59479f,    -0.750195f,    0.00635995f,  -0.338615f,
+  -0.216676f,   -0.391544f,   -1.62185f,     -0.718471f,   -0.475406f,
+  -0.782041f,   -0.608824f,   -1.09633f,     -1.27308f,    -0.560719f,
+  -0.207539f,   -0.0196445f,  -1.05519f,     -0.575249f,   -1.0642f,
+  1.01615f,     -0.873633f,   -0.417953f,    -0.428051f,   0.350259f,
+  -2.53833f,    -2.72203f,    0.672846f,     -0.503094f,   -1.1374f,
+  0.214291f,    0.013305f,    0.0112064f,    1.10532f,     0.030455f,
+  0.0239614f,   0.628072f,    0.0539135f,    -0.472441f,   -0.688439f,
+  -0.32044f,    -0.0234867f,  -0.0158436f,   -0.949314f,   -0.0453161f,
+  -1.18306f,    0.626845f,    -0.426925f,    -0.688371f,   0.415062f,
+  0.0640985f,   -0.638387f,   -2.01399f,     -0.209744f,   -0.762892f,
+  -0.0753296f,  -0.879315f,   -0.520433f,    -0.111375f,   0.389742f,
+  -0.398862f,   -0.643227f,   -0.246396f,    0.0317051f,   1.06973f,
+  0.413617f,    0.180506f,    -0.0507897f,   -0.00650435f, 0.620892f,
+  0.046312f,    0.475032f,    0.906993f,     -0.0388061f,  -0.256271f,
+  -1.03323f,    0.0125266f,   -0.31116f,     -0.377611f,   -0.0386407f,
+  -0.0232745f,  -0.353644f,   -2.27289f,     0.0571779f,   -0.00865006f,
+  1.65101f,     0.0175711f,   0.0184585f,    0.558458f,    0.2213f,
+  -0.285089f,   0.433445f,    -0.427177f,    -0.0103682f,  -0.0101273f,
+  0.214085f,    -0.0459885f,  0.00761981f,   0.836381f,    0.0175293f,
+  0.02508f,     -1.51778f,    0.0143956f,    -0.162589f,   0.595418f,
+  0.21445f,     -0.0335848f,  -0.0136684f,   -0.16686f,    -0.14612f,
+  0.0816238f,   0.499636f,    0.12458f,      -2.41673f,    -0.261721f,
+  -0.676805f,   -1.88366f,    0.730462f,     0.69196f,     -0.0288489f,
+  -2.38272f,    0.329876f,    0.014517f,     -0.115145f,   -3.48151f,
+  -0.00209072f, -0.0732377f,  0.820443f,     -0.0118701f,  0.112145f,
+  0.272315f,    0.137531f,    -0.0200997f,   -0.0397883f,  -2.19458f,
+  0.183554f,    -0.639716f,   0.481605f,     -0.621639f,   -0.0980299f,
+  -0.710534f,   -0.143105f,   -6.77626f,     -1.65139f,    -2.37718f,
+  -0.533127f,   -1.12574f,    3.34182f,      -0.0758663f,  0.0334238f,
+  -9.48647f,    0.0674974f,   0.0507665f,    0.523007f,    -0.0668f,
+  0.5736f,      -0.589761f,   -1.1692f,      -0.0236497f,  -0.00828928f,
+  -0.265823f,   1.15284f,     0.307927f,     -0.695308f,   0.13725f,
+  -0.20394f,    -0.363965f,   -0.331159f,    -1.50927f,    -1.20051f,
+  -0.0205825f,  -0.0381859f,  -0.0579876f,   -1.6913f,     -1.94626f,
+  3.4214f,      3.3922f,      -2.13798f,     -0.679848f,   -0.890735f,
+  0.235017f,    -0.253202f,   -1.0571f,      1.40354f,     0.00719052f,
+  -1.54365f,    -0.7289f,     -1.05492f,     0.0238169f,   -0.00543592f,
+  -0.0510353f,  -0.175386f,   -0.724207f,    -0.788936f,   0.039976f,
+  1.36966f,     0.869475f,    -0.0302774f,   -0.0537556f
+};
+
+static const NN_CONFIG av1_max_part_pred_nn_config = {
+  NUM_FEATURES,
+  NUM_LOGITS,
+  NUM_HIDDEN_LAYERS,
+  {
+      NUM_LAYER_0_UNITS,
+  },
+  {
+      av1_max_part_pred_layer_0_kernel,
+      av1_max_part_pred_logits_kernel,
+  },
+  {
+      av1_max_part_pred_layer_0_bias,
+      av1_max_part_pred_logits_bias,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+// Early termination in second pass
+static const float av1_simple_motion_search_term_none_mean_128[28] = {
+  12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f,
+  11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f,
+  11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f,
+  5.940535f,  0.770746f,  4.292692f,  4.309581f,  0.848423f,  4.292334f,
+  4.298179f,  8.514713f,  14.911736f, 19.825352f,
+};
+
+static const float av1_simple_motion_search_term_none_std_128[28] = {
+  1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f,
+  1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f,
+  1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f,
+  1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_64[28] = {
+  10.904455f, 10.853546f, 9.247903f,  9.184479f,  9.251985f,  9.186686f,
+  9.253490f,  9.190190f,  9.270079f,  9.204357f,  10.086511f, 10.031060f,
+  10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f,
+  4.888378f,  0.878113f,  3.598450f,  3.628491f,  0.925833f,  3.560971f,
+  3.573322f,  8.807137f,  13.348477f, 18.269117f,
+};
+
+static const float av1_simple_motion_search_term_none_std_64[28] = {
+  1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f,
+  1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f,
+  1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f,
+  0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_32[28] = {
+  9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f,  8.012570f,
+  7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f,  8.859187f,
+  8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f,  2.645082f,
+  2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f,
+};
+
+static const float av1_simple_motion_search_term_none_std_32[28] = {
+  1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f,
+  1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f,
+  1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f,
+  0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_16[28] = {
+  8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f,  7.088727f,
+  6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f,  7.967233f,
+  8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f,  1.901347f,
+  1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f,
+};
+
+static const float av1_simple_motion_search_term_none_std_16[28] = {
+  1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f,
+  1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f,
+  1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f,
+  0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f,
+};
+
+static const float av1_simple_motion_search_term_none_model_128[] = {
+  -0.6106842357f, -1.0402954455f, 0.6054417656f,  -0.2116623578f,
+  0.2447714930f,  0.3782256209f,  0.5095592479f,  -0.3275620904f,
+  0.3886188013f,  0.2629499420f,  -0.1979599415f, -0.5389565605f,
+  0.1209207902f,  -0.4913347466f, 0.3798542731f,  -0.2812861709f,
+  -0.1049824167f, -0.1088672020f, 0.4059596517f,  -0.1347896613f,
+  0.2276868621f,  0.0506386970f,  0.0071088411f,  0.0467952100f,
+  0.2091247458f,  -0.7371964736f, 0.1368935545f,  0.3175247786f,
+  -0.5493146094f,
+};
+
+static const float av1_simple_motion_search_term_none_model_64[] = {
+  -0.4150046575f, -0.3954358561f, 0.1997997444f,  0.3395826831f,
+  0.2827215753f,  0.3395683652f,  0.2483140395f,  0.2722216476f,
+  0.2610308009f,  0.3724974359f,  -0.0551479654f, -0.1721616359f,
+  -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f,
+  -0.3169539902f, -0.0269429900f, 0.9891530919f,  -0.0125084982f,
+  0.0972182377f,  0.0008889801f,  0.0205418050f,  0.0057237854f,
+  0.1005222691f,  -0.2851321920f, -1.5150336445f, 0.1893942436f,
+  -0.4337360901f,
+};
+
+static const float av1_simple_motion_search_term_none_model_32[] = {
+  -0.4667392852f, -0.3893302767f, 0.1603498635f,  0.2304974726f,
+  0.1404975592f,  0.2505516225f,  0.1423053884f,  0.2189318406f,
+  0.1379765409f,  0.2638241296f,  -0.1342865463f, -0.0549054345f,
+  -0.1925223436f, -0.1142702769f, 0.0127811659f,  0.0868639997f,
+  -0.0643197251f, 0.0279496470f,  0.9904395769f,  -0.0095178685f,
+  0.1179410649f,  -0.0013411972f, 0.0095060660f,  0.0195730400f,
+  0.0779717771f,  -0.2498860763f, -0.8168817125f, -0.4798397348f,
+  -0.6609679881f,
+};
+
+static const float av1_simple_motion_search_term_none_model_16[] = {
+  -0.3021081992f, -0.4620153673f, 0.0448577479f,  0.1738455035f,
+  0.0663209177f,  0.1629614573f,  0.0555168744f,  0.1631870212f,
+  0.0425805150f,  0.1688564954f,  0.0434083772f,  -0.0046603915f,
+  -0.0271580056f, -0.0183879127f, 0.1073730471f,  0.0314201476f,
+  0.0576891756f,  0.0119723753f,  0.9084332022f,  -0.0188429077f,
+  0.0755089811f,  -0.0172550234f, 0.0037663075f,  0.0022094472f,
+  0.0500247894f,  -0.2944572004f, -0.8908521199f, -0.2555515792f,
+  -0.5396254205f,
+};
+
+#define FEATURES 31
+#define HIDDEN_NODES 32
+static const float av1_early_term_after_split_nn_weights_64_layer0[] = {
+  -0.306296f, -0.691664f, 0.335148f,  -0.298465f, -0.509241f, -0.632796f,
+  -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f,
+  -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f,
+  -0.240293f, 0.121749f,  -0.489777f, -0.756647f, 0.001047f,  -0.016528f,
+  0.145714f,  0.172910f,  0.086197f,  0.162882f,  -0.070588f, -0.077104f,
+  0.502730f,  -0.244954f, 0.265605f,  -0.323994f, 0.223397f,  -1.086453f,
+  0.391886f,  0.200343f,  0.253878f,  0.018925f,  0.201819f,  -0.205136f,
+  0.427314f,  0.041155f,  0.070484f,  0.159925f,  -0.057095f, -0.146544f,
+  -0.073792f, 0.152628f,  0.003986f,  -0.515965f, -0.209754f, 0.037457f,
+  0.070622f,  -0.143571f, -0.059602f, 0.111734f,  0.319674f,  0.149894f,
+  -0.219883f, 0.206678f,  0.015809f,  -0.210549f, 0.130156f,  -0.189502f,
+  -0.850392f, -0.156363f, -0.060354f, 0.189044f,  0.266495f,  0.151305f,
+  -0.563677f, -0.354896f, 0.300637f,  0.257568f,  -0.008359f, -0.535497f,
+  -0.003127f, 0.293054f,  -0.020212f, -0.157278f, 0.229972f,  -0.309799f,
+  -0.329927f, -0.077140f, 0.001177f,  -0.024415f, 0.134044f,  -0.181587f,
+  -0.135380f, 0.230989f,  -0.281451f, 0.912282f,  0.511562f,  -3.900779f,
+  -0.039917f, 1.956406f,  -0.357589f, 0.292998f,  -0.950158f, 0.422041f,
+  0.526572f,  0.605746f,  -0.147110f, 0.256576f,  0.090010f,  0.221641f,
+  0.029763f,  0.351592f,  0.458324f,  -0.005888f, 0.010521f,  -0.389326f,
+  -0.094006f, -0.171489f, -0.013153f, 0.026333f,  -0.454571f, -1.932891f,
+  -0.168211f, 0.051298f,  -0.258061f, -0.028936f, -0.555937f, -0.475566f,
+  -0.304046f, -0.318113f, 0.099697f,  -0.217145f, 0.139433f,  -0.203986f,
+  -0.164012f, 0.051527f,  0.138603f,  -0.085100f, -0.082887f, -0.242955f,
+  -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f,  0.135086f,
+  0.146200f,  0.184827f,  -0.199041f, 0.162570f,  -0.300167f, 0.017748f,
+  -0.140111f, 0.103553f,  0.206929f,  0.193446f,  0.123141f,  -1.201898f,
+  -0.052254f, -0.750121f, 0.111741f,  0.204092f,  -0.166266f, 0.124008f,
+  -0.455496f, 0.306035f,  0.275903f,  0.193599f,  -0.730011f, 0.126808f,
+  0.051059f,  0.103634f,  -0.044334f, 0.048889f,  0.405228f,  0.574099f,
+  0.061167f,  0.260576f,  0.070032f,  -0.038040f, 0.229183f,  -0.243269f,
+  -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f,
+  -0.290828f, -0.233006f, 0.068712f,  0.618085f,  -0.407008f, 0.686868f,
+  0.172247f,  0.826287f,  -0.002672f, 0.239825f,  -0.051548f, 0.420773f,
+  0.218747f,  0.041057f,  -0.071189f, 0.286987f,  -0.113915f, 0.122561f,
+  0.013979f,  -0.049046f, 0.148175f,  0.031313f,  -0.248601f, 0.209488f,
+  0.069008f,  0.072763f,  0.332475f,  0.079986f,  -0.151042f, -0.205110f,
+  -0.155550f, -0.510408f, 0.330429f,  0.577729f,  0.266524f,  -0.378489f,
+  0.228204f,  0.055318f,  0.117583f,  -0.588557f, -0.778201f, 0.434622f,
+  -0.227820f, 0.611642f,  0.170548f,  0.817761f,  0.006642f,  -1.005794f,
+  -0.911490f, 1.633684f,  -0.290664f, 0.308128f,  0.295986f,  0.243377f,
+  -0.001275f, -0.131156f, 0.275205f,  -0.041865f, -0.201951f, -0.016380f,
+  0.336604f,  -0.258118f, 0.890810f,  0.441065f,  -0.968006f, 0.135989f,
+  -1.447191f, 0.353426f,  -0.343235f, 0.376837f,  -0.071602f, -0.319639f,
+  -0.072347f, 0.547450f,  -0.215380f, 0.182141f,  -0.066186f, 0.033787f,
+  0.257482f,  0.217428f,  -0.130249f, 0.057525f,  0.263991f,  0.230664f,
+  -0.245113f, 0.048610f,  -0.079955f, 0.251737f,  -0.070368f, -0.017968f,
+  -0.151815f, 0.025945f,  -0.257769f, 0.299735f,  0.077263f,  -0.565526f,
+  0.326263f,  0.096429f,  0.113414f,  0.092754f,  -0.141908f, 0.172060f,
+  0.393117f,  -0.216755f, 0.331051f,  -0.363369f, -0.113363f, -0.095164f,
+  -0.072784f, 0.214572f,  0.010993f,  0.209456f,  0.260381f,  -0.314747f,
+  -0.422173f, -0.189963f, -0.225130f, 0.339448f,  0.153814f,  0.265616f,
+  -0.103575f, -0.123841f, -0.106236f, 0.155894f,  -0.156264f, -1.361406f,
+  -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f,
+  0.105758f,  0.040788f,  -0.313589f, -1.359318f, 0.071329f,  0.176404f,
+  -0.476141f, 0.010108f,  -0.201440f, -0.221167f, -0.197448f, -0.013927f,
+  -0.610270f, -0.607285f, 0.178070f,  0.174320f,  0.313115f,  0.026191f,
+  -0.112330f, 0.122338f,  -0.367751f, 0.196794f,  0.153709f,  -0.205454f,
+  -0.397471f, -1.879336f, -0.030129f, 0.143429f,  -0.079832f, 0.435259f,
+  -1.729539f, 0.518301f,  -0.141393f, 0.199399f,  -1.914601f, 0.142865f,
+  -0.219899f, 0.508458f,  0.086365f,  -0.220740f, -0.012507f, 1.263320f,
+  0.042136f,  0.050922f,  -0.329644f, -0.188198f, 0.251522f,  0.394731f,
+  -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f,  0.359257f,
+  -0.427732f, -0.100652f, 0.192129f,  0.075572f,  0.916708f,  0.255747f,
+  0.486384f,  0.127989f,  -0.556449f, -0.484913f, 0.392298f,  0.045401f,
+  -0.839551f, -0.703619f, 0.069263f,  -0.040720f, 0.542265f,  0.443739f,
+  0.862552f,  -0.021726f, 0.230858f,  -0.261004f, -0.125697f, -0.106435f,
+  0.002341f,  0.013904f,  0.011034f,  0.542296f,  -0.284325f, 0.135736f,
+  0.113882f,  0.040610f,  -0.255485f, 0.224061f,  -0.087140f, 0.127872f,
+  -0.002638f, 0.164889f,  -0.335958f, -0.031166f, -0.393581f, 0.075455f,
+  0.055995f,  0.087934f,  -0.133859f, -0.342187f, 0.002492f,  -0.340722f,
+  0.058304f,  0.104165f,  -0.142136f, -0.351111f, -0.158037f, -0.079924f,
+  -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f,
+  0.076088f,  -0.232091f, -0.070052f, 0.097595f,  0.063173f,  -0.211195f,
+  0.126478f,  -0.178828f, 0.278723f,  -0.070807f, -0.179783f, 0.034123f,
+  0.035721f,  -0.200431f, 0.170640f,  0.107933f,  0.226594f,  -0.301499f,
+  -0.291096f, 0.228076f,  -0.272951f, 0.002490f,  -0.210707f, -0.128033f,
+  -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f,
+  -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f,
+  0.749334f,  -1.161845f, 0.505480f,  0.221733f,  0.210490f,  -0.234984f,
+  0.014183f,  -0.510401f, 0.238692f,  -0.134111f, 0.083844f,  -0.478751f,
+  -0.088434f, 0.304063f,  0.150336f,  -0.749682f, -0.081999f, 0.729739f,
+  0.412508f,  0.132571f,  0.058306f,  -0.047451f, -0.117435f, -0.445395f,
+  -0.005182f, -0.025757f, 0.175051f,  -0.258194f, -0.150311f, -0.196533f,
+  -1.314316f, -0.428627f, 0.512451f,  0.045138f,  -0.200925f, 0.081538f,
+  -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f,
+  -0.419858f, -0.154321f, 0.376970f,  0.094017f,  0.783520f,  0.110641f,
+  0.077966f,  -0.093064f, 0.160522f,  -0.863041f, 0.086210f,  0.560764f,
+  0.057032f,  0.159224f,  0.323068f,  -0.173109f, 0.014042f,  -0.126856f,
+  -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f,
+  -0.215834f, 0.062076f,  -0.270596f, 0.271581f,  -0.153486f, -0.247165f,
+  0.079737f,  -0.157049f, -0.027459f, -0.299397f, 0.136729f,  -0.334192f,
+  -0.191722f, 0.145865f,  -0.031324f, -0.307165f, -0.244923f, -0.228027f,
+  0.063807f,  0.054965f,  -0.005709f, -0.041977f, -0.276245f, 0.020003f,
+  0.133323f,  -0.145992f, -0.951030f, 0.414083f,  -1.063323f, 0.137872f,
+  0.104732f,  -0.123728f, 0.542532f,  0.213654f,  0.542954f,  0.155619f,
+  0.543072f,  0.399067f,  0.191402f,  -0.102552f, -0.176734f, -0.136776f,
+  -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f,
+  0.058331f,  0.126601f,  0.104420f,  -0.148684f, 0.343218f,  0.093604f,
+  -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f,
+  0.042791f,  -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f,
+  -0.070898f, -0.156415f, 0.112831f,  -0.065931f, -0.353007f, 0.058453f,
+  -0.136982f, 0.233393f,  0.017240f,  -0.018428f, 0.229104f,  -0.371440f,
+  -0.262212f, 0.203075f,  -0.263293f, 0.034413f,  -0.299354f, 0.227269f,
+  0.204977f,  -0.118107f, -0.359832f, -0.068252f, 0.480105f,  -0.214711f,
+  -0.614381f, 0.209048f,  -0.456014f, -0.188819f, -0.220995f, -0.322104f,
+  -0.191457f, 0.420874f,  -0.454919f, 0.023119f,  0.291700f,  -0.532885f,
+  -0.032642f, 0.043271f,  0.133974f,  0.002399f,  -0.179899f, -0.044158f,
+  -0.027078f, -0.350075f, 0.236766f,  0.346771f,  -0.118534f, -0.421221f,
+  0.019544f,  0.109349f,  0.141517f,  0.403561f,  0.409102f,  0.054555f,
+  -0.561751f, 0.577183f,  -0.705156f, -0.231188f, -1.969772f, 0.172289f,
+  -0.048122f, 0.205671f,  -0.667130f, -0.066870f, 0.202838f,  -0.095538f,
+  -0.842651f, 0.254170f,  0.046256f,  -0.271891f, -0.369254f, 0.492101f,
+  0.001189f,  -0.186525f, 0.188470f,  -0.207072f, 0.030086f,  -0.132904f,
+  0.127001f,  0.116662f,  -0.079246f, 0.227241f,  -0.462178f, 0.446304f,
+  -1.660753f, 0.241832f,  -0.288040f, 0.054663f,  -0.435804f, 0.296782f,
+  -0.026421f, -0.115618f, 0.163416f,  0.834001f,  0.008019f,  -0.014243f,
+  0.524658f,  0.067894f,  -0.253936f, -0.100657f, 1.285389f,  -0.005952f,
+  0.087134f,  -0.088375f, -0.121866f, -0.171172f, 0.279463f,  -0.598593f,
+  -0.727761f, 0.189831f,  -0.822575f, -0.291141f, -0.012410f, -0.069999f,
+  0.098842f,  -0.218513f, 0.009494f,  0.100106f,  -0.402884f, -0.299236f,
+  -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f,
+  -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f,
+  -0.012113f, -0.032962f, -0.450648f, 0.129060f,  -0.135227f, -0.298593f,
+  0.001435f,  0.278790f,  -0.272945f, 0.162759f,  -0.290208f, 0.058481f,
+  -0.490971f, 0.019630f,  -0.210347f, 0.000520f,  -0.340413f, 0.641562f,
+  0.023104f,  0.194832f,  -0.441894f, -0.253538f, -0.228332f, 0.423264f,
+  -1.094073f, -0.475657f, -0.238752f, 0.033910f,  0.440425f,  0.036320f,
+  0.566989f,  -0.065326f, -0.297939f, 0.406098f,  0.529561f,  -0.113084f,
+  0.141472f,  -0.024462f, -0.179212f, 0.187801f,  -0.235787f, -0.229624f,
+  0.357791f,  0.061110f,  -0.607788f, -1.713694f, -0.651041f, 1.734283f,
+  -0.334701f, 0.161687f,  0.010215f,  0.320708f,  0.169447f,  0.513558f,
+  0.488340f,  -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f,
+  0.327028f,  -0.100539f, 0.012048f,  -0.223013f, -0.239680f, 0.323035f,
+  0.165950f,  -0.155110f, 0.128664f,  -0.157378f, -0.124490f, 0.291553f,
+  0.055849f,  -0.221664f, 0.077770f,  -0.350658f, -0.181939f, 0.110230f,
+  -0.078219f, 0.007472f,  -0.031620f, 0.007708f,  -0.201794f, 0.017594f,
+  -0.027480f, 0.058884f,  -0.369166f, -0.369770f, 0.181635f,  -0.183318f,
+  -0.389184f, -0.256661f, 0.160107f,  0.037127f,  -0.082573f, -0.095815f,
+  -0.322782f, 0.072528f,  -0.348875f, 0.216247f,  -0.161757f, -0.385502f,
+  -0.315738f, 0.020123f,  -0.155609f, 0.114403f,  -0.383232f, 0.629529f,
+  0.066142f,  0.448392f,  -0.389557f, -0.083315f, 0.829535f,  -0.015531f,
+  -0.050728f, -0.325127f, 0.812992f,  -0.196780f, 0.021060f,  -0.952647f,
+  0.006687f,  -0.512715f, -0.066778f, 0.410067f,  -0.116945f, -0.288283f,
+  0.189334f,  -0.083153f, 0.159980f,  -0.068208f, 0.107358f,  -0.154411f,
+  -0.068914f, 0.186816f,  0.032251f,  0.109242f,  0.134825f,  0.035101f,
+  -0.253175f, 0.157309f,  -0.363597f, -0.138176f, -0.334141f, -0.172697f,
+  0.045800f,  -0.286057f, 0.173403f,  -0.172444f, -0.117996f, -0.383848f,
+  -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f,
+  0.056121f,  0.155046f,  0.044708f,  -0.295609f, -0.211688f, -0.233229f,
+  -0.264980f, 0.145549f,  0.045323f,  -0.027112f, 0.175638f,  -0.207251f,
+  -0.055274f, 0.092706f,  0.086200f,  -0.241340f, -0.147416f, 0.024510f,
+  -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f,
+  -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f,
+  -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f,  -0.090609f,
+  0.006595f,  -0.200790f, 0.171856f,  -0.027766f, -0.032017f, -0.006745f,
+  0.566426f,  -0.096850f, 0.727633f,  -0.408065f, -0.012436f, 0.005646f,
+  -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f,
+  -0.231385f, -0.203175f, 0.041903f,  -0.373694f, 0.058239f,  -0.101116f,
+  0.183772f,  0.164523f,  -0.099046f, -0.201272f, -0.394523f, -0.157517f,
+  0.032079f,  -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f,
+  0.100268f,  -0.023806f, 0.004978f,  0.184916f,  0.142699f,  -0.113240f,
+  -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f,
+  -0.113085f, -0.279699f, -0.267434f, 0.126263f,  -0.260527f, -0.153904f,
+  -0.494653f, -0.355144f, 0.030549f,  -0.216400f, -0.123363f, 0.189090f,
+  0.219122f,  0.096677f,  -0.202037f, -0.014489f, -0.137859f, -0.114184f,
+  -0.279423f, -0.270683f,
+};
+
+static const float av1_early_term_after_split_nn_bias_64_layer0[] = {
+  -0.491455f, 0.464538f,  -0.005742f, -0.219951f, -0.073682f, 0.102027f,
+  0.567071f,  0.441402f,  0.277521f,  0.314498f,  -0.448199f, -0.065032f,
+  0.488139f,  -0.079632f, 0.000000f,  0.521555f,  -0.151950f, -0.034616f,
+  0.393438f,  -0.072242f, -0.087343f, -0.571308f, 0.017372f,  -0.126144f,
+  0.372261f,  -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f,
+  -0.109614f, -0.164492f,
+};
+
+static const float av1_early_term_after_split_nn_weights_64_layer1[] = {
+  -0.373195f, -0.283141f, 0.416113f,  0.483659f,  0.230583f,  0.349197f,
+  -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f,  0.339355f,
+  -0.828033f, 0.019617f,  0.118757f,  -0.619360f, 0.282295f,  -0.054116f,
+  -0.730596f, 0.068567f,  -0.248707f, 0.461225f,  0.330224f,  -0.287080f,
+  -0.458103f, 0.591852f,  -0.008491f, 0.632119f,  -0.007872f, 0.007869f,
+  -0.230698f, -0.011437f,
+};
+
+static const float av1_early_term_after_split_nn_bias_64_layer1[] = {
+  -0.55403697f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = {
+  FEATURES,
+  1,
+  1,
+  {
+      HIDDEN_NODES,
+  },
+  {
+      av1_early_term_after_split_nn_weights_64_layer0,
+      av1_early_term_after_split_nn_weights_64_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_64_layer0,
+      av1_early_term_after_split_nn_bias_64_layer1,
+  },
+};
+
+static const float av1_early_term_after_split_nn_weights_32_layer0[] = {
+  0.026050f,  -0.226531f, 0.308107f,  -0.083744f, 0.201785f,  0.098562f,
+  0.147595f,  -0.495771f, -0.245741f, 0.201616f,  -0.272070f, -0.579545f,
+  -0.127261f, -0.229588f, 0.250831f,  -0.176929f, -0.031689f, 0.284718f,
+  0.085845f,  -0.285027f, 0.012304f,  0.382402f,  -0.204591f, 0.272514f,
+  -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f,  0.195689f,
+  0.242530f,  0.023528f,  -0.294242f, -0.272132f, 0.460180f,  -0.731281f,
+  -0.208103f, 0.208204f,  0.348250f,  0.016328f,  0.043707f,  -0.169551f,
+  0.108521f,  0.226895f,  -0.020471f, 0.102443f,  0.429640f,  -0.252555f,
+  -0.218434f, -0.163665f, 0.175531f,  0.101588f,  -0.135798f, -0.158102f,
+  0.142565f,  0.128277f,  0.174985f,  -0.100073f, 0.113967f,  0.223682f,
+  -0.145576f, -0.008443f, 0.112748f,  -0.037845f, 0.076954f,  -0.287137f,
+  -0.518185f, -0.106833f, 0.175359f,  0.031408f,  0.219069f,  -0.294440f,
+  0.007766f,  0.067754f,  -0.049168f, -0.212368f, -0.261708f, 0.309252f,
+  0.220859f,  -0.274852f, -0.653157f, 0.083438f,  -0.265386f, 0.174429f,
+  -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f,  0.212890f,
+  0.272053f,  -0.425315f, -0.107726f, 0.294444f,  -0.354629f, 0.104402f,
+  -0.307663f, 0.558430f,  0.140334f,  -0.054831f, -0.449456f, 0.058274f,
+  -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f,  -0.079297f,
+  -0.638571f, 0.181823f,  -0.039611f, 0.206310f,  -0.659157f, -0.102930f,
+  -0.067303f, -0.176881f, -0.001038f, 0.091835f,  0.079739f,  -0.121923f,
+  0.211070f,  0.362719f,  -0.154915f, -0.151876f, -0.165460f, 0.023469f,
+  -0.251036f, 0.210014f,  -0.537125f, 0.156832f,  -0.216987f, 0.062975f,
+  -0.198462f, 0.329123f,  0.125870f,  0.225830f,  0.086377f,  -0.128773f,
+  -0.179673f, -0.074612f, 0.456645f,  0.021905f,  -0.243140f, 0.059145f,
+  -0.273942f, -0.277822f, 0.154556f,  -0.025459f, 0.227614f,  -0.313076f,
+  0.044705f,  -0.019017f, 0.108999f,  -0.020243f, -0.016373f, 0.560270f,
+  -0.064818f, 0.050880f,  -0.218458f, 0.825699f,  -0.534056f, -0.258253f,
+  0.222073f,  0.013295f,  0.477870f,  -0.386727f, 0.388509f,  0.004128f,
+  0.451388f,  -0.175788f, 0.264093f,  -0.109812f, 0.358132f,  0.500992f,
+  -0.446933f, -0.222397f, 0.345834f,  0.370943f,  -0.233115f, -0.047005f,
+  -0.111335f, -0.111586f, 0.026975f,  -0.052191f, -0.111800f, -0.129782f,
+  0.225132f,  0.102524f,  0.544557f,  -0.111674f, -0.857884f, 0.133258f,
+  0.310001f,  0.043829f,  0.104143f,  0.256493f,  0.242520f,  -0.342082f,
+  0.421447f,  0.124227f,  0.061542f,  -0.090206f, 0.316681f,  0.353452f,
+  -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f,
+  0.255725f,  -0.126346f, 0.034095f,  -0.240276f, -0.135918f, 0.095682f,
+  -0.147457f, -0.338216f, -0.200426f, 0.010265f,  -0.243915f, -0.231375f,
+  -0.323924f, -0.014353f, 0.150252f,  -0.264346f, 0.205303f,  -0.194610f,
+  -0.282527f, 0.180555f,  -0.000087f, 0.027240f,  -0.000903f, -0.345877f,
+  -0.353274f, -0.311829f, 0.172985f,  -0.111748f, -0.309380f, 0.108110f,
+  -0.260914f, -0.164990f, 0.183625f,  -0.319692f, -0.096988f, 0.094147f,
+  -0.047062f, -0.080978f, 0.227387f,  -0.000450f, -0.220159f, -0.211448f,
+  -0.020885f, -0.139646f, -0.086721f, 0.067928f,  -0.033084f, -0.251996f,
+  0.090317f,  0.086313f,  -0.228420f, -0.111356f, -0.314304f, -0.223664f,
+  0.188176f,  -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f,
+  -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f,
+  -0.038595f, 0.119537f,  0.260477f,  -0.168014f, -0.172751f, 0.532861f,
+  -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f,
+  -0.532110f, 0.359323f,  -0.254786f, 0.471316f,  -0.545024f, 0.291912f,
+  -0.836939f, 0.443427f,  -0.441709f, 0.168866f,  -0.140372f, 0.546607f,
+  -0.315465f, 0.023328f,  0.137709f,  -0.083492f, -0.049986f, -0.071302f,
+  -0.293680f, -0.105049f, 0.315317f,  0.279569f,  0.220762f,  0.088161f,
+  -0.756456f, -0.074512f, 0.958318f,  -0.332924f, -0.004906f, -0.629271f,
+  0.212050f,  0.279123f,  0.311523f,  -0.599580f, 0.516150f,  0.456952f,
+  0.020255f,  0.247290f,  -0.182670f, -0.335554f, 0.021203f,  0.131081f,
+  -0.208584f, 0.112530f,  -0.198980f, 0.211583f,  -0.101271f, -0.206453f,
+  -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f,  -0.165483f,
+  0.094953f,  -0.182578f, 0.055068f,  0.135605f,  -0.266941f, -0.297556f,
+  0.199181f,  0.015979f,  -0.158659f, -0.226841f, 0.171306f,  0.013438f,
+  -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f,  -0.026230f,
+  -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f,
+  0.248576f,  -0.144425f, -0.113948f, -0.247297f, 0.276682f,  0.010963f,
+  -0.737786f, 0.026347f,  0.007830f,  0.753543f,  0.371904f,  0.305614f,
+  0.105028f,  0.073530f,  -0.119137f, 0.102352f,  -0.080523f, 0.176366f,
+  -0.159457f, -0.339948f, 0.360131f,  -0.007051f, -0.388378f, -0.101695f,
+  0.663041f,  -0.234486f, -0.142536f, -0.099931f, 0.041478f,  0.230425f,
+  0.005743f,  0.154060f,  0.056233f,  -0.080668f, -0.009754f, -0.194356f,
+  0.185474f,  -0.296474f, 0.192700f,  0.257767f,  0.348529f,  0.458265f,
+  0.060276f,  -0.130473f, 0.139889f,  0.310073f,  -0.306869f, -0.272922f,
+  -0.259862f, 0.409207f,  0.431991f,  -0.100357f, -0.050415f, -0.071830f,
+  -0.239665f, 0.153399f,  0.177192f,  -0.611644f, -0.176114f, -0.022694f,
+  -0.033701f, -0.345842f, 0.015660f,  0.158931f,  -0.097586f, 0.222001f,
+  0.257887f,  -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f,
+  -0.010895f, 0.052815f,  -0.265306f, -0.081059f, 0.219162f,  -0.256084f,
+  -0.372676f, 0.148977f,  0.174831f,  0.086980f,  0.108518f,  0.074011f,
+  0.038032f,  -0.070856f, -0.109407f, 0.126174f,  0.022341f,  -0.249786f,
+  -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f,  -0.017953f,
+  -0.028353f, 0.233621f,  0.109426f,  0.232798f,  -0.104950f, -0.241798f,
+  -0.018995f, -0.167954f, 0.002473f,  0.060418f,  -0.232717f, -0.195980f,
+  -0.283971f, -0.371881f, 0.219728f,  0.018072f,  -0.166694f, -0.083301f,
+  -0.000616f, -0.212641f, -0.173158f, 0.222739f,  -0.235302f, 0.237624f,
+  0.222232f,  -0.041235f, -0.342411f, 0.121194f,  0.211291f,  -0.032237f,
+  -0.249401f, -0.291668f, 0.206055f,  -0.148200f, 0.011824f,  -0.272728f,
+  -0.194854f, 0.367175f,  -0.257243f, 0.103433f,  -0.231077f, 0.236734f,
+  0.135733f,  -0.362845f, 0.197147f,  0.242782f,  -0.135289f, 0.123311f,
+  0.259420f,  -0.116278f, 0.127287f,  0.236789f,  -0.097438f, 0.118073f,
+  0.112796f,  -0.035949f, 0.184408f,  0.200948f,  -0.008859f, 0.195989f,
+  0.161970f,  -0.295320f, -0.330389f, 0.141034f,  0.066081f,  -0.707857f,
+  0.357037f,  0.149633f,  0.679877f,  0.548674f,  0.469076f,  0.194123f,
+  -0.209872f, -0.071764f, -0.126960f, 0.199420f,  0.327116f,  -0.169053f,
+  -0.429156f, 0.443429f,  -0.225530f, -0.130738f, -0.028351f, 0.644393f,
+  0.049606f,  -0.243602f, -0.409920f, 0.117028f,  -0.258557f, 0.073865f,
+  -0.200454f, -0.139957f, -0.031314f, 0.162325f,  0.247221f,  0.071909f,
+  -0.336276f, 0.079922f,  0.192780f,  -0.148882f, 0.133192f,  -0.143177f,
+  -0.121327f, 0.126221f,  -0.089521f, -0.181826f, 0.149923f,  -0.280682f,
+  0.391572f,  0.108990f,  -0.445494f, -0.170787f, 0.225182f,  0.223313f,
+  -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f,  -0.049377f,
+  0.057976f,  0.033558f,  0.068733f,  -0.283353f, 0.217877f,  0.158093f,
+  -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f,  -0.474510f,
+  -0.096738f, 0.256940f,  0.234203f,  -0.226667f, -0.260576f, -0.183403f,
+  -0.035578f, 0.141570f,  0.078764f,  -0.028086f, 0.155800f,  -0.251115f,
+  -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f,
+  0.043057f,  0.068136f,  -0.179903f, 0.143699f,  -0.002571f, 0.239012f,
+  0.197456f,  0.035745f,  -0.311927f, 0.220320f,  0.102687f,  -0.294105f,
+  0.426740f,  0.209050f,  0.211907f,  0.083453f,  0.006578f,  -0.143338f,
+  0.003157f,  0.040295f,  0.234497f,  0.035344f,  -0.163909f, 0.411115f,
+  0.289453f,  -0.075357f, -0.008884f, 0.469798f,  -0.033304f, -0.153293f,
+  -0.229322f, -0.004162f, 0.113363f,  0.395381f,  0.067414f,  -0.188966f,
+  -0.117424f, -0.166423f, 0.066839f,  0.595641f,  -0.204782f, -0.451727f,
+  0.198509f,  -0.921583f, -0.246765f, -0.153411f, 0.046491f,  0.365906f,
+  0.376710f,  -0.017355f, -0.035232f, 0.138785f,  -0.163918f, -0.283449f,
+  -0.094340f, 0.192127f,  0.154815f,  0.035787f,  -0.029087f, 0.115649f,
+  -0.220133f, -0.452741f, 0.311667f,  0.157666f,  0.091401f,  0.236040f,
+  -0.168523f, 0.122176f,  -0.219016f, -0.214856f, 0.172824f,  -0.091810f,
+  0.031520f,  -0.857420f, 0.643446f,  -0.017471f, 0.206082f,  -0.933517f,
+  -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f,
+  0.289970f,  -0.889775f, -0.044741f, 0.232647f,  -0.319416f, 0.073030f,
+  0.278549f,  0.238782f,  -0.202206f, 0.272540f,  0.201412f,  0.175574f,
+  -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f,  0.505169f,
+  -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f,
+  -0.161019f, 0.344837f,  0.361958f,  -0.097050f, 0.014375f,  0.267110f,
+  0.341442f,  -0.016688f, 0.073393f,  0.131500f,  0.246331f,  0.011059f,
+  0.033597f,  0.014779f,  -0.269366f, -0.504788f, 0.048651f,  0.295682f,
+  0.237363f,  0.227484f,  -0.235814f, -0.160530f, 0.182682f,  -0.172999f,
+  -0.126630f, 0.168357f,  -0.078729f, 0.052805f,  0.377021f,  -0.004727f,
+  0.230415f,  -0.876673f, 0.458457f,  0.099401f,  -0.019616f, 0.611982f,
+  -0.231508f, -0.070894f, -0.056142f, 0.548969f,  -0.376599f, -0.600428f,
+  0.241930f,  -0.592893f, 0.189371f,  0.488651f,  -0.092446f, -0.272569f,
+  0.251643f,  0.315945f,  -0.301468f, 0.112961f,  0.052119f,  -0.066076f,
+  -0.082249f, 0.252805f,  -0.195539f, 0.150386f,  -0.865534f, 0.673447f,
+  0.030177f,  -0.438528f, -1.006174f, 0.575176f,  -0.271656f, 0.035835f,
+  -1.056916f, 0.495267f,  -0.092428f, -0.109511f, -0.192359f, 0.166669f,
+  -0.624326f, -0.000354f, -0.089075f, 0.176279f,  -0.289347f, 0.021346f,
+  0.020375f,  0.255282f,  -0.045588f, 0.173675f,  0.100957f,  -0.294373f,
+  0.049303f,  -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f,
+  0.079024f,  0.101113f,  0.135742f,  -0.348869f, -0.026759f, -0.134155f,
+  -0.179275f, -0.054297f, -0.054948f, 0.029351f,  0.190560f,  0.102476f,
+  -0.025785f, 0.169442f,  -0.271303f, 0.200667f,  0.099063f,  0.074767f,
+  -0.326533f, 0.044426f,  -0.290251f, -0.082443f, -0.164482f, -0.349412f,
+  0.045109f,  -0.157330f, 0.165935f,  0.012672f,  -0.059818f, 0.399140f,
+  -0.316620f, 0.386638f,  -0.285399f, -0.296777f, -0.200473f, -0.144232f,
+  0.251851f,  -0.203768f, 0.001071f,  -0.179063f, 0.248952f,  -0.143029f,
+  0.010423f,  -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f,
+  0.322825f,  0.133683f,  0.017388f,  0.142467f,  0.221320f,  0.004059f,
+  -0.115770f, 0.143363f,  0.137972f,  -0.272584f, 0.489366f,  -0.091828f,
+  -0.014703f, 0.082332f,  -0.476226f, -0.202859f, 0.356094f,  -0.283049f,
+  0.218086f,  0.202015f,  0.201724f,  0.012617f,  0.050720f,  0.255695f,
+  0.244653f,  0.111296f,  -0.151450f, -0.056210f, -0.757348f, 0.441724f,
+  -0.022455f, -0.244662f, 0.296205f,  -0.421883f, -0.217386f, -0.254301f,
+  0.409105f,  -0.031309f, 0.050147f,  -0.337170f, -0.106620f, -0.606455f,
+  0.308024f,  0.298144f,  0.363993f,  0.704870f,  -0.047292f, 0.166901f,
+  0.105991f,  -0.536757f, -0.424031f, -0.226034f, 0.213635f,  -0.526754f,
+  0.310990f,  -0.116038f, 0.007775f,  0.538330f,  -0.177912f, 0.445357f,
+  -0.290365f, 0.451169f,  0.030931f,  0.033388f,  0.209905f,  -0.244492f,
+  -0.097792f, -0.246042f, 0.132047f,  0.032576f,  0.115516f,  0.022890f,
+  0.093508f,  -0.071840f, 0.362948f,  -0.135245f, 0.659911f,  -0.321413f,
+  0.193118f,  -0.795001f, -0.218311f, 0.024862f,  0.206172f,  -0.832878f,
+  -0.255670f, 0.343402f,  -0.275211f, -0.898363f, -0.025172f, 0.158565f,
+  0.171347f,  -0.127518f, -0.215156f, -0.159198f, 0.250355f,  -0.132452f,
+  0.061254f,  -0.097544f, -0.223246f, 0.013183f,  0.239468f,  0.259017f,
+  -0.217739f, -0.032263f, 0.123755f,  -0.701777f, 0.150049f,  -0.555293f,
+  0.062430f,  -0.260304f, 0.494894f,  -0.168702f, -0.134829f, -0.113989f,
+  0.150092f,  -0.060248f, 0.115711f,  -0.277202f, 0.499811f,  0.417116f,
+  0.191081f,  -0.376432f, -0.321092f, 0.033992f,  0.057193f,  0.127077f,
+  -0.009042f, 0.014443f,  0.142808f,  -0.124349f, 0.213087f,  -0.381686f,
+  0.129726f,  -0.038396f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer0[] = {
+  -0.107171f, 0.060848f,  -0.069480f, -0.121982f, 0.037637f,  -0.291839f,
+  0.102257f,  -0.065889f, -0.032452f, 0.034171f,  -0.073984f, -0.005236f,
+  0.218820f,  0.132123f,  -0.089621f, -0.067679f, 0.049368f,  0.329444f,
+  -0.184729f, 0.031702f,  0.009735f,  -0.039964f, -0.018024f, -0.073031f,
+  -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f,  0.216609f,
+  -0.078358f, -0.007740f,
+};
+
+static const float av1_early_term_after_split_nn_weights_32_layer1[] = {
+  0.047869f,  -0.231773f, -0.185663f, 0.460676f,  -0.208182f, 0.590555f,
+  -0.622627f, 0.279377f,  0.351681f,  0.633504f,  1.069884f,  0.332449f,
+  -0.457703f, -0.435817f, -0.028853f, 0.327490f,  -0.282469f, -0.975792f,
+  -0.062975f, -0.147187f, 0.348340f,  -1.207116f, 0.516159f,  -1.509626f,
+  -0.805072f, 0.522999f,  0.143671f,  0.304246f,  -0.360720f, -0.612472f,
+  0.260045f,  -0.223243f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer1[] = {
+  -0.07571174f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = {
+  FEATURES,
+  1,
+  1,
+  {
+      HIDDEN_NODES,
+  },
+  {
+      av1_early_term_after_split_nn_weights_32_layer0,
+      av1_early_term_after_split_nn_weights_32_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_32_layer0,
+      av1_early_term_after_split_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_early_term_after_split_nn_weights_16_layer0[] = {
+  -0.113798f, 0.053357f,  -0.037947f, -0.477171f, 0.276517f,  -0.349252f,
+  -0.177284f, 0.189597f,  0.141744f,  0.230207f,  -0.328104f, 0.074328f,
+  0.247717f,  0.233533f,  0.145167f,  0.018029f,  -0.398725f, -0.226199f,
+  -0.309724f, 0.125279f,  0.194759f,  0.025531f,  0.349714f,  -0.273944f,
+  0.186871f,  0.181735f,  -0.520614f, -0.264076f, 0.308207f,  0.157438f,
+  -0.137791f, -0.054582f, 0.125879f,  0.796218f,  -0.897562f, 0.885439f,
+  0.381640f,  0.106625f,  -2.027456f, 0.000874f,  0.179581f,  0.013287f,
+  -2.329439f, -0.163169f, -0.136191f, 0.320108f,  -2.318779f, -0.196722f,
+  -0.295721f, 0.203658f,  -0.182275f, 0.615941f,  0.015762f,  0.257181f,
+  -0.115297f, 0.295774f,  -0.026144f, -0.022686f, -0.219423f, -0.042861f,
+  0.207647f,  -0.057791f, 0.201671f,  -0.169569f, 0.291492f,  -0.994991f,
+  0.137473f,  0.230948f,  0.505626f,  -1.065860f, 0.275225f,  -0.250861f,
+  0.519466f,  -1.217242f, -0.087384f, 0.053441f,  0.030729f,  -1.702304f,
+  -0.034635f, 0.010177f,  -0.035422f, -0.749979f, 0.355499f,  0.408166f,
+  -0.086883f, 0.017203f,  0.195706f,  -0.218056f, -0.029153f, 0.367335f,
+  -0.061732f, -0.241068f, 0.078496f,  -0.370346f, -0.124223f, -0.172708f,
+  0.037971f,  0.038875f,  -0.282489f, -0.266323f, -0.210864f, 0.214714f,
+  0.234695f,  -0.045625f, 0.015357f,  -0.007464f, -0.362003f, -0.113465f,
+  0.145141f,  0.238470f,  -0.202664f, -0.286587f, -0.347112f, 0.054501f,
+  -0.190290f, -0.283256f, 0.062179f,  0.041165f,  -0.006935f, -0.220351f,
+  -0.088800f, 0.220924f,  -0.200982f, 0.058493f,  -0.225175f, 0.057175f,
+  -0.618187f, 0.761023f,  -0.743774f, -0.500599f, -0.584999f, 1.545211f,
+  0.123055f,  -0.106848f, -0.353057f, 1.552187f,  0.174104f,  0.068060f,
+  -0.449859f, 1.254299f,  -0.161716f, -0.060630f, -0.230721f, 0.165976f,
+  -0.101582f, -0.422415f, 0.110384f,  -0.130098f, 0.104428f,  0.083518f,
+  0.031626f,  0.083048f,  0.158877f,  0.173340f,  0.063962f,  0.427845f,
+  0.663268f,  0.376996f,  0.146435f,  -0.091329f, 0.443447f,  0.518432f,
+  -0.182777f, -0.091313f, 0.331229f,  0.532604f,  -0.187001f, 0.054774f,
+  0.298068f,  0.502295f,  -0.362378f, 0.054283f,  0.292806f,  0.168901f,
+  -0.214787f, 0.025637f,  0.458009f,  -0.322714f, -0.264059f, 0.140313f,
+  -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f,  -0.009061f,
+  -0.050681f, 0.108681f,  0.043272f,  -1.073133f, 0.206410f,  0.469576f,
+  0.291494f,  -2.021244f, -0.001183f, -0.067542f, 0.364907f,  -2.470543f,
+  0.049147f,  -0.018868f, 0.658500f,  -2.531048f, 0.275433f,  -0.034224f,
+  -0.171386f, 0.096369f,  0.728069f,  0.272332f,  0.222255f,  -0.030426f,
+  0.026994f,  0.208928f,  -0.173943f, -0.227581f, -0.214798f, 0.079341f,
+  0.032344f,  -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f,
+  -0.086592f, 0.066487f,  0.337353f,  -0.168704f, 0.015702f,  0.022607f,
+  0.286647f,  0.218106f,  0.193319f,  -0.358714f, 0.030796f,  0.007646f,
+  -0.045617f, 0.165007f,  -0.284641f, -0.291812f, 0.207544f,  0.082823f,
+  -0.141907f, -0.331336f, -0.052908f, 0.120716f,  0.202521f,  0.232782f,
+  -0.348141f, -0.017332f, 1.191126f,  -0.391987f, -0.154537f, -0.206551f,
+  -2.378690f, 0.057918f,  -0.328183f, 2.151556f,  0.238803f,  0.164880f,
+  -0.480039f, 1.616200f,  0.260243f,  0.083704f,  -0.174461f, 1.804634f,
+  0.194810f,  0.223837f,  0.550107f,  -0.068171f, -0.293435f, -0.186770f,
+  -0.364846f, 0.127181f,  0.105556f,  -0.016202f, 0.278403f,  -0.344995f,
+  -0.009761f, -0.082555f, 0.046731f,  -0.301452f, 0.604259f,  0.055895f,
+  0.049862f,  0.314249f,  -0.305811f, -0.112937f, 0.658787f,  -0.549288f,
+  -0.307567f, -0.460650f, -0.840643f, 0.082576f,  0.373711f,  0.138318f,
+  0.336901f,  0.284984f,  -0.281400f, 0.408210f,  -0.449858f, 0.461054f,
+  0.227629f,  -0.131705f, 0.301769f,  -0.278540f, 0.189290f,  -0.269041f,
+  0.111350f,  -0.300257f, 0.436858f,  -0.265920f, -0.211938f, 0.272631f,
+  0.206291f,  0.253273f,  -0.229776f, -0.031112f, -0.171183f, -0.109676f,
+  -0.202390f, -0.068857f, 0.182125f,  -0.140523f, -0.308742f, -0.045840f,
+  0.256545f,  -0.262405f, 0.225951f,  -0.287463f, -0.189203f, -0.055552f,
+  -0.052448f, -0.242839f, -0.278877f, 0.140920f,  -0.175755f, 0.215402f,
+  -0.248841f, -0.264080f, -0.178303f, 0.147777f,  0.049460f,  -0.279877f,
+  -0.539725f, -0.004622f, 0.182874f,  0.338814f,  0.265974f,  0.249851f,
+  -0.141154f, 0.157228f,  -0.090972f, 0.179444f,  0.305255f,  0.127788f,
+  0.123270f,  0.355320f,  0.076797f,  0.263495f,  0.235965f,  -0.133816f,
+  0.243624f,  0.227062f,  -0.213629f, 0.002075f,  0.061203f,  -0.077820f,
+  -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f,
+  -0.274248f, 0.053950f,  -0.225750f, -0.367097f, -0.122391f, 0.181212f,
+  -0.411824f, -0.084241f, -0.302288f, 0.077860f,  -0.187443f, -0.300262f,
+  0.083156f,  -0.392461f, -0.332320f, -0.346474f, 0.140658f,  -0.283656f,
+  0.120714f,  -0.056577f, -0.280968f, 0.017795f,  -0.024686f, 0.073113f,
+  -0.346637f, 0.082567f,  -0.036556f, -0.369730f, 0.081225f,  -0.005211f,
+  0.144886f,  -0.003544f, 0.178307f,  -0.366035f, -0.063887f, -0.191767f,
+  0.105835f,  -0.273978f, -0.266532f, -0.023984f, 0.039166f,  0.065848f,
+  -0.026802f, -0.268923f, 0.189659f,  0.086300f,  0.030718f,  0.216565f,
+  -0.130025f, -0.215687f, 0.146341f,  -0.286438f, -0.394226f, -0.181509f,
+  -0.005612f, 0.186040f,  0.133491f,  0.032096f,  -0.261609f, 0.074007f,
+  -0.042929f, -0.234479f, 0.189704f,  0.088395f,  -0.003671f, -0.125055f,
+  -0.252418f, -0.086387f, 0.111197f,  -0.297071f, -0.018793f, -0.031902f,
+  -0.333191f, -0.186279f, 0.039868f,  0.091419f,  -0.264438f, -0.216150f,
+  -0.212550f, 0.203412f,  -0.113028f, -0.197169f, -0.346771f, 0.086066f,
+  0.091443f,  -0.128507f, -0.007281f, -0.118389f, 0.003370f,  -0.338661f,
+  0.026739f,  -0.063571f, -0.281567f, -0.166824f, 0.167455f,  0.216173f,
+  0.199163f,  0.256314f,  -0.222679f, 0.040282f,  -0.154808f, -0.133943f,
+  -0.270163f, -0.357398f, 0.260373f,  0.176950f,  -0.125162f, -0.085050f,
+  0.226376f,  -0.124585f, -0.324804f, 0.035536f,  -0.133600f, 0.173450f,
+  0.068107f,  -0.337442f, 0.169629f,  0.047223f,  0.057878f,  0.055555f,
+  -0.317449f, -0.103768f, 0.080899f,  -0.194759f, -1.137593f, 0.508999f,
+  0.045372f,  1.746454f,  1.250347f,  -0.342930f, -0.127821f, -0.220175f,
+  -0.417649f, -0.480595f, 0.071902f,  0.050231f,  -0.562554f, -0.677866f,
+  -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f,  0.572936f,
+  0.047325f,  0.050619f,  0.112611f,  -0.035393f, 0.052585f,  -0.071076f,
+  -0.015798f, -0.050228f, -0.142875f, 0.189329f,  0.048833f,  0.503633f,
+  0.249588f,  0.175492f,  -0.137664f, -0.018533f, 0.288453f,  -0.025644f,
+  0.079131f,  0.195096f,  -0.154039f, -0.104220f, -0.224072f, 0.095946f,
+  -0.208424f, 0.214745f,  0.056468f,  0.182603f,  0.341784f,  -0.134664f,
+  -0.194050f, 0.058532f,  -0.107336f, -0.087783f, -0.238795f, -0.387212f,
+  0.049055f,  -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f,
+  0.407375f,  -0.462654f, -0.609488f, 0.027742f,  -0.985512f, -0.109154f,
+  -0.423276f, 2.347960f,  0.129240f,  0.187610f,  -0.057081f, 2.424892f,
+  0.087666f,  0.106716f,  -0.039379f, 2.764866f,  0.113309f,  0.028196f,
+  -0.582789f, 0.335385f,  -0.538029f, -0.477337f, -0.114207f, 0.178829f,
+  0.006276f,  0.123179f,  0.095101f,  0.139898f,  -0.372074f, -0.111010f,
+  0.136330f,  0.272900f,  0.126737f,  -0.097808f, -0.363697f, 0.108665f,
+  -0.227749f, -0.083421f, 1.714677f,  0.451943f,  0.107931f,  -0.392281f,
+  1.615846f,  0.022307f,  -0.247011f, 0.257703f,  1.039134f,  0.537789f,
+  0.022177f,  -0.271532f, 0.351350f,  -0.399205f, -0.240534f, -0.315399f,
+  0.026928f,  -0.005618f, 0.053179f,  -0.010277f, 0.000501f,  0.040896f,
+  -0.109160f, 0.018282f,  0.003887f,  0.199599f,  0.095349f,  -0.337284f,
+  0.169929f,  -0.109409f, -0.166983f, 0.059908f,  -0.226574f, -0.120114f,
+  0.077329f,  -0.333133f, -0.220936f, 0.114309f,  -0.233965f, -0.281551f,
+  0.042948f,  0.100940f,  0.116037f,  -0.313122f, 0.215149f,  -0.309057f,
+  -0.341052f, -0.294417f, -0.179722f, 0.010795f,  0.192053f,  -0.275261f,
+  -0.033077f, 0.117348f,  0.090206f,  0.781573f,  0.602456f,  -0.220296f,
+  0.172159f,  0.758513f,  0.157910f,  -0.217897f, -0.372659f, 0.031935f,
+  0.791463f,  0.267195f,  0.931593f,  -0.057349f, 0.405512f,  -0.058512f,
+  -0.641663f, -0.076592f, 0.550227f,  -0.024094f, 0.048218f,  -0.289971f,
+  0.180940f,  0.167533f,  0.052711f,  -0.360726f, 0.019210f,  -0.488879f,
+  0.380498f,  0.151608f,  -0.276895f, -0.596554f, 0.106076f,  -0.245833f,
+  -0.048783f, 0.073823f,  0.098780f,  0.000211f,  0.113958f,  -0.068964f,
+  -0.265533f, -0.185457f, 0.175586f,  -0.163621f, -0.204919f, 0.145802f,
+  -0.163421f, 0.129576f,  -0.153486f, -0.105573f, 0.067289f,  -0.213120f,
+  -0.286103f, 0.249543f,  -0.044970f, -0.170464f, -0.105501f, -0.094765f,
+  -0.050734f, -0.369468f, 0.180020f,  -0.363328f, -0.151654f, -0.262550f,
+  -0.424503f, 0.829032f,  -0.559452f, 0.506837f,  0.143823f,  0.276660f,
+  -1.808608f, -0.259517f, -0.053945f, 0.035676f,  -1.842195f, -0.065960f,
+  -0.069285f, 0.462022f,  -2.319453f, -0.370299f, 0.183329f,  -0.146412f,
+  -0.563875f, 0.305068f,  0.480904f,  0.044319f,  -0.016098f, 0.168516f,
+  0.114874f,  -0.097621f, -0.030373f, 0.177700f,  0.181591f,  -0.146003f,
+  -0.330853f, -0.259200f, 0.779319f,  -1.517524f, 0.178781f,  0.135451f,
+  0.088784f,  -2.076089f, 0.628717f,  -0.048685f, 0.281327f,  -2.341596f,
+  0.422171f,  0.006135f,  0.367096f,  -1.663118f, 0.365253f,  -0.072884f,
+  -0.197620f, -0.688634f, 0.477354f,  0.395841f,  -0.098505f, 0.208709f,
+  -0.027523f, 0.127119f,  0.106274f,  0.114424f,  -0.122877f, -0.087245f,
+  0.086923f,  -0.527398f, -0.342062f, -0.764662f, 0.713094f,  -0.626453f,
+  -0.081454f, -0.087683f, 0.885047f,  0.323440f,  -0.018579f, -0.217166f,
+  1.617984f,  -0.159038f, 0.265991f,  -0.390313f, 1.933182f,  -0.032431f,
+  -0.057513f, -0.300841f, 0.461248f,  -0.072147f, -0.287052f, -0.078056f,
+  0.011734f,  0.044013f,  0.177174f,  0.093400f,  0.028819f,  0.193686f,
+  -0.224853f, 0.268321f,  -0.075059f, 0.074526f,  -0.015618f, 0.165615f,
+  -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f,
+  -0.224625f, -0.124980f, -0.104482f, 0.076864f,  -0.009631f, -0.164682f,
+  0.150480f,  -0.111880f, -0.260425f, 0.086234f,  -0.176936f, -0.136771f,
+  -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f,
+  -0.109061f, -0.098970f, 0.090792f,  -0.109623f, 0.349851f,  0.266341f,
+  -0.088602f, -0.108071f, 0.082519f,  0.472650f,  -1.838758f, 0.456694f,
+  0.119927f,  0.461077f,  -2.860022f, 0.231495f,  0.235771f,  0.256424f,
+  -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f,  0.505510f,
+  0.615657f,  0.193760f,  0.224600f,  0.265732f,  -0.121553f, -0.354597f,
+  -0.242414f, -0.276639f, -0.057591f, 0.026369f,  -0.261148f, -0.356155f,
+  -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f,  0.221299f,
+  -0.108857f, -0.156514f, 0.050901f,  0.058541f,  -0.077141f, 0.071515f,
+  -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f,
+  -0.143374f, -0.091811f, 0.165161f,  0.060156f,  -0.086103f, -0.039031f,
+  -0.377759f, -0.370533f, 0.074431f,  0.064192f,  0.186576f,  0.447858f,
+  -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f,  0.176286f,
+  2.850013f,  0.019385f,  -0.225361f, -0.235315f, 1.654694f,  -0.073978f,
+  -0.341412f, -1.187575f, 2.815900f,  -0.228063f, -0.174547f, 0.623825f,
+  -0.010676f, 0.157189f,  0.111879f,  -0.198965f, 0.051851f,  0.158396f,
+  0.045194f,  0.293531f,  -0.246714f, -0.351493f, 0.026954f,  0.076233f,
+  0.420367f,  0.168154f,  -0.131450f, 0.134487f,  -0.288851f, -0.134553f,
+  0.014902f,  0.756381f,  0.277713f,  0.190080f,  -0.020869f, 1.446672f,
+  0.029792f,  -0.025927f, 0.060640f,  0.559864f,  0.422229f,  0.198459f,
+  0.036167f,  0.029432f,  0.001882f,  0.038480f,  -0.160528f, -0.288855f,
+  -0.310886f, 0.291296f,  0.190558f,  -0.182816f, -0.002252f, 0.073101f,
+  -0.172245f, -0.305980f, 0.112492f,  -0.422839f, -0.295999f, -0.078160f,
+  -0.173405f, -0.032819f, 0.373774f,  -0.715223f, 0.018911f,  0.131753f,
+  -0.237364f, -0.128499f, -0.228406f, 0.341619f,  0.343552f,  -0.521581f,
+  -0.263790f, 0.362502f,  -0.018450f, 0.054233f,  0.183068f,  0.382772f,
+  0.188811f,  -0.627287f, 0.040399f,  -0.487338f, -0.192591f, 0.247426f,
+  0.154372f,  -0.483994f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer0[] = {
+  -0.173976f, 0.305495f,  0.250981f,  -0.067127f, -0.313100f, 0.242464f,
+  0.315196f,  -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f,
+  -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f,
+  -0.239698f, -0.182082f, -0.065296f, 0.021503f,  -0.036787f, 0.311861f,
+  0.118135f,  -0.320456f, -0.110719f, 0.220692f,  -0.071727f, -0.088226f,
+  -0.110874f, -0.111671f,
+};
+
+static const float av1_early_term_after_split_nn_weights_16_layer1[] = {
+  -0.338573f, 0.398159f,  0.314774f,  -0.037448f, -0.271950f, -0.774991f,
+  0.950901f,  -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f,
+  -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f,
+  -0.657195f, 0.927632f,  -0.040150f, 0.578920f,  0.212301f,  0.292495f,
+  0.563590f,  -0.205735f, 0.195877f,  0.582122f,  -0.217860f, 1.613379f,
+  0.313278f,  -0.555802f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer1[] = {
+  0.16553f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = {
+  FEATURES,
+  1,
+  1,
+  {
+      HIDDEN_NODES,
+  },
+  {
+      av1_early_term_after_split_nn_weights_16_layer0,
+      av1_early_term_after_split_nn_weights_16_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_16_layer0,
+      av1_early_term_after_split_nn_bias_16_layer1,
+  },
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer0[] = {
+  -0.719472f, 0.305806f,  0.855829f,  0.100094f,  0.412517f,  1.254673f,
+  1.552105f,  -5.890773f, -0.089957f, -0.016736f, 1.418074f,  -5.393506f,
+  -0.028214f, 0.117758f,  1.479209f,  -5.299794f, 0.171585f,  -0.084182f,
+  -0.162105f, 0.388577f,  -0.044319f, -0.025861f, 0.251782f,  -0.181462f,
+  -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f,
+  0.038038f,  -0.119492f, 0.049104f,  -0.344384f, -0.354513f, 0.036977f,
+  0.017513f,  -0.004025f, -0.163212f, -0.261999f, 0.146575f,  0.207541f,
+  0.130365f,  -0.252127f, 0.097419f,  -0.231057f, -0.309421f, 0.347866f,
+  -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f,
+  -0.233553f, 0.156354f,  -0.184009f, 0.344289f,  -0.308058f, -0.205202f,
+  -0.325068f, 0.183820f,  -0.361667f, -0.069559f, -0.121834f, -0.038357f,
+  -0.210043f, -0.266129f, 0.003188f,  0.074902f,  -0.328843f, 0.293679f,
+  -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f,
+  0.166074f,  -0.262899f, 0.102114f,  -0.323420f, 0.057064f,  -0.203318f,
+  -0.397413f, -0.317324f, -0.307093f, 0.020574f,  -0.188627f, 0.132529f,
+  0.118992f,  -0.487387f, -0.282975f, 0.573231f,  -0.266071f, 0.125140f,
+  -0.970034f, 1.424008f,  -0.487366f, -0.196415f, 3.680273f,  -0.008407f,
+  0.081109f,  -0.187479f, 3.876021f,  0.159168f,  0.111721f,  -0.337423f,
+  3.901760f,  0.261268f,  -0.245555f, -0.187632f, -0.324298f, 0.167234f,
+  0.170986f,  -0.473055f, 0.087016f,  -0.003469f, 0.051035f,  0.251794f,
+  0.153549f,  0.217609f,  -0.326870f, -0.175511f, 0.637341f,  -0.694837f,
+  -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f,
+  0.071414f,  0.215265f,  -0.835999f, 5.735746f,  0.300101f,  0.089626f,
+  -0.450261f, 5.608051f,  0.190491f,  0.110220f,  -0.595360f, -0.446324f,
+  0.311380f,  0.268812f,  -0.339656f, -0.008708f, 0.011111f,  -0.027557f,
+  0.171534f,  0.000676f,  0.227232f,  0.033993f,  0.146684f,  0.094817f,
+  -0.175381f, -0.211927f, -0.362471f, 0.168834f,  0.264149f,  -0.350538f,
+  -0.463249f, -0.288105f, 0.347155f,  0.183231f,  -0.229732f, -0.252202f,
+  -0.218074f, -0.008769f, -0.156103f, 0.181233f,  -0.354736f, 0.263270f,
+  -0.106636f, 0.081057f,  0.060634f,  -0.046887f, 0.050468f,  0.071259f,
+  0.221287f,  0.199071f,  -0.180185f, -0.406902f, -0.239351f, -0.034957f,
+  0.369140f,  0.864600f,  0.233798f,  0.423612f,  -0.468918f, 0.976987f,
+  0.691198f,  -1.597908f, 0.102926f,  0.305546f,  0.391196f,  -3.909059f,
+  0.333635f,  0.311561f,  0.738886f,  -4.002001f, 0.236394f,  -0.233141f,
+  0.263342f,  0.679898f,  0.136233f,  0.254743f,  -0.367571f, 0.066412f,
+  0.001606f,  -0.059542f, 0.051726f,  -0.347145f, -0.045501f, -0.313847f,
+  -0.021952f, 1.386316f,  -0.579139f, -1.275844f, -0.003493f, -1.716577f,
+  0.250209f,  0.192086f,  4.177055f,  0.351835f,  0.338177f,  0.140163f,
+  4.099592f,  0.321866f,  -0.128153f, -0.360414f, 4.350767f,  0.025943f,
+  -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f,
+  0.149441f,  -0.019823f, 0.012759f,  0.404442f,  -0.108881f, 0.067974f,
+  -0.188278f, 0.136327f,  0.109927f,  -0.179270f, -0.272342f, 0.018064f,
+  -0.304216f, -0.469470f, 0.109310f,  -0.326214f, 0.061909f,  -0.278997f,
+  -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f,
+  0.042441f,  -0.126699f, -0.420399f, -0.033842f, 0.016773f,  -0.273789f,
+  0.081928f,  -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f,
+  -0.232576f, 0.082955f,  -0.490080f, 0.073820f,  -0.090384f, 0.035781f,
+  -0.158880f, -0.506793f, -0.069132f, 0.047602f,  -0.349640f, -0.058389f,
+  -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f,  -0.548909f,
+  -0.131561f, 0.247196f,  -0.207923f, 0.133056f,  -0.509854f, -0.193685f,
+  -0.181327f, -0.242442f, 0.091821f,  0.114430f,  -0.375233f, -0.015254f,
+  -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f,
+  -0.076332f, -0.186232f, -0.268491f, 0.075561f,  -0.389082f, -0.077435f,
+  0.352562f,  -0.020086f, -0.338181f, -0.404629f, 0.254983f,  0.150477f,
+  -0.265903f, 0.003341f,  0.099969f,  -0.211964f, -0.129372f, -0.166366f,
+  0.327712f,  -0.276234f, 0.140675f,  -0.433677f, -0.163050f, -0.143578f,
+  -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f,
+  1.394155f,  -0.922486f, -1.350222f, 2.030201f,  0.057717f,  0.227650f,
+  -0.193179f, 0.037224f,  0.065555f,  0.020558f,  -0.059205f, -0.023690f,
+  -0.008718f, 0.095976f,  -0.549587f, -0.321164f, -0.243728f, 1.344381f,
+  -1.254107f, 0.294244f,  -0.154737f, -0.152597f, 0.342419f,  0.301883f,
+  0.069866f,  -0.327766f, 0.209323f,  -0.364913f, -0.005530f, -0.558972f,
+  0.057684f,  -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f,
+  -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f,
+  0.269628f,  -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f,
+  -0.135487f, -0.326108f, -0.039019f, 0.185029f,  -0.264883f, -0.563447f,
+  -0.163532f, -0.447652f, -0.141851f, 0.001714f,  -0.193184f, 0.032609f,
+  -0.112883f, 0.074599f,  0.490665f,  0.434764f,  0.021652f,  -0.219618f,
+  0.743267f,  0.147195f,  -0.303479f, -0.097674f, 0.195813f,  0.704007f,
+  -1.290851f, 0.119701f,  0.224065f,  0.260246f,  -0.580657f, -0.096201f,
+  -0.333214f, -0.586689f, 0.567178f,  0.157340f,  -0.043184f, 0.194358f,
+  -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f,  -0.564178f,
+  -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f,  -0.346106f,
+  -0.256892f, 0.110915f,  -0.337464f, -0.341474f, -0.216113f, 0.249445f,
+  -0.070175f, -0.412141f, 0.153458f,  -0.081280f, 0.164669f,  -0.356396f,
+  -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f,  -0.253233f,
+  -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f,
+  -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f,  0.011957f,
+  0.000498f,  -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f,
+  0.124249f,  -0.550804f, -0.420397f, -0.123462f, 0.333292f,  -0.240230f,
+  -0.025604f, 0.337536f,  -0.295006f, -0.272614f, -0.496850f, -0.278521f,
+  0.234591f,  -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f,
+  0.008714f,  -0.064018f, -0.124873f, -0.334014f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer0[] = {
+  1.202379f,  -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f,
+  0.615653f,  -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f,
+  -0.225995f, 0.370877f,  -0.214821f, -0.227752f,
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer1[] = {
+  0.376594f,  0.266703f,  -0.039847f, 1.680142f,  -0.879939f, 0.286806f,
+  -0.378223f, -0.405295f, -0.021107f, 0.039188f,  0.259308f,  0.193091f,
+  0.077994f,  -0.269141f, 0.011180f,  -0.019262f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer1[] = {
+  -1.29585564f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = {
+  FEATURES,
+  1,
+  1,
+  {
+      16,
+  },
+  {
+      av1_early_term_after_split_nn_weights_8_layer0,
+      av1_early_term_after_split_nn_weights_8_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_8_layer0,
+      av1_early_term_after_split_nn_bias_8_layer1,
+  },
+};
+#undef FEATURES
+#undef HIDDEN_NODES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/partition_search.c b/third_party/aom/av1/encoder/partition_search.c
new file mode 100644
index 0000000000..1c17b09ee1
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_search.c
@@ -0,0 +1,6263 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "aom_dsp/txfm_common.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/av1_ml_partition_models.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define COLLECT_MOTION_SEARCH_FEATURE_SB 0
+
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+  part_sf->partition_search_type = SEARCH_PARTITION;
+  part_sf->less_rectangular_check_level = 0;
+  part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+  part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+  part_sf->default_max_partition_size = BLOCK_LARGEST;
+  part_sf->default_min_partition_size = BLOCK_4X4;
+  part_sf->adjust_var_based_rd_partitioning = 0;
+  part_sf->max_intra_bsize = BLOCK_LARGEST;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  part_sf->fixed_partition_size = BLOCK_16X16;
+  // Recode loop tolerance %.
+  part_sf->partition_search_breakout_dist_thr = 0;
+  part_sf->partition_search_breakout_rate_thr = 0;
+  part_sf->prune_ext_partition_types_search_level = 0;
+  part_sf->prune_part4_search = 0;
+  part_sf->ml_prune_partition = 0;
+  part_sf->ml_early_term_after_part_split_level = 0;
+  for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+    part_sf->ml_partition_search_breakout_thresh[i] =
+        -1;  // -1 means not enabled.
+  }
+  part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0;
+  part_sf->simple_motion_search_split = 0;
+  part_sf->simple_motion_search_prune_rect = 0;
+  part_sf->simple_motion_search_early_term_none = 0;
+  part_sf->simple_motion_search_reduce_search_steps = 0;
+  part_sf->intra_cnn_based_part_prune_level = 0;
+  part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+  part_sf->ext_part_eval_based_on_cur_best = 0;
+  part_sf->prune_ext_part_using_split_info = 0;
+  part_sf->prune_rectangular_split_based_on_qidx = 0;
+  part_sf->early_term_after_none_split = 0;
+  part_sf->ml_predict_breakout_level = 0;
+  part_sf->prune_sub_8x8_partition_level = 0;
+  part_sf->simple_motion_search_rect_split = 0;
+  part_sf->reuse_prev_rd_results_for_part_ab = 0;
+  part_sf->reuse_best_prediction_for_part_ab = 0;
+  part_sf->use_best_rd_for_pruning = 0;
+  part_sf->skip_non_sq_part_based_on_none = 0;
+}
+
+// Reset speed features that works for the baseline encoding, but
+// blocks the external partition search.
+void av1_reset_sf_for_ext_part(AV1_COMP *const cpi) {
+  cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// If input |features| is NULL, write tpl stats to file for each super block.
+// Otherwise, store tpl stats to |features|.
+// The tpl stats is computed in the unit of tpl_bsize_1d (16x16).
+// When writing to text file:
+// The first row contains super block position, super block size,
+// tpl unit length, number of units in the super block.
+// The second row contains the intra prediction cost for each unit.
+// The third row contains the inter prediction cost for each unit.
+// The forth row contains the motion compensated dependency cost for each unit.
+static void collect_tpl_stats_sb(const AV1_COMP *const cpi,
+                                 const BLOCK_SIZE bsize, const int mi_row,
+                                 const int mi_col,
+                                 aom_partition_features_t *features) {
+  const AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+      gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+    return;
+  }
+
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  // If tpl stats is not established, early return
+  if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+    if (features != NULL) features->sb_features.tpl_features.available = 0;
+    return;
+  }
+
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const int mi_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int mi_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int col_steps = (mi_width / step) + ((mi_width % step) > 0);
+  const int row_steps = (mi_height / step) + ((mi_height % step) > 0);
+  const int num_blocks = col_steps * row_steps;
+
+  if (features == NULL) {
+    char filename[256];
+    snprintf(filename, sizeof(filename), "%s/tpl_feature_sb%d",
+             cpi->oxcf.partition_info_path, cpi->sb_counter);
+    FILE *pfile = fopen(filename, "w");
+    fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+            tpl_data->tpl_bsize_1d, num_blocks);
+    int count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        fprintf(pfile, "%.0f", (double)this_stats->intra_cost);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fprintf(pfile, "\n");
+    count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        fprintf(pfile, "%.0f", (double)this_stats->inter_cost);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fprintf(pfile, "\n");
+    count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        const int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        fprintf(pfile, "%.0f", (double)mc_dep_delta);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fclose(pfile);
+  } else {
+    features->sb_features.tpl_features.available = 1;
+    features->sb_features.tpl_features.tpl_unit_length = tpl_data->tpl_bsize_1d;
+    features->sb_features.tpl_features.num_units = num_blocks;
+    int count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        const int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        features->sb_features.tpl_features.intra_cost[count] =
+            this_stats->intra_cost;
+        features->sb_features.tpl_features.inter_cost[count] =
+            this_stats->inter_cost;
+        features->sb_features.tpl_features.mc_dep_cost[count] = mc_dep_delta;
+        ++count;
+      }
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+                              int blk_row, int blk_col,
+                              uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                   xd->left_txfm_context + blk_row, mbmi->bsize,
+                                   tx_size);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+    // Don't add to counts in this case
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][1];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
+    ++x->txfm_search_info.txb_split_count;
+
+    if (sub_txs == TX_4X4) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = row;
+        int offsetc = col;
+
+        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                          blk_col + offsetc, allow_update_cdf);
+      }
+    }
+  }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                      BLOCK_SIZE plane_bsize,
+                                      FRAME_COUNTS *td_counts,
+                                      uint8_t allow_update_cdf) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+                        allow_update_cdf);
+    }
+  }
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+                             int blk_col) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+
+  } else {
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
+        const int offsetc = blk_col + col;
+        set_txfm_context(xd, sub_txs, offsetr, offsetc);
+      }
+    }
+  }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+                                      MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      set_txfm_context(xd, max_tx_size, idy, idx);
+    }
+  }
+}
+
+static void update_zeromv_cnt(const AV1_COMP *const cpi,
+                              const MB_MODE_INFO *const mi, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize) {
+  if (mi->ref_frame[0] != LAST_FRAME || !is_inter_block(mi) ||
+      mi->segment_id > CR_SEGMENT_ID_BOOST2) {
+    return;
+  }
+  const AV1_COMMON *const cm = &cpi->common;
+  const MV mv = mi->mv[0].as_mv;
+  const int bw = mi_size_wide[bsize] >> 1;
+  const int bh = mi_size_high[bsize] >> 1;
+  const int xmis = AOMMIN((cm->mi_params.mi_cols - mi_col) >> 1, bw);
+  const int ymis = AOMMIN((cm->mi_params.mi_rows - mi_row) >> 1, bh);
+  const int block_index =
+      (mi_row >> 1) * (cm->mi_params.mi_cols >> 1) + (mi_col >> 1);
+  for (int y = 0; y < ymis; y++) {
+    for (int x = 0; x < xmis; x++) {
+      // consec_zero_mv is in the scale of 8x8 blocks
+      const int map_offset = block_index + y * (cm->mi_params.mi_cols >> 1) + x;
+      if (abs(mv.row) < 10 && abs(mv.col) < 10) {
+        if (cpi->consec_zero_mv[map_offset] < 255)
+          cpi->consec_zero_mv[map_offset]++;
+      } else {
+        cpi->consec_zero_mv[map_offset] = 0;
+      }
+    }
+  }
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TokenExtra **t, RUN_TYPE dry_run,
+                              BLOCK_SIZE bsize, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO **mi_4x4 = xd->mi;
+  MB_MODE_INFO *mbmi = mi_4x4[0];
+  const int seg_skip =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  const int mis = cm->mi_params.mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int is_inter = is_inter_block(mbmi);
+
+  // Initialize tx_mode and tx_size_search_method
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  set_tx_size_search_method(
+      cm, &cpi->winner_mode_params, txfm_params,
+      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  if (!is_inter) {
+    xd->cfl.store_y = store_cfl_required(cm, xd);
+    mbmi->skip_txfm = 1;
+    for (int plane = 0; plane < num_planes; ++plane) {
+      av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
+                                   cpi->optimize_seg_arr[mbmi->segment_id]);
+    }
+
+    // If there is at least one lossless segment, force the skip for intra
+    // block to be 0, in order to avoid the segment_id to be changed by in
+    // write_segment_id().
+    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+        cpi->enc_seg.has_lossless_segment)
+      mbmi->skip_txfm = 0;
+
+    xd->cfl.store_y = 0;
+    if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          if (!dry_run) {
+            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+                                   PALETTE_MAP, tile_data->allow_update_cdf,
+                                   td->counts);
+          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+            *rate +=
+                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+          }
+        }
+      }
+    }
+
+    av1_update_intra_mb_txb_context(cpi, td, dry_run, bsize,
+                                    tile_data->allow_update_cdf);
+  } else {
+    int ref;
+    const int is_compound = has_second_ref(mbmi);
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const YV12_BUFFER_CONFIG *cfg =
+          get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           xd->block_ref_scale_factors[ref], num_planes);
+    }
+    // Predicted sample of inter mode (for Luma plane) cannot be reused if
+    // nonrd_check_partition_split speed feature is enabled, Since in such cases
+    // the buffer may not contain the predicted sample of best mode.
+    const int start_plane =
+        (x->reuse_inter_pred && (!cpi->sf.rt_sf.nonrd_check_partition_split) &&
+         cm->seq_params->bit_depth == AOM_BITS_8)
+            ? 1
+            : 0;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  start_plane, av1_num_planes(cm) - 1);
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      assert(cpi->oxcf.motion_mode_cfg.enable_obmc);
+      av1_build_obmc_inter_predictors_sb(cm, xd);
+    }
+
+#if CONFIG_MISMATCH_DEBUG
+    if (dry_run == OUTPUT_ENABLED) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
+                                  cm->current_frame.order_hint, plane, pixel_c,
+                                  pixel_r, pd->width, pd->height,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#else
+    (void)num_planes;
+#endif
+
+    av1_encode_sb(cpi, x, bsize, dry_run);
+    av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
+                          tile_data->allow_update_cdf);
+  }
+
+  if (!dry_run) {
+    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
+    if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+        !xd->lossless[mbmi->segment_id] && mbmi->bsize > BLOCK_4X4 &&
+        !(is_inter && (mbmi->skip_txfm || seg_skip))) {
+      if (is_inter) {
+        tx_partition_count_update(cm, x, bsize, td->counts,
+                                  tile_data->allow_update_cdf);
+      } else {
+        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+          ++x->txfm_search_info.txb_split_count;
+        if (block_signals_txsize(bsize)) {
+          const int tx_size_ctx = get_tx_size_context(xd);
+          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+          const int max_depths = bsize_to_max_depth(bsize);
+
+          if (tile_data->allow_update_cdf)
+            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                       depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+        }
+      }
+      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
+    } else {
+      int i, j;
+      TX_SIZE intra_tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter) {
+        if (xd->lossless[mbmi->segment_id]) {
+          intra_tx_size = TX_4X4;
+        } else {
+          intra_tx_size =
+              tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+        }
+      } else {
+        intra_tx_size = mbmi->tx_size;
+      }
+
+      const int cols = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_width);
+      const int rows = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_height);
+      for (j = 0; j < rows; j++) {
+        for (i = 0; i < cols; i++) mi_4x4[mis * j + i]->tx_size = intra_tx_size;
+      }
+
+      if (intra_tx_size != max_txsize_rect_lookup[bsize])
+        ++x->txfm_search_info.txb_split_count;
+    }
+  }
+
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+      block_signals_txsize(mbmi->bsize) && is_inter &&
+      !(mbmi->skip_txfm || seg_skip) && !xd->lossless[mbmi->segment_id]) {
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
+  } else {
+    TX_SIZE tx_size = mbmi->tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter) {
+      if (xd->lossless[mbmi->segment_id]) {
+        tx_size = TX_4X4;
+      } else {
+        tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+      }
+    } else {
+      tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+    }
+    mbmi->tx_size = tx_size;
+    set_txfm_ctxs(tx_size, xd->width, xd->height,
+                  (mbmi->skip_txfm || seg_skip) && is_inter_block(mbmi), xd);
+  }
+
+  if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
+    cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
+  }
+  if (!dry_run) {
+    if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->svc.temporal_layer_id == 0 &&
+        cpi->sf.rt_sf.use_temporal_noise_estimate &&
+        (!cpi->ppi->use_svc ||
+         (cpi->ppi->use_svc &&
+          !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
+      update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize);
+  }
+}
+
+static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
+  x->rdmult = cpi->rd.RDMULT;
+
+  if (aq_mode != NO_AQ) {
+    assert(mbmi != NULL);
+    if (aq_mode == VARIANCE_AQ) {
+      if (cpi->vaq_refresh) {
+        const int energy = bsize <= BLOCK_16X16
+                               ? x->mb_energy
+                               : av1_log_block_var(cpi, x, bsize);
+        mbmi->segment_id = energy;
+      }
+      x->rdmult = set_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == COMPLEXITY_AQ) {
+      x->rdmult = set_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+      // If segment is boosted, use rdmult for that segment.
+      if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+        x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+    }
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->common.delta_q_info.delta_q_present_flag &&
+      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    x->rdmult = av1_get_cb_rdmult(cpi, x, bsize, mi_row, mi_col);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) {
+    av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col,
+                        &x->rdmult);
+  }
+#if CONFIG_SALIENCY_MAP
+  else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP) {
+    av1_set_saliency_map_vmaf_rdmult(cpi, &x->errorperbit,
+                                     cpi->common.seq_params->sb_size, mi_row,
+                                     mi_col, &x->rdmult);
+  }
+#endif
+#if CONFIG_TUNE_VMAF
+  else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+           cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+           cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#endif
+#if CONFIG_TUNE_BUTTERAUGLI
+  else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+    av1_set_butteraugli_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#endif
+  if (cpi->oxcf.mode == ALLINTRA) {
+    x->rdmult = (int)(((int64_t)x->rdmult * x->intra_sb_rdmult_modifier) >> 7);
+  }
+
+  // Check to make sure that the adjustments above have not caused the
+  // rd multiplier to be truncated to 0.
+  x->rdmult = (x->rdmult > 0) ? x->rdmult : 1;
+}
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                        const TileInfo *const tile,
+                                        MACROBLOCK *const x, int mi_row,
+                                        int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+
+  set_entropy_context(xd, mi_row, mi_col, num_planes);
+  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+                    mi_width, cpi->oxcf.border_in_pixels);
+
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+
+  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                     MACROBLOCK *const x, int mi_row, int mi_col,
+                     BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+  // Setup segment ID.
+  mbmi = xd->mi[0];
+  mbmi->segment_id = 0;
+  if (seg->enabled) {
+    if (seg->enabled && !cpi->vaq_refresh) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      mbmi->segment_id =
+          map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
+    }
+    av1_init_plane_quantizers(cpi, x, mbmi->segment_id, 0);
+  }
+#ifndef NDEBUG
+  x->last_set_offsets_loc.mi_row = mi_row;
+  x->last_set_offsets_loc.mi_col = mi_col;
+  x->last_set_offsets_loc.bsize = bsize;
+#endif  // NDEBUG
+}
+
+/*!\brief Hybrid intra mode search.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This is top level function for mode search for intra frames in non-RD
+ * optimized case. Depending on speed feature and block size it calls
+ * either non-RD or RD optimized intra mode search.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+
+static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi,
+                                                MACROBLOCK *const x,
+                                                RD_STATS *rd_cost,
+                                                BLOCK_SIZE bsize,
+                                                PICK_MODE_CONTEXT *ctx) {
+  int use_rdopt = 0;
+  const int hybrid_intra_pickmode = cpi->sf.rt_sf.hybrid_intra_pickmode;
+  // Use rd pick for intra mode search based on block size and variance.
+  if (hybrid_intra_pickmode && bsize < BLOCK_16X16) {
+    unsigned int var_thresh[3] = { 0, 101, 201 };
+    assert(hybrid_intra_pickmode <= 3);
+    if (x->source_variance >= var_thresh[hybrid_intra_pickmode - 1])
+      use_rdopt = 1;
+  }
+
+  if (use_rdopt)
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  else
+    av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+// For real time/allintra row-mt enabled multi-threaded encoding with cost
+// update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated
+// at superblock level. Thus, it is not required for the encoding of top-right
+// superblock be complete for updating tile ctxt. However, when encoding a block
+// whose right edge is also the superblock edge, intra and inter mode evaluation
+// (ref mv list population) require the encoding of the top-right superblock to
+// be complete. So, here, we delay the waiting of threads until the need for the
+// data from the top-right superblock region.
+static AOM_INLINE void wait_for_top_right_sb(
+    AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync,
+    TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2,
+    BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int sb_size_in_mi = mi_size_wide[sb_size];
+  const int bw_in_mi = mi_size_wide[bsize];
+  const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
+  const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1);
+  const int top_right_block_in_sb =
+      (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi);
+
+  // Don't wait if the block is the not the top-right block in the superblock.
+  if (!top_right_block_in_sb) return;
+
+  // Wait for the top-right superblock to finish encoding.
+  const int sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2;
+  const int sb_col_in_tile =
+      (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2;
+
+  enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+}
+
+/*!\brief Interface for AV1 mode search for an individual coding block
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level interface that
+ * directs the encoder to the proper mode search function, among these
+ * implemented for inter/intra + rd/non-rd + non-skip segment/skip segment.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+ *                              data/contexts/models for the tile during
+ *                              encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+ *                              the current macroblock
+ * \param[in]    mi_row         Row coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    mi_col         Column coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    rd_cost        Pointer to structure holding rate and distortion
+ *                              stats for the current block
+ * \param[in]    partition      Partition mode of the parent block
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Pointer to structure holding coding contexts and
+ *                              chosen modes for the current block
+ * \param[in]    best_rd        Upper bound of rd cost of a valid partition
+ *
+ * \remark Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                          MACROBLOCK *const x, int mi_row, int mi_col,
+                          RD_STATS *rd_cost, PARTITION_TYPE partition,
+                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                          RD_STATS best_rd) {
+  if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) {
+    ctx->rd_stats.rdcost = INT64_MAX;
+    ctx->rd_stats.skip_txfm = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+
+  if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab &&
+      ctx->rd_mode_is_ready) {
+    assert(ctx->mic.bsize == bsize);
+    assert(ctx->mic.partition == partition);
+    rd_cost->rate = ctx->rd_stats.rate;
+    rd_cost->dist = ctx->rd_stats.dist;
+    rd_cost->rdcost = ctx->rd_stats.rdcost;
+    return;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  int i;
+
+  // This is only needed for real time/allintra row-mt enabled multi-threaded
+  // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+  wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+                        &tile_data->tile_info, cm->seq_params->sb_size,
+                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
+  mbmi = xd->mi[0];
+  mbmi->bsize = bsize;
+  mbmi->partition = partition;
+
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
+#endif
+
+  // Sets up the tx_type_map buffer in MACROBLOCKD.
+  xd->tx_type_map = txfm_info->tx_type_map_;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+  ctx->skippable = 0;
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  mbmi->skip_txfm = 0;
+  // Reset skip mode flag.
+  mbmi->skip_mode = 0;
+
+  x->source_variance = av1_get_perpixel_variance_facade(
+      cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+  av1_rd_cost_update(x->rdmult, &best_rd);
+
+  // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous
+  // rdcost information for the following mode search.
+  // Disabling the feature could get some coding gain, with encoder slowdown.
+  if (!cpi->sf.part_sf.use_best_rd_for_pruning) {
+    av1_invalid_rd_stats(&best_rd);
+  }
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+  } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx, best_rd.rdcost);
+    } else {
+      av1_rd_pick_inter_mode(cpi, tile_data, x, rd_cost, bsize, ctx,
+                             best_rd.rdcost);
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+  }
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
+      bsize >= BLOCK_16X16) {
+    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
+  x->rdmult = orig_rdmult;
+
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+  ctx->rd_stats.rate = rd_cost->rate;
+  ctx->rd_stats.dist = rd_cost->dist;
+  ctx->rd_stats.rdcost = rd_cost->rdcost;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
+}
+
+static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const int seg_ref_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+  if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active &&
+      is_comp_ref_allowed(bsize)) {
+    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+    update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+  }
+
+  if (!mbmi->skip_mode && !seg_ref_active) {
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_txfm[skip_ctx][mbmi->skip_txfm]++;
+#endif
+    update_cdf(fc->skip_txfm_cdfs[skip_ctx], mbmi->skip_txfm, 2);
+  }
+
+#if CONFIG_ENTROPY_STATS
+  // delta quant applies to both intra and inter
+  const int super_block_upper_left =
+      ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+      ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  if (delta_q_info->delta_q_present_flag &&
+      (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
+      super_block_upper_left) {
+    const int dq = (mbmi->current_qindex - xd->current_base_qindex) /
+                   delta_q_info->delta_q_res;
+    const int absdq = abs(dq);
+    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+      td->counts->delta_q[i][1]++;
+    }
+    if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+    if (delta_q_info->delta_lf_present_flag) {
+      if (delta_q_info->delta_lf_multi) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+                               delta_q_info->delta_lf_res;
+          const int abs_delta_lf = abs(delta_lf);
+          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+            td->counts->delta_lf_multi[lf_id][i][1]++;
+          }
+          if (abs_delta_lf < DELTA_LF_SMALL)
+            td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+        }
+      } else {
+        const int delta_lf =
+            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+            delta_q_info->delta_lf_res;
+        const int abs_delta_lf = abs(delta_lf);
+        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+          td->counts->delta_lf[i][1]++;
+        }
+        if (abs_delta_lf < DELTA_LF_SMALL)
+          td->counts->delta_lf[abs_delta_lf][0]++;
+      }
+    }
+  }
+#endif
+
+  if (!is_inter_block(mbmi)) {
+    av1_sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+                        frame_is_intra_only(cm));
+  }
+
+  if (av1_allow_intrabc(cm)) {
+    const int is_intrabc = is_intrabc_block(mbmi);
+    update_cdf(fc->intrabc_cdf, is_intrabc, 2);
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->intrabc[is_intrabc];
+#endif  // CONFIG_ENTROPY_STATS
+    if (is_intrabc) {
+      const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      const int_mv dv_ref = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+      av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc,
+                          MV_SUBPEL_NONE);
+    }
+  }
+
+  if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
+
+  FRAME_COUNTS *const counts = td->counts;
+  const int inter_block = is_inter_block(mbmi);
+
+  if (!seg_ref_active) {
+#if CONFIG_ENTROPY_STATS
+    counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#endif
+    update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+               inter_block, 2);
+    // If the segment reference feature is enabled we have only a single
+    // reference frame allowed for the segment so exclude it from
+    // the reference frame counts used to work out probabilities.
+    if (inter_block) {
+      const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+      if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+        if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->comp_inter[av1_get_reference_mode_context(xd)]
+                            [has_second_ref(mbmi)]++;
+#endif  // CONFIG_ENTROPY_STATS
+          update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                      ? UNIDIR_COMP_REFERENCE
+                                                      : BIDIR_COMP_REFERENCE;
+        update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+                   COMP_REFERENCE_TYPES);
+#if CONFIG_ENTROPY_STATS
+        counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+                             [comp_ref_type]++;
+#endif  // CONFIG_ENTROPY_STATS
+
+        if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+          const int bit = (ref0 == BWDREF_FRAME);
+          update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts
+              ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit) {
+            const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+            update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+                                [bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (bit1) {
+              update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+                         ref1 == GOLDEN_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
+                                  [ref1 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
+          }
+        } else {
+          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+          update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit) {
+            update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+                            [ref0 == LAST2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          } else {
+            update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+          update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
+                     2);
+#if CONFIG_ENTROPY_STATS
+          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+                             [ref1 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref1 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+                       ref1 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+                               [ref1 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        }
+      } else {
+        const int bit = (ref0 >= BWDREF_FRAME);
+        update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+        counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+        if (bit) {
+          assert(ref0 <= ALTREF_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
+                     2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+                            [ref0 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref0 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+                       ref0 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+                              [ref0 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        } else {
+          const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit1) {
+            update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+                              [ref0 != LAST_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          } else {
+            update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+                              [ref0 != LAST3_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        }
+      }
+
+      if (cm->seq_params->enable_interintra_compound &&
+          is_interintra_allowed(mbmi)) {
+        const int bsize_group = size_group_lookup[bsize];
+        if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][1]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+#endif
+          update_cdf(fc->interintra_mode_cdf[bsize_group],
+                     mbmi->interintra_mode, INTERINTRA_MODES);
+          if (av1_is_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+            update_cdf(fc->wedge_interintra_cdf[bsize],
+                       mbmi->use_wedge_interintra, 2);
+            if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+              counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+              update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
+                         16);
+            }
+          }
+        } else {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][0]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+        }
+      }
+
+      const MOTION_MODE motion_allowed =
+          cm->features.switchable_motion_mode
+              ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                    cm->features.allow_warped_motion)
+              : SIMPLE_TRANSLATION;
+      if (mbmi->ref_frame[1] != INTRA_FRAME) {
+        if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+          update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+                     MOTION_MODES);
+        } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+          update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        assert(current_frame->reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+        const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                         cm->seq_params->enable_masked_compound;
+        if (masked_compound_used) {
+          const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+          update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+                     mbmi->comp_group_idx, 2);
+        }
+
+        if (mbmi->comp_group_idx == 0) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
+#endif
+          update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
+                     2);
+        } else {
+          assert(masked_compound_used);
+          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+            ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+                                           COMPOUND_WEDGE];
+#endif
+            update_cdf(fc->compound_type_cdf[bsize],
+                       mbmi->interinter_comp.type - COMPOUND_WEDGE,
+                       MASKED_COMPOUND_TYPES);
+          }
+        }
+      }
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+          update_cdf(fc->wedge_idx_cdf[bsize],
+                     mbmi->interinter_comp.wedge_index, 16);
+        }
+      }
+    }
+  }
+
+  if (inter_block && cm->features.interp_filter == SWITCHABLE &&
+      av1_is_interp_needed(xd)) {
+    update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter);
+  }
+  if (inter_block &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+    if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+      update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                 INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+    } else {
+      av1_update_inter_mode_stats(fc, counts, mode, mode_ctx);
+    }
+
+    const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+    if (new_mv) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 0; idx < 2; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+          if (mbmi->ref_mv_idx == idx) break;
+        }
+      }
+    }
+
+    if (have_nearmv_in_inter_mode(mbmi->mode)) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 1; idx < 3; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
+          if (mbmi->ref_mv_idx == idx - 1) break;
+        }
+      }
+    }
+    if (have_newmv_in_inter_mode(mbmi->mode)) {
+      const int allow_hp = cm->features.cur_frame_force_integer_mv
+                               ? MV_SUBPEL_NONE
+                               : cm->features.allow_high_precision_mv;
+      if (new_mv) {
+        for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+          const int_mv ref_mv = av1_get_ref_mv(x, ref);
+          av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                              allow_hp);
+        }
+      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
+        const int ref = 1;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
+        const int ref = 0;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      }
+    }
+  }
+}
+
+/*!\brief Reconstructs an individual coding block
+ *
+ * \ingroup partition_search
+ * Reconstructs an individual coding block by applying the chosen modes stored
+ * in ctx, also updates mode counts and entropy models.
+ *
+ * \param[in]    cpi       Top-level encoder structure
+ * \param[in]    tile_data Pointer to struct holding adaptive
+ *                         data/contexts/models for the tile during encoding
+ * \param[in]    td        Pointer to thread data
+ * \param[in]    tp        Pointer to the starting token
+ * \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]    mi_col    Column coordinate of the block in a step size of
+ *                         MI_SIZE
+ * \param[in]    dry_run   A code indicating whether it is part of the final
+ *                         pass for reconstructing the superblock
+ * \param[in]    bsize     Current block size
+ * \param[in]    partition Partition mode of the parent block
+ * \param[in]    ctx       Pointer to structure holding coding contexts and the
+ *                         chosen modes for the current block
+ * \param[in]    rate      Pointer to the total rate for the current block
+ *
+ * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes
+ * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0].
+ */
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                     ThreadData *td, TokenExtra **tp, int mi_row, int mi_col,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx,
+                     int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int subsampling_x = cm->seq_params->subsampling_x;
+  const int subsampling_y = cm->seq_params->subsampling_y;
+
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  const int origin_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+
+  if (!dry_run) {
+    set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+                   x->cb_offset[PLANE_TYPE_UV]);
+    assert(x->cb_offset[PLANE_TYPE_Y] <
+           (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
+    assert(x->cb_offset[PLANE_TYPE_UV] <
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
+            (subsampling_x + subsampling_y)));
+  }
+
+  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+
+  if (!dry_run) {
+    update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+    if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 &&
+        cm->delta_q_info.delta_lf_present_flag) {
+      const int frame_lf_count =
+          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+    }
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+    }
+
+    // delta quant applies to both intra and inter
+    const int super_block_upper_left =
+        ((mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params->mib_size - 1)) == 0);
+    const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+    if (delta_q_info->delta_q_present_flag &&
+        (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
+        super_block_upper_left) {
+      xd->current_base_qindex = mbmi->current_qindex;
+      if (delta_q_info->delta_lf_present_flag) {
+        if (delta_q_info->delta_lf_multi) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+          }
+        } else {
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+        }
+      }
+    }
+
+    RD_COUNTS *rdc = &td->rd_counts;
+    if (mbmi->skip_mode) {
+      assert(!frame_is_intra_only(cm));
+      rdc->skip_mode_used_flag = 1;
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+        assert(has_second_ref(mbmi));
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    } else {
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active) {
+        // If the segment reference feature is enabled we have only a single
+        // reference frame allowed for the segment so exclude it from
+        // the reference frame counts used to work out probabilities.
+        if (is_inter_block(mbmi)) {
+          av1_collect_neighbors_ref_counts(xd);
+          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+            if (has_second_ref(mbmi)) {
+              // This flag is also updated for 4x4 blocks
+              rdc->compound_ref_used_flag = 1;
+            }
+          }
+          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        }
+      }
+    }
+
+    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+
+    // Gather obmc and warped motion count to update the probability.
+    if ((cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+         cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) ||
+        (cm->features.allow_warped_motion &&
+         cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
+      const int inter_block = is_inter_block(mbmi);
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active && inter_block) {
+        const MOTION_MODE motion_allowed =
+            cm->features.switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->features.allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+
+        if (mbmi->ref_frame[1] != INTRA_FRAME) {
+          if (motion_allowed >= OBMC_CAUSAL) {
+            td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+          }
+          if (motion_allowed == WARPED_CAUSAL) {
+            td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
+          }
+        }
+      }
+    }
+  }
+  // TODO(Ravi/Remya): Move this copy function to a better logical place
+  // This function will copy the best mode information from block
+  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+  // bitstream preparation.
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  x->rdmult = origin_mult;
+}
+
+/*!\brief Reconstructs a partition (may contain multiple coding blocks)
+ *
+ * \ingroup partition_search
+ * Reconstructs a sub-partition of the superblock by applying the chosen modes
+ * and partition trees stored in pc_tree.
+ *
+ * \param[in]    cpi       Top-level encoder structure
+ * \param[in]    td        Pointer to thread data
+ * \param[in]    tile_data Pointer to struct holding adaptive
+ *                         data/contexts/models for the tile during encoding
+ * \param[in]    tp        Pointer to the starting token
+ * \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]    mi_col    Column coordinate of the block in a step size of
+ *                         MI_SIZE
+ * \param[in]    dry_run   A code indicating whether it is part of the final
+ *                         pass for reconstructing the superblock
+ * \param[in]    bsize     Current block size
+ * \param[in]    pc_tree   Pointer to the PC_TREE node storing the picked
+ *                         partitions and mode info for the current block
+ * \param[in]    rate      Pointer to the total rate for the current block
+ *
+ * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd.
+ */
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+                      TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                      int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree, int *rate) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int is_partition_root = bsize >= BLOCK_8X8;
+  const int ctx = is_partition_root
+                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                      : -1;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+#if !CONFIG_REALTIME_ONLY
+  int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+  if (subsize == BLOCK_INVALID) return;
+
+  if (!dry_run && ctx >= 0) {
+    const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
+    const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
+
+    if (has_rows && has_cols) {
+#if CONFIG_ENTROPY_STATS
+      td->counts->partition[ctx][partition]++;
+#endif
+
+      if (tile_data->allow_update_cdf) {
+        FRAME_CONTEXT *fc = xd->tile_ctx;
+        update_cdf(fc->partition_cdf[ctx], partition,
+                   partition_cdf_length(bsize));
+      }
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->none, rate);
+      break;
+    case PARTITION_VERT:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->vertical[0], rate);
+      if (mi_col + hbs < mi_params->mi_cols) {
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                 partition, pc_tree->vertical[1], rate);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontal[0], rate);
+      if (mi_row + hbs < mi_params->mi_rows) {
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                 partition, pc_tree->horizontal[1], rate);
+      }
+      break;
+    case PARTITION_SPLIT:
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->split[0], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                pc_tree->split[1], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                pc_tree->split[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                subsize, pc_tree->split[3], rate);
+      break;
+
+#if !CONFIG_REALTIME_ONLY
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontala[2], rate);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, pc_tree->horizontalb[2], rate);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, pc_tree->verticala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, pc_tree->verticala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, pc_tree->verticala[2], rate);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, pc_tree->verticalb[2], rate);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
+                 partition, pc_tree->horizontal4[i], rate);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
+                 partition, pc_tree->vertical4[i], rate);
+      }
+      break;
+#endif
+    default: assert(0 && "Invalid partition type."); break;
+  }
+
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static AOM_INLINE int is_adjust_var_based_part_enabled(
+    AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf,
+    BLOCK_SIZE bsize) {
+  if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0;
+  if (part_sf->adjust_var_based_rd_partitioning == 0 ||
+      part_sf->adjust_var_based_rd_partitioning > 2)
+    return 0;
+
+  if (bsize <= BLOCK_32X32) return 1;
+  if (part_sf->adjust_var_based_rd_partitioning == 2) {
+    const int is_larger_qindex = cm->quant_params.base_qindex > 190;
+    const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+    return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64;
+  }
+  return 0;
+}
+
+/*!\brief AV1 block partition search (partition estimation and partial search).
+*
+* \ingroup partition_search
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. Minor partition
+* adjustments are tested and applied if they lead to lower rd costs. The
+* partition types are limited to a basic set: none, horz, vert, and split.
+*
+* \param[in]    cpi       Top-level encoder structure
+* \param[in]    td        Pointer to thread data
+* \param[in]    tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in]    mib       Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in]    tp        Pointer to the starting token
+* \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+* \param[in]    mi_col    Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in]    bsize     Current block size
+* \param[in]    rate      Pointer to the final rate for encoding the current
+block
+* \param[in]    dist      Pointer to the final distortion of the current block
+* \param[in]    do_recon  Whether the reconstruction function needs to be run,
+either for finalizing a superblock or providing
+reference for future sub-partitions
+* \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \remark Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes. The rate and dist are also updated with those
+* corresponding to the best partition found.
+*/
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                          MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+                          int mi_col, BLOCK_SIZE bsize, int *rate,
+                          int64_t *dist, int do_recon, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
+  BLOCK_SIZE bs_type = mib[0]->bsize;
+  int use_partition_none = 0;
+  x->try_merge_partition = 0;
+
+  if (pc_tree->none == NULL) {
+    pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    if (!pc_tree->none)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
+  }
+  PICK_MODE_CONTEXT *ctx_none = pc_tree->none;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  // In rt mode, currently the min partition size is BLOCK_8X8.
+  assert(bsize >= cpi->sf.part_sf.default_min_partition_size);
+
+  av1_invalid_rd_stats(&last_part_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_invalid_rd_stats(&chosen_rdc);
+  av1_invalid_rd_stats(&invalid_rdc);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+  }
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  if (partition != PARTITION_NONE &&
+      is_adjust_var_based_part_enabled(cm, &cpi->sf.part_sf, bsize) &&
+      (mi_row + hbs < mi_params->mi_rows &&
+       mi_col + hbs < mi_params->mi_cols)) {
+    assert(bsize > cpi->sf.part_sf.default_min_partition_size);
+    mib[0]->bsize = bsize;
+    pc_tree->partitioning = PARTITION_NONE;
+    x->try_merge_partition = 1;
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, PARTITION_NONE,
+                  bsize, ctx_none, invalid_rdc);
+
+    if (none_rdc.rate < INT_MAX) {
+      none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+      none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+    }
+
+    // Try to skip split partition evaluation based on none partition
+    // characteristics.
+    if (none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) {
+      use_partition_none = 1;
+    }
+
+    av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    mib[0]->bsize = bs_type;
+    pc_tree->partitioning = partition;
+  }
+
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    if (!pc_tree->split[i])
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    pc_tree->split[i]->index = i;
+  }
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_NONE, bsize, ctx_none, invalid_rdc);
+      break;
+    case PARTITION_HORZ:
+      if (use_partition_none) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
+
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        pc_tree->horizontal[i] =
+            av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        if (!pc_tree->horizontal[i])
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
+      }
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_HORZ, subsize, pc_tree->horizontal[0],
+                    invalid_rdc);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_row + hbs < mi_params->mi_rows) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0];
+        av1_init_rd_stats(&tmp_rdc);
+        av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                      PARTITION_HORZ, subsize, pc_tree->horizontal[1],
+                      invalid_rdc);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      if (use_partition_none) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
+
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        pc_tree->vertical[i] =
+            av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        if (!pc_tree->vertical[i])
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
+      }
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_col + hbs < mi_params->mi_cols) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0];
+        av1_init_rd_stats(&tmp_rdc);
+        av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                      PARTITION_VERT, subsize,
+                      pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (use_partition_none) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
+
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        int jj = i >> 1, ii = i & 0x01;
+        RD_STATS tmp_rdc;
+        if ((mi_row + y_idx >= mi_params->mi_rows) ||
+            (mi_col + x_idx >= mi_params->mi_cols))
+          continue;
+
+        av1_init_rd_stats(&tmp_rdc);
+        av1_rd_use_partition(
+            cpi, td, tile_data,
+            mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+            mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+            &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += mode_costs->partition_cost[pl][partition];
+    last_part_rdc.rdcost =
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
+      partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+      (mi_row + bs < mi_params->mi_rows ||
+       mi_row + hbs == mi_params->mi_rows) &&
+      (mi_col + bs < mi_params->mi_cols ||
+       mi_col + hbs == mi_params->mi_cols)) {
+    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+
+    av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    pc_tree->partitioning = PARTITION_SPLIT;
+
+    // Split partition.
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
+      RD_STATS tmp_rdc;
+
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols))
+        continue;
+
+      av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      if (pc_tree->split[i]->none == NULL)
+        pc_tree->split[i]->none =
+            av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf);
+      if (!pc_tree->split[i]->none)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PICK_MODE_CONTEXT");
+      pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                    PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none,
+                    invalid_rdc);
+
+      av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        av1_invalid_rd_stats(&chosen_rdc);
+        break;
+      }
+
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
+
+      if (i != SUB_PARTITIONS_SPLIT - 1)
+        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+      chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+    }
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
+    }
+  }
+
+  // If last_part is better set the partitioning to that.
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+    mib[0]->bsize = bs_type;
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+
+    chosen_rdc = last_part_rdc;
+  }
+  // If none was better set the partitioning to that.
+  if (none_rdc.rdcost < INT64_MAX &&
+      none_rdc.rdcost - (none_rdc.rdcost >> 9) < chosen_rdc.rdcost) {
+    mib[0]->bsize = bsize;
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+    chosen_rdc = none_rdc;
+  }
+
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == cm->seq_params->sb_size)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_time);
+#endif
+  if (do_recon) {
+    if (bsize == cm->seq_params->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_time);
+#endif
+
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
+  x->rdmult = orig_rdmult;
+}
+
+static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                           ThreadData *td, TokenExtra **tp, int mi_row,
+                           int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *const ctx, int *rate) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
+#endif
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  const int origin_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+  const int subsampling_x = cpi->common.seq_params->subsampling_x;
+  const int subsampling_y = cpi->common.seq_params->subsampling_y;
+  if (!dry_run) {
+    set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+                   x->cb_offset[PLANE_TYPE_UV]);
+    assert(x->cb_offset[PLANE_TYPE_Y] <
+           (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
+    assert(x->cb_offset[PLANE_TYPE_UV] <
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
+            (subsampling_x + subsampling_y)));
+  }
+
+  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+  if (!dry_run) {
+    update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+      mbmi->compound_idx = 1;
+    }
+    RD_COUNTS *const rdc = &td->rd_counts;
+    if (mbmi->skip_mode) {
+      assert(!frame_is_intra_only(cm));
+      rdc->skip_mode_used_flag = 1;
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+          has_second_ref(mbmi)) {
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    } else {
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active) {
+        // If the segment reference feature is enabled we have only a single
+        // reference frame allowed for the segment so exclude it from
+        // the reference frame counts used to work out probabilities.
+        if (is_inter_block(mbmi)) {
+          av1_collect_neighbors_ref_counts(xd);
+          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+              has_second_ref(mbmi)) {
+            // This flag is also updated for 4x4 blocks
+            rdc->compound_ref_used_flag = 1;
+          }
+          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        }
+      }
+    }
+    if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY &&
+        (mbmi->mode == NEWMV || mbmi->mode < INTRA_MODE_END)) {
+      int32_t blocks = mi_size_high[bsize] * mi_size_wide[bsize];
+      rdc->newmv_or_intra_blocks += blocks;
+    }
+    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+  }
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm &&
+      !cpi->rc.rtc_external_ratectrl && cm->seg.enabled)
+    av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run);
+  // TODO(Ravi/Remya): Move this copy function to a better logical place
+  // This function will copy the best mode information from block
+  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+  // bitstream preparation.
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  x->rdmult = origin_mult;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
+#endif
+}
+
+static int get_force_zeromv_skip_flag_for_blk(const AV1_COMP *cpi,
+                                              const MACROBLOCK *x,
+                                              BLOCK_SIZE bsize) {
+  // Force zero MV skip based on SB level decision
+  if (x->force_zeromv_skip_for_sb < 2) return x->force_zeromv_skip_for_sb;
+
+  // For blocks of size equal to superblock size, the decision would have been
+  // already done at superblock level. Hence zeromv-skip decision is skipped.
+  const AV1_COMMON *const cm = &cpi->common;
+  if (bsize == cm->seq_params->sb_size) return 0;
+
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const unsigned int thresh_exit_part_y =
+      cpi->zeromv_skip_thresh_exit_part[bsize];
+  const unsigned int thresh_exit_part_uv =
+      CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y);
+  const unsigned int thresh_exit_part[MAX_MB_PLANE] = { thresh_exit_part_y,
+                                                        thresh_exit_part_uv,
+                                                        thresh_exit_part_uv };
+  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, LAST_FRAME);
+
+  struct buf_2d yv12_mb[MAX_MB_PLANE];
+  av1_setup_pred_block(xd, yv12_mb, yv12, sf, sf, num_planes);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+        p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride);
+    assert(plane < MAX_MB_PLANE);
+    if (plane_sad >= thresh_exit_part[plane]) return 0;
+  }
+  return 1;
+}
+
+/*!\brief Top level function to pick block mode for non-RD optimized case
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level function that is
+ * used for non-RD optimized mode search (controlled by
+ * \c cpi->sf.rt_sf.use_nonrd_pick_mode). Depending on frame type it calls
+ * inter/skip/hybrid-intra mode search functions
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+ *                              data/contexts/models for the tile during
+ *                              encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+ *                              the current macroblock
+ * \param[in]    mi_row         Row coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    mi_col         Column coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    rd_cost        Pointer to structure holding rate and distortion
+ *                              stats for the current block
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Pointer to structure holding coding contexts and
+ *                              chosen modes for the current block
+ *
+ * \remark Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                                MACROBLOCK *const x, int mi_row, int mi_col,
+                                RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx) {
+  // For nonrd mode, av1_set_offsets is already called at the superblock level
+  // in encode_nonrd_sb when we determine the partitioning.
+  if (bsize != cpi->common.seq_params->sb_size ||
+      cpi->sf.rt_sf.nonrd_check_partition_split == 1) {
+    av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+  }
+  assert(x->last_set_offsets_loc.mi_row == mi_row &&
+         x->last_set_offsets_loc.mi_col == mi_col &&
+         x->last_set_offsets_loc.bsize == bsize);
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  int i;
+
+  // This is only needed for real time/allintra row-mt enabled multi-threaded
+  // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+  wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+                        &tile_data->tile_info, cm->seq_params->sb_size,
+                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, pick_sb_modes_nonrd_time);
+#endif
+  // Sets up the tx_type_map buffer in MACROBLOCKD.
+  xd->tx_type_map = txfm_info->tx_type_map_;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+  x->force_zeromv_skip_for_blk =
+      get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
+
+  // Source variance may be already compute at superblock level, so no need
+  // to recompute, unless bsize < sb_size or source_variance is not yet set.
+  if (!x->force_zeromv_skip_for_blk &&
+      (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size))
+    x->source_variance = av1_get_perpixel_variance_facade(
+        cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, hybrid_intra_mode_search_time);
+#endif
+    hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, hybrid_intra_mode_search_time);
+#endif
+  } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, nonrd_pick_inter_mode_sb_time);
+#endif
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      RD_STATS invalid_rd;
+      av1_invalid_rd_stats(&invalid_rd);
+      // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx,
+                                         invalid_rd.rdcost);
+    } else {
+      av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx);
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, nonrd_pick_inter_mode_sb_time);
+#endif
+  }
+  if (cpi->sf.rt_sf.skip_cdef_sb) {
+    // cdef_strength is initialized to 1 which means skip_cdef, and is updated
+    // here. Check to see is skipping cdef is allowed.
+    const int allow_cdef_skipping =
+        cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad &&
+        !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]);
+
+    // Find the corresponding 64x64 block. It'll be the 128x128 block if that's
+    // the block size.
+    const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64;
+    const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64;
+    MB_MODE_INFO **mi_sb =
+        cm->mi_params.mi_grid_base +
+        get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
+    // Do not skip if intra or new mv is picked, or color sensitivity is set.
+    // Never skip on slide/scene change.
+    if (cpi->sf.rt_sf.skip_cdef_sb >= 2) {
+      mi_sb[0]->cdef_strength =
+          mi_sb[0]->cdef_strength &&
+          (allow_cdef_skipping || x->source_variance == 0);
+    } else {
+      mi_sb[0]->cdef_strength =
+          mi_sb[0]->cdef_strength && allow_cdef_skipping &&
+          !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
+    }
+    // Store in the pickmode context.
+    ctx->mic.cdef_strength = mi_sb[0]->cdef_strength;
+  }
+  x->rdmult = orig_rdmult;
+  ctx->rd_stats.rate = rd_cost->rate;
+  ctx->rd_stats.dist = rd_cost->dist;
+  ctx->rd_stats.rdcost = rd_cost->rdcost;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, pick_sb_modes_nonrd_time);
+#endif
+}
+
+static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td,
+                               TileDataEnc *const tile_data,
+                               TileInfo *const tile_info, TokenExtra **tp,
+                               MACROBLOCK *const x, MACROBLOCKD *const xd,
+                               const CommonModeInfoParams *const mi_params,
+                               const int mi_row, const int mi_col,
+                               const BLOCK_SIZE bsize, const int pl,
+                               PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int hbs = mi_size_wide[bsize] / 2;
+  if (mi_row + mi_size_high[bsize] >= mi_params->mi_rows ||
+      mi_col + mi_size_wide[bsize] >= mi_params->mi_cols)
+    return 0;
+  if (bsize <= BLOCK_8X8 || frame_is_intra_only(cm)) return 0;
+  if (x->content_state_sb.source_sad_nonrd <= kLowSad) return 0;
+
+  // Do not try split partition when the source sad is small, or
+  // the prediction residual is small.
+  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, LAST_FRAME);
+  const int num_planes = av1_num_planes(cm);
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+  av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf, num_planes);
+  int block_sad = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+        p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride);
+    block_sad += plane_sad;
+  }
+  const int blk_pix = block_size_wide[bsize] * block_size_high[bsize];
+  const int block_avg_sad = block_sad / blk_pix;
+  // TODO(chengchen): find a proper threshold. It might change according to
+  // q as well.
+  const int threshold = 25;
+  if (block_avg_sad < threshold) return 0;
+
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS split_rdc, none_rdc;
+  av1_invalid_rd_stats(&split_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Calculate rdcost for none partition
+  pc_tree->partitioning = PARTITION_NONE;
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  if (!pc_tree->none) {
+    pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    if (!pc_tree->none)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
+  } else {
+    av1_reset_pmc(pc_tree->none);
+  }
+  pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                      pc_tree->none);
+  none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+  none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  // Calculate rdcost for split partition
+  pc_tree->partitioning = PARTITION_SPLIT;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  av1_init_rd_stats(&split_rdc);
+  split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+  if (subsize >= BLOCK_8X8) {
+    split_rdc.rate += (mode_costs->partition_cost[pl][PARTITION_NONE] * 4);
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (!pc_tree->split[i]) {
+      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+      if (!pc_tree->split[i])
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
+    }
+    pc_tree->split[i]->index = i;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+    RD_STATS block_rdc;
+    av1_invalid_rd_stats(&block_rdc);
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    if ((mi_row + y_idx >= mi_params->mi_rows) ||
+        (mi_col + x_idx >= mi_params->mi_cols))
+      continue;
+    xd->above_txfm_context =
+        cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+    if (!pc_tree->split[i]->none) {
+      pc_tree->split[i]->none =
+          av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+      if (!pc_tree->split[i]->none)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PICK_MODE_CONTEXT");
+    } else {
+      av1_reset_pmc(pc_tree->split[i]->none);
+    }
+    pc_tree->split[i]->partitioning = PARTITION_NONE;
+    pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                        &block_rdc, subsize, pc_tree->split[i]->none);
+    split_rdc.rate += block_rdc.rate;
+    split_rdc.dist += block_rdc.dist;
+    av1_rd_cost_update(x->rdmult, &split_rdc);
+    if (none_rdc.rdcost < split_rdc.rdcost) break;
+    if (i != SUB_PARTITIONS_SPLIT - 1)
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+                     subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
+  }
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+  split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+  const int split = split_rdc.rdcost < none_rdc.rdcost;
+
+  return split;
+}
+
+// Returns if SPLIT partitions should be evaluated
+static bool calc_do_split_flag(const AV1_COMP *cpi, const MACROBLOCK *x,
+                               const PC_TREE *pc_tree, const RD_STATS *none_rdc,
+                               const CommonModeInfoParams *mi_params,
+                               int mi_row, int mi_col, int hbs,
+                               BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_larger_qindex = cm->quant_params.base_qindex > 100;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  bool do_split =
+      (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3)
+          ? (bsize <= BLOCK_32X32 || (is_larger_qindex && bsize <= BLOCK_64X64))
+          : true;
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN ||
+      cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+      cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) ||
+      !none_rdc->skip_txfm)
+    return do_split;
+
+  const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
+
+  // When model based skip is not used (i.e.,use_model_yrd_large = 0), skip_txfm
+  // would have been populated based on Hadamard transform and skip_txfm flag is
+  // more reliable. Hence SPLIT evaluation is disabled at all quantizers for 8x8
+  // and 16x16 blocks.
+  // When model based skip is used (i.e.,use_model_yrd_large = 1), skip_txfm may
+  // not be reliable. Hence SPLIT evaluation is disabled only at lower
+  // quantizers for blocks >= 32x32.
+  if ((!use_model_yrd_large) || (!is_larger_qindex)) return false;
+
+  // Use residual statistics to decide if SPLIT partition should be evaluated
+  // for 32x32 blocks. The pruning logic is avoided for larger block size to
+  // avoid the visual artifacts
+  if (pc_tree->none->mic.mode == NEWMV && bsize == BLOCK_32X32 && do_split) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+    assert(subsize < BLOCK_SIZES_ALL);
+    double min_per_pixel_error = DBL_MAX;
+    double max_per_pixel_error = 0.;
+    int i;
+    for (i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+      const int x_idx = (i & 1) * hbs;
+      const int y_idx = (i >> 1) * hbs;
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols)) {
+        break;
+      }
+
+      // Populate the appropriate buffer pointers.
+      // Pass scale factors as NULL as the base pointer of the block would have
+      // been calculated appropriately.
+      struct buf_2d src_split_buf_2d, pred_split_buf_2d;
+      const struct buf_2d *src_none_buf_2d = &x->plane[AOM_PLANE_Y].src;
+      setup_pred_plane(&src_split_buf_2d, subsize, src_none_buf_2d->buf,
+                       src_none_buf_2d->width, src_none_buf_2d->height,
+                       src_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0);
+      const struct buf_2d *pred_none_buf_2d = &xd->plane[AOM_PLANE_Y].dst;
+      setup_pred_plane(&pred_split_buf_2d, subsize, pred_none_buf_2d->buf,
+                       pred_none_buf_2d->width, pred_none_buf_2d->height,
+                       pred_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0);
+
+      unsigned int curr_uint_mse;
+      const unsigned int curr_uint_var = cpi->ppi->fn_ptr[subsize].vf(
+          src_split_buf_2d.buf, src_split_buf_2d.stride, pred_split_buf_2d.buf,
+          pred_split_buf_2d.stride, &curr_uint_mse);
+      const double curr_per_pixel_error =
+          sqrt((double)curr_uint_var / block_size_wide[subsize] /
+               block_size_high[subsize]);
+      if (curr_per_pixel_error < min_per_pixel_error)
+        min_per_pixel_error = curr_per_pixel_error;
+      if (curr_per_pixel_error > max_per_pixel_error)
+        max_per_pixel_error = curr_per_pixel_error;
+    }
+
+    // Prune based on residual statistics only if all the sub-partitions are
+    // valid.
+    if (i == SUB_PARTITIONS_SPLIT) {
+      if (max_per_pixel_error - min_per_pixel_error <= 1.5) do_split = false;
+    }
+  }
+
+  return do_split;
+}
+
+static void try_merge(AV1_COMP *const cpi, ThreadData *td,
+                      TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                      TokenExtra **tp, const int mi_row, const int mi_col,
+                      const BLOCK_SIZE bsize, PC_TREE *const pc_tree,
+                      const PARTITION_TYPE partition, const BLOCK_SIZE subsize,
+                      const int pl) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int num_planes = av1_num_planes(cm);
+  // Only square blocks from 8x8 to 128x128 are supported
+  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  bool do_split = false;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS split_rdc, none_rdc;
+  av1_invalid_rd_stats(&split_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  pc_tree->partitioning = PARTITION_NONE;
+  if (!pc_tree->none) {
+    pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    if (!pc_tree->none)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
+  } else {
+    av1_reset_pmc(pc_tree->none);
+  }
+  pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                      pc_tree->none);
+  none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+  none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+      none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) {
+    do_split = calc_do_split_flag(cpi, x, pc_tree, &none_rdc, mi_params, mi_row,
+                                  mi_col, hbs, bsize, partition);
+    if (do_split) {
+      av1_init_rd_stats(&split_rdc);
+      split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+        RD_STATS block_rdc;
+        av1_invalid_rd_stats(&block_rdc);
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        if ((mi_row + y_idx >= mi_params->mi_rows) ||
+            (mi_col + x_idx >= mi_params->mi_cols))
+          continue;
+        xd->above_txfm_context =
+            cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+        xd->left_txfm_context =
+            xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+        if (!pc_tree->split[i]->none) {
+          pc_tree->split[i]->none =
+              av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+          if (!pc_tree->split[i]->none)
+            aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PICK_MODE_CONTEXT");
+        } else {
+          av1_reset_pmc(pc_tree->split[i]->none);
+        }
+        pc_tree->split[i]->partitioning = PARTITION_NONE;
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                            &block_rdc, subsize, pc_tree->split[i]->none);
+        // TODO(yunqingwang): The rate here did not include the cost of
+        // signaling PARTITION_NONE token in the sub-blocks.
+        split_rdc.rate += block_rdc.rate;
+        split_rdc.dist += block_rdc.dist;
+
+        av1_rd_cost_update(x->rdmult, &split_rdc);
+
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          break;
+        }
+
+        if (i != SUB_PARTITIONS_SPLIT - 1)
+          encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx,
+                         1, subsize, PARTITION_NONE, pc_tree->split[i]->none,
+                         NULL);
+      }
+      av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+    }
+  }
+
+  if (none_rdc.rdcost < split_rdc.rdcost) {
+    /* Predicted samples can not be reused for PARTITION_NONE since same
+     * buffer is being used to store the reconstructed samples of
+     * PARTITION_SPLIT block. */
+    if (do_split) x->reuse_inter_pred = false;
+
+    mib[0]->bsize = bsize;
+    pc_tree->partitioning = PARTITION_NONE;
+    encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                   pc_tree->none, NULL);
+  } else {
+    mib[0]->bsize = subsize;
+    pc_tree->partitioning = PARTITION_SPLIT;
+    /* Predicted samples can not be reused for PARTITION_SPLIT since same
+     * buffer is being used to write the reconstructed samples. */
+    // TODO(Cherma): Store and reuse predicted samples generated by
+    // encode_b_nonrd() in DRY_RUN_NORMAL mode.
+    x->reuse_inter_pred = false;
+
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols))
+        continue;
+
+      // Note: We don't reset pc_tree->split[i]->none here because it
+      // could contain results from the additional check. Instead, it is
+      // reset before we enter the nonrd_check_partition_merge_mode
+      // condition.
+      if (!pc_tree->split[i]->none) {
+        pc_tree->split[i]->none =
+            av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        if (!pc_tree->split[i]->none)
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
+      }
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+                     subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
+    }
+  }
+}
+
+// Evaluate if the sub-partitions can be merged directly into a large partition
+// without calculating the RD cost.
+static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td,
+                                     TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                                     int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  MB_MODE_INFO **b0 = mib;
+  MB_MODE_INFO **b1 = mib + hbs;
+  MB_MODE_INFO **b2 = mib + hbs * mi_params->mi_stride;
+  MB_MODE_INFO **b3 = mib + hbs * mi_params->mi_stride + hbs;
+
+  // Check if the following conditions are met. This can be updated
+  // later with more support added.
+  const int further_split = b0[0]->bsize < subsize || b1[0]->bsize < subsize ||
+                            b2[0]->bsize < subsize || b3[0]->bsize < subsize;
+  if (further_split) return;
+
+  const int no_skip = !b0[0]->skip_txfm || !b1[0]->skip_txfm ||
+                      !b2[0]->skip_txfm || !b3[0]->skip_txfm;
+  if (no_skip) return;
+
+  const int compound = (b0[0]->ref_frame[1] != b1[0]->ref_frame[1] ||
+                        b0[0]->ref_frame[1] != b2[0]->ref_frame[1] ||
+                        b0[0]->ref_frame[1] != b3[0]->ref_frame[1] ||
+                        b0[0]->ref_frame[1] > NONE_FRAME);
+  if (compound) return;
+
+  // Intra modes aren't considered here.
+  const int different_ref = (b0[0]->ref_frame[0] != b1[0]->ref_frame[0] ||
+                             b0[0]->ref_frame[0] != b2[0]->ref_frame[0] ||
+                             b0[0]->ref_frame[0] != b3[0]->ref_frame[0] ||
+                             b0[0]->ref_frame[0] <= INTRA_FRAME);
+  if (different_ref) return;
+
+  const int different_mode =
+      (b0[0]->mode != b1[0]->mode || b0[0]->mode != b2[0]->mode ||
+       b0[0]->mode != b3[0]->mode);
+  if (different_mode) return;
+
+  const int unsupported_mode =
+      (b0[0]->mode != NEARESTMV && b0[0]->mode != GLOBALMV);
+  if (unsupported_mode) return;
+
+  const int different_mv = (b0[0]->mv[0].as_int != b1[0]->mv[0].as_int ||
+                            b0[0]->mv[0].as_int != b2[0]->mv[0].as_int ||
+                            b0[0]->mv[0].as_int != b3[0]->mv[0].as_int);
+  if (different_mv) return;
+
+  const int unsupported_motion_mode =
+      (b0[0]->motion_mode != b1[0]->motion_mode ||
+       b0[0]->motion_mode != b2[0]->motion_mode ||
+       b0[0]->motion_mode != b3[0]->motion_mode ||
+       b0[0]->motion_mode != SIMPLE_TRANSLATION);
+  if (unsupported_motion_mode) return;
+
+  const int diffent_filter =
+      (b0[0]->interp_filters.as_int != b1[0]->interp_filters.as_int ||
+       b0[0]->interp_filters.as_int != b2[0]->interp_filters.as_int ||
+       b0[0]->interp_filters.as_int != b3[0]->interp_filters.as_int);
+  if (diffent_filter) return;
+
+  const int different_seg = (b0[0]->segment_id != b1[0]->segment_id ||
+                             b0[0]->segment_id != b2[0]->segment_id ||
+                             b0[0]->segment_id != b3[0]->segment_id);
+  if (different_seg) return;
+
+  // Evaluate the ref_mv.
+  MB_MODE_INFO **this_mi = mib;
+  BLOCK_SIZE orig_bsize = this_mi[0]->bsize;
+  const PARTITION_TYPE orig_partition = this_mi[0]->partition;
+
+  this_mi[0]->bsize = bsize;
+  this_mi[0]->partition = PARTITION_NONE;
+  this_mi[0]->skip_txfm = 1;
+
+  // TODO(yunqing): functions called below can be optimized by
+  // removing unrelated operations.
+  av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+                                     mi_col, bsize);
+
+  const MV_REFERENCE_FRAME ref_frame = this_mi[0]->ref_frame[0];
+  int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  int force_skip_low_temp_var = 0;
+  int skip_pred_mv = 0;
+  bool use_scaled_ref;
+
+  for (int i = 0; i < MB_MODE_COUNT; ++i) {
+    for (int j = 0; j < REF_FRAMES; ++j) {
+      frame_mv[i][j].as_int = INVALID_MV;
+    }
+  }
+  av1_copy(x->color_sensitivity, x->color_sensitivity_sb);
+  skip_pred_mv = (x->nonrd_prune_ref_frame_search > 2 &&
+                  x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
+                  x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
+
+  find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, bsize,
+                  force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
+
+  int continue_merging = 1;
+  if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row ||
+      frame_mv[NEARESTMV][ref_frame].as_mv.col != b0[0]->mv[0].as_mv.col)
+    continue_merging = 0;
+
+  if (!continue_merging) {
+    this_mi[0]->bsize = orig_bsize;
+    this_mi[0]->partition = orig_partition;
+
+    // TODO(yunqing): Store the results and restore here instead of
+    // calling find_predictors() again.
+    av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+                                       mi_col, this_mi[0]->bsize);
+    find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, this_mi[0]->bsize,
+                    force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
+  } else {
+    struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+    const int is_scaled = av1_is_scaled(sf);
+    const int is_y_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 8) ||
+                               (abs(this_mi[0]->mv[0].as_mv.col) % 8);
+    const int is_uv_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 16) ||
+                                (abs(this_mi[0]->mv[0].as_mv.col) % 16);
+
+    if (cpi->ppi->use_svc || is_scaled || is_y_subpel_mv || is_uv_subpel_mv) {
+      const int num_planes = av1_num_planes(cm);
+      set_ref_ptrs(cm, xd, ref_frame, this_mi[0]->ref_frame[1]);
+      const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
+      av1_setup_pre_planes(xd, 0, cfg, mi_row, mi_col,
+                           xd->block_ref_scale_factors[0], num_planes);
+
+      if (!cpi->ppi->use_svc && !is_scaled && !is_y_subpel_mv) {
+        assert(is_uv_subpel_mv == 1);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 1,
+                                      num_planes - 1);
+      } else {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      num_planes - 1);
+      }
+    }
+
+    // Copy out mbmi_ext information.
+    MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+    MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = x->mbmi_ext_frame;
+    av1_copy_mbmi_ext_to_mbmi_ext_frame(
+        mbmi_ext_frame, mbmi_ext, av1_ref_frame_type(this_mi[0]->ref_frame));
+
+    const BLOCK_SIZE this_subsize =
+        get_partition_subsize(bsize, this_mi[0]->partition);
+    // Update partition contexts.
+    update_ext_partition_context(xd, mi_row, mi_col, this_subsize, bsize,
+                                 this_mi[0]->partition);
+
+    const int num_planes = av1_num_planes(cm);
+    av1_reset_entropy_context(xd, bsize, num_planes);
+
+    // Note: use x->txfm_search_params.tx_mode_search_type instead of
+    // cm->features.tx_mode here.
+    TX_SIZE tx_size =
+        tx_size_from_tx_mode(bsize, x->txfm_search_params.tx_mode_search_type);
+    if (xd->lossless[this_mi[0]->segment_id]) tx_size = TX_4X4;
+    this_mi[0]->tx_size = tx_size;
+    memset(this_mi[0]->inter_tx_size, this_mi[0]->tx_size,
+           sizeof(this_mi[0]->inter_tx_size));
+
+    // Update txfm contexts.
+    xd->above_txfm_context =
+        cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+    set_txfm_ctxs(this_mi[0]->tx_size, xd->width, xd->height,
+                  this_mi[0]->skip_txfm && is_inter_block(this_mi[0]), xd);
+
+    // Update mi for this partition block.
+    for (int y = 0; y < bs; y++) {
+      for (int x_idx = 0; x_idx < bs; x_idx++) {
+        this_mi[x_idx + y * mi_params->mi_stride] = this_mi[0];
+      }
+    }
+  }
+}
+
+/*!\brief AV1 block partition application (minimal RD search).
+*
+* \ingroup partition_search
+* \callgraph
+* \callergraph
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. The only
+* partition adjustment allowed is merging leaf split nodes if it leads to a
+* lower rd cost. The partition types are limited to a basic set: none, horz,
+* vert, and split. This function is only used in the real-time mode.
+*
+* \param[in]    cpi       Top-level encoder structure
+* \param[in]    td        Pointer to thread data
+* \param[in]    tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in]    mib       Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in]    tp        Pointer to the starting token
+* \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+* \param[in]    mi_col    Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in]    bsize     Current block size
+* \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \remark Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes.
+*/
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                             TokenExtra **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  // Only square blocks from 8x8 to 128x128 are supported
+  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  PARTITION_TYPE partition = (bsize >= BLOCK_8X8)
+                                 ? get_partition(cm, mi_row, mi_col, bsize)
+                                 : PARTITION_NONE;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  assert(subsize <= BLOCK_LARGEST);
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+
+  RD_STATS dummy_cost;
+  av1_invalid_rd_stats(&dummy_cost);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  x->reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd;
+
+  int change_none_to_split = 0;
+  if (partition == PARTITION_NONE &&
+      cpi->sf.rt_sf.nonrd_check_partition_split == 1) {
+    change_none_to_split =
+        try_split_partition(cpi, td, tile_data, tile_info, tp, x, xd, mi_params,
+                            mi_row, mi_col, bsize, pl, pc_tree);
+    if (change_none_to_split) {
+      partition = PARTITION_SPLIT;
+      subsize = get_partition_subsize(bsize, partition);
+      assert(subsize <= BLOCK_LARGEST);
+    }
+  }
+
+  pc_tree->partitioning = partition;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      if (!pc_tree->none) {
+        pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+        if (!pc_tree->none)
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
+      } else {
+        av1_reset_pmc(pc_tree->none);
+      }
+      pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, bsize,
+                          pc_tree->none);
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+                     partition, pc_tree->none, NULL);
+      break;
+    case PARTITION_VERT:
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        if (!pc_tree->vertical[i]) {
+          pc_tree->vertical[i] =
+              av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+          if (!pc_tree->vertical[i])
+            aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PICK_MODE_CONTEXT");
+        } else {
+          av1_reset_pmc(pc_tree->vertical[i]);
+        }
+      }
+      pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                          subsize, pc_tree->vertical[0]);
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+                     PARTITION_VERT, pc_tree->vertical[0], NULL);
+      if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col + hbs,
+                            &dummy_cost, subsize, pc_tree->vertical[1]);
+        encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
+                       PARTITION_VERT, pc_tree->vertical[1], NULL);
+      }
+      break;
+    case PARTITION_HORZ:
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        if (!pc_tree->horizontal[i]) {
+          pc_tree->horizontal[i] =
+              av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+          if (!pc_tree->horizontal[i])
+            aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PICK_MODE_CONTEXT");
+        } else {
+          av1_reset_pmc(pc_tree->horizontal[i]);
+        }
+      }
+      pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                          subsize, pc_tree->horizontal[0]);
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+                     PARTITION_HORZ, pc_tree->horizontal[0], NULL);
+
+      if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + hbs, mi_col,
+                            &dummy_cost, subsize, pc_tree->horizontal[1]);
+        encode_b_nonrd(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
+                       PARTITION_HORZ, pc_tree->horizontal[1], NULL);
+      }
+      break;
+    case PARTITION_SPLIT:
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        if (!pc_tree->split[i]) {
+          pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+          if (!pc_tree->split[i])
+            aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PC_TREE");
+        }
+        pc_tree->split[i]->index = i;
+      }
+      if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
+          av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
+          !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+        try_merge(cpi, td, tile_data, mib, tp, mi_row, mi_col, bsize, pc_tree,
+                  partition, subsize, pl);
+      } else {
+        for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+          int x_idx = (i & 1) * hbs;
+          int y_idx = (i >> 1) * hbs;
+          int jj = i >> 1, ii = i & 0x01;
+          if ((mi_row + y_idx >= mi_params->mi_rows) ||
+              (mi_col + x_idx >= mi_params->mi_cols))
+            continue;
+          av1_nonrd_use_partition(
+              cpi, td, tile_data,
+              mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+              mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]);
+        }
+
+        if (!change_none_to_split) {
+          // Note: Palette, cfl are not supported.
+          if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf &&
+              cpi->sf.rt_sf.partition_direct_merging &&
+              mode_costs->partition_cost[pl][PARTITION_NONE] <
+                  mode_costs->partition_cost[pl][PARTITION_SPLIT] &&
+              (mi_row + bs <= mi_params->mi_rows) &&
+              (mi_col + bs <= mi_params->mi_cols)) {
+            direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col,
+                                     bsize);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks).
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int is_last,
+                           int mi_row, int mi_col, BLOCK_SIZE subsize,
+                           RD_STATS best_rdcost, RD_STATS *sum_rdc,
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *this_ctx) {
+  MACROBLOCK *const x = &td->mb;
+  const int orig_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
+
+  av1_rd_cost_update(x->rdmult, &best_rdcost);
+
+  RD_STATS rdcost_remaining;
+  av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
+  RD_STATS this_rdc;
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
+                subsize, this_ctx, rdcost_remaining);
+
+  if (this_rdc.rate == INT_MAX) {
+    sum_rdc->rdcost = INT64_MAX;
+  } else {
+    sum_rdc->rate += this_rdc.rate;
+    sum_rdc->dist += this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, sum_rdc);
+  }
+
+  if (sum_rdc->rdcost >= best_rdcost.rdcost) {
+    x->rdmult = orig_mult;
+    return 0;
+  }
+
+  if (!is_last) {
+    av1_update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
+  }
+
+  x->rdmult = orig_mult;
+  return 1;
+}
+
+// Tests an AB partition, and updates the encoder status, the pick mode
+// contexts, the best rdcost, and the best partition.
+static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+                               TileDataEnc *tile_data, TokenExtra **tp,
+                               PC_TREE *pc_tree, RD_STATS *best_rdc,
+                               int64_t *this_rdcost,
+                               PICK_MODE_CONTEXT *ctxs[SUB_PARTITIONS_AB],
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               PARTITION_TYPE partition,
+                               const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+                               const int ab_mi_pos[SUB_PARTITIONS_AB][2],
+                               const MB_MODE_INFO **mode_cache) {
+  MACROBLOCK *const x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  RD_STATS sum_rdc;
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = x->mode_costs.partition_cost[pl][partition];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+  // Loop over sub-partitions in AB partition type.
+  for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+    if (mode_cache && mode_cache[i]) {
+      x->use_mb_mode_cache = 1;
+      x->mb_mode_cache = mode_cache[i];
+    }
+    const int mode_search_success =
+        rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1,
+                        ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i],
+                        *best_rdc, &sum_rdc, partition, ctxs[i]);
+    x->use_mb_mode_cache = 0;
+    x->mb_mode_cache = NULL;
+    if (!mode_search_success) {
+      return false;
+    }
+  }
+
+  av1_rd_cost_update(x->rdmult, &sum_rdc);
+  *this_rdcost = sum_rdc.rdcost;
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+  *this_rdcost = sum_rdc.rdcost;
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+
+  *best_rdc = sum_rdc;
+  pc_tree->partitioning = partition;
+  return true;
+}
+
+#if CONFIG_COLLECT_PARTITION_STATS
+static void init_partition_block_timing_stats(
+    PartitionTimingStats *part_timing_stats) {
+  av1_zero(*part_timing_stats);
+}
+
+static INLINE void start_partition_block_timer(
+    PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type) {
+  assert(!part_timing_stats->timer_is_on);
+  part_timing_stats->partition_attempts[partition_type] += 1;
+  aom_usec_timer_start(&part_timing_stats->timer);
+  part_timing_stats->timer_is_on = 1;
+}
+
+static INLINE void end_partition_block_timer(
+    PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type,
+    int64_t rdcost) {
+  if (part_timing_stats->timer_is_on) {
+    aom_usec_timer_mark(&part_timing_stats->timer);
+    const int64_t time = aom_usec_timer_elapsed(&part_timing_stats->timer);
+    part_timing_stats->partition_times[partition_type] += time;
+    part_timing_stats->partition_rdcost[partition_type] = rdcost;
+    part_timing_stats->timer_is_on = 0;
+  }
+}
+static INLINE void print_partition_timing_stats_with_rdcost(
+    const PartitionTimingStats *part_timing_stats, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number,
+    const RD_STATS *best_rdc, const char *filename) {
+  FILE *f = fopen(filename, "a");
+  fprintf(f, "%d,%d,%d,%d,%d,%d,%" PRId64 ",%" PRId64 ",", bsize, frame_number,
+          frame_update_type, mi_row, mi_col, best_rdc->rate, best_rdc->dist,
+          best_rdc->rdcost);
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    if (part_timing_stats->partition_rdcost[idx] == INT64_MAX) {
+      fprintf(f, "%d,", -1);
+    } else {
+      fprintf(f, "%" PRId64 ",", part_timing_stats->partition_rdcost[idx]);
+    }
+  }
+  fprintf(f, "\n");
+  fclose(f);
+}
+
+static INLINE void print_partition_timing_stats(
+    const PartitionTimingStats *part_timing_stats, int intra_only,
+    int show_frame, const BLOCK_SIZE bsize, const char *filename) {
+  FILE *f = fopen(filename, "a");
+  fprintf(f, "%d,%d,%d,", bsize, show_frame, intra_only);
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
+  }
+  fprintf(f, "\n");
+  fclose(f);
+}
+
+static INLINE void accumulate_partition_timing_stats(
+    FramePartitionTimingStats *fr_part_timing_stats,
+    const PartitionTimingStats *part_timing_stats, BLOCK_SIZE bsize) {
+  const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+  int *agg_attempts = fr_part_timing_stats->partition_attempts[bsize_idx];
+  int *agg_decisions = fr_part_timing_stats->partition_decisions[bsize_idx];
+  int64_t *agg_times = fr_part_timing_stats->partition_times[bsize_idx];
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    agg_attempts[idx] += part_timing_stats->partition_attempts[idx];
+    agg_decisions[idx] += part_timing_stats->partition_decisions[idx];
+    agg_times[idx] += part_timing_stats->partition_times[idx];
+  }
+}
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+// Initialize state variables of partition search used in
+// av1_rd_pick_partition().
+static void init_partition_search_state_params(
+    MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state,
+    int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+
+  // Initialization of block size related parameters.
+  blk_params->mi_step = mi_size_wide[bsize] / 2;
+  blk_params->mi_row = mi_row;
+  blk_params->mi_col = mi_col;
+  blk_params->mi_row_edge = mi_row + blk_params->mi_step;
+  blk_params->mi_col_edge = mi_col + blk_params->mi_step;
+  blk_params->width = block_size_wide[bsize];
+  blk_params->min_partition_size_1d =
+      block_size_wide[x->sb_enc.min_partition_size];
+  blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  blk_params->split_bsize2 = blk_params->subsize;
+  blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  blk_params->bsize = bsize;
+
+  // Check if the partition corresponds to edge block.
+  blk_params->has_rows = (blk_params->mi_row_edge < mi_params->mi_rows);
+  blk_params->has_cols = (blk_params->mi_col_edge < mi_params->mi_cols);
+
+  // Update intra partitioning related info.
+  part_search_state->intra_part_info = &x->part_search_info;
+  // Prepare for segmentation CNN-based partitioning for intra-frame.
+  if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+    part_search_state->intra_part_info->quad_tree_idx = 0;
+    part_search_state->intra_part_info->cnn_output_valid = 0;
+  }
+
+  // Set partition plane context index.
+  part_search_state->pl_ctx_idx =
+      blk_params->bsize_at_least_8x8
+          ? partition_plane_context(xd, mi_row, mi_col, bsize)
+          : 0;
+
+  // Partition cost buffer update
+  ModeCosts *mode_costs = &x->mode_costs;
+  part_search_state->partition_cost =
+      mode_costs->partition_cost[part_search_state->pl_ctx_idx];
+
+  // Initialize HORZ and VERT win flags as true for all split partitions.
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+    part_search_state->split_part_rect_win[i].rect_part_win[HORZ] = true;
+    part_search_state->split_part_rect_win[i].rect_part_win[VERT] = true;
+  }
+
+  // Initialize the rd cost.
+  av1_init_rd_stats(&part_search_state->this_rdc);
+
+  // Initialize RD costs for partition types to 0.
+  part_search_state->none_rd = 0;
+  av1_zero(part_search_state->split_rd);
+  av1_zero(part_search_state->rect_part_rd);
+
+  // Initialize SPLIT partition to be not ready.
+  av1_zero(part_search_state->is_split_ctx_is_ready);
+  // Initialize HORZ and VERT partitions to be not ready.
+  av1_zero(part_search_state->is_rect_ctx_is_ready);
+
+  // Chroma subsampling.
+  part_search_state->ss_x = x->e_mbd.plane[1].subsampling_x;
+  part_search_state->ss_y = x->e_mbd.plane[1].subsampling_y;
+
+  // Initialize partition search flags to defaults.
+  part_search_state->terminate_partition_search = 0;
+  part_search_state->do_square_split = blk_params->bsize_at_least_8x8;
+  part_search_state->do_rectangular_split =
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      blk_params->bsize_at_least_8x8;
+  av1_zero(part_search_state->prune_rect_part);
+
+  // Initialize allowed partition types for the partition block.
+  part_search_state->partition_none_allowed =
+      av1_blk_has_rows_and_cols(blk_params);
+  part_search_state->partition_rect_allowed[HORZ] =
+      part_search_state->do_rectangular_split && blk_params->has_cols &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->partition_rect_allowed[VERT] =
+      part_search_state->do_rectangular_split && blk_params->has_rows &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+
+  // Reset the flag indicating whether a partition leading to a rdcost lower
+  // than the bound best_rdc has been found.
+  part_search_state->found_best_partition = false;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  init_partition_block_timing_stats(&part_search_state->part_timing_stats);
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+}
+
+// Override partition cost buffer for the edge blocks.
+static void set_partition_cost_for_edge_blk(
+    AV1_COMMON const *cm, PartitionSearchState *part_search_state) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0);
+  const aom_cdf_prob *partition_cdf =
+      cm->fc->partition_cdf[part_search_state->pl_ctx_idx];
+  const int max_cost = av1_cost_symbol(0);
+  for (PARTITION_TYPE i = 0; i < PARTITION_TYPES; ++i)
+    part_search_state->tmp_partition_cost[i] = max_cost;
+  if (blk_params.has_cols) {
+    // At the bottom, the two possibilities are HORZ and SPLIT.
+    aom_cdf_prob bot_cdf[2];
+    partition_gather_vert_alike(bot_cdf, partition_cdf, blk_params.bsize);
+    static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+    av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, bot_cdf,
+                             bot_inv_map);
+  } else if (blk_params.has_rows) {
+    // At the right, the two possibilities are VERT and SPLIT.
+    aom_cdf_prob rhs_cdf[2];
+    partition_gather_horz_alike(rhs_cdf, partition_cdf, blk_params.bsize);
+    static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+    av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, rhs_cdf,
+                             rhs_inv_map);
+  } else {
+    // At the bottom right, we always split.
+    part_search_state->tmp_partition_cost[PARTITION_SPLIT] = 0;
+  }
+  // Override the partition cost buffer.
+  part_search_state->partition_cost = part_search_state->tmp_partition_cost;
+}
+
+// Reset the partition search state flags when
+// must_find_valid_partition is equal to 1.
+static AOM_INLINE void reset_part_limitations(
+    AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int is_rect_part_allowed =
+      blk_params.bsize_at_least_8x8 &&
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      (blk_params.width > blk_params.min_partition_size_1d);
+  part_search_state->do_square_split =
+      blk_params.bsize_at_least_8x8 &&
+      (blk_params.width > blk_params.min_partition_size_1d);
+  part_search_state->partition_none_allowed =
+      av1_blk_has_rows_and_cols(&blk_params) &&
+      (blk_params.width >= blk_params.min_partition_size_1d);
+  part_search_state->partition_rect_allowed[HORZ] =
+      blk_params.has_cols && is_rect_part_allowed &&
+      get_plane_block_size(
+          get_partition_subsize(blk_params.bsize, PARTITION_HORZ),
+          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->partition_rect_allowed[VERT] =
+      blk_params.has_rows && is_rect_part_allowed &&
+      get_plane_block_size(
+          get_partition_subsize(blk_params.bsize, PARTITION_VERT),
+          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->terminate_partition_search = 0;
+}
+
+// Rectangular partitions evaluation at sub-block level.
+static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                                   MACROBLOCK *x,
+                                   PICK_MODE_CONTEXT *cur_partition_ctx,
+                                   PartitionSearchState *part_search_state,
+                                   RD_STATS *best_rdc, const int idx,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                   PARTITION_TYPE partition_type) {
+  // Obtain the remainder from the best rd cost
+  // for further processing of partition.
+  RD_STATS best_remain_rdcost;
+  av1_rd_stats_subtraction(x->rdmult, best_rdc, &part_search_state->sum_rdc,
+                           &best_remain_rdcost);
+
+  // Obtain the best mode for the partition sub-block.
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &part_search_state->this_rdc,
+                partition_type, bsize, cur_partition_ctx, best_remain_rdcost);
+  av1_rd_cost_update(x->rdmult, &part_search_state->this_rdc);
+
+  // Update the partition rd cost with the current sub-block rd.
+  if (part_search_state->this_rdc.rate == INT_MAX) {
+    part_search_state->sum_rdc.rdcost = INT64_MAX;
+  } else {
+    part_search_state->sum_rdc.rate += part_search_state->this_rdc.rate;
+    part_search_state->sum_rdc.dist += part_search_state->this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+  }
+  const RECT_PART_TYPE rect_part =
+      partition_type == PARTITION_HORZ ? HORZ : VERT;
+  part_search_state->rect_part_rd[rect_part][idx] =
+      part_search_state->this_rdc.rdcost;
+}
+
+typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+// Checks if HORZ / VERT partition search is allowed.
+static AOM_INLINE int is_rect_part_allowed(
+    const AV1_COMP *cpi, const PartitionSearchState *part_search_state,
+    const active_edge_info *active_edge, RECT_PART_TYPE rect_part,
+    const int mi_pos) {
+  const PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+  const int is_part_allowed =
+      (!part_search_state->terminate_partition_search &&
+       part_search_state->partition_rect_allowed[rect_part] &&
+       !part_search_state->prune_rect_part[rect_part] &&
+       (part_search_state->do_rectangular_split ||
+        active_edge[rect_part](cpi, mi_pos, blk_params->mi_step)));
+  return is_part_allowed;
+}
+
+static void rectangular_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+    RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type,
+    const RECT_PART_TYPE end_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS *sum_rdc = &part_search_state->sum_rdc;
+  const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ,
+                                                    PARTITION_VERT };
+
+  // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of
+  //                                           HORZ and VERT partition types.
+  // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][1]: mi_col postion of
+  //                                           HORZ and VERT partition types.
+  const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2] = {
+    { { blk_params.mi_row, blk_params.mi_col },
+      { blk_params.mi_row_edge, blk_params.mi_col } },
+    { { blk_params.mi_row, blk_params.mi_col },
+      { blk_params.mi_row, blk_params.mi_col_edge } }
+  };
+
+  // Initialize active edge_type function pointer
+  // for HOZR and VERT partition types.
+  active_edge_info active_edge_type[NUM_RECT_PARTS] = { av1_active_h_edge,
+                                                        av1_active_v_edge };
+
+  // Indicates edge blocks for HORZ and VERT partition types.
+  const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows,
+                                                  blk_params.has_cols };
+
+  // Initialize pc tree context for HORZ and VERT partition types.
+  PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = {
+    { &pc_tree->horizontal[0], &pc_tree->horizontal[1] },
+    { &pc_tree->vertical[0], &pc_tree->vertical[1] }
+  };
+
+  // Loop over rectangular partition types.
+  for (RECT_PART_TYPE i = start_type; i <= end_type; i++) {
+    assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                   !part_search_state->partition_rect_allowed[i]));
+
+    // Check if the HORZ / VERT partition search is to be performed.
+    if (!is_rect_part_allowed(cpi, part_search_state, active_edge_type, i,
+                              mi_pos_rect[i][0][i]))
+      continue;
+
+    // Sub-partition idx.
+    int sub_part_idx = 0;
+    PARTITION_TYPE partition_type = rect_partition_type[i];
+    blk_params.subsize =
+        get_partition_subsize(blk_params.bsize, partition_type);
+    assert(blk_params.subsize <= BLOCK_LARGEST);
+    av1_init_rd_stats(sum_rdc);
+    for (int j = 0; j < SUB_PARTITIONS_RECT; j++) {
+      if (cur_ctx[i][j][0] == NULL) {
+        cur_ctx[i][j][0] =
+            av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf);
+        if (!cur_ctx[i][j][0])
+          aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
+      }
+    }
+    sum_rdc->rate = part_search_state->partition_cost[partition_type];
+    sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+    PartitionTimingStats *part_timing_stats =
+        &part_search_state->part_timing_stats;
+    if (best_rdc->rdcost - sum_rdc->rdcost >= 0) {
+      start_partition_block_timer(part_timing_stats, partition_type);
+    }
+#endif
+
+    // First sub-partition evaluation in HORZ / VERT partition type.
+    rd_pick_rect_partition(
+        cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+        best_rdc, 0, mi_pos_rect[i][sub_part_idx][0],
+        mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+
+    // Start of second sub-partition evaluation.
+    // Evaluate second sub-partition if the first sub-partition cost
+    // is less than the best cost and if it is not an edge block.
+    if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[i]) {
+      const MB_MODE_INFO *const mbmi = &cur_ctx[i][sub_part_idx][0]->mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted.
+      if (pmi->palette_size[PLANE_TYPE_Y] == 0 &&
+          pmi->palette_size[PLANE_TYPE_UV] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED)
+          part_search_state->is_rect_ctx_is_ready[i] = 1;
+      }
+      av1_update_state(cpi, td, cur_ctx[i][sub_part_idx][0], blk_params.mi_row,
+                       blk_params.mi_col, blk_params.subsize, DRY_RUN_NORMAL);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
+                        blk_params.subsize, NULL);
+
+      // Second sub-partition evaluation in HORZ / VERT partition type.
+      sub_part_idx = 1;
+      rd_pick_rect_partition(
+          cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+          best_rdc, 1, mi_pos_rect[i][sub_part_idx][0],
+          mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+    }
+    // Update HORZ / VERT best partition.
+    if (sum_rdc->rdcost < best_rdc->rdcost) {
+      sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, sum_rdc->dist);
+      if (sum_rdc->rdcost < best_rdc->rdcost) {
+        *best_rdc = *sum_rdc;
+        part_search_state->found_best_partition = true;
+        pc_tree->partitioning = partition_type;
+      }
+    } else {
+      // Update HORZ / VERT win flag.
+      if (rect_part_win_info != NULL)
+        rect_part_win_info->rect_part_win[i] = false;
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (part_timing_stats->timer_is_on) {
+      end_partition_block_timer(part_timing_stats, partition_type,
+                                sum_rdc->rdcost);
+    }
+#endif
+    av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+                        blk_params.bsize, av1_num_planes(cm));
+  }
+}
+
+// AB partition type evaluation.
+static void rd_pick_ab_part(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB],
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+    const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type,
+    const MB_MODE_INFO **mode_cache) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  int64_t this_rdcost = 0;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state->part_timing_stats;
+  {
+    RD_STATS tmp_sum_rdc;
+    av1_init_rd_stats(&tmp_sum_rdc);
+    tmp_sum_rdc.rate = part_search_state->partition_cost[part_type];
+    tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+    if (best_rdc->rdcost - tmp_sum_rdc.rdcost >= 0) {
+      start_partition_block_timer(part_timing_stats, part_type);
+    }
+  }
+#endif
+
+  // Test this partition and update the best partition.
+  const bool find_best_ab_part = rd_test_partition3(
+      cpi, td, tile_data, tp, pc_tree, best_rdc, &this_rdcost, dst_ctxs, mi_row,
+      mi_col, bsize, part_type, ab_subsize, ab_mi_pos, mode_cache);
+  part_search_state->found_best_partition |= find_best_ab_part;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (part_timing_stats->timer_is_on) {
+    if (!find_best_ab_part) this_rdcost = INT64_MAX;
+    end_partition_block_timer(part_timing_stats, part_type, this_rdcost);
+  }
+#endif
+  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// Set mode search context.
+static AOM_INLINE void set_mode_search_ctx(
+    PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2],
+    PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) {
+  mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0];
+  mode_srch_ctx[VERT_B][0] = &pc_tree->vertical[0];
+
+  if (is_ctx_ready[HORZ_A][0])
+    mode_srch_ctx[HORZ_A][0] = &pc_tree->split[0]->none;
+
+  if (is_ctx_ready[VERT_A][0])
+    mode_srch_ctx[VERT_A][0] = &pc_tree->split[0]->none;
+
+  if (is_ctx_ready[HORZ_A][1])
+    mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none;
+}
+
+static AOM_INLINE void copy_partition_mode_from_mode_context(
+    const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) {
+  if (ctx && ctx->rd_stats.rate < INT_MAX) {
+    *dst_mode = &ctx->mic;
+  } else {
+    *dst_mode = NULL;
+  }
+}
+
+static AOM_INLINE void copy_partition_mode_from_pc_tree(
+    const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) {
+  if (pc_tree) {
+    copy_partition_mode_from_mode_context(dst_mode, pc_tree->none);
+  } else {
+    *dst_mode = NULL;
+  }
+}
+
+static AOM_INLINE void set_mode_cache_for_partition_ab(
+    const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree,
+    AB_PART_TYPE ab_part_type) {
+  switch (ab_part_type) {
+    case HORZ_A:
+      copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+      copy_partition_mode_from_mode_context(&mode_cache[2],
+                                            pc_tree->horizontal[1]);
+      break;
+    case HORZ_B:
+      copy_partition_mode_from_mode_context(&mode_cache[0],
+                                            pc_tree->horizontal[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+      copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+      break;
+    case VERT_A:
+      copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+      copy_partition_mode_from_mode_context(&mode_cache[2],
+                                            pc_tree->vertical[1]);
+      break;
+    case VERT_B:
+      copy_partition_mode_from_mode_context(&mode_cache[0],
+                                            pc_tree->vertical[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+      copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+      break;
+    default: assert(0 && "Invalid ab partition type!\n");
+  }
+}
+
+// AB Partitions type search.
+static void ab_partitions_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PartitionSearchState *part_search_state,
+    RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info,
+    int pb_source_variance, int ext_partition_allowed,
+    const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+
+  if (part_search_state->terminate_partition_search) {
+    return;
+  }
+
+  int ab_partitions_allowed[NUM_AB_PARTS];
+  // Prune AB partitions
+  av1_prune_ab_partitions(cpi, x, pc_tree, pb_source_variance, best_rdc->rdcost,
+                          rect_part_win_info, ext_partition_allowed,
+                          part_search_state, ab_partitions_allowed);
+
+  // Flags to indicate whether the mode search is done.
+  const int is_ctx_ready[NUM_AB_PARTS][2] = {
+    { part_search_state->is_split_ctx_is_ready[0],
+      part_search_state->is_split_ctx_is_ready[1] },
+    { part_search_state->is_rect_ctx_is_ready[HORZ], 0 },
+    { part_search_state->is_split_ctx_is_ready[0], 0 },
+    { part_search_state->is_rect_ctx_is_ready[VERT], 0 }
+  };
+
+  // Current partition context.
+  PICK_MODE_CONTEXT **cur_part_ctxs[NUM_AB_PARTS] = { pc_tree->horizontala,
+                                                      pc_tree->horizontalb,
+                                                      pc_tree->verticala,
+                                                      pc_tree->verticalb };
+
+  // Context of already evaluted partition types.
+  PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2];
+  // Set context of already evaluted partition types.
+  set_mode_search_ctx(pc_tree, is_ctx_ready, mode_srch_ctx);
+
+  // Array of sub-partition size of AB partition types.
+  const BLOCK_SIZE ab_subsize[NUM_AB_PARTS][SUB_PARTITIONS_AB] = {
+    { blk_params.split_bsize2, blk_params.split_bsize2,
+      get_partition_subsize(bsize, PARTITION_HORZ_A) },
+    { get_partition_subsize(bsize, PARTITION_HORZ_B), blk_params.split_bsize2,
+      blk_params.split_bsize2 },
+    { blk_params.split_bsize2, blk_params.split_bsize2,
+      get_partition_subsize(bsize, PARTITION_VERT_A) },
+    { get_partition_subsize(bsize, PARTITION_VERT_B), blk_params.split_bsize2,
+      blk_params.split_bsize2 }
+  };
+
+  // Array of mi_row, mi_col positions corresponds to each sub-partition in AB
+  // partition types.
+  const int ab_mi_pos[NUM_AB_PARTS][SUB_PARTITIONS_AB][2] = {
+    { { mi_row, mi_col },
+      { mi_row, blk_params.mi_col_edge },
+      { blk_params.mi_row_edge, mi_col } },
+    { { mi_row, mi_col },
+      { blk_params.mi_row_edge, mi_col },
+      { blk_params.mi_row_edge, blk_params.mi_col_edge } },
+    { { mi_row, mi_col },
+      { blk_params.mi_row_edge, mi_col },
+      { mi_row, blk_params.mi_col_edge } },
+    { { mi_row, mi_col },
+      { mi_row, blk_params.mi_col_edge },
+      { blk_params.mi_row_edge, blk_params.mi_col_edge } }
+  };
+
+  // Loop over AB partition types.
+  for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type;
+       ab_part_type++) {
+    const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A;
+
+    // Check if the AB partition search is to be performed.
+    if (!ab_partitions_allowed[ab_part_type]) {
+      continue;
+    }
+
+    blk_params.subsize = get_partition_subsize(bsize, part_type);
+    for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+      // Set AB partition context.
+      cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
+          cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+      if (!cur_part_ctxs[ab_part_type][i])
+        aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PICK_MODE_CONTEXT");
+      // Set mode as not ready.
+      cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
+    }
+
+    if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) {
+      // We can copy directly the mode search results if we have already
+      // searched the current block and the contexts match.
+      if (is_ctx_ready[ab_part_type][0]) {
+        av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
+                              mode_srch_ctx[ab_part_type][0][0]);
+        cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
+        cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
+        if (is_ctx_ready[ab_part_type][1]) {
+          av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
+                                mode_srch_ctx[ab_part_type][1][0]);
+          cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
+          cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+        }
+      }
+    }
+
+    // Even if the contexts don't match, we can still speed up by reusing the
+    // previous prediction mode.
+    const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL };
+    if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) {
+      set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type);
+    }
+
+    // Evaluation of AB partition type.
+    rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree,
+                    cur_part_ctxs[ab_part_type], part_search_state, best_rdc,
+                    ab_subsize[ab_part_type], ab_mi_pos[ab_part_type],
+                    part_type, mode_cache);
+  }
+}
+
+// Set mi positions for HORZ4 / VERT4 sub-block partitions.
+static void set_mi_pos_partition4(const int inc_step[NUM_PART4_TYPES],
+                                  int mi_pos[SUB_PARTITIONS_PART4][2],
+                                  const int mi_row, const int mi_col) {
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; i++) {
+    mi_pos[i][0] = mi_row + i * inc_step[HORZ4];
+    mi_pos[i][1] = mi_col + i * inc_step[VERT4];
+  }
+}
+
+// Set context and RD cost for HORZ4 / VERT4 partition types.
+static void set_4_part_ctx_and_rdcost(
+    MACROBLOCK *x, const AV1_COMP *const cpi, ThreadData *td,
+    PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+    PartitionSearchState *part_search_state, PARTITION_TYPE partition_type,
+    BLOCK_SIZE bsize) {
+  // Initialize sum_rdc RD cost structure.
+  av1_init_rd_stats(&part_search_state->sum_rdc);
+  const int subsize = get_partition_subsize(bsize, partition_type);
+  part_search_state->sum_rdc.rate =
+      part_search_state->partition_cost[partition_type];
+  part_search_state->sum_rdc.rdcost =
+      RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0);
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+    if (!cur_part_ctx[i])
+      aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
+  }
+}
+
+// Partition search of HORZ4 / VERT4 partition types.
+static void rd_pick_4partition(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    const int inc_step[NUM_PART4_TYPES], PARTITION_TYPE partition_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  // mi positions needed for HORZ4 and VERT4 partition types.
+  int mi_pos_check[NUM_PART4_TYPES] = { cm->mi_params.mi_rows,
+                                        cm->mi_params.mi_cols };
+  const PART4_TYPES part4_idx = (partition_type != PARTITION_HORZ_4);
+  int mi_pos[SUB_PARTITIONS_PART4][2];
+
+  blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type);
+  // Set partition context and RD cost.
+  set_4_part_ctx_and_rdcost(x, cpi, td, cur_part_ctx, part_search_state,
+                            partition_type, blk_params.bsize);
+  // Set mi positions for sub-block sizes.
+  set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col);
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state->part_timing_stats;
+  if (best_rdc->rdcost - part_search_state->sum_rdc.rdcost >= 0) {
+    start_partition_block_timer(part_timing_stats, partition_type);
+  }
+#endif
+  // Loop over sub-block partitions.
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    if (i > 0 && mi_pos[i][part4_idx] >= mi_pos_check[part4_idx]) break;
+
+    // Sub-block evaluation of Horz4 / Vert4 partition type.
+    cur_part_ctx[i]->rd_mode_is_ready = 0;
+    if (!rd_try_subblock(
+            cpi, td, tile_data, tp, (i == SUB_PARTITIONS_PART4 - 1),
+            mi_pos[i][0], mi_pos[i][1], blk_params.subsize, *best_rdc,
+            &part_search_state->sum_rdc, partition_type, cur_part_ctx[i])) {
+      av1_invalid_rd_stats(&part_search_state->sum_rdc);
+      break;
+    }
+  }
+
+  // Calculate the total cost and update the best partition.
+  av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+  if (part_search_state->sum_rdc.rdcost < best_rdc->rdcost) {
+    *best_rdc = part_search_state->sum_rdc;
+    part_search_state->found_best_partition = true;
+    pc_tree->partitioning = partition_type;
+  }
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (part_timing_stats->timer_is_on) {
+    end_partition_block_timer(part_timing_stats, partition_type,
+                              part_search_state->sum_rdc.rdcost);
+  }
+#endif
+  av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+                      blk_params.bsize, av1_num_planes(cm));
+}
+
+// Do not evaluate extended partitions if NONE partition is skippable.
+static INLINE int prune_ext_part_none_skippable(
+    PICK_MODE_CONTEXT *part_none, int must_find_valid_partition,
+    int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) {
+  if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) {
+    if (part_none->skippable && !must_find_valid_partition &&
+        bsize >= BLOCK_16X16) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+// Allow ab partition search
+static int allow_ab_partition_search(PartitionSearchState *part_search_state,
+                                     PARTITION_SPEED_FEATURES *part_sf,
+                                     PARTITION_TYPE curr_best_part,
+                                     int must_find_valid_partition,
+                                     int prune_ext_part_state,
+                                     int64_t best_rdcost) {
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+
+  // Do not prune if there is no valid partition
+  if (best_rdcost == INT64_MAX) return 1;
+
+  // Determine bsize threshold to evaluate ab partitions
+  BLOCK_SIZE ab_bsize_thresh = part_sf->ext_partition_eval_thresh;
+  if (part_sf->ext_part_eval_based_on_cur_best && !must_find_valid_partition &&
+      !(curr_best_part == PARTITION_HORZ || curr_best_part == PARTITION_VERT))
+    ab_bsize_thresh = BLOCK_128X128;
+
+  // ab partitions are only allowed for square block sizes BLOCK_16X16 or
+  // higher, so ab_bsize_thresh must be large enough to exclude BLOCK_4X4 and
+  // BLOCK_8X8.
+  assert(ab_bsize_thresh >= BLOCK_8X8);
+
+  int ab_partition_allowed =
+      part_search_state->do_rectangular_split && bsize > ab_bsize_thresh &&
+      av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
+
+  return ab_partition_allowed;
+}
+
+// Prune 4-way partitions based on the number of horz/vert wins
+// in the current block and sub-blocks in PARTITION_SPLIT.
+static void prune_4_partition_using_split_info(
+    AV1_COMP *const cpi, MACROBLOCK *x, PartitionSearchState *part_search_state,
+    int part4_search_allowed[NUM_PART4_TYPES]) {
+  PART4_TYPES cur_part[NUM_PART4_TYPES] = { HORZ4, VERT4 };
+  // Count of child blocks in which HORZ or VERT partition has won
+  int num_child_rect_win[NUM_RECT_PARTS] = { 0, 0 };
+  // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
+  // split partiitons.
+  // Conservative pruning for high quantizers.
+  const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
+
+  for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) {
+    if (!(cpi->sf.part_sf.prune_ext_part_using_split_info &&
+          part4_search_allowed[cur_part[i]]))
+      continue;
+    // Loop over split partitions.
+    // Get rectangular partitions winner info of split partitions.
+    for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; idx++)
+      num_child_rect_win[i] +=
+          (part_search_state->split_part_rect_win[idx].rect_part_win[i]) ? 1
+                                                                         : 0;
+    if (num_child_rect_win[i] < num_win_thresh) {
+      part4_search_allowed[cur_part[i]] = 0;
+    }
+  }
+}
+
+// Prune 4-way partition search.
+static void prune_4_way_partition_search(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    int pb_source_variance, int prune_ext_part_state,
+    int part4_search_allowed[NUM_PART4_TYPES]) {
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+
+  // Do not prune if there is no valid partition
+  if (best_rdc->rdcost == INT64_MAX) return;
+
+  // Determine bsize threshold to evaluate 4-way partitions
+  BLOCK_SIZE part4_bsize_thresh = cpi->sf.part_sf.ext_partition_eval_thresh;
+  if (cpi->sf.part_sf.ext_part_eval_based_on_cur_best &&
+      !x->must_find_valid_partition && pc_tree->partitioning == PARTITION_NONE)
+    part4_bsize_thresh = BLOCK_128X128;
+
+  // 4-way partitions are only allowed for BLOCK_16X16, BLOCK_32X32, and
+  // BLOCK_64X64, so part4_bsize_thresh must be large enough to exclude
+  // BLOCK_4X4 and BLOCK_8X8.
+  assert(part4_bsize_thresh >= BLOCK_8X8);
+
+  bool partition4_allowed =
+      part_search_state->do_rectangular_split && bsize > part4_bsize_thresh &&
+      av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
+
+  // Disable 4-way partition search flags for width less than a multiple of the
+  // minimum partition width.
+  if (blk_params.width < (blk_params.min_partition_size_1d
+                          << cpi->sf.part_sf.prune_part4_search)) {
+    part4_search_allowed[HORZ4] = 0;
+    part4_search_allowed[VERT4] = 0;
+    return;
+  }
+
+  PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4,
+                                               PARTITION_VERT_4 };
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+  // PARTITION_VERT_4 for this block. This is almost the same as
+  // partition4_allowed, except that we don't allow 128x32 or 32x128
+  // blocks, so we require that bsize is not BLOCK_128X128.
+  partition4_allowed &=
+      part_cfg->enable_1to4_partitions && bsize != BLOCK_128X128;
+
+  for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) {
+    part4_search_allowed[i] =
+        partition4_allowed && part_search_state->partition_rect_allowed[i] &&
+        get_plane_block_size(get_partition_subsize(bsize, cur_part[i]),
+                             part_search_state->ss_x,
+                             part_search_state->ss_y) != BLOCK_INVALID;
+  }
+  // Pruning: pruning out 4-way partitions based on the current best partition.
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
+    part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                    pc_tree->partitioning == PARTITION_HORZ_A ||
+                                    pc_tree->partitioning == PARTITION_HORZ_B ||
+                                    pc_tree->partitioning == PARTITION_SPLIT ||
+                                    pc_tree->partitioning == PARTITION_NONE);
+    part4_search_allowed[VERT4] &= (pc_tree->partitioning == PARTITION_VERT ||
+                                    pc_tree->partitioning == PARTITION_VERT_A ||
+                                    pc_tree->partitioning == PARTITION_VERT_B ||
+                                    pc_tree->partitioning == PARTITION_SPLIT ||
+                                    pc_tree->partitioning == PARTITION_NONE);
+  }
+
+  // Pruning: pruning out some 4-way partitions using a DNN taking rd costs of
+  // sub-blocks from basic partition types.
+  if (cpi->sf.part_sf.ml_prune_partition && partition4_allowed &&
+      part_search_state->partition_rect_allowed[HORZ] &&
+      part_search_state->partition_rect_allowed[VERT]) {
+    av1_ml_prune_4_partition(cpi, x, pc_tree->partitioning, best_rdc->rdcost,
+                             part_search_state, part4_search_allowed,
+                             pb_source_variance);
+  }
+
+  // Pruning: pruning out 4-way partitions based on the number of horz/vert wins
+  // in the current block and sub-blocks in PARTITION_SPLIT.
+  prune_4_partition_using_split_info(cpi, x, part_search_state,
+                                     part4_search_allowed);
+}
+
+// Set params needed for PARTITION_NONE search.
+static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td,
+                                      MACROBLOCK *x, PC_TREE *pc_tree,
+                                      PartitionSearchState *part_search_state,
+                                      RD_STATS *best_remain_rdcost,
+                                      RD_STATS *best_rdc, int *pt_cost) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS partition_rdcost;
+  // Set PARTITION_NONE context.
+  if (pc_tree->none == NULL)
+    pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf);
+  if (!pc_tree->none)
+    aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate PICK_MODE_CONTEXT");
+
+  // Set PARTITION_NONE type cost.
+  if (part_search_state->partition_none_allowed) {
+    if (blk_params.bsize_at_least_8x8) {
+      *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX
+                     ? part_search_state->partition_cost[PARTITION_NONE]
+                     : 0;
+    }
+
+    // Initialize the RD stats structure.
+    av1_init_rd_stats(&partition_rdcost);
+    partition_rdcost.rate = *pt_cost;
+    av1_rd_cost_update(x->rdmult, &partition_rdcost);
+    av1_rd_stats_subtraction(x->rdmult, best_rdc, &partition_rdcost,
+                             best_remain_rdcost);
+  }
+}
+
+// Skip other partitions based on PARTITION_NONE rd cost.
+static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                        PICK_MODE_CONTEXT *ctx_none,
+                                        PartitionSearchState *part_search_state,
+                                        RD_STATS *best_rdc,
+                                        unsigned int *pb_source_variance) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS *this_rdc = &part_search_state->this_rdc;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  if (!frame_is_intra_only(cm) &&
+      (part_search_state->do_square_split ||
+       part_search_state->do_rectangular_split) &&
+      !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+    const int use_ml_based_breakout =
+        bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
+        bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1;
+    if (use_ml_based_breakout) {
+      av1_ml_predict_breakout(cpi, x, this_rdc, *pb_source_variance, xd->bd,
+                              part_search_state);
+    }
+
+    // Adjust dist breakout threshold according to the partition size.
+    const int64_t dist_breakout_thr =
+        cpi->sf.part_sf.partition_search_breakout_dist_thr >>
+        ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+         (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+    const int rate_breakout_thr =
+        cpi->sf.part_sf.partition_search_breakout_rate_thr *
+        num_pels_log2_lookup[bsize];
+    // If all y, u, v transform blocks in this partition are skippable,
+    // and the dist & rate are within the thresholds, the partition
+    // search is terminated for current branch of the partition search
+    // tree. The dist & rate thresholds are set to 0 at speed 0 to
+    // disable the early termination at that speed.
+    if (best_rdc->dist < dist_breakout_thr &&
+        best_rdc->rate < rate_breakout_thr) {
+      part_search_state->do_square_split = 0;
+      part_search_state->do_rectangular_split = 0;
+    }
+  }
+
+  // Early termination: using simple_motion_search features and the
+  // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a
+  // decision on early terminating at PARTITION_NONE.
+  if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame &&
+      !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
+      av1_blk_has_rows_and_cols(&blk_params) && this_rdc->rdcost < INT64_MAX &&
+      this_rdc->rdcost >= 0 && this_rdc->rate < INT_MAX &&
+      this_rdc->rate >= 0 &&
+      (part_search_state->do_square_split ||
+       part_search_state->do_rectangular_split)) {
+    av1_simple_motion_search_early_term_none(cpi, x, sms_tree, this_rdc,
+                                             part_search_state);
+  }
+}
+
+// Decide early termination and rectangular partition pruning
+// based on PARTITION_NONE and PARTITION_SPLIT costs.
+static void prune_partitions_after_split(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    int64_t part_none_rd, int64_t part_split_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  // Early termination: using the rd costs of PARTITION_NONE and subblocks
+  // from PARTITION_SPLIT to determine an early breakout.
+  if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      !frame_is_intra_only(cm) &&
+      !part_search_state->terminate_partition_search &&
+      part_search_state->do_rectangular_split &&
+      (part_search_state->partition_rect_allowed[HORZ] ||
+       part_search_state->partition_rect_allowed[VERT])) {
+    av1_ml_early_term_after_split(
+        cpi, x, sms_tree, best_rdc->rdcost, part_none_rd, part_split_rd,
+        part_search_state->split_rd, part_search_state);
+  }
+
+  // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT
+  // to prune out rectangular partitions in some directions.
+  if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      cpi->sf.part_sf.ml_prune_partition && !frame_is_intra_only(cm) &&
+      (part_search_state->partition_rect_allowed[HORZ] ||
+       part_search_state->partition_rect_allowed[VERT]) &&
+      !(part_search_state->prune_rect_part[HORZ] ||
+        part_search_state->prune_rect_part[VERT]) &&
+      !part_search_state->terminate_partition_search) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm),
+                         bsize);
+    av1_ml_prune_rect_partition(cpi, x, best_rdc->rdcost,
+                                part_search_state->none_rd,
+                                part_search_state->split_rd, part_search_state);
+  }
+}
+
+// Returns true if either of the left and top neighbor blocks is larger than
+// the current block; false otherwise.
+static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk(
+    const MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+  const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]);
+  if (xd->left_available) {
+    const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize;
+    if (block_size_high[left_bsize] * block_size_wide[left_bsize] >
+        cur_blk_area)
+      return true;
+  }
+
+  if (xd->up_available) {
+    const BLOCK_SIZE above_bsize = xd->above_mbmi->bsize;
+    if (block_size_high[above_bsize] * block_size_wide[above_bsize] >
+        cur_blk_area)
+      return true;
+  }
+  return false;
+}
+
+static AOM_INLINE void prune_rect_part_using_none_pred_mode(
+    const MACROBLOCKD *xd, PartitionSearchState *part_state,
+    PREDICTION_MODE mode, BLOCK_SIZE bsize) {
+  if (mode == DC_PRED || mode == SMOOTH_PRED) {
+    // If the prediction mode of NONE partition is either DC_PRED or
+    // SMOOTH_PRED, it indicates that the current block has less variation. In
+    // this case, HORZ and VERT partitions are pruned if at least one of left
+    // and top neighbor blocks is larger than the current block.
+    if (is_neighbor_blk_larger_than_cur_blk(xd, bsize)) {
+      part_state->prune_rect_part[HORZ] = 1;
+      part_state->prune_rect_part[VERT] = 1;
+    }
+  } else if (mode == D67_PRED || mode == V_PRED || mode == D113_PRED) {
+    // If the prediction mode chosen by NONE partition is close to 90 degrees,
+    // it implies a dominant vertical pattern, and the chance of choosing a
+    // vertical rectangular partition is high. Hence, horizontal partition is
+    // pruned in these cases.
+    part_state->prune_rect_part[HORZ] = 1;
+  } else if (mode == D157_PRED || mode == H_PRED || mode == D203_PRED) {
+    // If the prediction mode chosen by NONE partition is close to 180 degrees,
+    // it implies a dominant horizontal pattern, and the chance of choosing a
+    // horizontal rectangular partition is high. Hence, vertical partition is
+    // pruned in these cases.
+    part_state->prune_rect_part[VERT] = 1;
+  }
+}
+
+// PARTITION_NONE search.
+static void none_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x,
+    PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    unsigned int *pb_source_variance, int64_t *none_rd, int64_t *part_none_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS *this_rdc = &part_search_state->this_rdc;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  if (part_search_state->terminate_partition_search ||
+      !part_search_state->partition_none_allowed)
+    return;
+
+  int pt_cost = 0;
+  RD_STATS best_remain_rdcost;
+  av1_invalid_rd_stats(&best_remain_rdcost);
+
+  // Set PARTITION_NONE context and cost.
+  set_none_partition_params(cpi, td, x, pc_tree, part_search_state,
+                            &best_remain_rdcost, best_rdc, &pt_cost);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  // Timer start for partition None.
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state->part_timing_stats;
+  if (best_remain_rdcost.rdcost >= 0) {
+    start_partition_block_timer(part_timing_stats, PARTITION_NONE);
+  }
+#endif
+  // PARTITION_NONE evaluation and cost update.
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE,
+                bsize, pc_tree->none, best_remain_rdcost);
+
+  av1_rd_cost_update(x->rdmult, this_rdc);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  // Timer end for partition None.
+  if (part_timing_stats->timer_is_on) {
+    RD_STATS tmp_rdc;
+    av1_init_rd_stats(&tmp_rdc);
+    if (this_rdc->rate != INT_MAX) {
+      tmp_rdc.rate = this_rdc->rate;
+      tmp_rdc.dist = this_rdc->dist;
+      tmp_rdc.rdcost = this_rdc->rdcost;
+      if (blk_params.bsize_at_least_8x8) {
+        tmp_rdc.rate += pt_cost;
+        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
+      }
+    }
+    end_partition_block_timer(part_timing_stats, PARTITION_NONE,
+                              tmp_rdc.rdcost);
+  }
+#endif
+  *pb_source_variance = x->source_variance;
+  if (none_rd) *none_rd = this_rdc->rdcost;
+  part_search_state->none_rd = this_rdc->rdcost;
+  if (this_rdc->rate != INT_MAX) {
+    // Record picked ref frame to prune ref frames for other partition types.
+    if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
+      const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame);
+      av1_update_picked_ref_frames_mask(
+          x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col);
+    }
+
+    // Calculate the total cost and update the best partition.
+    if (blk_params.bsize_at_least_8x8) {
+      this_rdc->rate += pt_cost;
+      this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist);
+    }
+    *part_none_rd = this_rdc->rdcost;
+    if (this_rdc->rdcost < best_rdc->rdcost) {
+      *best_rdc = *this_rdc;
+      part_search_state->found_best_partition = true;
+      if (blk_params.bsize_at_least_8x8) {
+        pc_tree->partitioning = PARTITION_NONE;
+      }
+
+      // Disable split and rectangular partition search
+      // based on PARTITION_NONE cost.
+      prune_partitions_after_none(cpi, x, sms_tree, pc_tree->none,
+                                  part_search_state, best_rdc,
+                                  pb_source_variance);
+    }
+
+    if (cpi->sf.part_sf.prune_rect_part_using_none_pred_mode)
+      prune_rect_part_using_none_pred_mode(&x->e_mbd, part_search_state,
+                                           pc_tree->none->mic.mode, bsize);
+  }
+  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// PARTITION_SPLIT search.
+static void split_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+    SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    SB_MULTI_PASS_MODE multi_pass_mode, int64_t *part_split_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+  RD_STATS sum_rdc = part_search_state->sum_rdc;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  // Check if partition split is allowed.
+  if (part_search_state->terminate_partition_search ||
+      !part_search_state->do_square_split)
+    return;
+
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (pc_tree->split[i] == NULL)
+      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    if (!pc_tree->split[i])
+      aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    pc_tree->split[i]->index = i;
+  }
+
+  // Initialization of this partition RD stats.
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+  int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state->part_timing_stats;
+  if (best_rdc->rdcost - sum_rdc.rdcost >= 0) {
+    start_partition_block_timer(part_timing_stats, PARTITION_SPLIT);
+  }
+#endif
+  // Recursive partition search on 4 sub-blocks.
+  for (idx = 0; idx < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc->rdcost;
+       ++idx) {
+    const int x_idx = (idx & 1) * blk_params.mi_step;
+    const int y_idx = (idx >> 1) * blk_params.mi_step;
+
+    if (mi_row + y_idx >= mi_params->mi_rows ||
+        mi_col + x_idx >= mi_params->mi_cols)
+      continue;
+
+    pc_tree->split[idx]->index = idx;
+    int64_t *p_split_rd = &part_search_state->split_rd[idx];
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc,
+                             &best_remain_rdcost);
+
+    int curr_quad_tree_idx = 0;
+    if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+      curr_quad_tree_idx = part_search_state->intra_part_info->quad_tree_idx;
+      part_search_state->intra_part_info->quad_tree_idx =
+          4 * curr_quad_tree_idx + idx + 1;
+    }
+    // Split partition evaluation of corresponding idx.
+    // If the RD cost exceeds the best cost then do not
+    // evaluate other split sub-partitions.
+    SIMPLE_MOTION_DATA_TREE *const sms_tree_split =
+        (sms_tree == NULL) ? NULL : sms_tree->split[idx];
+    if (!av1_rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+            &part_search_state->this_rdc, best_remain_rdcost,
+            pc_tree->split[idx], sms_tree_split, p_split_rd, multi_pass_mode,
+            &part_search_state->split_part_rect_win[idx])) {
+      av1_invalid_rd_stats(&sum_rdc);
+      break;
+    }
+    if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+      part_search_state->intra_part_info->quad_tree_idx = curr_quad_tree_idx;
+    }
+
+    sum_rdc.rate += part_search_state->this_rdc.rate;
+    sum_rdc.dist += part_search_state->this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
+
+    // Set split ctx as ready for use.
+    if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+                     pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+      const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none->mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted.
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED)
+          part_search_state->is_split_ctx_is_ready[idx] = 1;
+      }
+    }
+  }
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (part_timing_stats->timer_is_on) {
+    end_partition_block_timer(part_timing_stats, PARTITION_SPLIT,
+                              sum_rdc.rdcost);
+  }
+#endif
+  const int reached_last_index = (idx == SUB_PARTITIONS_SPLIT);
+
+  // Calculate the total cost and update the best partition.
+  *part_split_rd = sum_rdc.rdcost;
+  if (reached_last_index && sum_rdc.rdcost < best_rdc->rdcost) {
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    if (sum_rdc.rdcost < best_rdc->rdcost) {
+      *best_rdc = sum_rdc;
+      part_search_state->found_best_partition = true;
+      pc_tree->partitioning = PARTITION_SPLIT;
+    }
+  } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
+    // Skip rectangular partition test when partition type none gives better
+    // rd than partition type split.
+    if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
+      const int partition_none_valid = part_search_state->none_rd > 0;
+      const int partition_none_better =
+          part_search_state->none_rd < sum_rdc.rdcost;
+      part_search_state->do_rectangular_split &=
+          !(partition_none_valid && partition_none_better);
+    }
+  }
+  // Restore the context for the following cases:
+  // 1) Current block size not more than maximum partition size as dry run
+  // encode happens for these cases
+  // 2) Current block size same as superblock size as the final encode
+  // happens for this case
+  if (bsize <= x->sb_enc.max_partition_size || bsize == cm->seq_params->sb_size)
+    av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// The max number of nodes in the partition tree.
+// The number of leaf nodes is (128x128) / (4x4) = 1024.
+// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023.
+#define NUM_NODES 2048
+
+static void write_partition_tree(AV1_COMP *const cpi,
+                                 const PC_TREE *const pc_tree,
+                                 const BLOCK_SIZE bsize, const int mi_row,
+                                 const int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+           cpi->sb_counter, 0);
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d", bsize);
+
+  // Write partition type with BFS order.
+  const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int q_idx = 0;
+  int last_idx = 1;
+  int num_nodes = 1;
+
+  // First traversal to get number of leaf nodes.
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  const int num_leafs = last_idx;
+  fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+  // Write partitions for each node.
+  q_idx = 0;
+  last_idx = 1;
+  num_nodes = 1;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    fprintf(pfile, ",%d", node->partitioning);
+    if (node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fprintf(pfile, "\n");
+
+  fclose(pfile);
+}
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+static void verify_write_partition_tree(const AV1_COMP *const cpi,
+                                        const PC_TREE *const pc_tree,
+                                        const BLOCK_SIZE bsize,
+                                        const int config_id, const int mi_row,
+                                        const int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d",
+           path, cpi->sb_counter, config_id);
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d", bsize);
+
+  // Write partition type with BFS order.
+  const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int q_idx = 0;
+  int last_idx = 1;
+  int num_nodes = 1;
+
+  // First traversal to get number of leaf nodes.
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL && node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  const int num_leafs = last_idx;
+  fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+  // Write partitions for each node.
+  q_idx = 0;
+  last_idx = 1;
+  num_nodes = 1;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) {  // suppress warning
+      fprintf(pfile, ",%d", node->partitioning);
+      if (node->partitioning == PARTITION_SPLIT) {
+        for (int i = 0; i < 4; ++i) {
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+        num_nodes += 4;
+      }
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fprintf(pfile, "\n");
+
+  fclose(pfile);
+}
+
+static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
+                               struct aom_internal_error_info *error_info,
+                               const int config_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+           cpi->sb_counter, config_id);
+  FILE *pfile = fopen(filename, "r");
+  if (pfile == NULL) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR, "Can't find input file: %s.",
+                       filename);
+  }
+
+  int read_bsize;
+  int num_nodes;
+  int num_configs;
+  fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs);
+  assert(read_bsize == cpi->common.seq_params->sb_size);
+  BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize;
+  assert(bsize == pc_tree->block_size);
+
+  PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int last_idx = 1;
+  int q_idx = 0;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    int partitioning;
+    fscanf(pfile, ",%d", &partitioning);
+    assert(partitioning >= PARTITION_NONE &&
+           partitioning < EXT_PARTITION_TYPES);
+    PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) {
+      node->partitioning = partitioning;
+      bsize = node->block_size;
+    }
+    if (partitioning == PARTITION_SPLIT) {
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      for (int i = 0; i < 4; ++i) {
+        if (node != NULL) {  // Suppress warning
+          node->split[i] = av1_alloc_pc_tree_node(subsize);
+          if (!node->split[i])
+            aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PC_TREE");
+          node->split[i]->index = i;
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+      }
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fclose(pfile);
+
+  return num_configs;
+}
+
+static RD_STATS rd_search_for_fixed_partition(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col,
+    const BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  RD_STATS best_rdc;
+  av1_invalid_rd_stats(&best_rdc);
+  int sum_subblock_rate = 0;
+  int64_t sum_subblock_dist = 0;
+  PartitionSearchState part_search_state;
+  init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+                                     bsize);
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c).
+  PartitionBlkParams blk_params = part_search_state.part_blk_params;
+  if (!av1_blk_has_rows_and_cols(&blk_params))
+    set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  (void)orig_rdmult;
+
+  // Set the context.
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  assert(bsize < BLOCK_SIZES_ALL);
+  unsigned int pb_source_variance = UINT_MAX;
+  int64_t part_none_rd = INT64_MAX;
+  int64_t none_rd = INT64_MAX;
+  int inc_step[NUM_PART4_TYPES] = { 0 };
+  if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4;
+  if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+                            &part_search_state, &best_rdc, &pb_source_variance,
+                            &none_rd, &part_none_rd);
+      break;
+    case PARTITION_HORZ:
+      rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                                   &part_search_state, &best_rdc, NULL, HORZ,
+                                   HORZ);
+      break;
+    case PARTITION_VERT:
+      rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                                   &part_search_state, &best_rdc, NULL, VERT,
+                                   VERT);
+      break;
+    case PARTITION_HORZ_A:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, HORZ_A, HORZ_A);
+      break;
+    case PARTITION_HORZ_B:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, HORZ_B, HORZ_B);
+      break;
+    case PARTITION_VERT_A:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, VERT_A, VERT_A);
+      break;
+    case PARTITION_VERT_B:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, VERT_B, VERT_B);
+      break;
+    case PARTITION_HORZ_4:
+      rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                         pc_tree->horizontal4, &part_search_state, &best_rdc,
+                         inc_step, PARTITION_HORZ_4);
+      break;
+    case PARTITION_VERT_4:
+      rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                         pc_tree->vertical4, &part_search_state, &best_rdc,
+                         inc_step, PARTITION_VERT_4);
+      break;
+    case PARTITION_SPLIT:
+      for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) {
+        const BLOCK_SIZE subsize =
+            get_partition_subsize(bsize, PARTITION_SPLIT);
+        assert(subsize < BLOCK_SIZES_ALL);
+        const int next_mi_row =
+            idx < 2 ? mi_row : mi_row + mi_size_high[subsize];
+        const int next_mi_col =
+            idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize];
+        if (next_mi_row >= cm->mi_params.mi_rows ||
+            next_mi_col >= cm->mi_params.mi_cols) {
+          continue;
+        }
+        const RD_STATS subblock_rdc = rd_search_for_fixed_partition(
+            cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row,
+            next_mi_col, subsize, pc_tree->split[idx]);
+        sum_subblock_rate += subblock_rdc.rate;
+        sum_subblock_dist += subblock_rdc.dist;
+      }
+      best_rdc.rate = sum_subblock_rate;
+      best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT];
+      best_rdc.dist = sum_subblock_dist;
+      best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist);
+      break;
+    default:
+      assert(0 && "invalid partition type.");
+      aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid partition type.");
+  }
+  // Note: it is necessary to restore context information.
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize != cm->seq_params->sb_size) {
+    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+              pc_tree, NULL);
+  }
+  x->rdmult = orig_rdmult;
+
+  return best_rdc;
+}
+
+static void prepare_sb_features_before_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row,
+    int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) {
+  av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+                                        bsize, features);
+  collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features);
+}
+
+static void update_partition_stats(const RD_STATS *const this_rdcost,
+                                   aom_partition_stats_t *stats) {
+  stats->rate = this_rdcost->rate;
+  stats->dist = this_rdcost->dist;
+  stats->rdcost = this_rdcost->rdcost;
+}
+
+static void build_pc_tree_from_part_decision(
+    const aom_partition_decision_t *partition_decision,
+    const BLOCK_SIZE this_bsize, PC_TREE *pc_tree,
+    struct aom_internal_error_info *error_info) {
+  BLOCK_SIZE bsize = this_bsize;
+  int num_nodes = partition_decision->num_nodes;
+  PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int last_idx = 1;
+  int q_idx = 0;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const int partitioning = partition_decision->partition_decision[q_idx];
+    assert(partitioning >= PARTITION_NONE &&
+           partitioning < EXT_PARTITION_TYPES);
+    PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) {
+      node->partitioning = partitioning;
+      bsize = node->block_size;
+    }
+    if (partitioning == PARTITION_SPLIT) {
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      for (int i = 0; i < 4; ++i) {
+        if (node != NULL) {  // Suppress warning
+          node->split[i] = av1_alloc_pc_tree_node(subsize);
+          if (!node->split[i])
+            aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PC_TREE");
+          node->split[i]->index = i;
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+      }
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+}
+
+// The ML model needs to provide the whole decision tree for the superblock.
+static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           TokenExtra **tp,
+                                           SIMPLE_MOTION_DATA_TREE *sms_root,
+                                           int mi_row, int mi_col,
+                                           const BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  struct aom_internal_error_info *error_info = x->e_mbd.error_info;
+  aom_partition_features_t features;
+  prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+                                    &features);
+  features.mi_row = mi_row;
+  features.mi_col = mi_col;
+  features.frame_width = cpi->frame_info.frame_width;
+  features.frame_height = cpi->frame_info.frame_height;
+  features.block_size = bsize;
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // rd mode search (dry run) for a valid partition decision from the ml model.
+  aom_partition_decision_t partition_decision;
+  do {
+    const bool valid_decision = av1_ext_part_get_partition_decision(
+        ext_part_controller, &partition_decision);
+    if (!valid_decision) return false;
+
+    // First, let's take the easy approach.
+    // We require that the ml model has to provide partition decisions for the
+    // whole superblock.
+    td->pc_root = av1_alloc_pc_tree_node(bsize);
+    if (!td->pc_root)
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    build_pc_tree_from_part_decision(&partition_decision, bsize, td->pc_root,
+                                     error_info);
+
+    const RD_STATS this_rdcost = rd_search_for_fixed_partition(
+        cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
+    aom_partition_stats_t stats;
+    update_partition_stats(&this_rdcost, &stats);
+    av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+    if (!partition_decision.is_final_decision) {
+      av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+                                 cpi->sf.part_sf.partition_search_type);
+      td->pc_root = NULL;
+    }
+  } while (!partition_decision.is_final_decision);
+
+  // Encode with the selected mode and partition.
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            td->pc_root, NULL);
+  av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  td->pc_root = NULL;
+
+  return true;
+}
+
+// Use a bitmask to represent the valid partition types for the current
+// block. "1" represents the corresponding partition type is vaild.
+// The least significant bit represents "PARTITION_NONE", the
+// largest significant bit represents "PARTITION_VERT_4", follow
+// the enum order for PARTITION_TYPE in "enums.h"
+static int get_valid_partition_types(
+    const AV1_COMP *const cpi,
+    const PartitionSearchState *const part_search_state,
+    const BLOCK_SIZE bsize) {
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  int valid_types = 0;
+  // PARTITION_NONE
+  valid_types |= (part_search_state->partition_none_allowed << 0);
+  // PARTITION_HORZ
+  valid_types |= (part_search_state->partition_rect_allowed[HORZ] << 1);
+  // PARTITION_VERT
+  valid_types |= (part_search_state->partition_rect_allowed[VERT] << 2);
+  // PARTITION_SPLIT
+  valid_types |= (part_search_state->do_square_split << 3);
+  // PARTITION_HORZ_A
+  const int ext_partition_allowed = part_search_state->do_rectangular_split &&
+                                    av1_blk_has_rows_and_cols(&blk_params);
+  const int horzab_partition_allowed =
+      ext_partition_allowed && part_cfg->enable_ab_partitions &&
+      part_search_state->partition_rect_allowed[HORZ];
+  valid_types |= (horzab_partition_allowed << 4);
+  // PARTITION_HORZ_B
+  valid_types |= (horzab_partition_allowed << 5);
+  // PARTITION_VERT_A
+  const int vertab_partition_allowed =
+      ext_partition_allowed && part_cfg->enable_ab_partitions &&
+      part_search_state->partition_rect_allowed[VERT];
+  valid_types |= (vertab_partition_allowed << 6);
+  // PARTITION_VERT_B
+  valid_types |= (vertab_partition_allowed << 7);
+  // PARTITION_HORZ_4
+  const int partition4_allowed = part_cfg->enable_1to4_partitions &&
+                                 ext_partition_allowed &&
+                                 bsize != BLOCK_128X128;
+  const int horz4_allowed =
+      partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  valid_types |= (horz4_allowed << 8);
+  // PARTITION_VERT_4
+  const int vert4_allowed =
+      partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  valid_types |= (vert4_allowed << 9);
+
+  return valid_types;
+}
+
+static void prepare_tpl_stats_block(const AV1_COMP *const cpi,
+                                    const BLOCK_SIZE bsize, const int mi_row,
+                                    const int mi_col, int64_t *intra_cost,
+                                    int64_t *inter_cost, int64_t *mc_dep_cost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+      gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+    return;
+  }
+
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  // If tpl stats is not established, early return
+  if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+    return;
+  }
+
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const int mi_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int mi_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+
+  int64_t sum_intra_cost = 0;
+  int64_t sum_inter_cost = 0;
+  int64_t sum_mc_dep_cost = 0;
+  for (int row = 0; row < mi_height; row += step) {
+    for (int col = 0; col < mi_width; col += step) {
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                     tpl_data->tpl_stats_block_mis_log2)];
+      sum_intra_cost += this_stats->intra_cost;
+      sum_inter_cost += this_stats->inter_cost;
+      const int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      sum_mc_dep_cost += mc_dep_delta;
+    }
+  }
+
+  *intra_cost = sum_intra_cost;
+  *inter_cost = sum_inter_cost;
+  *mc_dep_cost = sum_mc_dep_cost;
+}
+
+static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td,
+                                TileDataEnc *tile_data, TokenExtra **tp,
+                                SIMPLE_MOTION_DATA_TREE *sms_root,
+                                PC_TREE *pc_tree, int mi_row, int mi_col,
+                                const BLOCK_SIZE bsize, RD_STATS *this_rdcost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) {
+    return false;
+  }
+  aom_partition_decision_t partition_decision;
+  do {
+    PartitionSearchState part_search_state;
+    // Initialization of state variables used in partition search.
+    // TODO(chengchen): check if there is hidden conditions that don't allow
+    // all possible partition types.
+    init_partition_search_state_params(x, cpi, &part_search_state, mi_row,
+                                       mi_col, bsize);
+    // Override partition costs at the edges of the frame in the same
+    // way as in read_partition (see decodeframe.c).
+    PartitionBlkParams blk_params = part_search_state.part_blk_params;
+    if (!av1_blk_has_rows_and_cols(&blk_params))
+      set_partition_cost_for_edge_blk(cm, &part_search_state);
+    const int orig_rdmult = x->rdmult;
+    setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+    const int valid_partition_types =
+        get_valid_partition_types(cpi, &part_search_state, bsize);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    const int qindex = av1_get_qindex(&cm->seg, xd->mi[0]->segment_id,
+                                      cm->quant_params.base_qindex);
+    // RD multiplier
+    const int rdmult = x->rdmult;
+    // pyramid level
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+    x->rdmult = orig_rdmult;
+    // Neighbor information
+    const int has_above = !!xd->above_mbmi;
+    const int has_left = !!xd->left_mbmi;
+    const BLOCK_SIZE above_bsize =
+        has_above ? xd->above_mbmi->bsize : BLOCK_INVALID;
+    const BLOCK_SIZE left_bsize =
+        has_left ? xd->left_mbmi->bsize : BLOCK_INVALID;
+    const int above_block_width =
+        above_bsize == BLOCK_INVALID ? -1 : block_size_wide[above_bsize];
+    const int above_block_height =
+        above_bsize == BLOCK_INVALID ? -1 : block_size_high[above_bsize];
+    const int left_block_width =
+        left_bsize == BLOCK_INVALID ? -1 : block_size_wide[left_bsize];
+    const int left_block_height =
+        left_bsize == BLOCK_INVALID ? -1 : block_size_high[left_bsize];
+    // Prepare simple motion search stats as features
+    unsigned int block_sse = -1;
+    unsigned int block_var = -1;
+    unsigned int sub_block_sse[4] = { -1, -1, -1, -1 };
+    unsigned int sub_block_var[4] = { -1, -1, -1, -1 };
+    unsigned int horz_block_sse[2] = { -1, -1 };
+    unsigned int horz_block_var[2] = { -1, -1 };
+    unsigned int vert_block_sse[2] = { -1, -1 };
+    unsigned int vert_block_var[2] = { -1, -1 };
+    av1_prepare_motion_search_features_block(
+        cpi, td, tile_data, mi_row, mi_col, bsize, valid_partition_types,
+        &block_sse, &block_var, sub_block_sse, sub_block_var, horz_block_sse,
+        horz_block_var, vert_block_sse, vert_block_var);
+    // Prepare tpl stats for the current block as features
+    int64_t tpl_intra_cost = -1;
+    int64_t tpl_inter_cost = -1;
+    int64_t tpl_mc_dep_cost = -1;
+    prepare_tpl_stats_block(cpi, bsize, mi_row, mi_col, &tpl_intra_cost,
+                            &tpl_inter_cost, &tpl_mc_dep_cost);
+
+    aom_partition_features_t features;
+    features.mi_row = mi_row;
+    features.mi_col = mi_col;
+    features.frame_width = cpi->frame_info.frame_width;
+    features.frame_height = cpi->frame_info.frame_height;
+    features.block_size = bsize;
+    features.valid_partition_types = valid_partition_types;
+    features.update_type = update_type;
+    features.qindex = qindex;
+    features.rdmult = rdmult;
+    features.pyramid_level = pyramid_level;
+    features.has_above_block = has_above;
+    features.above_block_width = above_block_width;
+    features.above_block_height = above_block_height;
+    features.has_left_block = has_left;
+    features.left_block_width = left_block_width;
+    features.left_block_height = left_block_height;
+    features.block_sse = block_sse;
+    features.block_var = block_var;
+    for (int i = 0; i < 4; ++i) {
+      features.sub_block_sse[i] = sub_block_sse[i];
+      features.sub_block_var[i] = sub_block_var[i];
+    }
+    for (int i = 0; i < 2; ++i) {
+      features.horz_block_sse[i] = horz_block_sse[i];
+      features.horz_block_var[i] = horz_block_var[i];
+      features.vert_block_sse[i] = vert_block_sse[i];
+      features.vert_block_var[i] = vert_block_var[i];
+    }
+    features.tpl_intra_cost = tpl_intra_cost;
+    features.tpl_inter_cost = tpl_inter_cost;
+    features.tpl_mc_dep_cost = tpl_mc_dep_cost;
+    av1_ext_part_send_features(ext_part_controller, &features);
+    const bool valid_decision = av1_ext_part_get_partition_decision(
+        ext_part_controller, &partition_decision);
+    if (!valid_decision) return false;
+    pc_tree->partitioning = partition_decision.current_decision;
+
+    av1_init_rd_stats(this_rdcost);
+    if (partition_decision.current_decision == PARTITION_SPLIT) {
+      assert(block_size_wide[bsize] >= 8 && block_size_high[bsize] >= 8);
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      RD_STATS split_rdc[SUB_PARTITIONS_SPLIT];
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        av1_init_rd_stats(&split_rdc[i]);
+        if (pc_tree->split[i] == NULL)
+          pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+        if (!pc_tree->split[i])
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PC_TREE");
+        pc_tree->split[i]->index = i;
+      }
+      const int orig_rdmult_tmp = x->rdmult;
+      setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+      // TODO(chengchen): check boundary conditions
+      // top-left
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0],
+                          mi_row, mi_col, subsize, &split_rdc[0]);
+      // top-right
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[1],
+                          mi_row, mi_col + mi_size_wide[subsize], subsize,
+                          &split_rdc[1]);
+      // bottom-left
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[2],
+                          mi_row + mi_size_high[subsize], mi_col, subsize,
+                          &split_rdc[2]);
+      // bottom_right
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[3],
+                          mi_row + mi_size_high[subsize],
+                          mi_col + mi_size_wide[subsize], subsize,
+                          &split_rdc[3]);
+      this_rdcost->rate += part_search_state.partition_cost[PARTITION_SPLIT];
+      // problem is here, the rdmult is different from the rdmult in sub block.
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        this_rdcost->rate += split_rdc[i].rate;
+        this_rdcost->dist += split_rdc[i].dist;
+        av1_rd_cost_update(x->rdmult, this_rdcost);
+      }
+      x->rdmult = orig_rdmult_tmp;
+    } else {
+      *this_rdcost = rd_search_for_fixed_partition(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+    }
+
+    aom_partition_stats_t stats;
+    update_partition_stats(this_rdcost, &stats);
+    av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+    if (!partition_decision.is_final_decision) {
+      if (partition_decision.current_decision == PARTITION_SPLIT) {
+        for (int i = 0; i < 4; ++i) {
+          if (pc_tree->split[i] != NULL) {
+            av1_free_pc_tree_recursive(pc_tree->split[i], av1_num_planes(cm), 0,
+                                       0,
+                                       cpi->sf.part_sf.partition_search_type);
+            pc_tree->split[i] = NULL;
+          }
+        }
+      }
+    }
+  } while (!partition_decision.is_final_decision);
+
+  return true;
+}
+
+// The ML model only needs to make decisions for the current block each time.
+static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td,
+                                        TileDataEnc *tile_data, TokenExtra **tp,
+                                        SIMPLE_MOTION_DATA_TREE *sms_root,
+                                        int mi_row, int mi_col,
+                                        const BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  aom_partition_features_t features;
+  prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+                                    &features);
+  features.mi_row = mi_row;
+  features.mi_col = mi_col;
+  features.frame_width = cpi->frame_info.frame_width;
+  features.frame_height = cpi->frame_info.frame_height;
+  features.block_size = bsize;
+  av1_ext_part_send_features(ext_part_controller, &features);
+  td->pc_root = av1_alloc_pc_tree_node(bsize);
+  if (!td->pc_root)
+    aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate PC_TREE");
+
+  RD_STATS rdcost;
+  const bool valid_partition =
+      recursive_partition(cpi, td, tile_data, tp, sms_root, td->pc_root, mi_row,
+                          mi_col, bsize, &rdcost);
+  if (!valid_partition) {
+    return false;
+  }
+
+  // Encode with the selected mode and partition.
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            td->pc_root, NULL);
+  av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  td->pc_root = NULL;
+
+  return true;
+}
+
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+                             TileDataEnc *tile_data, TokenExtra **tp,
+                             SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+                             int mi_col, const BLOCK_SIZE bsize,
+                             RD_STATS *best_rd_cost) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cpi->ext_part_controller.ready) {
+    bool valid_search = true;
+    const aom_ext_part_decision_mode_t decision_mode =
+        av1_get_ext_part_decision_mode(&cpi->ext_part_controller);
+    if (decision_mode == AOM_EXT_PART_WHOLE_TREE) {
+      valid_search = ml_partition_search_whole_tree(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+    } else if (decision_mode == AOM_EXT_PART_RECURSIVE) {
+      valid_search = ml_partition_search_partial(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+    } else {
+      assert(0 && "Unknown decision mode.");
+      return false;
+    }
+    if (!valid_search) {
+      aom_internal_error(
+          cm->error, AOM_CODEC_ERROR,
+          "Invalid search from ML model, partition search failed");
+    }
+    return true;
+  }
+
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int best_idx = 0;
+  int64_t min_rdcost = INT64_MAX;
+  int num_configs;
+  int i = 0;
+  do {
+    td->pc_root = av1_alloc_pc_tree_node(bsize);
+    if (!td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    num_configs = read_partition_tree(cpi, td->pc_root, xd->error_info, i);
+    if (num_configs <= 0) {
+      av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+                                 cpi->sf.part_sf.partition_search_type);
+      td->pc_root = NULL;
+      aom_internal_error(xd->error_info, AOM_CODEC_ERROR, "Invalid configs.");
+    }
+    verify_write_partition_tree(cpi, td->pc_root, bsize, i, mi_row, mi_col);
+    if (i == 0) {
+      AOM_CHECK_MEM_ERROR(xd->error_info, x->rdcost,
+                          aom_calloc(num_configs, sizeof(*x->rdcost)));
+    }
+    // Encode the block with the given partition tree. Get rdcost and encoding
+    // time.
+    x->rdcost[i] = rd_search_for_fixed_partition(
+        cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
+
+    if (x->rdcost[i].rdcost < min_rdcost) {
+      min_rdcost = x->rdcost[i].rdcost;
+      best_idx = i;
+      *best_rd_cost = x->rdcost[i];
+    }
+    av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+                               cpi->sf.part_sf.partition_search_type);
+    td->pc_root = NULL;
+    ++i;
+  } while (i < num_configs);
+
+  aom_free(x->rdcost);
+  x->rdcost = NULL;
+  // Encode with the partition configuration with the smallest rdcost.
+  td->pc_root = av1_alloc_pc_tree_node(bsize);
+  if (!td->pc_root)
+    aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate PC_TREE");
+  read_partition_tree(cpi, td->pc_root, xd->error_info, best_idx);
+  rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row,
+                                mi_col, bsize, td->pc_root);
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            td->pc_root, NULL);
+  av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  td->pc_root = NULL;
+  ++cpi->sb_counter;
+
+  return true;
+}
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
+
+static AOM_INLINE bool should_do_dry_run_encode_for_current_block(
+    BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index,
+    BLOCK_SIZE bsize) {
+  if (bsize > max_partition_size) return false;
+
+  // Enable the reconstruction with dry-run for the 4th sub-block only if its
+  // parent block's reconstruction with dry-run is skipped. If
+  // max_partition_size is the same as immediate split of superblock, then avoid
+  // reconstruction of the 4th sub-block, as this data is not consumed.
+  if (curr_block_index != 3) return true;
+
+  const BLOCK_SIZE sub_sb_size =
+      get_partition_subsize(sb_size, PARTITION_SPLIT);
+  return bsize == max_partition_size && sub_sb_size != max_partition_size;
+}
+
+static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                              double *var_min, double *var_max) {
+  // This functions returns a the minimum and maximum log variances for 4x4
+  // sub blocks in the current block.
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int is_hbd = is_cur_buf_hbd(xd);
+  const int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  const int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+  const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+  const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+  // Initialize minimum variance to a large value and maximum variance to 0.
+  double min_var_4x4 = (double)INT_MAX;
+  double max_var_4x4 = 0.0;
+
+  for (int i = 0; i < bh; i += MI_SIZE) {
+    for (int j = 0; j < bw; j += MI_SIZE) {
+      int var;
+      // Calculate the 4x4 sub-block variance.
+      var = av1_calc_normalized_variance(
+          cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+          x->plane[0].src.buf + (i * x->plane[0].src.stride) + j,
+          x->plane[0].src.stride, is_hbd);
+
+      // Record min and max for over-arching block
+      min_var_4x4 = AOMMIN(min_var_4x4, var);
+      max_var_4x4 = AOMMAX(max_var_4x4, var);
+    }
+  }
+  *var_min = log1p(min_var_4x4 / 16.0);
+  *var_max = log1p(max_var_4x4 / 16.0);
+}
+
+static AOM_INLINE void set_sms_tree_partitioning(
+    SIMPLE_MOTION_DATA_TREE *sms_tree, PARTITION_TYPE partition) {
+  if (sms_tree == NULL) return;
+  sms_tree->partitioning = partition;
+}
+
+/*!\brief AV1 block partition search (full search).
+*
+* \ingroup partition_search
+* \callgraph
+* Searches for the best partition pattern for a block based on the
+* rate-distortion cost, and returns a bool value to indicate whether a valid
+* partition pattern is found. The partition can recursively go down to the
+* smallest block size.
+*
+* \param[in]    cpi                Top-level encoder structure
+* \param[in]    td                 Pointer to thread data
+* \param[in]    tile_data          Pointer to struct holding adaptive
+data/contexts/models for the tile during
+encoding
+* \param[in]    tp                 Pointer to the starting token
+* \param[in]    mi_row             Row coordinate of the block in a step size
+of MI_SIZE
+* \param[in]    mi_col             Column coordinate of the block in a step
+size of MI_SIZE
+* \param[in]    bsize              Current block size
+* \param[in]    rd_cost            Pointer to the final rd cost of the block
+* \param[in]    best_rdc           Upper bound of rd cost of a valid partition
+* \param[in]    pc_tree            Pointer to the PC_TREE node storing the
+picked partitions and mode info for the
+current block
+* \param[in]    sms_tree           Pointer to struct holding simple motion
+search data for the current block
+* \param[in]    none_rd            Pointer to the rd cost in the case of not
+splitting the current block
+* \param[in]    multi_pass_mode    SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
+* \param[in]    rect_part_win_info Pointer to struct storing whether horz/vert
+partition outperforms previously tested
+partitions
+*
+* \return A bool value is returned indicating if a valid partition is found.
+* The pc_tree struct is modified to store the picked partition and modes.
+* The rd_cost struct is also updated with the RD stats corresponding to the
+* best partition found.
+*/
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+                           RD_STATS best_rdc, PC_TREE *pc_tree,
+                           SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+                           SB_MULTI_PASS_MODE multi_pass_mode,
+                           RD_RECT_PART_WIN_INFO *rect_part_win_info) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TokenExtra *const tp_orig = *tp;
+  PartitionSearchState part_search_state;
+
+  // Initialization of state variables used in partition search.
+  init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+                                     bsize);
+  PartitionBlkParams blk_params = part_search_state.part_blk_params;
+
+  set_sms_tree_partitioning(sms_tree, PARTITION_NONE);
+  if (best_rdc.rdcost < 0) {
+    av1_invalid_rd_stats(rd_cost);
+    return part_search_state.found_best_partition;
+  }
+  if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0;
+
+  // Override skipping rectangular partition operations for edge blocks.
+  if (none_rd) *none_rd = 0;
+  (void)*tp_orig;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  // Stats at the current quad tree
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state.part_timing_stats;
+  // Stats aggregated at frame level
+  FramePartitionTimingStats *fr_part_timing_stats = &cpi->partition_stats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c).
+  if (!av1_blk_has_rows_and_cols(&blk_params))
+    set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+  // Disable rectangular partitions for inner blocks when the current block is
+  // forced to only use square partitions.
+  if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
+    part_search_state.partition_rect_allowed[HORZ] &= !blk_params.has_rows;
+    part_search_state.partition_rect_allowed[VERT] &= !blk_params.has_cols;
+  }
+
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to fixed pattern
+  // when debugging.
+  // bit 0, 1, 2 are blk_skip of each plane
+  // bit 4, 5, 6 are initialization checking of each plane
+  memset(x->txfm_search_info.blk_skip, 0x77,
+         sizeof(x->txfm_search_info.blk_skip));
+#endif  // NDEBUG
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  // Set buffers and offsets.
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (cpi->oxcf.mode == ALLINTRA) {
+    if (bsize == cm->seq_params->sb_size) {
+      double var_min, var_max;
+      log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+      x->intra_sb_rdmult_modifier = 128;
+      if ((var_min < 2.0) && (var_max > 4.0)) {
+        if ((var_max - var_min) > 8.0) {
+          x->intra_sb_rdmult_modifier -= 48;
+        } else {
+          x->intra_sb_rdmult_modifier -= (int)((var_max - var_min) * 6);
+        }
+      }
+    }
+  }
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  // Apply simple motion search for the entire super block with fixed block
+  // size, e.g., 16x16, to collect features and write to files for the
+  // external ML model.
+  // TODO(chengchen): reduce motion search. This function is similar to
+  // av1_get_max_min_partition_features().
+  if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) &&
+      bsize == cm->seq_params->sb_size) {
+    av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+                                          bsize, /*features=*/NULL);
+    collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL);
+  }
+
+  // Update rd cost of the bound using the current multiplier.
+  av1_rd_cost_update(x->rdmult, &best_rdc);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+  // Set the context.
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_prune_partitions_time);
+#endif
+  // Pruning: before searching any partition type, using source and simple
+  // motion search results to prune out unlikely partitions.
+  av1_prune_partitions_before_search(cpi, x, sms_tree, &part_search_state);
+
+  // Pruning: eliminating partition types leading to coding block sizes outside
+  // the min and max bsize limitations set from the encoder.
+  av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_prune_partitions_time);
+#endif
+
+  // Partition search
+BEGIN_PARTITION_SEARCH:
+  // If a valid partition is required, usually when the first round cannot find
+  // a valid one under the cost limit after pruning, reset the limitations on
+  // partition types and intra cnn output.
+  if (x->must_find_valid_partition) {
+    reset_part_limitations(cpi, &part_search_state);
+    av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state);
+    // Invalidate intra cnn output for key frames.
+    if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+      part_search_state.intra_part_info->quad_tree_idx = 0;
+      part_search_state.intra_part_info->cnn_output_valid = 0;
+    }
+  }
+  // Partition block source pixel variance.
+  unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, none_partition_search_time);
+#endif
+
+  if (cpi->oxcf.mode == ALLINTRA) {
+    const bool bsize_at_least_16x16 = (bsize >= BLOCK_16X16);
+    const bool prune_rect_part_using_4x4_var_deviation =
+        (cpi->sf.part_sf.prune_rect_part_using_4x4_var_deviation &&
+         !x->must_find_valid_partition);
+
+    if (bsize_at_least_16x16 || prune_rect_part_using_4x4_var_deviation) {
+      double var_min, var_max;
+      log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+      // Further pruning or in some cases reverse pruning when allintra is set.
+      // This code helps visual and in some cases metrics quality where the
+      // current block comprises at least one very low variance sub-block and at
+      // least one where the variance is much higher.
+      //
+      // The idea is that in such cases there is danger of ringing and other
+      // visual artifacts from a high variance feature such as an edge into a
+      // very low variance region.
+      //
+      // The approach taken is to force break down / split to a smaller block
+      // size to try and separate out the low variance and well predicted blocks
+      // from the more complex ones and to prevent propagation of ringing over a
+      // large region.
+      if (bsize_at_least_16x16 && (var_min < 0.272) &&
+          ((var_max - var_min) > 3.0)) {
+        part_search_state.partition_none_allowed = 0;
+        part_search_state.terminate_partition_search = 0;
+        part_search_state.do_square_split = 1;
+      } else if (prune_rect_part_using_4x4_var_deviation &&
+                 (var_max - var_min < 3.0)) {
+        // Prune rectangular partitions if the variance deviation of 4x4
+        // sub-blocks within the block is less than a threshold (derived
+        // empirically).
+        part_search_state.do_rectangular_split = 0;
+      }
+    }
+  }
+
+  // PARTITION_NONE search stage.
+  int64_t part_none_rd = INT64_MAX;
+  none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+                        &part_search_state, &best_rdc, &pb_source_variance,
+                        none_rd, &part_none_rd);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, none_partition_search_time);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, split_partition_search_time);
+#endif
+  // PARTITION_SPLIT search stage.
+  int64_t part_split_rd = INT64_MAX;
+  split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx,
+                         &part_search_state, &best_rdc, multi_pass_mode,
+                         &part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, split_partition_search_time);
+#endif
+  // Terminate partition search for child partition,
+  // when NONE and SPLIT partition rd_costs are INT64_MAX.
+  if (cpi->sf.part_sf.early_term_after_none_split &&
+      part_none_rd == INT64_MAX && part_split_rd == INT64_MAX &&
+      !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) {
+    part_search_state.terminate_partition_search = 1;
+  }
+
+  // Do not evaluate non-square partitions if NONE partition did not choose a
+  // newmv mode and is skippable.
+  if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) &&
+      (pc_tree->none != NULL)) {
+    if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) &&
+        !have_newmv_in_inter_mode(pc_tree->none->mic.mode) &&
+        pc_tree->none->skippable && !x->must_find_valid_partition &&
+        bsize >= BLOCK_16X16)
+      part_search_state.do_rectangular_split = 0;
+  }
+
+  // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT.
+  prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc,
+                               part_none_rd, part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rectangular_partition_search_time);
+#endif
+  // Rectangular partitions search stage.
+  rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                               &part_search_state, &best_rdc,
+                               rect_part_win_info, HORZ, VERT);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rectangular_partition_search_time);
+#endif
+
+  if (pb_source_variance == UINT_MAX) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+    pb_source_variance = av1_get_perpixel_variance_facade(
+        cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+  }
+
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part_search_state.do_rectangular_split));
+
+  const int prune_ext_part_state = prune_ext_part_none_skippable(
+      pc_tree->none, x->must_find_valid_partition,
+      cpi->sf.part_sf.skip_non_sq_part_based_on_none, bsize);
+
+  const int ab_partition_allowed = allow_ab_partition_search(
+      &part_search_state, &cpi->sf.part_sf, pc_tree->partitioning,
+      x->must_find_valid_partition, prune_ext_part_state, best_rdc.rdcost);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, ab_partitions_search_time);
+#endif
+  // AB partitions search stage.
+  ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       &part_search_state, &best_rdc, rect_part_win_info,
+                       pb_source_variance, ab_partition_allowed, HORZ_A,
+                       VERT_B);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, ab_partitions_search_time);
+#endif
+
+  // 4-way partitions search stage.
+  int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
+  // Prune 4-way partition search.
+  prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc,
+                               pb_source_variance, prune_ext_part_state,
+                               part4_search_allowed);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_4partition_time);
+#endif
+  // PARTITION_HORZ_4
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part4_search_allowed[HORZ4]));
+  if (!part_search_state.terminate_partition_search &&
+      part4_search_allowed[HORZ4]) {
+    const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4,
+                                            0 };
+    // Evaluation of Horz4 partition type.
+    rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       pc_tree->horizontal4, &part_search_state, &best_rdc,
+                       inc_step, PARTITION_HORZ_4);
+  }
+
+  // PARTITION_VERT_4
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part4_search_allowed[VERT4]));
+  if (!part_search_state.terminate_partition_search &&
+      part4_search_allowed[VERT4] && blk_params.has_cols) {
+    const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] /
+                                                   4 };
+    // Evaluation of Vert4 partition type.
+    rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       pc_tree->vertical4, &part_search_state, &best_rdc,
+                       inc_step, PARTITION_VERT_4);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_4partition_time);
+#endif
+
+  if (bsize == cm->seq_params->sb_size &&
+      !part_search_state.found_best_partition) {
+    // Did not find a valid partition, go back and search again, with less
+    // constraint on which partition types to search.
+    x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS
+    fr_part_timing_stats->partition_redo += 1;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+    goto BEGIN_PARTITION_SEARCH;
+  }
+
+  // Store the final rd cost
+  *rd_cost = best_rdc;
+
+  // Also record the best partition in simple motion data tree because it is
+  // necessary for the related speed features.
+  set_sms_tree_partitioning(sms_tree, pc_tree->partitioning);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+    part_timing_stats->partition_decisions[pc_tree->partitioning] += 1;
+  }
+
+  // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
+  // prediction block.
+  print_partition_timing_stats_with_rdcost(
+      part_timing_stats, mi_row, mi_col, bsize,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+      cm->current_frame.frame_number, &best_rdc, "part_timing.csv");
+  const bool print_timing_stats = false;
+  if (print_timing_stats) {
+    print_partition_timing_stats(part_timing_stats, cm->show_frame,
+                                 frame_is_intra_only(cm), bsize,
+                                 "part_timing_data.csv");
+  }
+  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
+  // the whole clip. So we need to pass the information upstream to the encoder.
+  accumulate_partition_timing_stats(fr_part_timing_stats, part_timing_stats,
+                                    bsize);
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+  // Reset the PC_TREE deallocation flag.
+  int pc_tree_dealloc = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_time);
+#endif
+  if (part_search_state.found_best_partition) {
+    if (bsize == cm->seq_params->sb_size) {
+      // Encode the superblock.
+      const int emit_output = multi_pass_mode != SB_DRY_PASS;
+      const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
+
+      // Write partition tree to file. Not used by default.
+      if (COLLECT_MOTION_SEARCH_FEATURE_SB) {
+        write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col);
+        ++cpi->sb_counter;
+      }
+
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
+                pc_tree, NULL);
+      assert(pc_tree == td->pc_root);
+      // Dealloc the whole PC_TREE after a superblock is done.
+      av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0,
+                                 cpi->sf.part_sf.partition_search_type);
+      pc_tree = NULL;
+      td->pc_root = NULL;
+      pc_tree_dealloc = 1;
+    } else if (should_do_dry_run_encode_for_current_block(
+                   cm->seq_params->sb_size, x->sb_enc.max_partition_size,
+                   pc_tree->index, bsize)) {
+      // Encode the smaller blocks in DRY_RUN mode.
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_time);
+#endif
+
+  // If the tree still exists (non-superblock), dealloc most nodes, only keep
+  // nodes for the best partition and PARTITION_NONE.
+  if (pc_tree_dealloc == 0)
+    av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1,
+                               cpi->sf.part_sf.partition_search_type);
+
+  if (bsize == cm->seq_params->sb_size) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+
+  // Restore the rd multiplier.
+  x->rdmult = orig_rdmult;
+  return part_search_state.found_best_partition;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef COLLECT_MOTION_SEARCH_FEATURE_SB
+
+#if CONFIG_RT_ML_PARTITIONING
+#define FEATURES 6
+#define LABELS 2
+static int ml_predict_var_partitioning(AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, int mi_row,
+                                       int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const NN_CONFIG *nn_config = NULL;
+  const float *means = NULL;
+  const float *vars = NULL;
+  switch (bsize) {
+    case BLOCK_64X64:
+      nn_config = &av1_var_part_nnconfig_64;
+      means = av1_var_part_means_64;
+      vars = av1_var_part_vars_64;
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_var_part_nnconfig_32;
+      means = av1_var_part_means_32;
+      vars = av1_var_part_vars_32;
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_var_part_nnconfig_16;
+      means = av1_var_part_means_16;
+      vars = av1_var_part_vars_16;
+      break;
+    case BLOCK_8X8:
+    default: assert(0 && "Unexpected block size."); return -1;
+  }
+
+  if (!nn_config) return -1;
+
+  {
+    const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
+    float features[FEATURES] = { 0.0f };
+    const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                                      cm->seq_params->bit_depth);
+    int feature_idx = 0;
+    float score[LABELS];
+
+    features[feature_idx] =
+        (log1pf((float)(dc_q * dc_q) / 256.0f) - means[feature_idx]) /
+        sqrtf(vars[feature_idx]);
+    feature_idx++;
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+    {
+      const int bs = block_size_wide[bsize];
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      const int sb_offset_row = 4 * (mi_row & 15);
+      const int sb_offset_col = 4 * (mi_col & 15);
+      const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      int i;
+      // Variance of whole block.
+      const unsigned int var =
+          cpi->ppi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx] =
+          (log1pf((float)var) - means[feature_idx]) / sqrtf(vars[feature_idx]);
+      feature_idx++;
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->ppi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                         pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx] =
+            (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]);
+        feature_idx++;
+      }
+    }
+    //    for (int i = 0; i<FEATURES; i++)
+    //      printf("F_%d, %f; ", i, features[i]);
+    assert(feature_idx == FEATURES);
+    av1_nn_predict(features, nn_config, 1, score);
+    //    printf("Score %f, thr %f ", (float)score[0], thresh);
+    if (score[0] > thresh) return PARTITION_SPLIT;
+    if (score[0] < -thresh) return PARTITION_NONE;
+    return -1;
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+// Uncomment for collecting data for ML-based partitioning
+// #define _COLLECT_GROUND_TRUTH_
+
+#ifdef _COLLECT_GROUND_TRUTH_
+static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                int mi_row, int mi_col, PARTITION_TYPE part) {
+  AV1_COMMON *const cm = &cpi->common;
+  char fname[128];
+  switch (bsize) {
+    case BLOCK_64X64: sprintf(fname, "data_64x64.txt"); break;
+    case BLOCK_32X32: sprintf(fname, "data_32x32.txt"); break;
+    case BLOCK_16X16: sprintf(fname, "data_16x16.txt"); break;
+    case BLOCK_8X8: sprintf(fname, "data_8x8.txt"); break;
+    default: assert(0 && "Unexpected block size."); return -1;
+  }
+
+  float features[6];  // DC_Q, VAR, VAR_RATIO-0..3
+
+  FILE *f = fopen(fname, "a");
+
+  {
+    const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                                      cm->seq_params->bit_depth);
+    int feature_idx = 0;
+
+    features[feature_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+    {
+      const int bs = block_size_wide[bsize];
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      const int sb_offset_row = 4 * (mi_row & 15);
+      const int sb_offset_col = 4 * (mi_col & 15);
+      const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      int i;
+      // Variance of whole block.
+      /*
+                if (bs == 8)
+                {
+                  int r, c;
+                  printf("%d %d\n", mi_row, mi_col);
+                  for (r = 0; r < bs; ++r) {
+                    for (c = 0; c < bs; ++c) {
+                      printf("%3d ",
+                             src[r * src_stride + c] - pred[64 * r + c]);
+                    }
+                    printf("\n");
+                  }
+                  printf("\n");
+                }
+      */
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx++] = log1pf((float)var);
+
+      fprintf(f, "%f,%f,", features[0], features[1]);
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx++] = var_ratio;
+        fprintf(f, "%f,", var_ratio);
+      }
+
+      fprintf(f, "%d\n", part == PARTITION_NONE ? 0 : 1);
+    }
+
+    fclose(f);
+    return -1;
+  }
+}
+#endif
+
+static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize) {
+  const int block_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int block_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int mi_stride = xd->mi_stride;
+  MB_MODE_INFO *const src_mi = xd->mi[0];
+  int i, j;
+
+  for (j = 0; j < block_height; ++j)
+    for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi;
+}
+
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+    MB_MODE_INFO_EXT *const mbmi_ext,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+static void fill_mode_info_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int hbs = mi_size_wide[bsize] >> 1;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_mode_info_offsets(&cm->mi_params, &cpi->mbmi_ext_info, x, xd, mi_row,
+                            mi_col);
+      *(xd->mi[0]) = pc_tree->none->mic;
+      copy_mbmi_ext_frame_to_mbmi_ext(
+          &x->mbmi_ext, &pc_tree->none->mbmi_ext_best, LAST_FRAME);
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      break;
+    case PARTITION_SPLIT: {
+      fill_mode_info_sb(cpi, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+      fill_mode_info_sb(cpi, x, mi_row, mi_col + hbs, subsize,
+                        pc_tree->split[1]);
+      fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col, subsize,
+                        pc_tree->split[2]);
+      fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col + hbs, subsize,
+                        pc_tree->split[3]);
+      break;
+    }
+    default: break;
+  }
+}
+
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TokenExtra **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+                              PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int hbs = mi_size_wide[bsize] >> 1;
+  TokenExtra *tp_orig = *tp;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  RD_STATS this_rdc, best_rdc;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  int do_split = bsize > BLOCK_8X8;
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + 2 * hbs > cm->mi_params.mi_rows);
+  const int force_vert_split = (mi_col + 2 * hbs > cm->mi_params.mi_cols);
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);  // Square partition only
+  assert(cm->seq_params->sb_size == BLOCK_64X64);      // Small SB so far
+
+  (void)*tp_orig;
+
+  av1_invalid_rd_stats(&best_rdc);
+  best_rdc.rdcost = best_rd;
+#ifndef _COLLECT_GROUND_TRUTH_
+  if (partition_none_allowed && do_split) {
+    const int ml_predicted_partition =
+        ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
+    if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
+    if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
+  }
+#endif
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    if (!pc_tree->none)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
+    PICK_MODE_CONTEXT *ctx = pc_tree->none;
+
+// Flip for RDO based pick mode
+#if 0
+    RD_STATS dummy;
+    av1_invalid_rd_stats(&dummy);
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                  PARTITION_NONE, bsize, ctx, dummy);
+#else
+    pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize,
+                        ctx);
+#endif
+    if (this_rdc.rate != INT_MAX) {
+      const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+      this_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+      this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+      }
+    }
+  }
+
+  // PARTITION_SPLIT
+  if (do_split) {
+    RD_STATS sum_rdc;
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+    av1_init_rd_stats(&sum_rdc);
+
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+      if (!pc_tree->split[i])
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
+      pc_tree->split[i]->index = i;
+    }
+
+    int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    sum_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    for (int i = 0;
+         i < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+      const int x_idx = (i & 1) * hbs;
+      const int y_idx = (i >> 1) * hbs;
+
+      if (mi_row + y_idx >= cm->mi_params.mi_rows ||
+          mi_col + x_idx >= cm->mi_params.mi_cols)
+        continue;
+      av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                               mi_col + x_idx, subsize, &this_rdc, i < 3,
+                               best_rdc.rdcost - sum_rdc.rdcost,
+                               pc_tree->split[i]);
+
+      if (this_rdc.rate == INT_MAX) {
+        av1_invalid_rd_stats(&sum_rdc);
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      pc_tree->partitioning = PARTITION_SPLIT;
+    }
+  }
+
+#ifdef _COLLECT_GROUND_TRUTH_
+  store_partition_data(cpi, x, bsize, mi_row, mi_col, pc_tree->partitioning);
+#endif
+
+  *rd_cost = best_rdc;
+
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  if (best_rdc.rate == INT_MAX) {
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  // update mode info array
+  fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree);
+
+  if (do_recon) {
+    if (bsize == cm->seq_params->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  if (bsize == BLOCK_64X64 && do_recon) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+#endif  // CONFIG_RT_ML_PARTITIONING
diff --git a/third_party/aom/av1/encoder/partition_search.h b/third_party/aom/av1/encoder/partition_search.h
new file mode 100644
index 0000000000..1b5d71b7da
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_search.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+#define AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                        const TileInfo *const tile,
+                                        MACROBLOCK *const x, int mi_row,
+                                        int mi_col, BLOCK_SIZE bsize);
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                     MACROBLOCK *const x, int mi_row, int mi_col,
+                     BLOCK_SIZE bsize);
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                          MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+                          int mi_col, BLOCK_SIZE bsize, int *rate,
+                          int64_t *dist, int do_recon, PC_TREE *pc_tree);
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                             TokenExtra **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, PC_TREE *pc_tree);
+#if CONFIG_RT_ML_PARTITIONING
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TokenExtra **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+                              PC_TREE *pc_tree);
+#endif
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf);
+void av1_reset_sf_for_ext_part(AV1_COMP *const cpi);
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+                             TileDataEnc *tile_data, TokenExtra **tp,
+                             SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+                             int mi_col, BLOCK_SIZE bsize,
+                             RD_STATS *best_rd_cost);
+#endif
+
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+                           RD_STATS best_rdc, PC_TREE *pc_tree,
+                           SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+                           SB_MULTI_PASS_MODE multi_pass_mode,
+                           RD_RECT_PART_WIN_INFO *rect_part_win_info);
+
+static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset,
+                                      const uint16_t cb_offset_y,
+                                      const uint16_t cb_offset_uv) {
+  cb_offset[PLANE_TYPE_Y] = cb_offset_y;
+  cb_offset[PLANE_TYPE_UV] = cb_offset_uv;
+}
+
+static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
+                                         const int subsampling_x,
+                                         const int subsampling_y) {
+  x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize];
+  if (x->e_mbd.is_chroma_ref) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, subsampling_x, subsampling_y);
+    assert(plane_bsize != BLOCK_INVALID);
+    x->cb_offset[PLANE_TYPE_UV] +=
+        block_size_wide[plane_bsize] * block_size_high[plane_bsize];
+  }
+}
+
+#endif  // AOM_AV1_ENCODER_PARTITION_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/partition_strategy.c b/third_party/aom/av1/encoder/partition_strategy.c
new file mode 100644
index 0000000000..ce06313579
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_strategy.c
@@ -0,0 +1,2573 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/thirdpass.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconinter.h"
+
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+#include "av1/encoder/encoder.h"
+
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/rdopt.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void simple_motion_search_prune_part_features(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+    int features_to_get);
+
+static bool ext_ml_model_decision_before_none(
+    AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split);
+
+static bool ext_ml_model_decision_before_none_part2(
+    AV1_COMP *cpi,
+    const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+    int *prune_horz, int *prune_vert);
+
+static bool ext_ml_model_decision_after_none(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_none, int *do_square_split,
+    int *do_rectangular_split);
+
+static bool ext_ml_model_decision_after_none_part2(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split_part2(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_prune, int *prune_rect_part_horz,
+    int *prune_rect_part_vert);
+
+static bool ext_ml_model_decision_after_rect(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_rect, int *horza_partition_allowed,
+    int *horzb_partition_allowed, int *verta_partition_allowed,
+    int *vertb_partition_allowed);
+
+static bool ext_ml_model_decision_after_part_ab(
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col);
+
+static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_128X128: return 0;
+    case BLOCK_64X64: return 1;
+    case BLOCK_32X32: return 2;
+    case BLOCK_16X16: return 3;
+    case BLOCK_8X8: return 4;
+    default: assert(0 && "Invalid bsize"); return -1;
+  }
+}
+
+static char *get_feature_file_name(int id) {
+  static char *feature_file_names[] = {
+    "feature_before_partition_none",
+    "feature_before_partition_none_prune_rect",
+    "feature_after_partition_none_prune",
+    "feature_after_partition_none_terminate",
+    "feature_after_partition_split_terminate",
+    "feature_after_partition_split_prune_rect",
+    "feature_after_partition_rect",
+    "feature_after_partition_ab",
+  };
+
+  return feature_file_names[id];
+}
+
+static void write_features_to_file(const char *const path,
+                                   const bool is_test_mode,
+                                   const float *features,
+                                   const int feature_size, const int id,
+                                   const BLOCK_SIZE bsize, const int mi_row,
+                                   const int mi_col) {
+  if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return;
+
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/%s", path,
+           get_feature_file_name(id));
+  FILE *pfile = fopen(filename, "a");
+  if (pfile == NULL) return;
+  if (!is_test_mode) {
+    fprintf(pfile, "%d,%d,%d,%d,%d\n", id, (int)bsize, mi_row, mi_col,
+            feature_size);
+  }
+  for (int i = 0; i < feature_size; ++i) {
+    fprintf(pfile, "%.6f", features[i]);
+    if (i < feature_size - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
+// TODO(chiyotsai@google.com): This is very much a work in progress. We still
+// need to the following:
+//   -- add support for hdres
+//   -- add support for pruning rectangular partitions
+//   -- use reconstructed pixels instead of source pixels for padding
+//   -- use chroma pixels in addition to luma pixels
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int quad_tree_idx,
+                                  int intra_cnn_based_part_prune_level,
+                                  PartitionSearchState *part_state) {
+  assert(cm->seq_params->sb_size >= BLOCK_64X64 &&
+         "Invalid sb_size for intra_cnn!");
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  const int bsize_idx = convert_bsize_to_idx(bsize);
+
+  if (bsize == BLOCK_128X128) {
+    return;
+  }
+
+  PartitionSearchInfo *part_info = &x->part_search_info;
+
+  // Precompute the CNN part and cache the result in MACROBLOCK
+  if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) {
+    const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
+
+    // Prepare the output
+    const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL };
+    const int num_outputs = 4;
+    const int output_dims[4] = { 1, 2, 4, 8 };
+    const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH,
+                             CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH };
+    float *output_buffer[CNN_TOT_OUT_CH];
+
+    float **cur_output_buf = output_buffer;
+    float *curr_buf_ptr = part_info->cnn_buffer;
+    for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+      const int num_chs = out_chs[output_idx];
+      const int ch_size = output_dims[output_idx] * output_dims[output_idx];
+      for (int ch = 0; ch < num_chs; ch++) {
+        cur_output_buf[ch] = curr_buf_ptr;
+        curr_buf_ptr += ch_size;
+      }
+      cur_output_buf += num_chs;
+    }
+
+    CNN_MULTI_OUT output = {
+      .num_outputs = 4,
+      .output_channels = out_chs,
+      .output_strides = output_dims,
+      .output_buffer = output_buffer,
+    };
+
+    // Prepare the input
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bit_depth = xd->bd;
+    const int dc_q =
+        av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8);
+    part_info->log_q = log1pf((float)(dc_q * dc_q) / 256.0f);
+    part_info->log_q =
+        (part_info->log_q - av1_intra_mode_cnn_partition_mean[0]) /
+        av1_intra_mode_cnn_partition_std[0];
+
+    const int width = 65, height = 65,
+              stride = x->plane[AOM_PLANE_Y].src.stride;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *image[1] = {
+        CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1
+      };
+
+      if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
+                                                cnn_config, &thread_data,
+                                                bit_depth, &output)) {
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Error allocating CNN data");
+        return;
+      }
+    } else {
+      uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 };
+
+      if (!av1_cnn_predict_img_multi_out(image, width, height, stride,
+                                         cnn_config, &thread_data, &output)) {
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Error allocating CNN data");
+        return;
+      }
+    }
+
+    part_info->cnn_output_valid = 1;
+  }
+
+  if (!part_info->cnn_output_valid) {
+    return;
+  }
+
+  const NN_CONFIG *dnn_configs[5] = {
+    NULL,
+    &av1_intra_mode_cnn_partition_branch_0_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_1_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_2_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_3_dnn_config,
+  };
+
+  const NN_CONFIG *dnn_config = dnn_configs[bsize_idx];
+
+  float dnn_features[100];
+  float logits[4] = { 0.0f };
+
+  const float *branch_0 = part_info->cnn_buffer;
+  const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE;
+  const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE;
+  const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE;
+
+  if (bsize == BLOCK_64X64) {
+    int f_idx = 0;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_0[ch_idx];
+    }
+
+    const int spa_stride = 2 * 2;
+    for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) {
+      for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+        dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride];
+      }
+    }
+    dnn_features[f_idx++] = part_info->log_q;
+  } else if (bsize == BLOCK_32X32) {
+    int f_idx = 0;
+    for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) {
+      dnn_features[f_idx++] = branch_0[idx];
+    }
+
+    const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1];
+    const int spa_stride = 2 * 2;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = part_info->log_q;
+  } else if (bsize == BLOCK_16X16) {
+    int f_idx = 0;
+    const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+    const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1];
+    const int prev_spa_stride = 2 * 2;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride];
+    }
+
+    const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5];
+    const int spa_stride = 4 * 4;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = part_info->log_q;
+  } else if (bsize == BLOCK_8X8) {
+    int f_idx = 0;
+    const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+    const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5];
+    const int prev_spa_stride = 4 * 4;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride];
+    }
+
+    const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21];
+    const int spa_stride = 8 * 8;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = part_info->log_q;
+  } else {
+    assert(0 && "Invalid bsize in intra_cnn partition");
+  }
+
+  // Make decision
+  av1_nn_predict(dnn_features, dnn_config, 1, logits);
+
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  float split_only_thresh = 100.0f, no_split_thresh = -100.0f;
+  if (is_720p_or_larger) {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx];
+  } else if (is_480p_or_larger) {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx];
+  } else {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx];
+  }
+
+  if (logits[0] > split_only_thresh) {
+    // As screen contents tend to choose larger partitions, do not prune
+    // PARTITION_NONE when intra_cnn_based_part_prune_level=1.
+    if (intra_cnn_based_part_prune_level != 1) {
+      part_state->partition_none_allowed = 0;
+    }
+    part_state->do_square_split = 1;
+    av1_disable_rect_partitions(part_state);
+  }
+
+  if (logits[0] < no_split_thresh) {
+    av1_disable_square_split_partition(part_state);
+  }
+}
+
+static INLINE int get_simple_motion_search_prune_agg(int qindex,
+                                                     int prune_level,
+                                                     int is_rect_part) {
+  assert(prune_level < TOTAL_AGG_LVLS);
+  if (prune_level == NO_PRUNING) {
+    return -1;
+  }
+
+  // Aggressiveness value for SIMPLE_MOTION_SEARCH_PRUNE_LEVEL except
+  // QIDX_BASED_AGG_LVL
+  const int sms_prune_agg_levels[TOTAL_SIMPLE_AGG_LVLS] = { 0, 1, 2, 3 };
+  if (prune_level < TOTAL_SIMPLE_AGG_LVLS) {
+    return sms_prune_agg_levels[prune_level];
+  }
+
+  // Map the QIDX_BASED_AGG_LVL to corresponding aggressiveness value.
+  // Aggressive pruning for lower quantizers in non-boosted frames to prune
+  // rectangular partitions.
+  const int qband = is_rect_part ? (qindex <= 90 ? 1 : 0) : 0;
+  const int sms_prune_agg_qindex_based[2] = { 1, 2 };
+  return sms_prune_agg_qindex_based[qband];
+}
+
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                          PartitionSearchState *part_state) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  const int bsize_idx = convert_bsize_to_idx(bsize);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+
+  const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+  assert(bsize_idx >= 0 && bsize_idx <= 4 &&
+         "Invalid bsize in simple_motion_search_based_split");
+
+  const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx];
+  const float *ml_std = av1_simple_motion_search_split_std[bsize_idx];
+  const NN_CONFIG *nn_config =
+      av1_simple_motion_search_split_nn_config[bsize_idx];
+
+  const int agg = get_simple_motion_search_prune_agg(
+      x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 0);
+  if (agg < 0) {
+    return;
+  }
+
+  const float split_only_thresh =
+      av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx];
+  const float no_split_thresh =
+      av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx];
+
+  float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f };
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+                                           bsize, features,
+                                           FEATURE_SMS_SPLIT_MODEL_FLAG);
+
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col);
+
+  // Note: it is intended to not normalize the features here, to keep it
+  // consistent for all features collected and passed to the external model.
+  if (ext_ml_model_decision_before_none(
+          cpi, features, &part_state->partition_none_allowed,
+          &part_state->partition_rect_allowed[HORZ],
+          &part_state->partition_rect_allowed[VERT],
+          &part_state->do_rectangular_split, &part_state->do_square_split)) {
+    return;
+  }
+
+  for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
+    features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx];
+  }
+
+  float score = 0.0f;
+
+  av1_nn_predict(features, nn_config, 1, &score);
+
+  if (score > split_only_thresh) {
+    av1_set_square_split_only(part_state);
+  }
+
+  if (cpi->sf.part_sf.simple_motion_search_split >= 2 &&
+      score < no_split_thresh) {
+    av1_disable_square_split_partition(part_state);
+  }
+
+  // If the score is very low, prune rectangular split since it is unlikely to
+  // occur.
+  if (cpi->sf.part_sf.simple_motion_search_rect_split) {
+    const float scale = res_idx >= 2 ? 3.0f : 2.0f;
+    const float rect_split_thresh =
+        scale * av1_simple_motion_search_no_split_thresh
+                    [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx]
+                    [bsize_idx];
+    if (score < rect_split_thresh) {
+      part_state->do_rectangular_split = 0;
+    }
+  }
+}
+
+// Given a list of ref frames in refs, performs simple_motion_search on each of
+// the refs and returns the ref with the smallest sse. Returns -1 if none of the
+// ref in the list is available. Also stores the best sse and var in best_sse,
+// best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in
+// sms_tree. If save_mv is 1, update mv_ref_fulls under sms_tree and the
+// subtrees.
+static int simple_motion_search_get_best_ref(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, const int *const refs,
+    int num_refs, int use_subpixel, int save_mv, unsigned int *best_sse,
+    unsigned int *best_var) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int best_ref = -1;
+
+  if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) {
+    // If the whole block is outside of the image, set the var and sse to 0.
+    *best_var = 0;
+    *best_sse = 0;
+
+    return best_ref;
+  }
+
+  // Otherwise do loop through the reference frames and find the one with the
+  // minimum SSE
+  const int num_planes = 1;
+
+  *best_sse = INT_MAX;
+
+  for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) {
+    const int ref = refs[ref_idx];
+
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
+      const FULLPEL_MV *start_mvs = sms_tree->start_mvs;
+      unsigned int curr_sse = 0, curr_var = 0;
+      const int_mv best_mv = av1_simple_motion_search_sse_var(
+          cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes,
+          use_subpixel, &curr_sse, &curr_var);
+      if (curr_sse < *best_sse) {
+        *best_sse = curr_sse;
+        *best_var = curr_var;
+        best_ref = ref;
+      }
+
+      if (save_mv) {
+        sms_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
+        sms_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
+
+        if (bsize >= BLOCK_8X8) {
+          for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
+            // Propagate the new motion vectors to a lower level
+            SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
+            sub_tree->start_mvs[ref] = sms_tree->start_mvs[ref];
+          }
+        }
+      }
+    }
+  }
+
+  return best_ref;
+}
+
+// Collects features using simple_motion_search and store them in features. The
+// features are also cached in SIMPLE_MOTION_DATA_TREE. By default, the features
+// collected are the sse and var from the subblocks flagged by features_to_get.
+// Furthermore, if features is not NULL, then 7 more features are appended to
+// the end of features:
+//  - log(1.0 + dc_q ** 2)
+//  - whether an above macroblock exists
+//  - width of above macroblock
+//  - height of above macroblock
+//  - whether a left marcoblock exists
+//  - width of left macroblock
+//  - height of left macroblock
+static AOM_INLINE void simple_motion_search_prune_part_features(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+    int features_to_get) {
+  const int w_mi = mi_size_wide[bsize];
+  const int h_mi = mi_size_high[bsize];
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  assert(bsize >= BLOCK_8X8);
+  assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
+         cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+
+  // Setting up motion search
+  const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+                                                        : LAST_FRAME };
+  const int num_refs = 1;
+  const int use_subpixel = 1;
+
+  // Doing whole block first to update the mv
+  if (!sms_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
+    simple_motion_search_get_best_ref(cpi, x, sms_tree, mi_row, mi_col, bsize,
+                                      ref_list, num_refs, use_subpixel, 1,
+                                      &sms_tree->sms_none_feat[0],
+                                      &sms_tree->sms_none_feat[1]);
+    sms_tree->sms_none_valid = 1;
+  }
+
+  // Split subblocks
+  if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
+      const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+      const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+      SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
+
+      if (!sub_tree->sms_none_valid) {
+        simple_motion_search_get_best_ref(
+            cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list,
+            num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0],
+            &sub_tree->sms_none_feat[1]);
+        sub_tree->sms_none_valid = 1;
+      }
+    }
+  }
+
+  // Rectangular subblocks
+  if (!sms_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
+    // Horz subblock
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
+      const int sub_mi_col = mi_col + 0;
+      const int sub_mi_row = mi_row + r_idx * h_mi / 2;
+
+      simple_motion_search_get_best_ref(
+          cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &sms_tree->sms_rect_feat[2 * r_idx],
+          &sms_tree->sms_rect_feat[2 * r_idx + 1]);
+    }
+
+    // Vert subblock
+    subsize = get_partition_subsize(bsize, PARTITION_VERT);
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
+      const int sub_mi_col = mi_col + r_idx * w_mi / 2;
+      const int sub_mi_row = mi_row + 0;
+
+      simple_motion_search_get_best_ref(
+          cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &sms_tree->sms_rect_feat[4 + 2 * r_idx],
+          &sms_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
+    }
+    sms_tree->sms_rect_valid = 1;
+  }
+
+  if (!features) return;
+
+  int f_idx = 0;
+  if (features_to_get & FEATURE_SMS_NONE_FLAG) {
+    for (int sub_idx = 0; sub_idx < 2; sub_idx++) {
+      features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[sub_idx]);
+    }
+  }
+
+  if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+    for (int sub_idx = 0; sub_idx < SUB_PARTITIONS_SPLIT; sub_idx++) {
+      SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[sub_idx];
+      features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[0]);
+      features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[1]);
+    }
+  }
+
+  if (features_to_get & FEATURE_SMS_RECT_FLAG) {
+    for (int sub_idx = 0; sub_idx < 8; sub_idx++) {
+      features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[sub_idx]);
+    }
+  }
+
+  const MACROBLOCKD *xd = &x->e_mbd;
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  // Q_INDEX
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  features[f_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f);
+
+  // Neighbor stuff
+  const int has_above = !!xd->above_mbmi;
+  const int has_left = !!xd->left_mbmi;
+  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : bsize;
+  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : bsize;
+  features[f_idx++] = (float)has_above;
+  features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
+  features[f_idx++] = (float)mi_size_high_log2[above_bsize];
+  features[f_idx++] = (float)has_left;
+  features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
+  features[f_idx++] = (float)mi_size_high_log2[left_bsize];
+}
+
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+                                         SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                         PartitionSearchState *part_state) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  const int bsize_idx = convert_bsize_to_idx(bsize);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+
+  const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+  // Get model parameters
+  const NN_CONFIG *nn_config =
+      av1_simple_motion_search_prune_rect_nn_config[bsize_idx];
+  const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx],
+              *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx];
+
+  const int agg = get_simple_motion_search_prune_agg(
+      x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 1);
+  if (agg < 0) {
+    return;
+  }
+
+  const float prune_thresh =
+      av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx];
+
+  // If there is no valid threshold, return immediately.
+  if (!nn_config || prune_thresh == 0.0f) {
+    return;
+  }
+
+  // Get features
+  float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+                                           bsize, features,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
+
+  // Note: it is intended to not normalize the features here, to keep it
+  // consistent for all features collected and passed to the external model.
+  if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) &&
+      (part_state->partition_rect_allowed[HORZ] ||
+       part_state->partition_rect_allowed[VERT]) &&
+      bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+    // Write features to file
+    write_features_to_file(
+        cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode,
+        features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col);
+
+    if (ext_ml_model_decision_before_none_part2(
+            cpi, features, &part_state->prune_rect_part[HORZ],
+            &part_state->prune_rect_part[VERT])) {
+      return;
+    }
+  }
+
+  for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
+    features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+  }
+
+  // Get probabilities
+  float scores[EXT_PARTITION_TYPES] = { 0.0f },
+        probs[EXT_PARTITION_TYPES] = { 0.0f };
+  const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8)
+                              ? PARTITION_TYPES
+                              : EXT_PARTITION_TYPES;
+
+  av1_nn_predict(features, nn_config, 1, scores);
+
+  av1_nn_softmax(scores, probs, num_classes);
+
+  // Determine if we should prune rectangular partitions.
+  if (probs[PARTITION_HORZ] <= prune_thresh) {
+    part_state->prune_rect_part[HORZ] = 1;
+  }
+  if (probs[PARTITION_VERT] <= prune_thresh) {
+    part_state->prune_rect_part[VERT] = 1;
+  }
+}
+
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+//  - The frame is a show frame
+//  - The frame is not intra only
+//  - The current bsize is > BLOCK_8X8
+//  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    const RD_STATS *none_rdc, PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f };
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+                                           bsize, features,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
+  int f_idx = FEATURE_SIZE_SMS_PRUNE_PART;
+
+  features[f_idx++] = log1pf((float)none_rdc->rate);
+  features[f_idx++] = log1pf((float)none_rdc->dist);
+  features[f_idx++] = log1pf((float)none_rdc->rdcost);
+
+  assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE);
+
+  const float *ml_mean = NULL;
+  const float *ml_std = NULL;
+  const float *ml_model = NULL;
+
+  if (bsize == BLOCK_128X128) {
+    ml_mean = av1_simple_motion_search_term_none_mean_128;
+    ml_std = av1_simple_motion_search_term_none_std_128;
+    ml_model = av1_simple_motion_search_term_none_model_128;
+  } else if (bsize == BLOCK_64X64) {
+    ml_mean = av1_simple_motion_search_term_none_mean_64;
+    ml_std = av1_simple_motion_search_term_none_std_64;
+    ml_model = av1_simple_motion_search_term_none_model_64;
+  } else if (bsize == BLOCK_32X32) {
+    ml_mean = av1_simple_motion_search_term_none_mean_32;
+    ml_std = av1_simple_motion_search_term_none_std_32;
+    ml_model = av1_simple_motion_search_term_none_model_32;
+  } else if (bsize == BLOCK_16X16) {
+    ml_mean = av1_simple_motion_search_term_none_mean_16;
+    ml_std = av1_simple_motion_search_term_none_std_16;
+    ml_model = av1_simple_motion_search_term_none_model_16;
+  } else {
+    assert(0 && "Unexpected block size in simple_motion_term_none");
+  }
+
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_none_part2(
+          cpi, features, &part_state->terminate_partition_search)) {
+    return;
+  }
+
+  if (ml_model) {
+    float score = 0.0f;
+    for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
+      score +=
+          ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+    }
+    score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
+
+    if (score >= 0.0f) {
+      part_state->terminate_partition_search = 1;
+    }
+  }
+}
+
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        float *features) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+  // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size.
+  assert(sb_size == BLOCK_128X128);
+
+  int f_idx = 0;
+
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  const float log_q_sq = log1pf((float)(dc_q * dc_q) / 256.0f);
+
+  // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb
+  float sum_mv_row_sq = 0;
+  float sum_mv_row = 0;
+  float min_abs_mv_row = FLT_MAX;
+  float max_abs_mv_row = 0;
+
+  float sum_mv_col_sq = 0;
+  float sum_mv_col = 0;
+  float min_abs_mv_col = FLT_MAX;
+  float max_abs_mv_col = 0;
+
+  float sum_log_sse_sq = 0;
+  float sum_log_sse = 0;
+  float min_log_sse = FLT_MAX;
+  float max_log_sse = 0;
+
+  const BLOCK_SIZE mb_size = BLOCK_16X16;
+  const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size];
+  const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size];
+  const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size];
+  const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size];
+
+  for (int mb_row = 0; mb_row < mb_rows; mb_row++)
+    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+      const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2);
+      const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2);
+      unsigned int sse = 0;
+      unsigned int var = 0;
+      const FULLPEL_MV start_mv = kZeroFullMv;
+      const MV_REFERENCE_FRAME ref =
+          cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+      const int_mv best_mv = av1_simple_motion_search_sse_var(
+          cpi, x, this_mi_row, this_mi_col, mb_size, ref, start_mv, 1, 0, &sse,
+          &var);
+
+      const float mv_row = (float)(best_mv.as_mv.row / 8);
+      const float mv_col = (float)(best_mv.as_mv.col / 8);
+      const float log_sse = log1pf((float)sse);
+      const float abs_mv_row = fabsf(mv_row);
+      const float abs_mv_col = fabsf(mv_col);
+
+      sum_mv_row_sq += mv_row * mv_row;
+      sum_mv_row += mv_row;
+      sum_mv_col_sq += mv_col * mv_col;
+      sum_mv_col += mv_col;
+
+      if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row;
+      if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row;
+      if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col;
+      if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col;
+
+      sum_log_sse_sq += log_sse * log_sse;
+      sum_log_sse += log_sse;
+      if (log_sse < min_log_sse) min_log_sse = log_sse;
+      if (log_sse > max_log_sse) max_log_sse = log_sse;
+    }
+  const int blks = mb_rows * mb_cols;
+  const float avg_mv_row = sum_mv_row / (float)blks;
+  const float var_mv_row =
+      sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row;
+
+  const float avg_mv_col = sum_mv_col / (float)blks;
+  const float var_mv_col =
+      sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col;
+
+  const float avg_log_sse = sum_log_sse / (float)blks;
+  const float var_log_sse =
+      sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse;
+
+  features[f_idx++] = avg_log_sse;
+  features[f_idx++] = avg_mv_col;
+  features[f_idx++] = avg_mv_row;
+  features[f_idx++] = log_q_sq;
+  features[f_idx++] = max_abs_mv_col;
+  features[f_idx++] = max_abs_mv_row;
+  features[f_idx++] = max_log_sse;
+  features[f_idx++] = min_abs_mv_col;
+  features[f_idx++] = min_abs_mv_row;
+  features[f_idx++] = min_log_sse;
+  features[f_idx++] = var_log_sse;
+  features[f_idx++] = var_mv_col;
+  features[f_idx++] = var_mv_row;
+
+  assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
+}
+
+// Convert result index to block size.
+// result idx     block size
+//     0          BLOCK_16X16
+//     1          BLOCK_32X32
+//     2          BLOCK_64X64
+//     3          BLOCK_128X128
+static BLOCK_SIZE get_block_size(int idx) {
+  return (BLOCK_SIZE)((idx + 2) * 3);
+}
+
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const float *features) {
+  float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+  const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
+
+  assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+         NOT_IN_USE);
+
+  av1_nn_predict(features, nn_config, 1, scores);
+
+  int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
+  if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+      DIRECT_PRED) {
+    result = 0;
+    float max_score = scores[0];
+    for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
+      if (scores[i] > max_score) {
+        max_score = scores[i];
+        result = i;
+      }
+    }
+    return get_block_size(result);
+  }
+
+  float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+  av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
+
+  if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+      RELAXED_PRED) {
+    for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+         --result) {
+      if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+        probs[result] += probs[result + 1];
+      }
+      if (probs[result] > 0.2) break;
+    }
+  } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+             ADAPT_PRED) {
+    const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+    // TODO(debargha): x->source_variance is unavailable at this point,
+    // so compute. The redundant recomputation later can be removed.
+    const unsigned int source_variance = av1_get_perpixel_variance_facade(
+        cpi, &x->e_mbd, &x->plane[0].src, sb_size, AOM_PLANE_Y);
+    if (source_variance > 16) {
+      const double thresh = source_variance < 128 ? 0.05 : 0.1;
+      for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+           --result) {
+        if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+          probs[result] += probs[result + 1];
+        }
+        if (probs[result] > thresh) break;
+      }
+    }
+  }
+
+  return get_block_size(result);
+}
+
+// Get the minimum partition block width and height(in log scale) under a
+// SIMPLE_MOTION_DATA_TREE.
+static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                     int *min_bw, int *min_bh) {
+  if (!sms_tree) return;
+
+  const BLOCK_SIZE bsize = sms_tree->block_size;
+  if (bsize == BLOCK_4X4) {
+    *min_bw = 0;
+    *min_bh = 0;
+    return;
+  }
+
+  PARTITION_TYPE part_type = sms_tree->partitioning;
+  if (part_type == PARTITION_INVALID) return;
+
+  if (part_type == PARTITION_SPLIT) {
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+      get_min_bsize(sms_tree->split[i], min_bw, min_bh);
+    }
+  } else {
+    if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B ||
+        part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B)
+      part_type = PARTITION_SPLIT;
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type);
+    if (subsize != BLOCK_INVALID) {
+      *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]);
+      *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]);
+    }
+  }
+}
+
+static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features,
+                                  int *feature_idx) {
+  const int rd_valid = rd > 0 && rd < INT64_MAX;
+  const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f;
+  features[(*feature_idx)++] = (float)rd_valid;
+  features[(*feature_idx)++] = rd_ratio;
+}
+
+#define FEATURES 31
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                   SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                   int64_t best_rd, int64_t part_none_rd,
+                                   int64_t part_split_rd,
+                                   int64_t *split_block_rd,
+                                   PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  if (best_rd <= 0 || best_rd == INT64_MAX ||
+      part_state->terminate_partition_search)
+    return;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const NN_CONFIG *nn_config = NULL;
+  float thresh = -1e6;
+  switch (bsize) {
+    case BLOCK_128X128: break;
+    case BLOCK_64X64:
+      nn_config = &av1_early_term_after_split_nnconfig_64;
+      thresh = is_480p_or_larger ? -2.0f : -1.2f;
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_early_term_after_split_nnconfig_32;
+      thresh = is_480p_or_larger ? -2.6f : -2.3f;
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_early_term_after_split_nnconfig_16;
+      thresh = is_480p_or_larger ? -2.0f : -2.4f;
+      break;
+    case BLOCK_8X8:
+      nn_config = &av1_early_term_after_split_nnconfig_8;
+      thresh = is_480p_or_larger ? -1.0f : -1.4f;
+      break;
+    case BLOCK_4X4: break;
+    default:
+      assert(0 && "Invalid block size in av1_ml_early_term_after_split().");
+      break;
+  }
+  if (!nn_config) return;
+
+  // Use more conservative threshold for level 1.
+  if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  const int bs = block_size_wide[bsize];
+  int f_idx = 0;
+  float features[FEATURES] = { 0.0f };
+
+  features[f_idx++] = log1pf((float)dc_q / 4.0f);
+  features[f_idx++] = log1pf((float)best_rd / bs / bs / 1024.0f);
+
+  add_rd_feature(part_none_rd, best_rd, features, &f_idx);
+  add_rd_feature(part_split_rd, best_rd, features, &f_idx);
+
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    add_rd_feature(split_block_rd[i], best_rd, features, &f_idx);
+    int min_bw = MAX_SB_SIZE_LOG2;
+    int min_bh = MAX_SB_SIZE_LOG2;
+    get_min_bsize(sms_tree->split[i], &min_bw, &min_bh);
+    features[f_idx++] = (float)min_bw;
+    features[f_idx++] = (float)min_bh;
+  }
+
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+                                           bsize, NULL,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
+
+  features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[1]);
+
+  features[f_idx++] = log1pf((float)sms_tree->split[0]->sms_none_feat[1]);
+  features[f_idx++] = log1pf((float)sms_tree->split[1]->sms_none_feat[1]);
+  features[f_idx++] = log1pf((float)sms_tree->split[2]->sms_none_feat[1]);
+  features[f_idx++] = log1pf((float)sms_tree->split[3]->sms_none_feat[1]);
+
+  features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[1]);
+  features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[3]);
+  features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[5]);
+  features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[7]);
+
+  assert(f_idx == FEATURES);
+
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features, FEATURES,
+                         4, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_split(
+          cpi, features, &part_state->terminate_partition_search)) {
+    return;
+  }
+
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+  // Score is indicator of confidence that we should NOT terminate.
+  if (score < thresh) {
+    part_state->terminate_partition_search = 1;
+  }
+}
+#undef FEATURES
+
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                                 int64_t best_rd, int64_t none_rd,
+                                 const int64_t *split_rd,
+                                 PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  best_rd = AOMMAX(best_rd, 1);
+  const NN_CONFIG *nn_config = NULL;
+  const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+  float cur_thresh = 0.0f;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_rect_partition_nnconfig_8;
+      cur_thresh = prob_thresholds[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_rect_partition_nnconfig_16;
+      cur_thresh = prob_thresholds[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_rect_partition_nnconfig_32;
+      cur_thresh = prob_thresholds[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_rect_partition_nnconfig_64;
+      cur_thresh = prob_thresholds[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_rect_partition_nnconfig_128;
+      cur_thresh = prob_thresholds[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  // 1. Compute input features
+  float features[9];
+
+  // RD cost ratios
+  for (int i = 0; i < 5; i++) features[i] = 1.0f;
+  if (none_rd > 0 && none_rd < 1000000000)
+    features[0] = (float)none_rd / (float)best_rd;
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      features[1 + i] = (float)split_rd[i] / (float)best_rd;
+  }
+
+  // Variance ratios
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  int whole_block_variance;
+  whole_block_variance = av1_get_perpixel_variance_facade(
+      cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+  whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+  int split_variance[SUB_PARTITIONS_SPLIT];
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  struct buf_2d buf;
+  buf.stride = x->plane[0].src.stride;
+  const int bw = block_size_wide[bsize];
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    const int x_idx = (i & 1) * bw / 2;
+    const int y_idx = (i >> 1) * bw / 2;
+    buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+    split_variance[i] =
+        av1_get_perpixel_variance_facade(cpi, xd, &buf, subsize, AOM_PLANE_Y);
+  }
+
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++)
+    features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         /*feature_size=*/9, 5, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_split_part2(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, &part_state->prune_rect_part[HORZ],
+          &part_state->prune_rect_part[VERT])) {
+    return;
+  }
+
+  // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+  float raw_scores[3] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, raw_scores);
+  float probs[3] = { 0.0f };
+  av1_nn_softmax(raw_scores, probs, 3);
+
+  // probs[0] is the probability of the fact that both rectangular partitions
+  // are worse than current best_rd
+  if (probs[1] <= cur_thresh) part_state->prune_rect_part[HORZ] = 1;
+  if (probs[2] <= cur_thresh) part_state->prune_rect_part[VERT] = 1;
+}
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+                               int64_t best_rd,
+                               PartitionSearchState *part_state,
+                               int *ab_partitions_allowed) {
+  const PartitionBlkParams blk_params = part_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_8X8: nn_config = NULL; break;
+    case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+    case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  // Generate features.
+  float features[10];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)var_ctx;
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    const int64_t *horz_rd = part_state->rect_part_rd[HORZ];
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    const int64_t *vert_rd = part_state->rect_part_rd[VERT];
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    const int64_t *split_rd = part_state->split_rd;
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == 10);
+
+  // Write features to file
+  if (!frame_is_intra_only(&cpi->common)) {
+    write_features_to_file(cpi->oxcf.partition_info_path,
+                           cpi->ext_part_controller.test_mode, features,
+                           /*feature_size=*/10, 6, bsize, mi_row, mi_col);
+  }
+
+  if (ext_ml_model_decision_after_rect(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, &ab_partitions_allowed[HORZ_A],
+          &ab_partitions_allowed[HORZ_B], &ab_partitions_allowed[VERT_A],
+          &ab_partitions_allowed[VERT_B])) {
+    return;
+  }
+
+  // Calculate scores using the NN model.
+  float score[16] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, score);
+  int int_score[16];
+  int max_score = -1000;
+  for (int i = 0; i < 16; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 150; break;
+    case BLOCK_32X32: thresh -= 100; break;
+    default: break;
+  }
+  av1_zero_array(ab_partitions_allowed, NUM_AB_PARTS);
+  for (int i = 0; i < 16; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) ab_partitions_allowed[HORZ_A] = 1;
+      if ((i >> 1) & 1) ab_partitions_allowed[HORZ_B] = 1;
+      if ((i >> 2) & 1) ab_partitions_allowed[VERT_A] = 1;
+      if ((i >> 3) & 1) ab_partitions_allowed[VERT_B] = 1;
+    }
+  }
+}
+
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int part_ctx, int64_t best_rd,
+                              PartitionSearchState *part_state,
+                              int *part4_allowed,
+                              unsigned int pb_source_variance) {
+  const PartitionBlkParams blk_params = part_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+
+  int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd;
+  int64_t *split_rd = part_state->split_rd;
+  if (ext_ml_model_decision_after_part_ab(
+          cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd,
+          &part4_allowed[HORZ4], &part4_allowed[VERT4], pb_source_variance,
+          mi_row, mi_col))
+    return;
+
+  if (best_rd >= 1000000000) return;
+  int64_t *horz_rd = rect_part_rd[HORZ4];
+  int64_t *vert_rd = rect_part_rd[VERT4];
+  const NN_CONFIG *nn_config = NULL;
+  // 4-way partitions are only allowed for these three square block sizes.
+  switch (bsize) {
+    case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  // Generate features.
+  float features[FEATURES];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+    assert(horz_4_bs != BLOCK_INVALID);
+    assert(vert_4_bs != BLOCK_INVALID);
+
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common), bsize);
+    const int src_stride = x->plane[0].src.stride;
+    uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+
+    struct buf_2d horz_4_src, vert_4_src;
+    horz_4_src.stride = src_stride;
+    vert_4_src.stride = src_stride;
+
+    for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+      horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+      vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+      horz_4_source_var[i] = av1_get_perpixel_variance_facade(
+          cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y);
+      vert_4_source_var[i] = av1_get_perpixel_variance_facade(
+          cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y);
+    }
+  }
+
+  const float denom = (float)(pb_source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  assert(feature_index == FEATURES);
+
+  // Write features to file
+  if (!frame_is_intra_only(&cpi->common)) {
+    write_features_to_file(cpi->oxcf.partition_info_path,
+                           cpi->ext_part_controller.test_mode, features,
+                           FEATURES, 7, bsize, mi_row, mi_col);
+  }
+
+  // Calculate scores using the NN model.
+  float score[LABELS] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, score);
+  int int_score[LABELS];
+  int max_score = -1000;
+  for (int i = 0; i < LABELS; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 500; break;
+    case BLOCK_32X32: thresh -= 500; break;
+    case BLOCK_64X64: thresh -= 200; break;
+    default: break;
+  }
+  av1_zero_array(part4_allowed, NUM_PART4_TYPES);
+  for (int i = 0; i < LABELS; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) part4_allowed[HORZ4] = 1;
+      if ((i >> 1) & 1) part4_allowed[VERT4] = 1;
+    }
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+#define FEATURES 4
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                             const RD_STATS *const rd_stats,
+                             unsigned int pb_source_variance, int bit_depth,
+                             PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  const NN_CONFIG *nn_config = NULL;
+  int thresh = 0;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_partition_breakout_nnconfig_8;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_partition_breakout_nnconfig_16;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_partition_breakout_nnconfig_32;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_partition_breakout_nnconfig_64;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_partition_breakout_nnconfig_128;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config || thresh < 0) return;
+
+  const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f };
+  thresh = (int)((float)thresh *
+                 ml_predict_breakout_thresh_scale
+                     [cpi->sf.part_sf.ml_predict_breakout_level - 1]);
+
+  // Generate feature values.
+  float features[FEATURES];
+  int feature_index = 0;
+
+  const int num_pels_log2 = num_pels_log2_lookup[bsize];
+  float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+  rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+           rate_f;
+  features[feature_index++] = rate_f;
+
+  const float dist_f =
+      (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+  features[feature_index++] = dist_f;
+
+  features[feature_index++] = (float)pb_source_variance;
+
+  const int dc_q = (int)x->plane[0].dequant_QTX[0] >> (bit_depth - 8);
+  features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+  assert(feature_index == FEATURES);
+
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features, FEATURES,
+                         2, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_none(&cpi->ext_part_controller,
+                                       frame_is_intra_only(&cpi->common),
+                                       features, &part_state->do_square_split,
+                                       &part_state->do_rectangular_split)) {
+    return;
+  }
+
+  // Calculate score using the NN model.
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+
+  // Make decision.
+  if ((int)(score * 100) >= thresh) {
+    part_state->do_square_split = 0;
+    part_state->do_rectangular_split = 0;
+  }
+}
+#undef FEATURES
+
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+                                        MACROBLOCK *const x,
+                                        SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                        PartitionSearchState *part_state) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  if (cpi->third_pass_ctx) {
+    int mi_row = blk_params->mi_row;
+    int mi_col = blk_params->mi_col;
+    double ratio_h, ratio_w;
+    av1_get_third_pass_ratio(cpi->third_pass_ctx, 0, cm->height, cm->width,
+                             &ratio_h, &ratio_w);
+    THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+        cpi->third_pass_ctx, 0, mi_row, mi_col, ratio_h, ratio_w);
+    BLOCK_SIZE third_pass_bsize =
+        av1_get_third_pass_adjusted_blk_size(this_mi, ratio_h, ratio_w);
+    // check the actual partition of this block in the second pass
+    PARTITION_TYPE third_pass_part =
+        av1_third_pass_get_sb_part_type(cpi->third_pass_ctx, this_mi);
+
+    int is_edge = (mi_row + mi_size_high[bsize] >= cm->mi_params.mi_rows) ||
+                  (mi_col + mi_size_wide[bsize] >= cm->mi_params.mi_cols);
+
+    if (!is_edge && block_size_wide[bsize] >= 16) {
+      // If in second pass we used rectangular partition, then do not search for
+      // rectangular partition in the different direction.
+      if (third_pass_part != PARTITION_NONE) {
+        if (third_pass_part == PARTITION_HORZ ||
+            third_pass_part == PARTITION_HORZ_4 ||
+            third_pass_part == PARTITION_HORZ_A ||
+            third_pass_part == PARTITION_HORZ_B) {
+          part_state->partition_rect_allowed[VERT] = 0;
+        } else if (third_pass_part == PARTITION_VERT ||
+                   third_pass_part == PARTITION_VERT_4 ||
+                   third_pass_part == PARTITION_VERT_A ||
+                   third_pass_part == PARTITION_VERT_B) {
+          part_state->partition_rect_allowed[HORZ] = 0;
+        }
+      }
+
+      int minSize = AOMMIN(block_size_wide[third_pass_bsize],
+                           block_size_high[third_pass_bsize]);
+      int maxSize = AOMMAX(block_size_wide[third_pass_bsize],
+                           block_size_high[third_pass_bsize]);
+      if (block_size_wide[bsize] < minSize / 4) {
+        // Current partition is too small, just terminate
+        part_state->terminate_partition_search = 1;
+        return;
+      } else if (block_size_wide[bsize] < minSize / 2) {
+        if (third_pass_part != PARTITION_NONE) {
+          // Current partition is very small, and in second pass we used
+          // rectangular partition. Terminate the search here then.
+          part_state->terminate_partition_search = 1;
+          return;
+        } else {
+          // Partition is small, but we still check this partition, only disable
+          // further splits.
+          // TODO(any): check why this is not covered by the termination for <
+          // minSize/4.
+          av1_disable_square_split_partition(part_state);
+          av1_disable_rect_partitions(part_state);
+          return;
+        }
+      } else if (block_size_wide[bsize] > maxSize) {
+        // Partition is larger than in the second pass. Only allow split.
+        av1_set_square_split_only(part_state);
+        return;
+      } else if (block_size_wide[bsize] >= minSize &&
+                 block_size_wide[bsize] <= maxSize) {
+        // Partition is within a range where it is very likely to find a good
+        // choice, so do not prune anything.
+        return;
+      }
+    }
+  }
+
+  // Prune rectangular partitions for larger blocks.
+  if (bsize > cpi->sf.part_sf.rect_partition_eval_thresh) {
+    part_state->do_rectangular_split = 0;
+    part_state->partition_rect_allowed[HORZ] = 0;
+    part_state->partition_rect_allowed[VERT] = 0;
+  }
+
+  // Prune rectangular, AB and 4-way partition based on q index and block size
+  if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 1) {
+    if (bsize == BLOCK_8X8 && x->qindex < 35)
+      av1_disable_rect_partitions(part_state);
+
+  } else if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 2) {
+    // Enumeration difference between two square partitions
+    const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16;
+    int max_bsize =
+        BLOCK_32X32 - (x->qindex * 3 / QINDEX_RANGE) * sqr_bsize_step;
+    max_bsize = AOMMAX(max_bsize, BLOCK_4X4);
+    const BLOCK_SIZE max_prune_bsize =
+        (BLOCK_SIZE)AOMMIN(max_bsize, BLOCK_32X32);
+
+    // Prune partition
+    // qidx 0 to 85: prune bsize below BLOCK_32X32
+    // qidx 86 to 170: prune bsize below BLOCK_16X16
+    // qidx 171 to 255: prune bsize below BLOCK_8X8
+    if (bsize < max_prune_bsize) {
+      av1_disable_rect_partitions(part_state);
+    }
+  }
+
+  if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) {
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    int prune_sub_8x8;
+    if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 2) {
+      prune_sub_8x8 = 1;
+    } else {
+      assert(cpi->sf.part_sf.prune_sub_8x8_partition_level == 1);
+      // Prune if both neighbors are available and either is > BLOCK_8X8
+      prune_sub_8x8 = xd->left_available && xd->up_available &&
+                      (xd->left_mbmi->bsize > BLOCK_8X8 ||
+                       xd->above_mbmi->bsize > BLOCK_8X8);
+    }
+    if (prune_sub_8x8) {
+      av1_disable_all_splits(part_state);
+    }
+  }
+
+  // A CNN-based speed feature pruning out either split or all non-split
+  // partition in INTRA frame coding.
+  const int try_intra_cnn_based_part_prune =
+      frame_is_intra_only(cm) &&
+      cpi->sf.part_sf.intra_cnn_based_part_prune_level &&
+      cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+      blk_params->bsize_at_least_8x8 &&
+      av1_is_whole_blk_in_frame(blk_params, mi_params);
+
+  if (try_intra_cnn_based_part_prune) {
+    av1_intra_mode_cnn_partition(
+        &cpi->common, x, x->part_search_info.quad_tree_idx,
+        cpi->sf.part_sf.intra_cnn_based_part_prune_level, part_state);
+  }
+
+  // Use simple motion search to prune out split or non-split partitions. This
+  // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a
+  // smaller blocksize.
+  const int try_split_only =
+      cpi->sf.part_sf.simple_motion_search_split &&
+      part_state->do_square_split && blk_params->bsize_at_least_8x8 &&
+      av1_is_whole_blk_in_frame(blk_params, mi_params) &&
+      !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
+
+  if (try_split_only) {
+    av1_simple_motion_search_based_split(cpi, x, sms_tree, part_state);
+  }
+
+  // Use simple motion search to prune out rectangular partition in some
+  // direction. The results are stored in prune_horz and prune_vert in order to
+  // bypass future related pruning checks if a pruning decision has been made.
+
+  // We want to search at least one partition mode, so don't prune if NONE and
+  // SPLIT are disabled.
+  const int non_rect_part_allowed =
+      part_state->do_square_split || part_state->partition_none_allowed;
+  // Only run the model if the partitions are not already pruned.
+  const int rect_part_allowed = part_state->do_rectangular_split &&
+                                ((part_state->partition_rect_allowed[HORZ] &&
+                                  !part_state->prune_rect_part[HORZ]) ||
+                                 (part_state->partition_rect_allowed[VERT] &&
+                                  !part_state->prune_rect_part[VERT]));
+
+  const int try_prune_rect = cpi->sf.part_sf.simple_motion_search_prune_rect &&
+                             !frame_is_intra_only(cm) &&
+                             non_rect_part_allowed && rect_part_allowed &&
+                             !av1_superres_scaled(cm);
+
+  if (try_prune_rect) {
+    av1_simple_motion_search_prune_rect(cpi, x, sms_tree, part_state);
+  }
+}
+
+#ifndef NDEBUG
+static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) {
+  return block_size_wide[bsize] == block_size_high[bsize];
+}
+#endif  // NDEBUG
+
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+                                           PartitionSearchState *part_state) {
+  assert(is_bsize_square(sb_enc->max_partition_size));
+  assert(is_bsize_square(sb_enc->min_partition_size));
+  assert(sb_enc->min_partition_size <= sb_enc->max_partition_size);
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+  assert(is_bsize_square(bsize));
+  const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size];
+  const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size];
+  const int bsize_1d = block_size_wide[bsize];
+  assert(min_partition_size_1d <= max_partition_size_1d);
+  const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d;
+  const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d;
+  if (is_gt_max_sq_part) {
+    // If current block size is larger than max, only allow split.
+    av1_set_square_split_only(part_state);
+  } else if (is_le_min_sq_part) {
+    // If current block size is less or equal to min, only allow none if valid
+    // block large enough; only allow split otherwise.
+    av1_disable_rect_partitions(part_state);
+
+    // only disable square split when current block is not at the picture
+    // boundary. otherwise, inherit the square split flag from previous logic
+    if (av1_blk_has_rows_and_cols(blk_params)) {
+      part_state->do_square_split = 0;
+    }
+    part_state->partition_none_allowed = !(part_state->do_square_split);
+  }
+}
+
+// Decide whether to evaluate the AB partition specified by part_type based on
+// split and HORZ/VERT info
+int evaluate_ab_partition_based_on_split(
+    const PC_TREE *pc_tree, PARTITION_TYPE rect_part,
+    const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
+    int split_idx2) {
+  int num_win = 0;
+  // Threshold for number of winners
+  // Conservative pruning for high quantizers
+  const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
+  int sub_part_win =
+      (rect_part_win_info == NULL)    ? (pc_tree->partitioning == rect_part)
+      : (rect_part == PARTITION_HORZ) ? rect_part_win_info->rect_part_win[HORZ]
+                                      : rect_part_win_info->rect_part_win[VERT];
+  num_win += (sub_part_win) ? 1 : 0;
+  if (pc_tree->split[split_idx1]) {
+    num_win +=
+        (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
+  } else {
+    num_win += 1;
+  }
+  if (pc_tree->split[split_idx2]) {
+    num_win +=
+        (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
+  } else {
+    num_win += 1;
+  }
+  if (num_win < num_win_thresh) {
+    return 0;
+  }
+  return 1;
+}
+
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+                             const PC_TREE *pc_tree, int pb_source_variance,
+                             int64_t best_rdcost,
+                             const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+                             bool ext_partition_allowed,
+                             PartitionSearchState *part_state,
+                             int *ab_partitions_allowed) {
+  int64_t *horz_rd = part_state->rect_part_rd[HORZ];
+  int64_t *vert_rd = part_state->rect_part_rd[VERT];
+  int64_t *split_rd = part_state->split_rd;
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  // The standard AB partitions are allowed initially if ext-partition-types are
+  // allowed.
+  int horzab_partition_allowed = ext_partition_allowed &&
+                                 part_cfg->enable_ab_partitions &&
+                                 part_state->partition_rect_allowed[HORZ];
+  int vertab_partition_allowed = ext_partition_allowed &&
+                                 part_cfg->enable_ab_partitions &&
+                                 part_state->partition_rect_allowed[VERT];
+
+  // Pruning: pruning out AB partitions on one main direction based on the
+  // current best partition and source variance.
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
+      // TODO(debargha,huisu@google.com): may need to tune the threshold for
+      // pb_source_variance.
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    } else {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    }
+    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+  }
+
+  // Pruning: pruning out horz_a or horz_b if the combined rdcost of its
+  // subblocks estimated from previous partitions is much higher than the best
+  // rd so far.
+  ab_partitions_allowed[HORZ_A] = horzab_partition_allowed;
+  ab_partitions_allowed[HORZ_B] = horzab_partition_allowed;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+      case 1:
+        ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 14 < best_rdcost);
+        ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 14 < best_rdcost);
+        break;
+      case 2:
+      default:
+        ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 15 < best_rdcost);
+        ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 15 < best_rdcost);
+        break;
+    }
+  }
+
+  // Pruning: pruning out vert_a or vert_b if the combined rdcost of its
+  // subblocks estimated from previous partitions is much higher than the best
+  // rd so far.
+  ab_partitions_allowed[VERT_A] = vertab_partition_allowed;
+  ab_partitions_allowed[VERT_B] = vertab_partition_allowed;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+      case 1:
+        ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 14 < best_rdcost);
+        ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 14 < best_rdcost);
+        break;
+      case 2:
+      default:
+        ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 15 < best_rdcost);
+        ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 15 < best_rdcost);
+        break;
+    }
+  }
+
+  // Pruning: pruning out some ab partitions using a DNN taking rd costs of
+  // sub-blocks from previous basic partition types.
+  if (cpi->sf.part_sf.ml_prune_partition && ext_partition_allowed &&
+      part_state->partition_rect_allowed[HORZ] &&
+      part_state->partition_rect_allowed[VERT]) {
+    // TODO(huisu@google.com): x->source_variance may not be the current
+    // block's variance. The correct one to use is pb_source_variance. Need to
+    // re-train the model to fix it.
+    av1_ml_prune_ab_partition(cpi, pc_tree->partitioning,
+                              get_unsigned_bits(x->source_variance),
+                              best_rdcost, part_state, ab_partitions_allowed);
+  }
+
+  // Pruning: pruning AB partitions based on the number of horz/vert wins
+  // in the current block and sub-blocks in PARTITION_SPLIT.
+  if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+      ab_partitions_allowed[HORZ_A]) {
+    ab_partitions_allowed[HORZ_A] &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
+  }
+  if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+      ab_partitions_allowed[HORZ_B]) {
+    ab_partitions_allowed[HORZ_B] &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
+  }
+  if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+      ab_partitions_allowed[VERT_A]) {
+    ab_partitions_allowed[VERT_A] &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
+  }
+  if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+      ab_partitions_allowed[VERT_B]) {
+    ab_partitions_allowed[VERT_B] &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
+  }
+}
+
+// Prepare features for the external model. Specifically, features after
+// ab partition is searched.
+static void prepare_features_after_part_ab(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    int part_ctx, int64_t best_rd,
+    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance,
+    int mi_row, int mi_col, aom_partition_features_t *const features) {
+  int64_t *horz_rd = rect_part_rd[HORZ];
+  int64_t *vert_rd = rect_part_rd[VERT];
+
+  // Generate features.
+  int feature_index = 0;
+  features->after_part_ab.f[feature_index++] = (float)part_ctx;
+  features->after_part_ab.f[feature_index++] =
+      (float)get_unsigned_bits(pb_source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features->after_part_ab.f[feature_index++] = rd_ratio;
+  }
+
+  // 4-way partitions are only allowed for these three square block sizes.
+  assert(bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64);
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+    assert(horz_4_bs != BLOCK_INVALID);
+    assert(vert_4_bs != BLOCK_INVALID);
+
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common), bsize);
+    const int src_stride = x->plane[0].src.stride;
+    uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+
+    struct buf_2d horz_4_src, vert_4_src;
+    horz_4_src.stride = src_stride;
+    vert_4_src.stride = src_stride;
+
+    for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+      horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+      vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+      horz_4_source_var[i] = av1_get_perpixel_variance_facade(
+          cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y);
+      vert_4_source_var[i] = av1_get_perpixel_variance_facade(
+          cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y);
+    }
+  }
+
+  const float denom = (float)(pb_source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features->after_part_ab.f[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features->after_part_ab.f[feature_index++] = var_ratio;
+  }
+  assert(feature_index == 18);
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// partition_none_allowed
+// partition_horz_allowed
+// partition_vert_allowed
+// do_rectangular_split
+// do_square_split
+static bool ext_ml_model_decision_before_none(
+    AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split) {
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE;
+  for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) {
+    features.before_part_none.f[i] = features_from_motion[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *partition_none_allowed = decision.partition_none_allowed;
+  *partition_horz_allowed = decision.partition_rect_allowed[HORZ];
+  *partition_vert_allowed = decision.partition_rect_allowed[VERT];
+  *do_rectangular_split = decision.do_rectangular_split;
+  *do_square_split = decision.do_square_split;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// prune_horz
+// prune_vert
+static bool ext_ml_model_decision_before_none_part2(
+    AV1_COMP *cpi,
+    const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+    int *prune_horz, int *prune_vert) {
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2;
+  for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) {
+    features.before_part_none.f_part2[i] = features_from_motion[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *prune_horz = decision.prune_rect_part[HORZ];
+  *prune_vert = decision.prune_rect_part[VERT];
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// do_square_split
+// do_rectangular_split
+bool ext_ml_model_decision_after_none(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_none, int *do_square_split,
+    int *do_rectangular_split) {
+  if (!ext_part_controller->ready || is_intra_frame) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_NONE;
+  for (int i = 0; i < 4; ++i) {
+    features.after_part_none.f[i] = features_after_none[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *do_square_split = decision.do_square_split;
+  *do_rectangular_split = decision.do_rectangular_split;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_none_part2(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search) {
+  AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_NONE_PART2;
+  for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) {
+    features.after_part_none.f_terminate[i] = features_terminate[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *terminate_partition_search = decision.terminate_partition_search;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_split(AV1_COMP *const cpi,
+                                       const float *const features_terminate,
+                                       int *terminate_partition_search) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) {
+    return false;
+  }
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT;
+  for (int i = 0; i < 31; ++i) {
+    features.after_part_split.f_terminate[i] = features_terminate[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *terminate_partition_search = decision.terminate_partition_search;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// prune_rect_part[HORZ]
+// prune_rect_part[VERT]
+bool ext_ml_model_decision_after_split_part2(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_prune, int *prune_rect_part_horz,
+    int *prune_rect_part_vert) {
+  if (is_intra_frame || !ext_part_controller->ready) {
+    return false;
+  }
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2;
+  for (int i = 0; i < 9; ++i) {
+    features.after_part_split.f_prune_rect[i] = features_prune[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *prune_rect_part_horz = decision.prune_rect_part[0];
+  *prune_rect_part_vert = decision.prune_rect_part[1];
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after rectangular partition. Specifically, these parameters:
+// horza_partition_allowed
+// horzb_partition_allowed
+// verta_partition_allowed
+// vertb_partition_allowed
+static bool ext_ml_model_decision_after_rect(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_rect, int *horza_partition_allowed,
+    int *horzb_partition_allowed, int *verta_partition_allowed,
+    int *vertb_partition_allowed) {
+  if (is_intra_frame || !ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_RECT;
+  for (int i = 0; i < 10; ++i) {
+    features.after_part_rect.f[i] = features_after_rect[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *horza_partition_allowed = decision.horza_partition_allowed;
+  *horzb_partition_allowed = decision.horzb_partition_allowed;
+  *verta_partition_allowed = decision.verta_partition_allowed;
+  *vertb_partition_allowed = decision.vertb_partition_allowed;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after AB partition. Specifically, these parameters:
+// partition_vert4_allowed
+// partition_horz4_allowed
+static bool ext_ml_model_decision_after_part_ab(
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+
+  if (!frame_is_intra_only(cm) && ext_part_controller->ready) {
+    // Setup features.
+    aom_partition_features_t features;
+    features.id = AOM_EXT_PART_FEATURE_AFTER_AB;
+    prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd,
+                                   rect_part_rd, split_rd, pb_source_variance,
+                                   mi_row, mi_col, &features);
+
+    // Send necessary features to the external model.
+    av1_ext_part_send_features(ext_part_controller, &features);
+
+    // Get partition decisions from the external model.
+    aom_partition_decision_t decision;
+    const bool valid_decision =
+        av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+    if (!valid_decision) return false;
+
+    // Populate decisions
+    *partition_horz4_allowed = decision.partition_horz4_allowed;
+    *partition_vert4_allowed = decision.partition_vert4_allowed;
+
+    return true;
+  }
+
+  return false;
+}
+
+// This function resembles "av1_setup_sms_tree()" in context_tree.c
+// with function signature change.
+static SIMPLE_MOTION_DATA_TREE *setup_sms_tree(
+    AV1_COMP *const cpi, SIMPLE_MOTION_DATA_TREE *sms_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  int sms_tree_index = 0;
+  SIMPLE_MOTION_DATA_TREE *this_sms;
+  int square_index = 1;
+  int nodes;
+  this_sms = &sms_tree[0];
+
+  if (!stat_generation_stage) {
+    const int leaf_factor = is_sb_size_128 ? 4 : 1;
+    const int leaf_nodes = 256 * leaf_factor;
+
+    // Sets up all the leaf nodes in the tree.
+    for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+      SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+      tree->block_size = square[0];
+    }
+
+    // Each node has 4 leaf nodes, fill each block_size level of the tree
+    // from leafs to the root.
+    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+      for (int i = 0; i < nodes; ++i) {
+        SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+        tree->block_size = square[square_index];
+        for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+        ++sms_tree_index;
+      }
+      ++square_index;
+    }
+  } else {
+    // Allocation for firstpass/LAP stage
+    // TODO(Mufaddal): refactor square_index to use a common block_size macro
+    // from firstpass.c
+    SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+    square_index = 2;
+    tree->block_size = square[square_index];
+  }
+
+  // Set up the root node for the largest superblock size
+  return &sms_tree[tree_nodes - 1];
+}
+
+static void write_motion_feature_to_file(
+    const char *const path, const int sb_counter, const unsigned int *block_sse,
+    const unsigned int *block_var, const int num_blocks, const BLOCK_SIZE bsize,
+    const BLOCK_SIZE fixed_block_size, const int mi_row, const int mi_col) {
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/motion_search_feature_sb%d", path,
+           sb_counter);
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+          block_size_wide[fixed_block_size], num_blocks);
+  for (int i = 0; i < num_blocks; ++i) {
+    fprintf(pfile, "%d", block_sse[i]);
+    if (i < num_blocks - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  for (int i = 0; i < num_blocks; ++i) {
+    fprintf(pfile, "%d", block_var[i]);
+    if (i < num_blocks - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           const int mi_row, const int mi_col,
+                                           const BLOCK_SIZE bsize,
+                                           aom_partition_features_t *features) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (frame_is_intra_only(cm)) return;
+
+  MACROBLOCK *const x = &td->mb;
+  const BLOCK_SIZE fixed_block_size = BLOCK_16X16;
+  const int col_step = mi_size_wide[fixed_block_size];
+  const int row_step = mi_size_high[fixed_block_size];
+  SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+  SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+  av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row,
+                                           mi_col);
+  av1_reset_simple_motion_tree_partition(sms_root, bsize);
+  const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+                                                        : LAST_FRAME };
+  const int mi_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int mi_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int col_steps = (mi_width / col_step) + ((mi_width % col_step) > 0);
+  const int row_steps = (mi_height / row_step) + ((mi_height % row_step) > 0);
+  const int num_blocks = col_steps * row_steps;
+  unsigned int *block_sse = aom_calloc(num_blocks, sizeof(*block_sse));
+  unsigned int *block_var = aom_calloc(num_blocks, sizeof(*block_var));
+  if (!(block_sse && block_var)) {
+    aom_free(sms_tree);
+    aom_free(block_sse);
+    aom_free(block_var);
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating block_sse & block_var");
+  }
+  int idx = 0;
+
+  for (int row = mi_row;
+       row < AOMMIN(mi_row + mi_size_high[bsize], cm->mi_params.mi_rows);
+       row += row_step) {
+    for (int col = mi_col;
+         col < AOMMIN(mi_col + mi_size_wide[bsize], cm->mi_params.mi_cols);
+         col += col_step) {
+      simple_motion_search_get_best_ref(
+          cpi, x, sms_root, row, col, fixed_block_size, ref_list,
+          /*num_refs=*/1, /*use_subpixel=*/1,
+          /*save_mv=*/1, &block_sse[idx], &block_var[idx]);
+      ++idx;
+    }
+  }
+  if (features == NULL) {
+    write_motion_feature_to_file(cpi->oxcf.partition_info_path, cpi->sb_counter,
+                                 block_sse, block_var, idx, bsize,
+                                 fixed_block_size, mi_row, mi_col);
+  } else {
+    features->sb_features.motion_features.unit_length =
+        block_size_wide[fixed_block_size];
+    features->sb_features.motion_features.num_units = idx;
+    for (int i = 0; i < idx; ++i) {
+      features->sb_features.motion_features.block_sse[i] = block_sse[i];
+      features->sb_features.motion_features.block_var[i] = block_var[i];
+    }
+  }
+
+  aom_free(block_sse);
+  aom_free(block_var);
+  aom_free(sms_tree);
+}
+
+void av1_prepare_motion_search_features_block(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    const int mi_row, const int mi_col, const BLOCK_SIZE bsize,
+    const int valid_partition_types, unsigned int *block_sse,
+    unsigned int *block_var, unsigned int sub_block_sse[4],
+    unsigned int sub_block_var[4], unsigned int horz_block_sse[2],
+    unsigned int horz_block_var[2], unsigned int vert_block_sse[2],
+    unsigned int vert_block_var[2]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (frame_is_intra_only(cm)) return;
+  MACROBLOCK *const x = &td->mb;
+  SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+  SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+  av1_reset_simple_motion_tree_partition(sms_root, bsize);
+  const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+                                                        : LAST_FRAME };
+  const int sub_mi_width = mi_size_wide[bsize] / 2;
+  const int sub_mi_height = sub_mi_width;
+  simple_motion_search_get_best_ref(
+      cpi, x, sms_root, mi_row, mi_col, bsize, ref_list, /*num_refs=*/1,
+      /*use_subpixel=*/1, /*save_mv=*/1, block_sse, block_var);
+  // Split to 4 sub blocks.
+  if (valid_partition_types & (1 << PARTITION_SPLIT)) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int i = 0; i < 4; ++i) {
+      const int row = mi_row + (i >> 1) * sub_mi_height;
+      const int col = mi_col + (i & 1) * sub_mi_width;
+      simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+                                        ref_list, /*num_refs=*/1,
+                                        /*use_subpixel=*/1, /*save_mv=*/1,
+                                        &sub_block_sse[i], &sub_block_var[i]);
+    }
+  }
+  // Horizontal split
+  if (valid_partition_types & (1 << PARTITION_HORZ)) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+    for (int i = 0; i < 2; ++i) {
+      const int row = mi_row + (i & 1) * sub_mi_height;
+      const int col = mi_col;
+      simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+                                        ref_list, /*num_refs=*/1,
+                                        /*use_subpixel=*/1, /*save_mv=*/1,
+                                        &horz_block_sse[i], &horz_block_var[i]);
+    }
+  }
+  // Vertical split
+  if (valid_partition_types & (1 << PARTITION_VERT)) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+    for (int i = 0; i < 2; ++i) {
+      const int row = mi_row;
+      const int col = mi_col + (i & 1) * sub_mi_width;
+      simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+                                        ref_list, /*num_refs=*/1,
+                                        /*use_subpixel=*/1, /*save_mv=*/1,
+                                        &vert_block_sse[i], &vert_block_var[i]);
+    }
+  }
+
+  aom_free(sms_tree);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void init_simple_motion_search_mvs(
+    SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) {
+  memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs));
+  av1_zero(sms_tree->sms_none_feat);
+  av1_zero(sms_tree->sms_rect_feat);
+  av1_zero(sms_tree->sms_none_valid);
+  av1_zero(sms_tree->sms_rect_valid);
+
+  if (sms_tree->block_size >= BLOCK_8X8) {
+    init_simple_motion_search_mvs(sms_tree->split[0], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[1], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[2], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[3], start_mvs);
+  }
+}
+
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+                                              const TileInfo *tile_info,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_root,
+                                              int mi_row, int mi_col) {
+  // Use the NEARESTMV of the sb as the start mv
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FULLPEL_MV ref_mvs[REF_FRAMES];
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  av1_zero(ref_mvs);
+  // If tile_info is NULL, assume that the offsets have already been set.
+  if (tile_info) {
+    av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col,
+                                       sb_size);
+  }
+
+  MB_MODE_INFO_EXT mbmi_ext;
+  const int ref_frame =
+      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+  av1_find_mv_refs(cm, xd, xd->mi[0], ref_frame, mbmi_ext.ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext.global_mvs,
+                   mbmi_ext.mode_context);
+  if (mbmi_ext.ref_mv_count[ref_frame] > 0) {
+    ref_mvs[ref_frame] =
+        get_fullmv_from_mv(&xd->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+  } else {
+    ref_mvs[ref_frame] =
+        get_fullmv_from_mv(&mbmi_ext.global_mvs[ref_frame].as_mv);
+  }
+
+  init_simple_motion_search_mvs(sms_root, ref_mvs);
+}
diff --git a/third_party/aom/av1/encoder/partition_strategy.h b/third_party/aom/av1/encoder/partition_strategy.h
new file mode 100644
index 0000000000..84683f5fd4
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_strategy.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encoder.h"
+
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int label_idx,
+                                  int intra_cnn_based_part_prune_level,
+                                  PartitionSearchState *part_state);
+
+// Performs a simple_motion_search with a single reference frame and extract
+// the variance of residues. Then use the features to determine whether we want
+// to go straight to splitting without trying PARTITION_NONE
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                          PartitionSearchState *part_state);
+
+// Performs a simple_motion_search with two reference frames and extract
+// the variance of residues. Then use the features to determine whether we want
+// to prune some partitions.
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+                                         SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                         PartitionSearchState *part_state);
+
+#if !CONFIG_REALTIME_ONLY
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+//  - The frame is a show frame
+//  - The frame is not intra only
+//  - The current bsize is > BLOCK_8X8
+//  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                              const RD_STATS *none_rdc,
+                                              PartitionSearchState *part_state);
+
+// Get the features for selecting the max and min partition size. Currently this
+// performs simple_motion_search on 16X16 subblocks of the current superblock,
+// and then extract the statistics of sse and motion vectors as features.
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        float *features);
+
+// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock.
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const float *features);
+
+// Attempts an early termination after PARTITION_SPLIT.
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                   SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                   int64_t best_rd, int64_t part_none_rd,
+                                   int64_t part_split_rd,
+                                   int64_t *split_block_rd,
+                                   PartitionSearchState *part_state);
+
+// Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and
+// PARTITION_VERT.
+// TODO(chiyotsai@google.com): Currently this model does not use q value and has
+// no information about rectangular partitions. Preliminary experiments suggest
+// that we can get better performance by adding in q_index and rectangular
+// sse/var from SMS. We should retrain and tune this model later.
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                                 int64_t best_rd, int64_t none_rd,
+                                 const int64_t *split_rd,
+                                 PartitionSearchState *part_state);
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+                               int64_t best_rd,
+                               PartitionSearchState *part_state,
+                               int *ab_partitions_allowed);
+
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int part_ctx, int64_t best_rd,
+                              PartitionSearchState *part_state,
+                              int *part4_allowed,
+                              unsigned int pb_source_variance);
+
+// ML-based partition search breakout after PARTITION_NONE.
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                             const RD_STATS *const rd_stats,
+                             unsigned int pb_source_variance, int bit_depth,
+                             PartitionSearchState *part_state);
+
+// The first round of partition pruning determined before any partition
+// has been tested. The decisions will be updated and passed back
+// to the partition search function.
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+                                        MACROBLOCK *const x,
+                                        SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                        PartitionSearchState *part_state);
+
+// Prune out partitions that lead to coding block sizes outside the min and max
+// bsizes set by the encoder. Max and min square partition levels are defined as
+// the partition nodes that the recursive function rd_pick_partition() can
+// reach. To implement this: only PARTITION_NONE is allowed if the current node
+// equals max_partition_size, only PARTITION_SPLIT is allowed if the current
+// node exceeds max_partition_size.
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+                                           PartitionSearchState *part_state);
+
+// Prune out AB partitions based on rd decisions made from testing the
+// basic partitions.
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+                             const PC_TREE *pc_tree, int pb_source_variance,
+                             int64_t best_rdcost,
+                             const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+                             bool ext_partition_allowed,
+                             PartitionSearchState *part_state,
+                             int *ab_partitions_allowed);
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           const int mi_row, const int mi_col,
+                                           const BLOCK_SIZE bsize,
+                                           aom_partition_features_t *features);
+void av1_prepare_motion_search_features_block(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    const int mi_row, const int mi_col, const BLOCK_SIZE bsize,
+    const int valid_partition_types, unsigned int *block_sse,
+    unsigned int *block_var, unsigned int sub_block_sse[4],
+    unsigned int sub_block_var[4], unsigned int horz_block_sse[2],
+    unsigned int horz_block_var[2], unsigned int vert_block_sse[2],
+    unsigned int vert_block_var[2]);
+#endif  // !CONFIG_REALTIME_ONLY
+
+// A simplified version of set_offsets meant to be used for
+// simple_motion_search.
+static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
+                                                 MACROBLOCK *const x,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+                    mi_width, cpi->oxcf.border_in_pixels);
+
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+  xd->mi_row = mi_row;
+  xd->mi_col = mi_col;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+  xd->mb_to_bottom_edge =
+      GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+  xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+  xd->mb_to_right_edge =
+      GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE);
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+}
+
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+                                              const TileInfo *tile_info,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_root,
+                                              int mi_row, int mi_col);
+
+static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, BLOCK_SIZE sb_size) {
+  const int sb_mi_wide = mi_size_wide[sb_size];
+  const int sb_mi_high = mi_size_high[sb_size];
+
+  return (mi_row + sb_mi_high) <= mi_params->mi_rows &&
+         (mi_col + sb_mi_wide) <= mi_params->mi_cols;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Do not use this criteria for screen content videos.
+// Since screen content videos could often find good predictors and the largest
+// block size is likely to be used.
+static INLINE int use_auto_max_partition(const AV1_COMP *const cpi,
+                                         BLOCK_SIZE sb_size, int mi_row,
+                                         int mi_col) {
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const AV1_COMMON *const cm = &cpi->common;
+  return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools &&
+         cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+             NOT_IN_USE &&
+         sb_size == BLOCK_128X128 &&
+         is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+             OVERLAY_UPDATE &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+             INTNL_OVERLAY_UPDATE;
+}
+
+static BLOCK_SIZE dim_to_size(int dim) {
+  switch (dim) {
+    case 4: return BLOCK_4X4;
+    case 8: return BLOCK_8X8;
+    case 16: return BLOCK_16X16;
+    case 32: return BLOCK_32X32;
+    case 64: return BLOCK_64X64;
+    case 128: return BLOCK_128X128;
+    default: assert(0); return 0;
+  }
+}
+
+static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
+                                                  AV1_COMP *cpi, MACROBLOCK *x,
+                                                  const SPEED_FEATURES *sf,
+                                                  BLOCK_SIZE sb_size,
+                                                  int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  sb_enc->max_partition_size =
+      AOMMIN(sf->part_sf.default_max_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
+  sb_enc->min_partition_size =
+      AOMMAX(sf->part_sf.default_min_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
+  sb_enc->max_partition_size =
+      AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size);
+  sb_enc->min_partition_size =
+      AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size);
+
+  if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+    float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+    av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+    sb_enc->max_partition_size =
+        AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
+                      sb_enc->max_partition_size),
+               sb_enc->min_partition_size);
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/pass2_strategy.c b/third_party/aom/av1/encoder/pass2_strategy.c
new file mode 100644
index 0000000000..a9442ffc1a
--- /dev/null
+++ b/third_party/aom/av1/encoder/pass2_strategy.c
@@ -0,0 +1,4488 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup gf_group_algo Golden Frame Group
+ * \ingroup high_level_algo
+ * Algorithms regarding determining the length of GF groups and defining GF
+ * group structures.
+ * @{
+ */
+/*! @} - end defgroup gf_group_algo */
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_mem/aom_mem.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "av1/common/av1_common_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/encode_strategy.h"
+
+#define DEFAULT_KF_BOOST 2300
+#define DEFAULT_GF_BOOST 2000
+#define GROUP_ADAPTIVE_MAXQ 1
+
+static void init_gf_stats(GF_GROUP_STATS *gf_stats);
+static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+                                 int is_final_pass);
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const FRAME_INFO *frame_info,
+                                    const FIRSTPASS_STATS *this_frame) {
+  const double active_pct =
+      1.0 -
+      ((this_frame->intra_skip_pct / 2) +
+       ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err_new(const FRAME_INFO *frame_info,
+                                         const FIRSTPASS_STATS *total_stats,
+                                         const FIRSTPASS_STATS *this_stats,
+                                         int vbrbias, double modified_error_min,
+                                         double modified_error_max) {
+  if (total_stats == NULL) {
+    return 0;
+  }
+  const double av_weight = total_stats->weight / total_stats->count;
+  const double av_err =
+      (total_stats->coded_error * av_weight) / total_stats->count;
+  double modified_error =
+      av_err * pow(this_stats->coded_error * this_stats->weight /
+                       DOUBLE_DIVIDE_CHECK(av_err),
+                   vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+      pow(calculate_active_area(frame_info, this_stats), ACT_AREA_CORRECTION);
+
+  return fclamp(modified_error, modified_error_min, modified_error_max);
+}
+
+static double calculate_modified_err(const FRAME_INFO *frame_info,
+                                     const TWO_PASS *twopass,
+                                     const AV1EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+  return calculate_modified_err_new(
+      frame_info, total_stats, this_frame, oxcf->rc_cfg.vbrbias,
+      twopass->modified_error_min, twopass->modified_error_max);
+}
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS_FRAME *p_frame,
+                               const FIRSTPASS_STATS *position) {
+  p_frame->stats_in = position;
+}
+
+static int input_stats(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+                       FIRSTPASS_STATS *fps) {
+  if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+  *fps = *p_frame->stats_in;
+  ++p_frame->stats_in;
+  return 1;
+}
+
+static int input_stats_lap(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+                           FIRSTPASS_STATS *fps) {
+  if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+  *fps = *p_frame->stats_in;
+  /* Move old stats[0] out to accommodate for next frame stats  */
+  memmove(p->frame_stats_arr[0], p->frame_stats_arr[1],
+          (p->stats_buf_ctx->stats_in_end - p_frame->stats_in - 1) *
+              sizeof(FIRSTPASS_STATS));
+  p->stats_buf_ctx->stats_in_end--;
+  return 1;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p,
+                                               const TWO_PASS_FRAME *p_frame,
+                                               int offset) {
+  if ((offset >= 0 &&
+       p_frame->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
+      (offset < 0 &&
+       p_frame->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
+    return NULL;
+  }
+
+  return &p_frame->stats_in[offset];
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+                          const AV1EncoderConfig *oxcf) {
+  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+                      (int64_t)oxcf->rc_cfg.vbrmax_section) /
+                     100;
+  if (max_bits < 0)
+    max_bits = 0;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
+
+  return (int)max_bits;
+}
+
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75,
+                                                            0.80, 0.85, 0.90,
+                                                            0.95, 0.95, 0.95 };
+#define ERR_DIVISOR 96.0
+static double calc_correction_factor(double err_per_mb, int q) {
+  const double error_term = err_per_mb / ERR_DIVISOR;
+  const int index = q >> 5;
+  // Adjustment to power term based on qindex
+  const double power_term =
+      q_pow_term[index] +
+      (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
+  assert(error_term >= 0.0);
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+// Based on history adjust expectations of bits per macroblock.
+static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+  // Based on recent history adjust expectations of bits per macroblock.
+  double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0);
+  double rate_err_factor = 1.0;
+  const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0);
+  const double min_fac = 1.0 - adj_limit;
+  const double max_fac = 1.0 + adj_limit;
+
+  if (cpi->third_pass_ctx && cpi->third_pass_ctx->frame_info_count > 0) {
+    int64_t actual_bits = 0;
+    int64_t target_bits = 0;
+    double factor = 0.0;
+    int count = 0;
+    for (int i = 0; i < cpi->third_pass_ctx->frame_info_count; i++) {
+      actual_bits += cpi->third_pass_ctx->frame_info[i].actual_bits;
+      target_bits += cpi->third_pass_ctx->frame_info[i].bits_allocated;
+      factor += cpi->third_pass_ctx->frame_info[i].bpm_factor;
+      count++;
+    }
+
+    if (count == 0) {
+      factor = 1.0;
+    } else {
+      factor /= (double)count;
+    }
+
+    factor *= (double)actual_bits / DOUBLE_DIVIDE_CHECK((double)target_bits);
+
+    if ((twopass->bpm_factor <= 1 && factor < twopass->bpm_factor) ||
+        (twopass->bpm_factor >= 1 && factor > twopass->bpm_factor)) {
+      twopass->bpm_factor = factor;
+      twopass->bpm_factor =
+          AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+    }
+  }
+
+  int err_estimate = p_rc->rate_error_estimate;
+  int64_t bits_left = twopass->bits_left;
+  int64_t total_actual_bits = p_rc->total_actual_bits;
+  int64_t bits_off_target = p_rc->vbr_bits_off_target;
+  double rolling_arf_group_actual_bits =
+      (double)twopass->rolling_arf_group_actual_bits;
+  double rolling_arf_group_target_bits =
+      (double)twopass->rolling_arf_group_target_bits;
+
+#if CONFIG_FPMT_TEST
+  const int is_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 ? 1 : 0;
+  const int simulate_parallel_frame =
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE
+          ? is_parallel_frame
+          : 0;
+  total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits
+                                              : p_rc->total_actual_bits;
+  bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target
+                                            : p_rc->vbr_bits_off_target;
+  bits_left =
+      simulate_parallel_frame ? p_rc->temp_bits_left : twopass->bits_left;
+  rolling_arf_group_target_bits =
+      (double)(simulate_parallel_frame
+                   ? p_rc->temp_rolling_arf_group_target_bits
+                   : twopass->rolling_arf_group_target_bits);
+  rolling_arf_group_actual_bits =
+      (double)(simulate_parallel_frame
+                   ? p_rc->temp_rolling_arf_group_actual_bits
+                   : twopass->rolling_arf_group_actual_bits);
+  err_estimate = simulate_parallel_frame ? p_rc->temp_rate_error_estimate
+                                         : p_rc->rate_error_estimate;
+#endif
+
+  if (p_rc->bits_off_target && total_actual_bits > 0) {
+    if (cpi->ppi->lap_enabled) {
+      rate_err_factor = rolling_arf_group_actual_bits /
+                        DOUBLE_DIVIDE_CHECK(rolling_arf_group_target_bits);
+    } else {
+      rate_err_factor = 1.0 - ((double)(bits_off_target) /
+                               AOMMAX(total_actual_bits, bits_left));
+    }
+
+    // Adjustment is damped if this is 1 pass with look ahead processing
+    // (as there are only ever a few frames of data) and for all but the first
+    // GOP in normal two pass.
+    if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) {
+      rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac);
+    }
+    rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
+  }
+
+  // Is the rate control trending in the right direction. Only make
+  // an adjustment if things are getting worse.
+  if ((rate_err_factor < 1.0 && err_estimate >= 0) ||
+      (rate_err_factor > 1.0 && err_estimate <= 0)) {
+    twopass->bpm_factor *= rate_err_factor;
+    if (rate_err_tol >= 100) {
+      twopass->bpm_factor =
+          AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+    } else {
+      twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor));
+    }
+  }
+}
+
+static int qbpm_enumerator(int rate_err_tol) {
+  return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
+}
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but includes
+// calculation of a correction_factor.
+static int find_qindex_by_rate_with_correction(
+    int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb,
+    double group_weight_factor, int rate_err_tol, int best_qindex,
+    int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const double mid_factor = calc_correction_factor(error_per_mb, mid);
+    const double q = av1_convert_qindex_to_q(mid, bit_depth);
+    const int enumerator = qbpm_enumerator(rate_err_tol);
+    const int mid_bits_per_mb =
+        (int)((enumerator * mid_factor * group_weight_factor) / q);
+
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  return low;
+}
+
+/*!\brief Choose a target maximum Q for a group of frames
+ *
+ * \ingroup rate_control
+ *
+ * This function is used to estimate a suitable maximum Q for a
+ * group of frames. Inititally it is called to get a crude estimate
+ * for the whole clip. It is then called for each ARF/GF group to get
+ * a revised estimate for that group.
+ *
+ * \param[in]    cpi                 Top-level encoder structure
+ * \param[in]    av_frame_err        The average per frame coded error score
+ *                                   for frames making up this section/group.
+ * \param[in]    inactive_zone       Used to mask off /ignore part of the
+ *                                   frame. The most common use case is where
+ *                                   a wide format video (e.g. 16:9) is
+ *                                   letter-boxed into a more square format.
+ *                                   Here we want to ignore the bands at the
+ *                                   top and bottom.
+ * \param[in]    av_target_bandwidth The target bits per frame
+ *
+ * \return The maximum Q for frames in the group.
+ */
+static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err,
+                                     double inactive_zone,
+                                     int av_target_bandwidth) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  inactive_zone = fclamp(inactive_zone, 0.0, 0.9999);
+
+  if (av_target_bandwidth <= 0) {
+    return rc->worst_quality;  // Highest value allowed
+  } else {
+    const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.mi_params.MBs;
+    const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone);
+    const int target_norm_bits_per_mb =
+        (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
+    int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+
+    // Update bpm correction factor based on previous GOP rate error.
+    twopass_update_bpm_factor(cpi, rate_err_tol);
+
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    int q = find_qindex_by_rate_with_correction(
+        target_norm_bits_per_mb, cpi->common.seq_params->bit_depth,
+        av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol,
+        rc->best_quality, rc->worst_quality);
+
+    // Restriction on active max q for constrained quality mode.
+    if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level);
+    return q;
+  }
+}
+
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.01
+#define NCOUNT_FRAME_II_THRESH 5.0
+#define LOW_CODED_ERR_PER_MB 0.01
+
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It comapres the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitframe boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) {
+  double sr_diff = (frame->sr_coded_error - frame->coded_error);
+  double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+       (double)NCOUNT_FRAME_II_THRESH)) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+    double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error);
+    sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
+  }
+  return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT);
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
+  const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+  double sr_decay = get_sr_decay_rate(frame);
+  return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define DEFAULT_ZM_FACTOR 0.5
+static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) {
+  const double sr_decay_rate = get_sr_decay_rate(frame_stats);
+  double zero_motion_factor =
+      DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
+
+  // Clamp value to range 0.0 to 1.0
+  // This should happen anyway if input values are sensibly clamped but checked
+  // here just in case.
+  if (zero_motion_factor > 1.0)
+    zero_motion_factor = 1.0;
+  else if (zero_motion_factor < 0.0)
+    zero_motion_factor = 0.0;
+
+  return AOMMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(const FIRSTPASS_INFO *firstpass_info,
+                                      int next_stats_index,
+                                      const int min_gf_interval,
+                                      const int frame_interval,
+                                      const int still_interval,
+                                      const double loop_decay_rate,
+                                      const double last_decay_rate) {
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
+    int stats_left =
+        av1_firstpass_info_future_count(firstpass_info, next_stats_index);
+    if (stats_left >= still_interval) {
+      int j;
+      // Look ahead a few frames to see if static condition persists...
+      for (j = 0; j < still_interval; ++j) {
+        const FIRSTPASS_STATS *stats =
+            av1_firstpass_info_peek(firstpass_info, next_stats_index + j);
+        if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+      }
+      // Only if it does do we signal a transition to still.
+      return j == still_interval;
+    }
+  }
+  return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass,
+                        const TWO_PASS_FRAME *twopass_frame, const int offset) {
+  const FIRSTPASS_STATS *const next_frame =
+      read_frame_stats(twopass, twopass_frame, offset);
+
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame != NULL &&
+         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+         next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+                                          GF_GROUP_STATS *gf_stats, double f_w,
+                                          double f_h) {
+  const double pct = stats->pcnt_motion;
+
+  // Accumulate Motion In/Out of frame stats.
+  gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct;
+  gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out;
+  gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out);
+
+  // Accumulate a measure of how uniform (or conversely how random) the motion
+  // field is (a ratio of abs(mv) / mv).
+  if (pct > 0.05) {
+    const double mvr_ratio =
+        fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+    const double mvc_ratio =
+        fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+    gf_stats->mv_ratio_accumulator +=
+        pct *
+        (mvr_ratio < stats->mvr_abs * f_h ? mvr_ratio : stats->mvr_abs * f_h);
+    gf_stats->mv_ratio_accumulator +=
+        pct *
+        (mvc_ratio < stats->mvc_abs * f_w ? mvc_ratio : stats->mvc_abs * f_w);
+  }
+}
+
+static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats,
+                                        const double mod_frame_err,
+                                        GF_GROUP_STATS *gf_stats) {
+  gf_stats->gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+  gf_stats->gf_group_raw_error += stats->coded_error;
+#endif
+  gf_stats->gf_group_skip_pct += stats->intra_skip_pct;
+  gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows;
+}
+
+static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
+                                        const int flash_detected,
+                                        const int frames_since_key,
+                                        const int cur_idx,
+                                        GF_GROUP_STATS *gf_stats, int f_w,
+                                        int f_h) {
+  accumulate_frame_motion_stats(stats, gf_stats, f_w, f_h);
+  // sum up the metric values of current gf group
+  gf_stats->avg_sr_coded_error += stats->sr_coded_error;
+  gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref;
+  gf_stats->avg_new_mv_count += stats->new_mv_count;
+  gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy;
+  if (fabs(stats->raw_error_stdev) > 0.000001) {
+    gf_stats->non_zero_stdev_count++;
+    gf_stats->avg_raw_err_stdev += stats->raw_error_stdev;
+  }
+
+  // Accumulate the effect of prediction quality decay
+  if (!flash_detected) {
+    gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate;
+    gf_stats->loop_decay_rate = get_prediction_decay_rate(stats);
+
+    gf_stats->decay_accumulator =
+        gf_stats->decay_accumulator * gf_stats->loop_decay_rate;
+
+    // Monitor for static sections.
+    if ((frames_since_key + cur_idx - 1) > 1) {
+      gf_stats->zero_motion_accumulator = AOMMIN(
+          gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats));
+    }
+  }
+}
+
+static void average_gf_stats(const int total_frame, GF_GROUP_STATS *gf_stats) {
+  if (total_frame) {
+    gf_stats->avg_sr_coded_error /= total_frame;
+    gf_stats->avg_pcnt_second_ref /= total_frame;
+    gf_stats->avg_new_mv_count /= total_frame;
+    gf_stats->avg_wavelet_energy /= total_frame;
+  }
+
+  if (gf_stats->non_zero_stdev_count)
+    gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count;
+}
+
+#define BOOST_FACTOR 12.5
+static double baseline_err_per_mb(const FRAME_INFO *frame_info) {
+  unsigned int screen_area = frame_info->frame_height * frame_info->frame_width;
+
+  // Use a different error per mb factor for calculating boost for
+  //  different formats.
+  if (screen_area <= 640 * 360) {
+    return 500.0;
+  } else {
+    return 1000.0;
+  }
+}
+
+static double calc_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
+                               const FRAME_INFO *frame_info,
+                               const FIRSTPASS_STATS *this_frame,
+                               double this_frame_mv_in_out, double max_boost) {
+  double frame_boost;
+  const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
+                                            frame_info->bit_depth);
+  const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+  const double active_area = calculate_active_area(frame_info, this_frame);
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
+                       this_frame->intra_error * active_area) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+  // In the extreme case the boost is halved.
+  else
+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static double calc_kf_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
+                                  const FRAME_INFO *frame_info,
+                                  const FIRSTPASS_STATS *this_frame,
+                                  double *sr_accumulator, double max_boost) {
+  double frame_boost;
+  const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
+                                            frame_info->bit_depth);
+  const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00);
+  const double active_area = calculate_active_area(frame_info, this_frame);
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
+                       this_frame->intra_error * active_area) /
+                DOUBLE_DIVIDE_CHECK(
+                    (this_frame->coded_error + *sr_accumulator) * active_area);
+
+  // Update the accumulator for second ref error difference.
+  // This is intended to give an indication of how much the coded error is
+  // increasing over time.
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
+  *sr_accumulator = AOMMAX(0.0, *sr_accumulator);
+
+  // Q correction and scaling
+  // The 40.0 value here is an experimentally derived baseline minimum.
+  // This value is in line with the minimum per frame boost in the alt_ref
+  // boost calculation.
+  frame_boost = ((frame_boost + 40.0) * boost_q_correction);
+
+  return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc,
+                                   int gfu_boost, int frames_to_project,
+                                   int num_stats_used_for_gfu_boost) {
+  /*
+   * If frames_to_project is equal to num_stats_used_for_gfu_boost,
+   * it means that gfu_boost was calculated over frames_to_project to
+   * begin with(ie; all stats required were available), hence return
+   * the original boost.
+   */
+  if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost;
+
+  double min_boost_factor = sqrt(p_rc->baseline_gf_interval);
+  // Get the current tpl factor (number of frames = frames_to_project).
+  double tpl_factor = av1_get_gfu_boost_projection_factor(
+      min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project);
+  // Get the tpl factor when number of frames = num_stats_used_for_prior_boost.
+  double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor(
+      min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost);
+  int projected_gfu_boost =
+      (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats);
+  return projected_gfu_boost;
+}
+
+#define GF_MAX_BOOST 90.0
+#define GF_MIN_BOOST 50
+#define MIN_DECAY_FACTOR 0.01
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+                       const TWO_PASS_FRAME *twopass_frame,
+                       const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+                       int offset, int f_frames, int b_frames,
+                       int *num_fpstats_used, int *num_fpstats_required,
+                       int project_gfu_boost) {
+  int i;
+  GF_GROUP_STATS gf_stats;
+  init_gf_stats(&gf_stats);
+  double boost_score = (double)NORMAL_BOOST;
+  int arf_boost;
+  int flash_detected = 0;
+  if (num_fpstats_used) *num_fpstats_used = 0;
+
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
+    const FIRSTPASS_STATS *this_frame =
+        read_frame_stats(twopass, twopass_frame, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(this_frame, &gf_stats,
+                                  frame_info->frame_width,
+                                  frame_info->frame_height);
+
+    // We want to discount the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+                     detect_flash(twopass, twopass_frame, i + offset + 1);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
+      gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+                                       ? MIN_DECAY_FACTOR
+                                       : gf_stats.decay_accumulator;
+    }
+
+    boost_score +=
+        gf_stats.decay_accumulator *
+        calc_frame_boost(p_rc, frame_info, this_frame,
+                         gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+    if (num_fpstats_used) (*num_fpstats_used)++;
+  }
+
+  arf_boost = (int)boost_score;
+
+  // Reset for backward looking loop.
+  boost_score = 0.0;
+  init_gf_stats(&gf_stats);
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
+    const FIRSTPASS_STATS *this_frame =
+        read_frame_stats(twopass, twopass_frame, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(this_frame, &gf_stats,
+                                  frame_info->frame_width,
+                                  frame_info->frame_height);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+                     detect_flash(twopass, twopass_frame, i + offset + 1);
+
+    // Cumulative effect of prediction quality decay.
+    if (!flash_detected) {
+      gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
+      gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+                                       ? MIN_DECAY_FACTOR
+                                       : gf_stats.decay_accumulator;
+    }
+
+    boost_score +=
+        gf_stats.decay_accumulator *
+        calc_frame_boost(p_rc, frame_info, this_frame,
+                         gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+    if (num_fpstats_used) (*num_fpstats_used)++;
+  }
+  arf_boost += (int)boost_score;
+
+  if (project_gfu_boost) {
+    assert(num_fpstats_required != NULL);
+    assert(num_fpstats_used != NULL);
+    *num_fpstats_required = f_frames + b_frames;
+    arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required,
+                                        *num_fpstats_used);
+  }
+
+  if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST))
+    arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST);
+
+  return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+                                         const FIRSTPASS_STATS *end,
+                                         int section_length) {
+  const FIRSTPASS_STATS *s = begin;
+  double intra_error = 0.0;
+  double coded_error = 0.0;
+  int i = 0;
+
+  while (s < end && i < section_length) {
+    intra_error += s->intra_error;
+    coded_error += s->coded_error;
+    ++s;
+    ++i;
+  }
+
+  return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+/*!\brief Calculates the bit target for this GF/ARF group
+ *
+ * \ingroup rate_control
+ *
+ * Calculates the total bits to allocate in this GF/ARF group.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    gf_group_err     Cumulative coded error score for the
+ *                                frames making up this group.
+ *
+ * \return The target total number of bits for this GF/ARF group.
+ */
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0) ? 0
+                     : (total_group_bits > twopass->kf_group_bits)
+                         ? twopass->kf_group_bits
+                         : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
+// Calculate the number of bits to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+                                int64_t total_group_bits) {
+  int allocation_chunks;
+
+  // return 0 for invalid inputs (could arise e.g. through rounding errors)
+  if (!boost || (total_group_bits <= 0)) return 0;
+
+  if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX));
+
+  allocation_chunks = (frame_count * 100) + boost;
+
+  // Prevent overflow.
+  if (boost > 1023) {
+    int divisor = boost >> 10;
+    boost /= divisor;
+    allocation_chunks /= divisor;
+  }
+
+  // Calculate the number of extra bits for use in the boosted frame or frames.
+  return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
+}
+
+// Calculate the boost factor based on the number of bits assigned, i.e. the
+// inverse of calculate_boost_bits().
+static int calculate_boost_factor(int frame_count, int bits,
+                                  int64_t total_group_bits) {
+  return (int)(100.0 * frame_count * bits / (total_group_bits - bits));
+}
+
+// Reduce the number of bits assigned to keyframe or arf if necessary, to
+// prevent bitrate spikes that may break level constraints.
+// frame_type: 0: keyframe; 1: arf.
+static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
+                                              RATE_CONTROL *const rc,
+                                              int bits_assigned,
+                                              int64_t group_bits,
+                                              int frame_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const int temporal_layer_id = cm->temporal_layer_id;
+  const int spatial_layer_id = cm->spatial_layer_id;
+  for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1;
+       ++index) {
+    if (!is_in_operating_point(seq_params->operating_point_idc[index],
+                               temporal_layer_id, spatial_layer_id)) {
+      continue;
+    }
+
+    const AV1_LEVEL target_level =
+        cpi->ppi->level_params.target_seq_level_idx[index];
+    if (target_level >= SEQ_LEVELS) continue;
+
+    assert(is_valid_seq_level_idx(target_level));
+
+    const double level_bitrate_limit = av1_get_max_bitrate_for_level(
+        target_level, seq_params->tier[0], seq_params->profile);
+    const int target_bits_per_frame =
+        (int)(level_bitrate_limit / cpi->framerate);
+    if (frame_type == 0) {
+      // Maximum bits for keyframe is 8 times the target_bits_per_frame.
+      const int level_enforced_max_kf_bits = target_bits_per_frame * 8;
+      if (bits_assigned > level_enforced_max_kf_bits) {
+        const int frames = rc->frames_to_key - 1;
+        p_rc->kf_boost = calculate_boost_factor(
+            frames, level_enforced_max_kf_bits, group_bits);
+        bits_assigned =
+            calculate_boost_bits(frames, p_rc->kf_boost, group_bits);
+      }
+    } else if (frame_type == 1) {
+      // Maximum bits for arf is 4 times the target_bits_per_frame.
+      const int level_enforced_max_arf_bits = target_bits_per_frame * 4;
+      if (bits_assigned > level_enforced_max_arf_bits) {
+        p_rc->gfu_boost =
+            calculate_boost_factor(p_rc->baseline_gf_interval,
+                                   level_enforced_max_arf_bits, group_bits);
+        bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval,
+                                             p_rc->gfu_boost, group_bits);
+      }
+    } else {
+      assert(0);
+    }
+  }
+
+  return bits_assigned;
+}
+
+// Allocate bits to each frame in a GF / ARF group
+double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55, 0.60,
+                                              0.60, 1.0,  1.0 };
+static void allocate_gf_group_bits(GF_GROUP *gf_group,
+                                   PRIMARY_RATE_CONTROL *const p_rc,
+                                   RATE_CONTROL *const rc,
+                                   int64_t gf_group_bits, int gf_arf_bits,
+                                   int key_frame, int use_arf) {
+  int64_t total_group_bits = gf_group_bits;
+  int base_frame_bits;
+  const int gf_group_size = gf_group->size;
+  int layer_frames[MAX_ARF_LAYERS + 1] = { 0 };
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  int frame_index = !!key_frame;
+
+  // Subtract the extra bits set aside for ARF frames from the Group Total
+  if (use_arf) total_group_bits -= gf_arf_bits;
+
+  int num_frames =
+      AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0));
+  base_frame_bits = (int)(total_group_bits / num_frames);
+
+  // Check the number of frames in each layer in case we have a
+  // non standard group length.
+  int max_arf_layer = gf_group->max_layer_depth - 1;
+  for (int idx = frame_index; idx < gf_group_size; ++idx) {
+    if ((gf_group->update_type[idx] == ARF_UPDATE) ||
+        (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) {
+      layer_frames[gf_group->layer_depth[idx]]++;
+    }
+  }
+
+  // Allocate extra bits to each ARF layer
+  int i;
+  int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 };
+  assert(max_arf_layer <= MAX_ARF_LAYERS);
+  for (i = 1; i <= max_arf_layer; ++i) {
+    double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i];
+    layer_extra_bits[i] =
+        (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i]));
+    gf_arf_bits -= (int)(gf_arf_bits * fraction);
+  }
+
+  // Now combine ARF layer and baseline bits to give total bits for each frame.
+  int arf_extra_bits;
+  for (int idx = frame_index; idx < gf_group_size; ++idx) {
+    switch (gf_group->update_type[idx]) {
+      case ARF_UPDATE:
+      case INTNL_ARF_UPDATE:
+        arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]];
+        gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits;
+        break;
+      case INTNL_OVERLAY_UPDATE:
+      case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break;
+      default: gf_group->bit_allocation[idx] = base_frame_bits; break;
+    }
+  }
+
+  // Set the frame following the current GOP to 0 bit allocation. For ARF
+  // groups, this next frame will be overlay frame, which is the first frame
+  // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
+  // Setting this frame to use 0 bit (of out the current GOP budget) will
+  // simplify logics in reference frame management.
+  if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH)
+    gf_group->bit_allocation[gf_group_size] = 0;
+}
+
+// Returns true if KF group and GF group both are almost completely static.
+static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion,
+                                   int is_lap_enabled) {
+  if (is_lap_enabled) {
+    /*
+     * when LAP enabled kf_zero_motion is not reliable, so use strict
+     * constraint on gf_zero_motion.
+     */
+    return (gf_zero_motion >= 0.999);
+  } else {
+    return (gf_zero_motion >= 0.995) &&
+           (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+  }
+}
+
+#define ARF_ABS_ZOOM_THRESH 4.4
+static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
+                                int flash_detected, int active_max_gf_interval,
+                                int active_min_gf_interval,
+                                GF_GROUP_STATS *gf_stats) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  AV1_COMMON *const cm = &cpi->common;
+  // Motion breakout threshold for loop below depends on image size.
+  const double mv_ratio_accumulator_thresh = (cm->height + cm->width) / 4.0;
+
+  if (!flash_detected) {
+    // Break clause to detect very still sections after motion. For example,
+    // a static image after a fade or other transition.
+
+    // TODO(angiebird): This is a temporary change, we will avoid using
+    // twopass_frame.stats_in in the follow-up CL
+    int index = (int)(cpi->twopass_frame.stats_in -
+                      twopass->stats_buf_ctx->stats_in_start);
+    if (detect_transition_to_still(&twopass->firstpass_info, index,
+                                   rc->min_gf_interval, frame_index - cur_start,
+                                   5, gf_stats->loop_decay_rate,
+                                   gf_stats->last_loop_decay_rate)) {
+      return 1;
+    }
+  }
+
+  // Some conditions to breakout after min interval.
+  if (frame_index - cur_start >= active_min_gf_interval &&
+      // If possible don't break very close to a kf
+      (rc->frames_to_key - frame_index >= rc->min_gf_interval) &&
+      ((frame_index - cur_start) & 0x01) && !flash_detected &&
+      (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
+       gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
+    return 1;
+  }
+
+  // If almost totally static, we will not use the the max GF length later,
+  // so we can continue for more frames.
+  if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
+      !is_almost_static(gf_stats->zero_motion_accumulator,
+                        twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) {
+    return 1;
+  }
+  return 0;
+}
+
+static int is_shorter_gf_interval_better(
+    AV1_COMP *cpi, const EncodeFrameParams *frame_params) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method;
+  int shorten_gf_interval;
+
+  av1_tpl_preload_rc_estimate(cpi, frame_params);
+
+  if (gop_length_decision_method == 2) {
+    // GF group length is decided based on GF boost and tpl stats of ARFs from
+    // base layer, (base+1) layer.
+    shorten_gf_interval =
+        (p_rc->gfu_boost <
+         p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) &&
+        !av1_tpl_setup_stats(cpi, 3, frame_params);
+  } else {
+    int do_complete_tpl = 1;
+    GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    int is_temporal_filter_enabled =
+        (rc->frames_since_key > 0 && gf_group->arf_index > -1);
+
+    if (gop_length_decision_method == 1) {
+      // Check if tpl stats of ARFs from base layer, (base+1) layer,
+      // (base+2) layer can decide the GF group length.
+      int gop_length_eval = av1_tpl_setup_stats(cpi, 2, frame_params);
+
+      if (gop_length_eval != 2) {
+        do_complete_tpl = 0;
+        shorten_gf_interval = !gop_length_eval;
+      }
+    }
+
+    if (do_complete_tpl) {
+      // Decide GF group length based on complete tpl stats.
+      shorten_gf_interval = !av1_tpl_setup_stats(cpi, 1, frame_params);
+      // Tpl stats is reused when the ARF is temporally filtered and GF
+      // interval is not shortened.
+      if (is_temporal_filter_enabled && !shorten_gf_interval) {
+        cpi->skip_tpl_setup_stats = 1;
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+        assert(cpi->gf_frame_index == 0);
+        av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+                                       gf_group,
+                                       cpi->common.seq_params->bit_depth);
+#endif  // CONFIG_BITRATE_ACCURACY
+      }
+    }
+  }
+  return shorten_gf_interval;
+}
+
+#define MIN_SHRINK_LEN 6  // the minimum length of gf if we are shrinking
+#define SMOOTH_FILT_LEN 7
+#define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2)
+#define WINDOW_SIZE 7
+#define HALF_WIN (WINDOW_SIZE / 2)
+// A 7-tap gaussian smooth filter
+const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383,
+                                              0.242, 0.061, 0.006 };
+
+// Smooth filter intra_error and coded_error in firstpass stats.
+// If stats[i].is_flash==1, the ith element should not be used in the filtering.
+static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx,
+                                int last_idx, double *filt_intra_err,
+                                double *filt_coded_err) {
+  int i, j;
+  for (i = start_idx; i <= last_idx; i++) {
+    double total_wt = 0;
+    for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+      if (stats[idx].is_flash) continue;
+
+      filt_intra_err[i] +=
+          smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error;
+      total_wt += smooth_filt[j + HALF_FILT_LEN];
+    }
+    if (total_wt > 0.01) {
+      filt_intra_err[i] /= total_wt;
+    } else {
+      filt_intra_err[i] = stats[i].intra_error;
+    }
+  }
+  for (i = start_idx; i <= last_idx; i++) {
+    double total_wt = 0;
+    for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+      // Coded error involves idx and idx - 1.
+      if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
+
+      filt_coded_err[i] +=
+          smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error;
+      total_wt += smooth_filt[j + HALF_FILT_LEN];
+    }
+    if (total_wt > 0.01) {
+      filt_coded_err[i] /= total_wt;
+    } else {
+      filt_coded_err[i] = stats[i].coded_error;
+    }
+  }
+}
+
+// Calculate gradient
+static void get_gradient(const double *values, int start, int last,
+                         double *grad) {
+  if (start == last) {
+    grad[start] = 0;
+    return;
+  }
+  for (int i = start; i <= last; i++) {
+    int prev = AOMMAX(i - 1, start);
+    int next = AOMMIN(i + 1, last);
+    grad[i] = (values[next] - values[prev]) / (next - prev);
+  }
+}
+
+static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
+                              int first, int last) {
+  // Identify unstable areas caused by scenecuts.
+  // Find the max and 2nd max coded error, and the average of the rest frames.
+  // If there is only one frame that yields a huge coded error, it is likely a
+  // scenecut.
+  double this_ratio, max_prev_ratio, max_next_ratio, max_prev_coded,
+      max_next_coded;
+
+  if (last - first == 0) return -1;
+
+  for (int i = first; i <= last; i++) {
+    if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+      continue;
+    double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01);
+    this_ratio = stats_start[i].coded_error / temp_intra;
+    // find the avg ratio in the preceding neighborhood
+    max_prev_ratio = 0;
+    max_prev_coded = 0;
+    for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) {
+      if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash))
+        continue;
+      temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+      double temp_ratio = stats_start[j].coded_error / temp_intra;
+      if (temp_ratio > max_prev_ratio) {
+        max_prev_ratio = temp_ratio;
+      }
+      if (stats_start[j].coded_error > max_prev_coded) {
+        max_prev_coded = stats_start[j].coded_error;
+      }
+    }
+    // find the avg ratio in the following neighborhood
+    max_next_ratio = 0;
+    max_next_coded = 0;
+    for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) {
+      if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+        continue;
+      temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+      double temp_ratio = stats_start[j].coded_error / temp_intra;
+      if (temp_ratio > max_next_ratio) {
+        max_next_ratio = temp_ratio;
+      }
+      if (stats_start[j].coded_error > max_next_coded) {
+        max_next_coded = stats_start[j].coded_error;
+      }
+    }
+
+    if (max_prev_ratio < 0.001 && max_next_ratio < 0.001) {
+      // the ratios are very small, only check a small fixed threshold
+      if (this_ratio < 0.02) continue;
+    } else {
+      // check if this frame has a larger ratio than the neighborhood
+      double max_sr = stats_start[i].sr_coded_error;
+      if (i < last) max_sr = AOMMAX(max_sr, stats_start[i + 1].sr_coded_error);
+      double max_sr_fr_ratio =
+          max_sr / AOMMAX(stats_start[i].coded_error, 0.01);
+
+      if (max_sr_fr_ratio > 1.2) continue;
+      if (this_ratio < 2 * AOMMAX(max_prev_ratio, max_next_ratio) &&
+          stats_start[i].coded_error <
+              2 * AOMMAX(max_prev_coded, max_next_coded)) {
+        continue;
+      }
+    }
+    return i;
+  }
+  return -1;
+}
+
+// Remove the region with index next_region.
+// parameter merge: 0: merge with previous; 1: merge with next; 2:
+// merge with both, take type from previous if possible
+// After removing, next_region will be the index of the next region.
+static void remove_region(int merge, REGIONS *regions, int *num_regions,
+                          int *next_region) {
+  int k = *next_region;
+  assert(k < *num_regions);
+  if (*num_regions == 1) {
+    *num_regions = 0;
+    return;
+  }
+  if (k == 0) {
+    merge = 1;
+  } else if (k == *num_regions - 1) {
+    merge = 0;
+  }
+  int num_merge = (merge == 2) ? 2 : 1;
+  switch (merge) {
+    case 0:
+      regions[k - 1].last = regions[k].last;
+      *next_region = k;
+      break;
+    case 1:
+      regions[k + 1].start = regions[k].start;
+      *next_region = k + 1;
+      break;
+    case 2:
+      regions[k - 1].last = regions[k + 1].last;
+      *next_region = k;
+      break;
+    default: assert(0);
+  }
+  *num_regions -= num_merge;
+  for (k = *next_region - (merge == 1); k < *num_regions; k++) {
+    regions[k] = regions[k + num_merge];
+  }
+}
+
+// Insert a region in the cur_region_idx. The start and last should both be in
+// the current region. After insertion, the cur_region_idx will point to the
+// last region that was splitted from the original region.
+static void insert_region(int start, int last, REGION_TYPES type,
+                          REGIONS *regions, int *num_regions,
+                          int *cur_region_idx) {
+  int k = *cur_region_idx;
+  REGION_TYPES this_region_type = regions[k].type;
+  int this_region_last = regions[k].last;
+  int num_add = (start != regions[k].start) + (last != regions[k].last);
+  // move the following regions further to the back
+  for (int r = *num_regions - 1; r > k; r--) {
+    regions[r + num_add] = regions[r];
+  }
+  *num_regions += num_add;
+  if (start > regions[k].start) {
+    regions[k].last = start - 1;
+    k++;
+    regions[k].start = start;
+  }
+  regions[k].type = type;
+  if (last < this_region_last) {
+    regions[k].last = last;
+    k++;
+    regions[k].start = last + 1;
+    regions[k].last = this_region_last;
+    regions[k].type = this_region_type;
+  } else {
+    regions[k].last = this_region_last;
+  }
+  *cur_region_idx = k;
+}
+
+// Get the average of stats inside a region.
+static void analyze_region(const FIRSTPASS_STATS *stats, int k,
+                           REGIONS *regions) {
+  int i;
+  regions[k].avg_cor_coeff = 0;
+  regions[k].avg_sr_fr_ratio = 0;
+  regions[k].avg_intra_err = 0;
+  regions[k].avg_coded_err = 0;
+
+  int check_first_sr = (k != 0);
+
+  for (i = regions[k].start; i <= regions[k].last; i++) {
+    if (i > regions[k].start || check_first_sr) {
+      double num_frames =
+          (double)(regions[k].last - regions[k].start + check_first_sr);
+      double max_coded_error =
+          AOMMAX(stats[i].coded_error, stats[i - 1].coded_error);
+      double this_ratio =
+          stats[i].sr_coded_error / AOMMAX(max_coded_error, 0.001);
+      regions[k].avg_sr_fr_ratio += this_ratio / num_frames;
+    }
+
+    regions[k].avg_intra_err +=
+        stats[i].intra_error / (double)(regions[k].last - regions[k].start + 1);
+    regions[k].avg_coded_err +=
+        stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1);
+
+    regions[k].avg_cor_coeff +=
+        AOMMAX(stats[i].cor_coeff, 0.001) /
+        (double)(regions[k].last - regions[k].start + 1);
+    regions[k].avg_noise_var +=
+        AOMMAX(stats[i].noise_var, 0.001) /
+        (double)(regions[k].last - regions[k].start + 1);
+  }
+}
+
+// Calculate the regions stats of every region.
+static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions,
+                             int num_regions) {
+  for (int k = 0; k < num_regions; k++) {
+    analyze_region(stats, k, regions);
+  }
+}
+
+// Find tentative stable regions
+static int find_stable_regions(const FIRSTPASS_STATS *stats,
+                               const double *grad_coded, int this_start,
+                               int this_last, REGIONS *regions) {
+  int i, j, k = 0;
+  regions[k].start = this_start;
+  for (i = this_start; i <= this_last; i++) {
+    // Check mean and variance of stats in a window
+    double mean_intra = 0.001, var_intra = 0.001;
+    double mean_coded = 0.001, var_coded = 0.001;
+    int count = 0;
+    for (j = -HALF_WIN; j <= HALF_WIN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, this_start), this_last);
+      if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
+      mean_intra += stats[idx].intra_error;
+      var_intra += stats[idx].intra_error * stats[idx].intra_error;
+      mean_coded += stats[idx].coded_error;
+      var_coded += stats[idx].coded_error * stats[idx].coded_error;
+      count++;
+    }
+
+    REGION_TYPES cur_type;
+    if (count > 0) {
+      mean_intra /= (double)count;
+      var_intra /= (double)count;
+      mean_coded /= (double)count;
+      var_coded /= (double)count;
+      int is_intra_stable = (var_intra / (mean_intra * mean_intra) < 1.03);
+      int is_coded_stable = (var_coded / (mean_coded * mean_coded) < 1.04 &&
+                             fabs(grad_coded[i]) / mean_coded < 0.05) ||
+                            mean_coded / mean_intra < 0.05;
+      int is_coded_small = mean_coded < 0.5 * mean_intra;
+      cur_type = (is_intra_stable && is_coded_stable && is_coded_small)
+                     ? STABLE_REGION
+                     : HIGH_VAR_REGION;
+    } else {
+      cur_type = HIGH_VAR_REGION;
+    }
+
+    // mark a new region if type changes
+    if (i == regions[k].start) {
+      // first frame in the region
+      regions[k].type = cur_type;
+    } else if (cur_type != regions[k].type) {
+      // Append a new region
+      regions[k].last = i - 1;
+      regions[k + 1].start = i;
+      regions[k + 1].type = cur_type;
+      k++;
+    }
+  }
+  regions[k].last = this_last;
+  return k + 1;
+}
+
+// Clean up regions that should be removed or merged.
+static void cleanup_regions(REGIONS *regions, int *num_regions) {
+  int k = 0;
+  while (k < *num_regions) {
+    if ((k > 0 && regions[k - 1].type == regions[k].type &&
+         regions[k].type != SCENECUT_REGION) ||
+        regions[k].last < regions[k].start) {
+      remove_region(0, regions, num_regions, &k);
+    } else {
+      k++;
+    }
+  }
+}
+
+// Remove regions that are of type and shorter than length.
+// Merge it with its neighboring regions.
+static void remove_short_regions(REGIONS *regions, int *num_regions,
+                                 REGION_TYPES type, int length) {
+  int k = 0;
+  while (k < *num_regions && (*num_regions) > 1) {
+    if ((regions[k].last - regions[k].start + 1 < length &&
+         regions[k].type == type)) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+    } else {
+      k++;
+    }
+  }
+  cleanup_regions(regions, num_regions);
+}
+
+static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
+                                          REGIONS *regions, int *num_regions) {
+  int i, j, k;
+  // Remove regions that are too short. Likely noise.
+  remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+
+  get_region_stats(stats, regions, *num_regions);
+
+  // Adjust region boundaries. The thresholds are empirically obtained, but
+  // overall the performance is not very sensitive to small changes to them.
+  for (k = 0; k < *num_regions; k++) {
+    if (regions[k].type == STABLE_REGION) continue;
+    if (k > 0) {
+      // Adjust previous boundary.
+      // First find the average intra/coded error in the previous
+      // neighborhood.
+      double avg_intra_err = 0;
+      const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1,
+                                regions[k - 1].start + 1);
+      const int lasti = regions[k - 1].last;
+      int counti = 0;
+      for (i = starti; i <= lasti; i++) {
+        avg_intra_err += stats[i].intra_error;
+        counti++;
+      }
+      if (counti > 0) {
+        avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+        int count_coded = 0, count_grad = 0;
+        for (j = lasti + 1; j <= regions[k].last; j++) {
+          const int intra_close =
+              fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+          const int coded_small = stats[j].coded_error / avg_intra_err < 0.1;
+          const int coeff_close = stats[j].cor_coeff > 0.995;
+          if (!coeff_close || !coded_small) count_coded--;
+          if (intra_close && count_coded >= 0 && count_grad >= 0) {
+            // this frame probably belongs to the previous stable region
+            regions[k - 1].last = j;
+            regions[k].start = j + 1;
+          } else {
+            break;
+          }
+        }
+      }
+    }  // if k > 0
+    if (k < *num_regions - 1) {
+      // Adjust next boundary.
+      // First find the average intra/coded error in the next neighborhood.
+      double avg_intra_err = 0;
+      const int starti = regions[k + 1].start;
+      const int lasti = AOMMIN(regions[k + 1].last - 1,
+                               regions[k + 1].start + WINDOW_SIZE - 1);
+      int counti = 0;
+      for (i = starti; i <= lasti; i++) {
+        avg_intra_err += stats[i].intra_error;
+        counti++;
+      }
+      if (counti > 0) {
+        avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+        // At the boundary, coded error is large, but still the frame is stable
+        int count_coded = 1, count_grad = 1;
+        for (j = starti - 1; j >= regions[k].start; j--) {
+          const int intra_close =
+              fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+          const int coded_small =
+              stats[j + 1].coded_error / avg_intra_err < 0.1;
+          const int coeff_close = stats[j].cor_coeff > 0.995;
+          if (!coeff_close || !coded_small) count_coded--;
+          if (intra_close && count_coded >= 0 && count_grad >= 0) {
+            // this frame probably belongs to the next stable region
+            regions[k + 1].start = j;
+            regions[k].last = j - 1;
+          } else {
+            break;
+          }
+        }
+      }
+    }  // if k < *num_regions - 1
+  }    // end of loop over all regions
+
+  cleanup_regions(regions, num_regions);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+  get_region_stats(stats, regions, *num_regions);
+
+  // If a stable regions has higher error than neighboring high var regions,
+  // or if the stable region has a lower average correlation,
+  // then it should be merged with them
+  k = 0;
+  while (k < *num_regions && (*num_regions) > 1) {
+    if (regions[k].type == STABLE_REGION &&
+        (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
+        ((k > 0 &&  // previous regions
+          (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 ||
+           regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) &&
+         (k < *num_regions - 1 &&  // next region
+          (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 ||
+           regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+      analyze_region(stats, k - 1, regions);
+    } else if (regions[k].type == HIGH_VAR_REGION &&
+               (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
+               ((k > 0 &&  // previous regions
+                 (regions[k].avg_coded_err <
+                      regions[k - 1].avg_coded_err * 0.99 ||
+                  regions[k].avg_cor_coeff >
+                      regions[k - 1].avg_cor_coeff * 1.001)) &&
+                (k < *num_regions - 1 &&  // next region
+                 (regions[k].avg_coded_err <
+                      regions[k + 1].avg_coded_err * 0.99 ||
+                  regions[k].avg_cor_coeff >
+                      regions[k + 1].avg_cor_coeff * 1.001)))) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+      analyze_region(stats, k - 1, regions);
+    } else {
+      k++;
+    }
+  }
+
+  remove_short_regions(regions, num_regions, STABLE_REGION, WINDOW_SIZE);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+}
+
+// Identify blending regions.
+static void find_blending_regions(const FIRSTPASS_STATS *stats,
+                                  REGIONS *regions, int *num_regions) {
+  int i, k = 0;
+  // Blending regions will have large content change, therefore will have a
+  // large consistent change in intra error.
+  int count_stable = 0;
+  while (k < *num_regions) {
+    if (regions[k].type == STABLE_REGION) {
+      k++;
+      count_stable++;
+      continue;
+    }
+    int dir = 0;
+    int start = 0, last;
+    for (i = regions[k].start; i <= regions[k].last; i++) {
+      // First mark the regions that has consistent large change of intra error.
+      if (k == 0 && i == regions[k].start) continue;
+      if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue;
+      double grad = stats[i].intra_error - stats[i - 1].intra_error;
+      int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05;
+      int this_dir = 0;
+      if (large_change) {
+        this_dir = (grad > 0) ? 1 : -1;
+      }
+      // the current trend continues
+      if (dir == this_dir) continue;
+      if (dir != 0) {
+        // Mark the end of a new large change group and add it
+        last = i - 1;
+        insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+      }
+      dir = this_dir;
+      if (k == 0 && i == regions[k].start + 1) {
+        start = i - 1;
+      } else {
+        start = i;
+      }
+    }
+    if (dir != 0) {
+      last = regions[k].last;
+      insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+    }
+    k++;
+  }
+
+  // If the blending region has very low correlation, mark it as high variance
+  // since we probably cannot benefit from it anyways.
+  get_region_stats(stats, regions, *num_regions);
+  for (k = 0; k < *num_regions; k++) {
+    if (regions[k].type != BLENDING_REGION) continue;
+    if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 ||
+        count_stable == 0)
+      regions[k].type = HIGH_VAR_REGION;
+  }
+  get_region_stats(stats, regions, *num_regions);
+
+  // It is possible for blending to result in a "dip" in intra error (first
+  // decrease then increase). Therefore we need to find the dip and combine the
+  // two regions.
+  k = 1;
+  while (k < *num_regions) {
+    if (k < *num_regions - 1 && regions[k].type == HIGH_VAR_REGION) {
+      // Check if this short high variance regions is actually in the middle of
+      // a blending region.
+      if (regions[k - 1].type == BLENDING_REGION &&
+          regions[k + 1].type == BLENDING_REGION &&
+          regions[k].last - regions[k].start < 3) {
+        int prev_dir = (stats[regions[k - 1].last].intra_error -
+                        stats[regions[k - 1].last - 1].intra_error) > 0
+                           ? 1
+                           : -1;
+        int next_dir = (stats[regions[k + 1].last].intra_error -
+                        stats[regions[k + 1].last - 1].intra_error) > 0
+                           ? 1
+                           : -1;
+        if (prev_dir < 0 && next_dir > 0) {
+          // This is possibly a mid region of blending. Check the ratios
+          double ratio_thres = AOMMIN(regions[k - 1].avg_sr_fr_ratio,
+                                      regions[k + 1].avg_sr_fr_ratio) *
+                               0.95;
+          if (regions[k].avg_sr_fr_ratio > ratio_thres) {
+            regions[k].type = BLENDING_REGION;
+            remove_region(2, regions, num_regions, &k);
+            analyze_region(stats, k - 1, regions);
+            continue;
+          }
+        }
+      }
+    }
+    // Check if we have a pair of consecutive blending regions.
+    if (regions[k - 1].type == BLENDING_REGION &&
+        regions[k].type == BLENDING_REGION) {
+      int prev_dir = (stats[regions[k - 1].last].intra_error -
+                      stats[regions[k - 1].last - 1].intra_error) > 0
+                         ? 1
+                         : -1;
+      int next_dir = (stats[regions[k].last].intra_error -
+                      stats[regions[k].last - 1].intra_error) > 0
+                         ? 1
+                         : -1;
+
+      // if both are too short, no need to check
+      int total_length = regions[k].last - regions[k - 1].start + 1;
+      if (total_length < 4) {
+        regions[k - 1].type = HIGH_VAR_REGION;
+        k++;
+        continue;
+      }
+
+      int to_merge = 0;
+      if (prev_dir < 0 && next_dir > 0) {
+        // In this case we check the last frame in the previous region.
+        double prev_length =
+            (double)(regions[k - 1].last - regions[k - 1].start + 1);
+        double last_ratio, ratio_thres;
+        if (prev_length < 2.01) {
+          // if the previous region is very short
+          double max_coded_error =
+              AOMMAX(stats[regions[k - 1].last].coded_error,
+                     stats[regions[k - 1].last - 1].coded_error);
+          last_ratio = stats[regions[k - 1].last].sr_coded_error /
+                       AOMMAX(max_coded_error, 0.001);
+          ratio_thres = regions[k].avg_sr_fr_ratio * 0.95;
+        } else {
+          double max_coded_error =
+              AOMMAX(stats[regions[k - 1].last].coded_error,
+                     stats[regions[k - 1].last - 1].coded_error);
+          last_ratio = stats[regions[k - 1].last].sr_coded_error /
+                       AOMMAX(max_coded_error, 0.001);
+          double prev_ratio =
+              (regions[k - 1].avg_sr_fr_ratio * prev_length - last_ratio) /
+              (prev_length - 1.0);
+          ratio_thres = AOMMIN(prev_ratio, regions[k].avg_sr_fr_ratio) * 0.95;
+        }
+        if (last_ratio > ratio_thres) {
+          to_merge = 1;
+        }
+      }
+
+      if (to_merge) {
+        remove_region(0, regions, num_regions, &k);
+        analyze_region(stats, k - 1, regions);
+        continue;
+      } else {
+        // These are possibly two separate blending regions. Mark the boundary
+        // frame as HIGH_VAR_REGION to separate the two.
+        int prev_k = k - 1;
+        insert_region(regions[prev_k].last, regions[prev_k].last,
+                      HIGH_VAR_REGION, regions, num_regions, &prev_k);
+        analyze_region(stats, prev_k, regions);
+        k = prev_k + 1;
+        analyze_region(stats, k, regions);
+      }
+    }
+    k++;
+  }
+  cleanup_regions(regions, num_regions);
+}
+
+// Clean up decision for blendings. Remove blending regions that are too short.
+// Also if a very short high var region is between a blending and a stable
+// region, just merge it with one of them.
+static void cleanup_blendings(REGIONS *regions, int *num_regions) {
+  int k = 0;
+  while (k<*num_regions && * num_regions> 1) {
+    int is_short_blending = regions[k].type == BLENDING_REGION &&
+                            regions[k].last - regions[k].start + 1 < 5;
+    int is_short_hv = regions[k].type == HIGH_VAR_REGION &&
+                      regions[k].last - regions[k].start + 1 < 5;
+    int has_stable_neighbor =
+        ((k > 0 && regions[k - 1].type == STABLE_REGION) ||
+         (k < *num_regions - 1 && regions[k + 1].type == STABLE_REGION));
+    int has_blend_neighbor =
+        ((k > 0 && regions[k - 1].type == BLENDING_REGION) ||
+         (k < *num_regions - 1 && regions[k + 1].type == BLENDING_REGION));
+    int total_neighbors = (k > 0) + (k < *num_regions - 1);
+
+    if (is_short_blending ||
+        (is_short_hv &&
+         has_stable_neighbor + has_blend_neighbor >= total_neighbors)) {
+      // Remove this region.Try to determine whether to combine it with the
+      // previous or next region.
+      int merge;
+      double prev_diff =
+          (k > 0)
+              ? fabs(regions[k].avg_cor_coeff - regions[k - 1].avg_cor_coeff)
+              : 1;
+      double next_diff =
+          (k < *num_regions - 1)
+              ? fabs(regions[k].avg_cor_coeff - regions[k + 1].avg_cor_coeff)
+              : 1;
+      // merge == 0 means to merge with previous, 1 means to merge with next
+      merge = prev_diff > next_diff;
+      remove_region(merge, regions, num_regions, &k);
+    } else {
+      k++;
+    }
+  }
+  cleanup_regions(regions, num_regions);
+}
+
+static void free_firstpass_stats_buffers(REGIONS *temp_regions,
+                                         double *filt_intra_err,
+                                         double *filt_coded_err,
+                                         double *grad_coded) {
+  aom_free(temp_regions);
+  aom_free(filt_intra_err);
+  aom_free(filt_coded_err);
+  aom_free(grad_coded);
+}
+
+// Identify stable and unstable regions from first pass stats.
+// stats_start points to the first frame to analyze.
+// |offset| is the offset from the current frame to the frame stats_start is
+// pointing to.
+// Returns 0 on success, -1 on memory allocation failure.
+static int identify_regions(const FIRSTPASS_STATS *const stats_start,
+                            int total_frames, int offset, REGIONS *regions,
+                            int *total_regions) {
+  int k;
+  if (total_frames <= 1) return 0;
+
+  // store the initial decisions
+  REGIONS *temp_regions =
+      (REGIONS *)aom_malloc(total_frames * sizeof(temp_regions[0]));
+  // buffers for filtered stats
+  double *filt_intra_err =
+      (double *)aom_calloc(total_frames, sizeof(*filt_intra_err));
+  double *filt_coded_err =
+      (double *)aom_calloc(total_frames, sizeof(*filt_coded_err));
+  double *grad_coded = (double *)aom_calloc(total_frames, sizeof(*grad_coded));
+  if (!(temp_regions && filt_intra_err && filt_coded_err && grad_coded)) {
+    free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+                                 grad_coded);
+    return -1;
+  }
+  av1_zero_array(temp_regions, total_frames);
+
+  int cur_region = 0, this_start = 0, this_last;
+
+  int next_scenecut = -1;
+  do {
+    // first get the obvious scenecuts
+    next_scenecut =
+        find_next_scenecut(stats_start, this_start, total_frames - 1);
+    this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1;
+
+    // low-pass filter the needed stats
+    smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err,
+                        filt_coded_err);
+    get_gradient(filt_coded_err, this_start, this_last, grad_coded);
+
+    // find tentative stable regions and unstable regions
+    int num_regions = find_stable_regions(stats_start, grad_coded, this_start,
+                                          this_last, temp_regions);
+
+    adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions);
+
+    get_region_stats(stats_start, temp_regions, num_regions);
+
+    // Try to identify blending regions in the unstable regions
+    find_blending_regions(stats_start, temp_regions, &num_regions);
+    cleanup_blendings(temp_regions, &num_regions);
+
+    // The flash points should all be considered high variance points
+    k = 0;
+    while (k < num_regions) {
+      if (temp_regions[k].type != STABLE_REGION) {
+        k++;
+        continue;
+      }
+      int start = temp_regions[k].start;
+      int last = temp_regions[k].last;
+      for (int i = start; i <= last; i++) {
+        if (stats_start[i].is_flash) {
+          insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k);
+        }
+      }
+      k++;
+    }
+    cleanup_regions(temp_regions, &num_regions);
+
+    // copy the regions in the scenecut group
+    for (k = 0; k < num_regions; k++) {
+      if (temp_regions[k].last < temp_regions[k].start &&
+          k == num_regions - 1) {
+        num_regions--;
+        break;
+      }
+      regions[k + cur_region] = temp_regions[k];
+    }
+    cur_region += num_regions;
+
+    // add the scenecut region
+    if (next_scenecut > -1) {
+      // add the scenecut region, and find the next scenecut
+      regions[cur_region].type = SCENECUT_REGION;
+      regions[cur_region].start = next_scenecut;
+      regions[cur_region].last = next_scenecut;
+      cur_region++;
+      this_start = next_scenecut + 1;
+    }
+  } while (next_scenecut >= 0);
+
+  *total_regions = cur_region;
+  get_region_stats(stats_start, regions, *total_regions);
+
+  for (k = 0; k < *total_regions; k++) {
+    // If scenecuts are very minor, mark them as high variance.
+    if (regions[k].type != SCENECUT_REGION ||
+        regions[k].avg_cor_coeff *
+                (1 - stats_start[regions[k].start].noise_var /
+                         regions[k].avg_intra_err) <
+            0.8) {
+      continue;
+    }
+    regions[k].type = HIGH_VAR_REGION;
+  }
+  cleanup_regions(regions, total_regions);
+  get_region_stats(stats_start, regions, *total_regions);
+
+  for (k = 0; k < *total_regions; k++) {
+    regions[k].start += offset;
+    regions[k].last += offset;
+  }
+
+  free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+                               grad_coded);
+  return 0;
+}
+
+static int find_regions_index(const REGIONS *regions, int num_regions,
+                              int frame_idx) {
+  for (int k = 0; k < num_regions; k++) {
+    if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) {
+      return k;
+    }
+  }
+  return -1;
+}
+
+/*!\brief Determine the length of future GF groups.
+ *
+ * \ingroup gf_group_algo
+ * This function decides the gf group length of future frames in batch
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    max_gop_length   Maximum length of the GF group
+ * \param[in]    max_intervals    Maximum number of intervals to decide
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is
+ * changed to store the decided GF group lengths.
+ */
+static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
+                                int max_intervals) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0);
+
+  const int f_w = cpi->common.width;
+  const int f_h = cpi->common.height;
+  int i;
+
+  int flash_detected;
+
+  av1_zero(next_frame);
+
+  if (has_no_stats_stage(cpi)) {
+    for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) {
+      p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
+    }
+    p_rc->cur_gf_index = 0;
+    rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS;
+    return;
+  }
+
+  // TODO(urvang): Try logic to vary min and max interval based on q.
+  const int active_min_gf_interval = rc->min_gf_interval;
+  const int active_max_gf_interval =
+      AOMMIN(rc->max_gf_interval, max_gop_length);
+  const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval);
+
+  i = (rc->frames_since_key == 0);
+  max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals;
+  int count_cuts = 1;
+  // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF.
+  int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last;
+  int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 };
+  int cut_here;
+  GF_GROUP_STATS gf_stats;
+  init_gf_stats(&gf_stats);
+  while (count_cuts < max_intervals + 1) {
+    // reaches next key frame, break here
+    if (i >= rc->frames_to_key) {
+      cut_here = 2;
+    } else if (i - cur_start >= rc->static_scene_max_gf_interval) {
+      // reached maximum len, but nothing special yet (almost static)
+      // let's look at the next interval
+      cut_here = 1;
+    } else if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) {
+      // reaches last frame, break
+      cut_here = 2;
+    } else {
+      // Test for the case where there is a brief flash but the prediction
+      // quality back to an earlier frame is then restored.
+      flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
+      // TODO(bohanli): remove redundant accumulations here, or unify
+      // this and the ones in define_gf_group
+      accumulate_next_frame_stats(&next_frame, flash_detected,
+                                  rc->frames_since_key, i, &gf_stats, f_w, f_h);
+
+      cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
+                               active_max_gf_interval, active_min_gf_interval,
+                               &gf_stats);
+    }
+    if (cut_here) {
+      cur_last = i - 1;  // the current last frame in the gf group
+      int ori_last = cur_last;
+      // The region frame idx does not start from the same frame as cur_start
+      // and cur_last. Need to offset them.
+      int offset = rc->frames_since_key - p_rc->regions_offset;
+      REGIONS *regions = p_rc->regions;
+      int num_regions = p_rc->num_regions;
+
+      int scenecut_idx = -1;
+      // only try shrinking if interval smaller than active_max_gf_interval
+      if (cur_last - cur_start <= active_max_gf_interval &&
+          cur_last > cur_start) {
+        // find the region indices of where the first and last frame belong.
+        int k_start =
+            find_regions_index(regions, num_regions, cur_start + offset);
+        int k_last =
+            find_regions_index(regions, num_regions, cur_last + offset);
+        if (cur_start + offset == 0) k_start = 0;
+
+        // See if we have a scenecut in between
+        for (int r = k_start + 1; r <= k_last; r++) {
+          if (regions[r].type == SCENECUT_REGION &&
+              regions[r].last - offset - cur_start > active_min_gf_interval) {
+            scenecut_idx = r;
+            break;
+          }
+        }
+
+        // if the found scenecut is very close to the end, ignore it.
+        if (regions[num_regions - 1].last - regions[scenecut_idx].last < 4) {
+          scenecut_idx = -1;
+        }
+
+        if (scenecut_idx != -1) {
+          // If we have a scenecut, then stop at it.
+          // TODO(bohanli): add logic here to stop before the scenecut and for
+          // the next gop start from the scenecut with GF
+          int is_minor_sc =
+              (regions[scenecut_idx].avg_cor_coeff *
+                   (1 - stats[regions[scenecut_idx].start - offset].noise_var /
+                            regions[scenecut_idx].avg_intra_err) >
+               0.6);
+          cur_last = regions[scenecut_idx].last - offset - !is_minor_sc;
+        } else {
+          int is_last_analysed = (k_last == num_regions - 1) &&
+                                 (cur_last + offset == regions[k_last].last);
+          int not_enough_regions =
+              k_last - k_start <=
+              1 + (regions[k_start].type == SCENECUT_REGION);
+          // if we are very close to the end, then do not shrink since it may
+          // introduce intervals that are too short
+          if (!(is_last_analysed && not_enough_regions)) {
+            const double arf_length_factor = 0.1;
+            double best_score = 0;
+            int best_j = -1;
+            const int first_frame = regions[0].start - offset;
+            const int last_frame = regions[num_regions - 1].last - offset;
+            // score of how much the arf helps the whole GOP
+            double base_score = 0.0;
+            // Accumulate base_score in
+            for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) {
+              if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+              base_score = (base_score + 1.0) * stats[j].cor_coeff;
+            }
+            int met_blending = 0;   // Whether we have met blending areas before
+            int last_blending = 0;  // Whether the previous frame if blending
+            for (int j = cur_start + min_shrink_int; j <= cur_last; j++) {
+              if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+              base_score = (base_score + 1.0) * stats[j].cor_coeff;
+              int this_reg =
+                  find_regions_index(regions, num_regions, j + offset);
+              if (this_reg < 0) continue;
+              // A GOP should include at most 1 blending region.
+              if (regions[this_reg].type == BLENDING_REGION) {
+                last_blending = 1;
+                if (met_blending) {
+                  break;
+                } else {
+                  base_score = 0;
+                  continue;
+                }
+              } else {
+                if (last_blending) met_blending = 1;
+                last_blending = 0;
+              }
+
+              // Add the factor of how good the neighborhood is for this
+              // candidate arf.
+              double this_score = arf_length_factor * base_score;
+              double temp_accu_coeff = 1.0;
+              // following frames
+              int count_f = 0;
+              for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) {
+                if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break;
+                temp_accu_coeff *= stats[n].cor_coeff;
+                this_score +=
+                    temp_accu_coeff *
+                    sqrt(AOMMAX(0.5,
+                                1 - stats[n].noise_var /
+                                        AOMMAX(stats[n].intra_error, 0.001)));
+                count_f++;
+              }
+              // preceding frames
+              temp_accu_coeff = 1.0;
+              for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) {
+                if (stats + n < twopass->stats_buf_ctx->stats_in_start) break;
+                temp_accu_coeff *= stats[n].cor_coeff;
+                this_score +=
+                    temp_accu_coeff *
+                    sqrt(AOMMAX(0.5,
+                                1 - stats[n].noise_var /
+                                        AOMMAX(stats[n].intra_error, 0.001)));
+              }
+
+              if (this_score > best_score) {
+                best_score = this_score;
+                best_j = j;
+              }
+            }
+
+            // For blending areas, move one more frame in case we missed the
+            // first blending frame.
+            int best_reg =
+                find_regions_index(regions, num_regions, best_j + offset);
+            if (best_reg < num_regions - 1 && best_reg > 0) {
+              if (regions[best_reg - 1].type == BLENDING_REGION &&
+                  regions[best_reg + 1].type == BLENDING_REGION) {
+                if (best_j + offset == regions[best_reg].start &&
+                    best_j + offset < regions[best_reg].last) {
+                  best_j += 1;
+                } else if (best_j + offset == regions[best_reg].last &&
+                           best_j + offset > regions[best_reg].start) {
+                  best_j -= 1;
+                }
+              }
+            }
+
+            if (cur_last - best_j < 2) best_j = cur_last;
+            if (best_j > 0 && best_score > 0.1) cur_last = best_j;
+            // if cannot find anything, just cut at the original place.
+          }
+        }
+      }
+      cut_pos[count_cuts] = cur_last;
+      count_cuts++;
+
+      // reset pointers to the shrunken location
+      cpi->twopass_frame.stats_in = start_pos + cur_last;
+      cur_start = cur_last;
+      int cur_region_idx =
+          find_regions_index(regions, num_regions, cur_start + 1 + offset);
+      if (cur_region_idx >= 0)
+        if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++;
+
+      i = cur_last;
+
+      if (cut_here > 1 && cur_last == ori_last) break;
+
+      // reset accumulators
+      init_gf_stats(&gf_stats);
+    }
+    ++i;
+  }
+
+  // save intervals
+  rc->intervals_till_gf_calculate_due = count_cuts - 1;
+  for (int n = 1; n < count_cuts; n++) {
+    p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
+  }
+  p_rc->cur_gf_index = 0;
+  cpi->twopass_frame.stats_in = start_pos;
+}
+
+static void correct_frames_to_key(AV1_COMP *cpi) {
+  int lookahead_size =
+      (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+  if (lookahead_size <
+      av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) {
+    assert(
+        IMPLIES(cpi->oxcf.pass != AOM_RC_ONE_PASS && cpi->ppi->frames_left > 0,
+                lookahead_size == cpi->ppi->frames_left));
+    cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
+  } else if (cpi->ppi->frames_left > 0) {
+    // Correct frames to key based on limit
+    cpi->rc.frames_to_key =
+        AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left);
+  }
+}
+
+/*!\brief Define a GF group in one pass mode when no look ahead stats are
+ * available.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup in the special
+ * case of one pass encoding where no lookahead stats are avialable.
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ */
+static void define_gf_group_pass0(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+  int target;
+
+  if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_set_golden_update(cpi);
+  } else {
+    p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index];
+    rc->intervals_till_gf_calculate_due--;
+    p_rc->cur_gf_index++;
+  }
+
+  // correct frames_to_key when lookahead queue is flushing
+  correct_frames_to_key(cpi);
+
+  if (p_rc->baseline_gf_interval > rc->frames_to_key)
+    p_rc->baseline_gf_interval = rc->frames_to_key;
+
+  p_rc->gfu_boost = DEFAULT_GF_BOOST;
+  p_rc->constrained_gf_group =
+      (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+
+  gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height;
+
+  // Rare case when the look-ahead is less than the target GOP length, can't
+  // generate ARF frame.
+  if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
+      !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) ||
+      p_rc->baseline_gf_interval < rc->min_gf_interval)
+    gf_group->max_layer_depth_allowed = 0;
+
+  // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+  av1_gop_setup_structure(cpi);
+
+  // Allocate bits to each of the frames in the GF group.
+  // TODO(sarahparker) Extend this to work with pyramid structure.
+  for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) {
+    const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index];
+    if (oxcf->rc_cfg.mode == AOM_CBR) {
+      if (cur_update_type == KF_UPDATE) {
+        target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+      } else {
+        target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type);
+      }
+    } else {
+      if (cur_update_type == KF_UPDATE) {
+        target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+      } else {
+        target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type);
+      }
+    }
+    gf_group->bit_allocation[cur_index] = target;
+  }
+}
+
+static INLINE void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc,
+                                            int arf_position) {
+  p_rc->baseline_gf_interval = arf_position;
+}
+
+// initialize GF_GROUP_STATS
+static void init_gf_stats(GF_GROUP_STATS *gf_stats) {
+  gf_stats->gf_group_err = 0.0;
+  gf_stats->gf_group_raw_error = 0.0;
+  gf_stats->gf_group_skip_pct = 0.0;
+  gf_stats->gf_group_inactive_zone_rows = 0.0;
+
+  gf_stats->mv_ratio_accumulator = 0.0;
+  gf_stats->decay_accumulator = 1.0;
+  gf_stats->zero_motion_accumulator = 1.0;
+  gf_stats->loop_decay_rate = 1.0;
+  gf_stats->last_loop_decay_rate = 1.0;
+  gf_stats->this_frame_mv_in_out = 0.0;
+  gf_stats->mv_in_out_accumulator = 0.0;
+  gf_stats->abs_mv_in_out_accumulator = 0.0;
+
+  gf_stats->avg_sr_coded_error = 0.0;
+  gf_stats->avg_pcnt_second_ref = 0.0;
+  gf_stats->avg_new_mv_count = 0.0;
+  gf_stats->avg_wavelet_energy = 0.0;
+  gf_stats->avg_raw_err_stdev = 0.0;
+  gf_stats->non_zero_stdev_count = 0;
+}
+
+static void accumulate_gop_stats(AV1_COMP *cpi, int is_intra_only, int f_w,
+                                 int f_h, FIRSTPASS_STATS *next_frame,
+                                 const FIRSTPASS_STATS *start_pos,
+                                 GF_GROUP_STATS *gf_stats, int *idx) {
+  int i, flash_detected;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  RATE_CONTROL *const rc = &cpi->rc;
+  FRAME_INFO *frame_info = &cpi->frame_info;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  init_gf_stats(gf_stats);
+  av1_zero(*next_frame);
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  i = is_intra_only;
+  // get the determined gf group length from p_rc->gf_intervals
+  while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
+    // read in the next frame
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break;
+    // Accumulate error score of frames in this gf group.
+    double mod_frame_err =
+        calculate_modified_err(frame_info, twopass, oxcf, next_frame);
+    // accumulate stats for this frame
+    accumulate_this_frame_stats(next_frame, mod_frame_err, gf_stats);
+    ++i;
+  }
+
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+  i = is_intra_only;
+  input_stats(twopass, &cpi->twopass_frame, next_frame);
+  while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
+    // read in the next frame
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
+
+    // accumulate stats for next frame
+    accumulate_next_frame_stats(next_frame, flash_detected,
+                                rc->frames_since_key, i, gf_stats, f_w, f_h);
+
+    ++i;
+  }
+
+  i = p_rc->gf_intervals[p_rc->cur_gf_index];
+  average_gf_stats(i, gf_stats);
+
+  *idx = i;
+}
+
+static void update_gop_length(RATE_CONTROL *rc, PRIMARY_RATE_CONTROL *p_rc,
+                              int idx, int is_final_pass) {
+  if (is_final_pass) {
+    rc->intervals_till_gf_calculate_due--;
+    p_rc->cur_gf_index++;
+  }
+
+  // Was the group length constrained by the requirement for a new KF?
+  p_rc->constrained_gf_group = (idx >= rc->frames_to_key) ? 1 : 0;
+
+  set_baseline_gf_interval(p_rc, idx);
+  rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+}
+
+#define MAX_GF_BOOST 5400
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only,
+                               int is_final_pass, int use_alt_ref,
+                               int alt_offset, const FIRSTPASS_STATS *start_pos,
+                               GF_GROUP_STATS *gf_stats) {
+  // Should we use the alternate reference frame.
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  FRAME_INFO *frame_info = &cpi->frame_info;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  int ext_len = i - is_intra_only;
+  if (use_alt_ref) {
+    const int forward_frames = (rc->frames_to_key - i >= ext_len)
+                                   ? ext_len
+                                   : AOMMAX(0, rc->frames_to_key - i);
+
+    // Calculate the boost for alt ref.
+    p_rc->gfu_boost = av1_calc_arf_boost(
+        twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset,
+        forward_frames, ext_len, &p_rc->num_stats_used_for_gfu_boost,
+        &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled);
+  } else {
+    reset_fpf_position(&cpi->twopass_frame, start_pos);
+    p_rc->gfu_boost = AOMMIN(
+        MAX_GF_BOOST,
+        av1_calc_arf_boost(
+            twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, ext_len,
+            0, &p_rc->num_stats_used_for_gfu_boost,
+            &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled));
+  }
+
+#define LAST_ALR_BOOST_FACTOR 0.2f
+  p_rc->arf_boost_factor = 1.0;
+  if (use_alt_ref && !is_lossless_requested(rc_cfg)) {
+    // Reduce the boost of altref in the last gf group
+    if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY ||
+        rc->frames_to_key - ext_len == 0) {
+      p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+    }
+  }
+
+  // Reset the file position.
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
+  if (cpi->ppi->lap_enabled) {
+    // Since we don't have enough stats to know the actual error of the
+    // gf group, we assume error of each frame to be equal to 1 and set
+    // the error of the group as baseline_gf_interval.
+    gf_stats->gf_group_err = p_rc->baseline_gf_interval;
+  }
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  p_rc->gf_group_bits =
+      calculate_total_gf_group_bits(cpi, gf_stats->gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more aggressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) &&
+      is_final_pass) {
+    const int vbr_group_bits_per_frame =
+        (int)(p_rc->gf_group_bits / p_rc->baseline_gf_interval);
+    const double group_av_err =
+        gf_stats->gf_group_raw_error / p_rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+        gf_stats->gf_group_skip_pct / p_rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+        ((gf_stats->gf_group_inactive_zone_rows * 2) /
+         (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
+
+    int tmp_q;
+    tmp_q = get_twopass_worst_quality(
+        cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+        vbr_group_bits_per_frame);
+    rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1);
+  }
+#endif
+
+  // Adjust KF group bits and error remaining.
+  if (is_final_pass) twopass->kf_group_error_left -= gf_stats->gf_group_err;
+
+  // Reset the file position.
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  if (rc->frames_since_key != 0) {
+    twopass->section_intra_rating = calculate_section_intra_ratio(
+        start_pos, twopass->stats_buf_ctx->stats_in_end,
+        p_rc->baseline_gf_interval);
+  }
+
+  av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0,
+                         use_alt_ref, p_rc->gf_group_bits);
+
+  // TODO(jingning): Generalize this condition.
+  if (is_final_pass) {
+    cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref;
+
+    // Reset rolling actual and target bits counters for ARF groups.
+    twopass->rolling_arf_group_target_bits = 1;
+    twopass->rolling_arf_group_actual_bits = 1;
+  }
+#if CONFIG_BITRATE_ACCURACY
+  if (is_final_pass) {
+    av1_vbr_rc_set_gop_bit_budget(&cpi->vbr_rc_info,
+                                  p_rc->baseline_gf_interval);
+  }
+#endif
+}
+
+/*!\brief Define a GF group.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup.
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    frame_params    Structure with frame parameters
+ * \param[in]    is_final_pass   Whether this is the final pass for the
+ *                               GF group, or a trial (non-zero)
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ */
+static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+                            int is_final_pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  const int f_w = cm->width;
+  const int f_h = cm->height;
+  int i;
+  const int is_intra_only = rc->frames_since_key == 0;
+
+  cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (!is_intra_only) {
+    av1_zero(cpi->ppi->gf_group);
+    cpi->gf_frame_index = 0;
+  }
+
+  if (has_no_stats_stage(cpi)) {
+    define_gf_group_pass0(cpi);
+    return;
+  }
+
+  if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+    int ret = define_gf_group_pass3(cpi, frame_params, is_final_pass);
+    if (ret == 0) return;
+
+    av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+    cpi->third_pass_ctx = NULL;
+  }
+
+  // correct frames_to_key when lookahead queue is emptying
+  if (cpi->ppi->lap_enabled) {
+    correct_frames_to_key(cpi);
+  }
+
+  GF_GROUP_STATS gf_stats;
+  accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos,
+                       &gf_stats, &i);
+
+  const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  const int active_min_gf_interval = rc->min_gf_interval;
+
+  // Disable internal ARFs for "still" gf groups.
+  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
+  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
+  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
+  //                            motion error per block of each frame.
+  const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1;
+  if (can_disable_internal_arfs &&
+      gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
+      gf_stats.avg_sr_coded_error < MAX_SR_CODED_ERROR &&
+      gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
+    cpi->ppi->internal_altref_allowed = 0;
+  }
+
+  int use_alt_ref;
+  if (can_disable_arf) {
+    use_alt_ref =
+        !is_almost_static(gf_stats.zero_motion_accumulator,
+                          twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) &&
+        p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
+        (i >= MIN_GF_INTERVAL);
+  } else {
+    use_alt_ref = p_rc->use_arf_in_this_kf_group &&
+                  (i < gf_cfg->lag_in_frames) && (i > 2);
+  }
+  if (use_alt_ref) {
+    gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
+  } else {
+    gf_group->max_layer_depth_allowed = 0;
+  }
+
+  int alt_offset = 0;
+  // The length reduction strategy is tweaked for certain cases, and doesn't
+  // work well for certain other cases.
+  const int allow_gf_length_reduction =
+      ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) ||
+       !cpi->ppi->internal_altref_allowed) &&
+      !is_lossless_requested(rc_cfg);
+
+  if (allow_gf_length_reduction && use_alt_ref) {
+    // adjust length of this gf group if one of the following condition met
+    // 1: only one overlay frame left and this gf is too long
+    // 2: next gf group is too short to have arf compared to the current gf
+
+    // maximum length of next gf group
+    const int next_gf_len = rc->frames_to_key - i;
+    const int single_overlay_left =
+        next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+    // the next gf is probably going to have a ARF but it will be shorter than
+    // this gf
+    const int unbalanced_gf =
+        i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 >= rc->min_gf_interval;
+
+    if (single_overlay_left || unbalanced_gf) {
+      const int roll_back = REDUCE_GF_LENGTH_BY;
+      // Reduce length only if active_min_gf_interval will be respected later.
+      if (i - roll_back >= active_min_gf_interval + 1) {
+        alt_offset = -roll_back;
+        i -= roll_back;
+        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+        p_rc->gf_intervals[p_rc->cur_gf_index] -= roll_back;
+        reset_fpf_position(&cpi->twopass_frame, start_pos);
+        accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame,
+                             start_pos, &gf_stats, &i);
+      }
+    }
+  }
+
+  update_gop_length(rc, p_rc, i, is_final_pass);
+
+  // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+  av1_gop_setup_structure(cpi);
+
+  set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref,
+                     alt_offset, start_pos, &gf_stats);
+
+  frame_params->frame_type =
+      rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME;
+  frame_params->show_frame =
+      !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+        gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE);
+}
+
+/*!\brief Define a GF group for the third apss.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group for the third pass, along
+ * with various parameters regarding bit-allocation and quality setup based on
+ * the two-pass bitstream.
+ * Much of the function still uses the strategies used for the second pass and
+ * relies on first pass statistics. It is expected that over time these portions
+ * would be replaced with strategies specific to the third pass.
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    frame_params    Structure with frame parameters
+ * \param[in]    is_final_pass   Whether this is the final pass for the
+ *                               GF group, or a trial (non-zero)
+ *
+ * \return       0: Success;
+ *              -1: There are conflicts between the bitstream and current config
+ *               The values in cpi->ppi->gf_group are also changed.
+ */
+static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+                                 int is_final_pass) {
+  if (!cpi->third_pass_ctx) return -1;
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+  const int f_w = cm->width;
+  const int f_h = cm->height;
+  int i;
+  const int is_intra_only = rc->frames_since_key == 0;
+
+  cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (!is_intra_only) {
+    av1_zero(cpi->ppi->gf_group);
+    cpi->gf_frame_index = 0;
+  }
+
+  GF_GROUP_STATS gf_stats;
+  accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos,
+                       &gf_stats, &i);
+
+  const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
+
+  // TODO(any): set cpi->ppi->internal_altref_allowed accordingly;
+
+  int use_alt_ref = av1_check_use_arf(cpi->third_pass_ctx);
+  if (use_alt_ref == 0 && !can_disable_arf) return -1;
+  if (use_alt_ref) {
+    gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
+  } else {
+    gf_group->max_layer_depth_allowed = 0;
+  }
+
+  update_gop_length(rc, p_rc, i, is_final_pass);
+
+  // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+  av1_gop_setup_structure(cpi);
+
+  set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, 0,
+                     start_pos, &gf_stats);
+
+  frame_params->frame_type = cpi->third_pass_ctx->frame_info[0].frame_type;
+  frame_params->show_frame = cpi->third_pass_ctx->frame_info[0].is_show_frame;
+  return 0;
+}
+
+// #define FIXED_ARF_BITS
+#ifdef FIXED_ARF_BITS
+#define ARF_BITS_FRACTION 0.75
+#endif
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  // Calculate the extra bits to be used for boosted frame(s)
+#ifdef FIXED_ARF_BITS
+  int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
+#else
+  int gf_arf_bits = calculate_boost_bits(
+      p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost,
+      gf_group_bits);
+#endif
+
+  gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits,
+                                                   gf_group_bits, 1);
+
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits,
+                         is_key_frame, use_arf);
+}
+
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 1.9
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+                            const FIRSTPASS_STATS *last_frame,
+                            const FIRSTPASS_STATS *next_frame) {
+  return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+         (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+         (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+// We adapt the threshold based on number of frames in this key-frame group so
+// far.
+static double get_second_ref_usage_thresh(int frame_count_so_far) {
+  const int adapt_upto = 32;
+  const double min_second_ref_usage_thresh = 0.085;
+  const double second_ref_usage_thresh_max_delta = 0.035;
+  if (frame_count_so_far >= adapt_upto) {
+    return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta;
+  }
+  return min_second_ref_usage_thresh +
+         ((double)frame_count_so_far / (adapt_upto - 1)) *
+             second_ref_usage_thresh_max_delta;
+}
+
+static int test_candidate_kf(const FIRSTPASS_INFO *firstpass_info,
+                             int this_stats_index, int frame_count_so_far,
+                             enum aom_rc_mode rc_mode, int scenecut_mode,
+                             int num_mbs) {
+  const FIRSTPASS_STATS *last_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index - 1);
+  const FIRSTPASS_STATS *this_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index);
+  const FIRSTPASS_STATS *next_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index + 1);
+  if (last_stats == NULL || this_stats == NULL || next_stats == NULL) {
+    return 0;
+  }
+
+  int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_stats->pcnt_inter;
+  double modified_pcnt_inter =
+      this_stats->pcnt_inter - this_stats->pcnt_neutral;
+  const double second_ref_usage_thresh =
+      get_second_ref_usage_thresh(frame_count_so_far);
+  int frames_to_test_after_candidate_key = SCENE_CUT_KEY_TEST_INTERVAL;
+  int count_for_tolerable_prediction = 3;
+
+  // We do "-1" because the candidate key is not counted.
+  int stats_after_this_stats =
+      av1_firstpass_info_future_count(firstpass_info, this_stats_index) - 1;
+
+  if (scenecut_mode == ENABLE_SCENECUT_MODE_1) {
+    if (stats_after_this_stats < 3) {
+      return 0;
+    } else {
+      frames_to_test_after_candidate_key = 3;
+      count_for_tolerable_prediction = 1;
+    }
+  }
+  // Make sure we have enough stats after the candidate key.
+  frames_to_test_after_candidate_key =
+      AOMMIN(frames_to_test_after_candidate_key, stats_after_this_stats);
+
+  // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
+  // If so, then examine how well it predicts subsequent frames.
+  if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) &&
+      (this_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+      (next_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+      ((this_stats->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       slide_transition(this_stats, last_stats, next_stats) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+        ((this_stats->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_stats->coded_error)) <
+         KF_II_ERR_THRESHOLD) &&
+        ((fabs(last_stats->coded_error - this_stats->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_stats->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         (fabs(last_stats->intra_error - this_stats->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_stats->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         ((next_stats->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_stats->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
+    int i;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double decay_accumulator = 1.0;
+
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 1; i <= frames_to_test_after_candidate_key; ++i) {
+      // Get the next frame details
+      const FIRSTPASS_STATS *local_next_frame =
+          av1_firstpass_info_peek(firstpass_info, this_stats_index + i);
+      double next_iiratio =
+          (BOOST_FACTOR * local_next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(local_next_frame->coded_error));
+
+      if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+      // Cumulative effect of decay in prediction quality.
+      if (local_next_frame->pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame->pcnt_inter;
+      else
+        decay_accumulator *= (0.85 + local_next_frame->pcnt_inter) / 2.0;
+
+      // Keep a running total.
+      boost_score += (decay_accumulator * next_iiratio);
+
+      // Test various breakout clauses.
+      // TODO(any): Test of intra error should be normalized to an MB.
+      if ((local_next_frame->pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((local_next_frame->pcnt_inter - local_next_frame->pcnt_neutral) <
+            0.20) &&
+           (next_iiratio < 3.0)) ||
+          ((boost_score - old_boost_score) < 3.0) ||
+          (local_next_frame->intra_error < (200.0 / (double)num_mbs))) {
+        break;
+      }
+
+      old_boost_score = boost_score;
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
+    if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) {
+      is_viable_kf = 1;
+    } else {
+      is_viable_kf = 0;
+    }
+  }
+  return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
+#define MIN_KF_BOOST 600  // Minimum boost for non-static KF interval
+#define MAX_KF_BOOST 3200
+#define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
+
+static int detect_app_forced_key(AV1_COMP *cpi) {
+  int num_frames_to_app_forced_key = is_forced_keyframe_pending(
+      cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage);
+  return num_frames_to_app_forced_key;
+}
+
+static int get_projected_kf_boost(AV1_COMP *cpi) {
+  /*
+   * If num_stats_used_for_kf_boost >= frames_to_key, then
+   * all stats needed for prior boost calculation are available.
+   * Hence projecting the prior boost is not needed in this cases.
+   */
+  if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
+    return cpi->ppi->p_rc.kf_boost;
+
+  // Get the current tpl factor (number of frames = frames_to_key).
+  double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key);
+  // Get the tpl factor when number of frames = num_stats_used_for_kf_boost.
+  double tpl_factor_num_stats = av1_get_kf_boost_projection_factor(
+      cpi->ppi->p_rc.num_stats_used_for_kf_boost);
+  int projected_kf_boost =
+      (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats);
+  return projected_kf_boost;
+}
+
+/*!\brief Determine the location of the next key frame
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame when a
+ * scenecut is detected or the maximum key frame distance is reached.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    firstpass_info   struct for firstpass info
+ * \param[in]    num_frames_to_detect_scenecut Maximum lookahead frames.
+ * \param[in]    search_start_idx   the start index for searching key frame.
+ *                                  Set it to one if we already know the
+ *                                  current frame is key frame. Otherwise,
+ *                                  set it to zero.
+ *
+ * \return       Number of frames to the next key including the current frame.
+ */
+static int define_kf_interval(AV1_COMP *cpi,
+                              const FIRSTPASS_INFO *firstpass_info,
+                              int num_frames_to_detect_scenecut,
+                              int search_start_idx) {
+  const TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+  double decay_accumulator = 1.0;
+  int i = 0, j;
+  int frames_to_key = search_start_idx;
+  int frames_since_key = rc->frames_since_key + 1;
+  int scenecut_detected = 0;
+
+  int num_frames_to_next_key = detect_app_forced_key(cpi);
+
+  if (num_frames_to_detect_scenecut == 0) {
+    if (num_frames_to_next_key != -1)
+      return num_frames_to_next_key;
+    else
+      return rc->frames_to_key;
+  }
+
+  if (num_frames_to_next_key != -1)
+    num_frames_to_detect_scenecut =
+        AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key);
+
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+  i = 0;
+  const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cpi->common.mi_params.MBs;
+  const int future_stats_count =
+      av1_firstpass_info_future_count(firstpass_info, 0);
+  while (frames_to_key < future_stats_count &&
+         frames_to_key < num_frames_to_detect_scenecut) {
+    // Provided that we are not at the end of the file...
+    if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
+        frames_to_key + 1 < future_stats_count) {
+      double loop_decay_rate;
+
+      // Check for a scene cut.
+      if (frames_since_key >= kf_cfg->key_freq_min) {
+        scenecut_detected = test_candidate_kf(
+            &twopass->firstpass_info, frames_to_key, frames_since_key,
+            oxcf->rc_cfg.mode, cpi->ppi->p_rc.enable_scenecut_detection,
+            num_mbs);
+        if (scenecut_detected) {
+          break;
+        }
+      }
+
+      // How fast is the prediction quality decaying?
+      const FIRSTPASS_STATS *next_stats =
+          av1_firstpass_info_peek(firstpass_info, frames_to_key + 1);
+      loop_decay_rate = get_prediction_decay_rate(next_stats);
+
+      // We want to know something about the recent past... rather than
+      // as used elsewhere where we are concerned with decay in prediction
+      // quality since the last GF or KF.
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+      decay_accumulator = 1.0;
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+        decay_accumulator *= recent_loop_decay[j];
+
+      // Special check for transition or high motion followed by a
+      // static scene.
+      if (frames_since_key >= kf_cfg->key_freq_min) {
+        scenecut_detected = detect_transition_to_still(
+            firstpass_info, frames_to_key + 1, rc->min_gf_interval, i,
+            kf_cfg->key_freq_max - i, loop_decay_rate, decay_accumulator);
+        if (scenecut_detected) {
+          // In the case of transition followed by a static scene, the key frame
+          // could be a good predictor for the following frames, therefore we
+          // do not use an arf.
+          p_rc->use_arf_in_this_kf_group = 0;
+          break;
+        }
+      }
+
+      // Step on to the next frame.
+      ++frames_to_key;
+      ++frames_since_key;
+
+      // If we don't have a real key frame within the next two
+      // key_freq_max intervals then break out of the loop.
+      if (frames_to_key >= 2 * kf_cfg->key_freq_max) {
+        break;
+      }
+    } else {
+      ++frames_to_key;
+      ++frames_since_key;
+    }
+    ++i;
+  }
+  if (cpi->ppi->lap_enabled && !scenecut_detected)
+    frames_to_key = num_frames_to_next_key;
+
+  return frames_to_key;
+}
+
+static double get_kf_group_avg_error(TWO_PASS *twopass,
+                                     TWO_PASS_FRAME *twopass_frame,
+                                     const FIRSTPASS_STATS *first_frame,
+                                     const FIRSTPASS_STATS *start_position,
+                                     int frames_to_key) {
+  FIRSTPASS_STATS cur_frame = *first_frame;
+  int num_frames, i;
+  double kf_group_avg_error = 0.0;
+
+  reset_fpf_position(twopass_frame, start_position);
+
+  for (i = 0; i < frames_to_key; ++i) {
+    kf_group_avg_error += cur_frame.coded_error;
+    if (EOF == input_stats(twopass, twopass_frame, &cur_frame)) break;
+  }
+  num_frames = i + 1;
+  num_frames = AOMMIN(num_frames, frames_to_key);
+  kf_group_avg_error = kf_group_avg_error / num_frames;
+
+  return (kf_group_avg_error);
+}
+
+static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
+                                 double kf_group_avg_error) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  int64_t kf_group_bits;
+  if (cpi->ppi->lap_enabled) {
+    kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth;
+    if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) {
+      double vbr_corpus_complexity_lap =
+          cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0;
+      /* Get the average corpus complexity of the frame */
+      kf_group_bits = (int64_t)(
+          kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap));
+    }
+  } else {
+    kf_group_bits = (int64_t)(twopass->bits_left *
+                              (kf_group_err / twopass->modified_error_left));
+  }
+
+  return kf_group_bits;
+}
+
+static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS cur_frame;
+  av1_zero(cur_frame);
+  int num_frames = 0;
+  // Accumulate total stat using available number of stats.
+  for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) {
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, &cur_frame)) break;
+    av1_accumulate_stats(avg_frame_stat, &cur_frame);
+  }
+
+  if (num_frames < 2) {
+    return num_frames;
+  }
+  // Average the total stat
+  avg_frame_stat->weight = avg_frame_stat->weight / num_frames;
+  avg_frame_stat->intra_error = avg_frame_stat->intra_error / num_frames;
+  avg_frame_stat->frame_avg_wavelet_energy =
+      avg_frame_stat->frame_avg_wavelet_energy / num_frames;
+  avg_frame_stat->coded_error = avg_frame_stat->coded_error / num_frames;
+  avg_frame_stat->sr_coded_error = avg_frame_stat->sr_coded_error / num_frames;
+  avg_frame_stat->pcnt_inter = avg_frame_stat->pcnt_inter / num_frames;
+  avg_frame_stat->pcnt_motion = avg_frame_stat->pcnt_motion / num_frames;
+  avg_frame_stat->pcnt_second_ref =
+      avg_frame_stat->pcnt_second_ref / num_frames;
+  avg_frame_stat->pcnt_neutral = avg_frame_stat->pcnt_neutral / num_frames;
+  avg_frame_stat->intra_skip_pct = avg_frame_stat->intra_skip_pct / num_frames;
+  avg_frame_stat->inactive_zone_rows =
+      avg_frame_stat->inactive_zone_rows / num_frames;
+  avg_frame_stat->inactive_zone_cols =
+      avg_frame_stat->inactive_zone_cols / num_frames;
+  avg_frame_stat->MVr = avg_frame_stat->MVr / num_frames;
+  avg_frame_stat->mvr_abs = avg_frame_stat->mvr_abs / num_frames;
+  avg_frame_stat->MVc = avg_frame_stat->MVc / num_frames;
+  avg_frame_stat->mvc_abs = avg_frame_stat->mvc_abs / num_frames;
+  avg_frame_stat->MVrv = avg_frame_stat->MVrv / num_frames;
+  avg_frame_stat->MVcv = avg_frame_stat->MVcv / num_frames;
+  avg_frame_stat->mv_in_out_count =
+      avg_frame_stat->mv_in_out_count / num_frames;
+  avg_frame_stat->new_mv_count = avg_frame_stat->new_mv_count / num_frames;
+  avg_frame_stat->count = avg_frame_stat->count / num_frames;
+  avg_frame_stat->duration = avg_frame_stat->duration / num_frames;
+
+  return num_frames;
+}
+
+static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
+                                 double *zero_motion_accumulator,
+                                 double *sr_accumulator, int use_avg_stat) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  FIRSTPASS_STATS frame_stat;
+  av1_zero(frame_stat);
+  int i = 0, num_stat_used = 0;
+  double boost_score = 0.0;
+  const double kf_max_boost =
+      cpi->oxcf.rc_cfg.mode == AOM_Q
+          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+                   KF_MAX_FRAME_BOOST)
+          : KF_MAX_FRAME_BOOST;
+
+  // Calculate the average using available number of stats.
+  if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat);
+
+  for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) {
+    if (!use_avg_stat &&
+        EOF == input_stats(twopass, &cpi->twopass_frame, &frame_stat))
+      break;
+
+    // Monitor for static sections.
+    // For the first frame in kf group, the second ref indicator is invalid.
+    if (i > 0) {
+      *zero_motion_accumulator =
+          AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat));
+    } else {
+      *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion;
+    }
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((*sr_accumulator < (kf_raw_err * 1.50)) &&
+        (i <= rc->max_gf_interval * 2)) {
+      double frame_boost;
+      double zm_factor;
+
+      // Factor 0.75-1.25 based on how much of frame is static.
+      zm_factor = (0.75 + (*zero_motion_accumulator / 2.0));
+
+      if (i < 2) *sr_accumulator = 0.0;
+      frame_boost =
+          calc_kf_frame_boost(&cpi->ppi->p_rc, frame_info, &frame_stat,
+                              sr_accumulator, kf_max_boost);
+      boost_score += frame_boost * zm_factor;
+    }
+  }
+  return boost_score;
+}
+
+/*!\brief Interval(in seconds) to clip key-frame distance to in LAP.
+ */
+#define MAX_KF_BITS_INTERVAL_SINGLE_PASS 5
+
+/*!\brief Determine the next key frame group
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame, and
+ * calculates the bit allocation of the KF group and the keyframe itself.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    this_frame       Pointer to first pass stats
+ */
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_INFO *firstpass_info = &twopass->firstpass_info;
+  av1_zero(next_frame);
+
+  rc->frames_since_key = 0;
+  // Use arfs if possible.
+  p_rc->use_arf_in_this_kf_group = is_altref_enabled(
+      oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf);
+
+  // Reset the GF group data structures.
+  av1_zero(*gf_group);
+  cpi->gf_frame_index = 0;
+
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
+
+  if (has_no_stats_stage(cpi)) {
+    int num_frames_to_app_forced_key = detect_app_forced_key(cpi);
+    p_rc->this_key_frame_forced =
+        current_frame->frame_number != 0 && rc->frames_to_key == 0;
+    if (num_frames_to_app_forced_key != -1)
+      rc->frames_to_key = num_frames_to_app_forced_key;
+    else
+      rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max);
+    correct_frames_to_key(cpi);
+    p_rc->kf_boost = DEFAULT_KF_BOOST;
+    gf_group->update_type[0] = KF_UPDATE;
+    return;
+  }
+  int i;
+  const FIRSTPASS_STATS *const start_position = cpi->twopass_frame.stats_in;
+  int kf_bits = 0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0.0;
+  double kf_raw_err = 0.0;
+  double kf_mod_err = 0.0;
+  double sr_accumulator = 0.0;
+  double kf_group_avg_error = 0.0;
+  int frames_to_key, frames_to_key_clipped = INT_MAX;
+  int64_t kf_group_bits_clipped = INT64_MAX;
+
+  // Is this a forced key frame by interval.
+  p_rc->this_key_frame_forced = p_rc->next_key_frame_forced;
+
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
+
+  kf_raw_err = this_frame->intra_error;
+  kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+
+  // We assume the current frame is a key frame and we are looking for the next
+  // key frame. Therefore search_start_idx = 1
+  frames_to_key = define_kf_interval(cpi, firstpass_info, kf_cfg->key_freq_max,
+                                     /*search_start_idx=*/1);
+
+  if (frames_to_key != -1) {
+    rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key);
+  } else {
+    rc->frames_to_key = kf_cfg->key_freq_max;
+  }
+
+  if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi);
+
+  // If there is a max kf interval set by the user we must obey it.
+  // We already breakout of the loop above at 2x max.
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
+  if (kf_cfg->auto_key && rc->frames_to_key > kf_cfg->key_freq_max) {
+    FIRSTPASS_STATS tmp_frame = first_frame;
+
+    rc->frames_to_key /= 2;
+
+    // Reset to the start of the group.
+    reset_fpf_position(&cpi->twopass_frame, start_position);
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      if (EOF == input_stats(twopass, &cpi->twopass_frame, &tmp_frame)) break;
+    }
+    p_rc->next_key_frame_forced = 1;
+  } else if ((cpi->twopass_frame.stats_in ==
+                  twopass->stats_buf_ctx->stats_in_end &&
+              is_stat_consumption_stage_twopass(cpi)) ||
+             rc->frames_to_key >= kf_cfg->key_freq_max) {
+    p_rc->next_key_frame_forced = 1;
+  } else {
+    p_rc->next_key_frame_forced = 0;
+  }
+
+  double kf_group_err = 0;
+  for (i = 0; i < rc->frames_to_key; ++i) {
+    const FIRSTPASS_STATS *this_stats =
+        av1_firstpass_info_peek(&twopass->firstpass_info, i);
+    if (this_stats != NULL) {
+      // Accumulate kf group error.
+      kf_group_err += calculate_modified_err_new(
+          frame_info, &firstpass_info->total_stats, this_stats,
+          oxcf->rc_cfg.vbrbias, twopass->modified_error_min,
+          twopass->modified_error_max);
+      ++p_rc->num_stats_used_for_kf_boost;
+    }
+  }
+
+  // Calculate the number of bits that should be assigned to the kf group.
+  if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) ||
+      (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
+    // Maximum number of bits for a single normal frame (not key frame).
+    const int max_bits = frame_max_bits(rc, oxcf);
+
+    // Maximum number of bits allocated to the key frame group.
+    int64_t max_grp_bits;
+
+    if (oxcf->rc_cfg.vbr_corpus_complexity_lap) {
+      kf_group_avg_error =
+          get_kf_group_avg_error(twopass, &cpi->twopass_frame, &first_frame,
+                                 start_position, rc->frames_to_key);
+    }
+
+    // Default allocation based on bits left and relative
+    // complexity of the section.
+    twopass->kf_group_bits =
+        get_kf_group_bits(cpi, kf_group_err, kf_group_avg_error);
+    // Clip based on maximum per frame rate defined by the user.
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
+  } else {
+    twopass->kf_group_bits = 0;
+  }
+  twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+  if (cpi->ppi->lap_enabled) {
+    // In the case of single pass based on LAP, frames to  key may have an
+    // inaccurate value, and hence should be clipped to an appropriate
+    // interval.
+    frames_to_key_clipped =
+        (int)(MAX_KF_BITS_INTERVAL_SINGLE_PASS * cpi->framerate);
+
+    // This variable calculates the bits allocated to kf_group with a clipped
+    // frames_to_key.
+    if (rc->frames_to_key > frames_to_key_clipped) {
+      kf_group_bits_clipped =
+          (int64_t)((double)twopass->kf_group_bits * frames_to_key_clipped /
+                    rc->frames_to_key);
+    }
+  }
+
+  // Reset the first pass file position.
+  reset_fpf_position(&cpi->twopass_frame, start_position);
+
+  // Scan through the kf group collating various stats used to determine
+  // how many bits to spend on it.
+  boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator,
+                                   &sr_accumulator, 0);
+  reset_fpf_position(&cpi->twopass_frame, start_position);
+  // Store the zero motion percentage
+  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  twopass->section_intra_rating = calculate_section_intra_ratio(
+      start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key);
+
+  p_rc->kf_boost = (int)boost_score;
+
+  if (cpi->ppi->lap_enabled) {
+    if (oxcf->rc_cfg.mode == AOM_Q) {
+      p_rc->kf_boost = get_projected_kf_boost(cpi);
+    } else {
+      // TODO(any): Explore using average frame stats for AOM_Q as well.
+      boost_score = get_kf_boost_score(
+          cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1);
+      reset_fpf_position(&cpi->twopass_frame, start_position);
+      p_rc->kf_boost += (int)boost_score;
+    }
+  }
+
+  // Special case for static / slide show content but don't apply
+  // if the kf group is very short.
+  if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
+      (rc->frames_to_key > 8)) {
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST);
+  } else {
+    // Apply various clamps for min and max boost
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3));
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST);
+#ifdef STRICT_RC
+    p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST);
+#endif
+  }
+
+  // Work out how many bits to allocate for the key frame itself.
+  // In case of LAP enabled for VBR, if the frames_to_key value is
+  // very high, we calculate the bits based on a clipped value of
+  // frames_to_key.
+  kf_bits = calculate_boost_bits(
+      AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost,
+      AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped));
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n",
+  // p_rc->kf_boost,
+  //        kf_bits, twopass->kf_zeromotion_pct);
+  kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
+                                               twopass->kf_group_bits, 0);
+
+  twopass->kf_group_bits -= kf_bits;
+
+  // Save the bits to spend on the key frame.
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+
+  // Note the total error score of the kf group minus the key frame itself.
+  if (cpi->ppi->lap_enabled)
+    // As we don't have enough stats to know the actual error of the group,
+    // we assume the complexity of each frame to be equal to 1, and set the
+    // error as the number of frames in the group(minus the keyframe).
+    twopass->kf_group_error_left = (double)(rc->frames_to_key - 1);
+  else
+    twopass->kf_group_error_left = kf_group_err - kf_mod_err;
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
+  twopass->modified_error_left -= kf_group_err;
+}
+
+#define ARF_STATS_OUTPUT 0
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+static int get_section_target_bandwidth(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  int section_target_bandwidth;
+  const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
+                                current_frame->frame_number);
+  if (cpi->ppi->lap_enabled)
+    section_target_bandwidth = (int)rc->avg_frame_bandwidth;
+  else
+    section_target_bandwidth = (int)(twopass->bits_left / frames_left);
+  return section_target_bandwidth;
+}
+
+static INLINE void set_twopass_params_based_on_fp_stats(
+    AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) {
+  if (this_frame_ptr == NULL) return;
+
+  TWO_PASS_FRAME *twopass_frame = &cpi->twopass_frame;
+  // The multiplication by 256 reverses a scaling factor of (>> 8)
+  // applied when combining MB error values for the frame.
+  twopass_frame->mb_av_energy = log1p(this_frame_ptr->intra_error);
+
+  const FIRSTPASS_STATS *const total_stats =
+      cpi->ppi->twopass.stats_buf_ctx->total_stats;
+  if (is_fp_wavelet_energy_invalid(total_stats) == 0) {
+    twopass_frame->frame_avg_haar_energy =
+        log1p(this_frame_ptr->frame_avg_wavelet_energy);
+  }
+
+  // Set the frame content type flag.
+  if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass_frame->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass_frame->fr_content_type = FC_NORMAL;
+}
+
+static void process_first_pass_stats(AV1_COMP *cpi,
+                                     FIRSTPASS_STATS *this_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+
+  if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 &&
+      cpi->gf_frame_index == 0 && total_stats &&
+      twopass->stats_buf_ctx->total_left_stats) {
+    if (cpi->ppi->lap_enabled) {
+      /*
+       * Accumulate total_stats using available limited number of stats,
+       * and assign it to total_left_stats.
+       */
+      *twopass->stats_buf_ctx->total_left_stats = *total_stats;
+    }
+    // Special case code for first frame.
+    const int section_target_bandwidth = get_section_target_bandwidth(cpi);
+    const double section_length =
+        twopass->stats_buf_ctx->total_left_stats->count;
+    const double section_error =
+        twopass->stats_buf_ctx->total_left_stats->coded_error / section_length;
+    const double section_intra_skip =
+        twopass->stats_buf_ctx->total_left_stats->intra_skip_pct /
+        section_length;
+    const double section_inactive_zone =
+        (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) /
+        ((double)cm->mi_params.mb_rows * section_length);
+    const int tmp_q = get_twopass_worst_quality(
+        cpi, section_error, section_intra_skip + section_inactive_zone,
+        section_target_bandwidth);
+
+    rc->active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    p_rc->last_q[INTER_FRAME] = tmp_q;
+    p_rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth);
+    p_rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    p_rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2;
+    p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME];
+  }
+
+  if (cpi->twopass_frame.stats_in < twopass->stats_buf_ctx->stats_in_end) {
+    *this_frame = *cpi->twopass_frame.stats_in;
+    ++cpi->twopass_frame.stats_in;
+  }
+  set_twopass_params_based_on_fp_stats(cpi, this_frame);
+}
+
+static void setup_target_rate(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+  int target_rate = gf_group->bit_allocation[cpi->gf_frame_index];
+
+  if (has_no_stats_stage(cpi)) {
+    av1_rc_set_frame_target(cpi, target_rate, cpi->common.width,
+                            cpi->common.height);
+  }
+
+  rc->base_frame_target = target_rate;
+}
+
+void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
+                      FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats = first_stats, *next_stats;
+  while (this_stats < last_stats - 1) {
+    next_stats = this_stats + 1;
+    if (next_stats->pcnt_second_ref > next_stats->pcnt_inter &&
+        next_stats->pcnt_second_ref >= 0.5) {
+      this_stats->is_flash = 1;
+    } else {
+      this_stats->is_flash = 0;
+    }
+    this_stats = next_stats;
+  }
+  // We always treat the last one as none flash.
+  if (last_stats - 1 >= first_stats) {
+    (last_stats - 1)->is_flash = 0;
+  }
+}
+
+// Smooth-out the noise variance so it is more stable
+// Returns 0 on success, -1 on memory allocation failure.
+// TODO(bohanli): Use a better low-pass filter than averaging
+static int smooth_filter_noise(FIRSTPASS_STATS *first_stats,
+                               FIRSTPASS_STATS *last_stats) {
+  int len = (int)(last_stats - first_stats);
+  double *smooth_noise = aom_malloc(len * sizeof(*smooth_noise));
+  if (!smooth_noise) return -1;
+
+  for (int i = 0; i < len; i++) {
+    double total_noise = 0;
+    double total_wt = 0;
+    for (int j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, 0), len - 1);
+      if (first_stats[idx].is_flash) continue;
+
+      total_noise += first_stats[idx].noise_var;
+      total_wt += 1.0;
+    }
+    if (total_wt > 0.01) {
+      total_noise /= total_wt;
+    } else {
+      total_noise = first_stats[i].noise_var;
+    }
+    smooth_noise[i] = total_noise;
+  }
+
+  for (int i = 0; i < len; i++) {
+    first_stats[i].noise_var = smooth_noise[i];
+  }
+
+  aom_free(smooth_noise);
+  return 0;
+}
+
+// Estimate the noise variance of each frame from the first pass stats
+void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
+                        FIRSTPASS_STATS *last_stats,
+                        struct aom_internal_error_info *error_info) {
+  FIRSTPASS_STATS *this_stats, *next_stats;
+  double C1, C2, C3, noise;
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    this_stats->noise_var = 0.0;
+    // flashes tend to have high correlation of innovations, so ignore them.
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash)
+      continue;
+
+    C1 = (this_stats - 1)->intra_error *
+         (this_stats->intra_error - this_stats->coded_error);
+    C2 = (this_stats - 2)->intra_error *
+         ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error);
+    C3 = (this_stats - 2)->intra_error *
+         (this_stats->intra_error - this_stats->sr_coded_error);
+    if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
+    C1 = sqrt(C1);
+    C2 = sqrt(C2);
+    C3 = sqrt(C3);
+
+    noise = (this_stats - 1)->intra_error - C1 * C2 / C3;
+    noise = AOMMAX(noise, 0.01);
+    this_stats->noise_var = noise;
+  }
+
+  // Copy noise from the neighbor if the noise value is not trustworthy
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash)
+      continue;
+    if (this_stats->noise_var < 1.0) {
+      int found = 0;
+      // TODO(bohanli): consider expanding to two directions at the same time
+      for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+          continue;
+        found = 1;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+      if (found) continue;
+      for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+           next_stats--) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+          continue;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+    }
+  }
+
+  // copy the noise if this is a flash
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash) {
+      int found = 0;
+      for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash)
+          continue;
+        found = 1;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+      if (found) continue;
+      for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+           next_stats--) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash)
+          continue;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+    }
+  }
+
+  // if we are at the first 2 frames, copy the noise
+  for (this_stats = first_stats;
+       this_stats < first_stats + 2 && (first_stats + 2) < last_stats;
+       this_stats++) {
+    this_stats->noise_var = (first_stats + 2)->noise_var;
+  }
+
+  if (smooth_filter_noise(first_stats, last_stats) == -1) {
+    aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                       "Error allocating buffers in smooth_filter_noise()");
+  }
+}
+
+// Estimate correlation coefficient of each frame with its previous frame.
+void av1_estimate_coeff(FIRSTPASS_STATS *first_stats,
+                        FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats;
+  for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) {
+    const double C =
+        sqrt(AOMMAX((this_stats - 1)->intra_error *
+                        (this_stats->intra_error - this_stats->coded_error),
+                    0.001));
+    const double cor_coeff =
+        C /
+        AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001);
+
+    this_stats->cor_coeff =
+        cor_coeff *
+        sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var,
+                    0.001) /
+             AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001));
+    // clip correlation coefficient.
+    this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1);
+  }
+  first_stats->cor_coeff = 1.0;
+}
+
+void av1_get_second_pass_params(AV1_COMP *cpi,
+                                EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  if (cpi->use_ducky_encode &&
+      cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+    frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+    frame_params->show_frame =
+        !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+          gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE);
+    if (cpi->gf_frame_index == 0) {
+      av1_tf_info_reset(&cpi->ppi->tf_info);
+      av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+    }
+    return;
+  }
+
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  int update_total_stats = 0;
+
+  if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return;
+
+  // Check forced key frames.
+  const int frames_to_next_forced_key = detect_app_forced_key(cpi);
+  if (frames_to_next_forced_key == 0) {
+    rc->frames_to_key = 0;
+    frame_flags &= FRAMEFLAGS_KEY;
+  } else if (frames_to_next_forced_key > 0 &&
+             frames_to_next_forced_key < rc->frames_to_key) {
+    rc->frames_to_key = frames_to_next_forced_key;
+  }
+
+  assert(cpi->twopass_frame.stats_in != NULL);
+  const int update_type = gf_group->update_type[cpi->gf_frame_index];
+  frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+
+  if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) {
+    assert(cpi->gf_frame_index < gf_group->size);
+
+    setup_target_rate(cpi);
+
+    // If this is an arf frame then we dont want to read the stats file or
+    // advance the input pointer as we already have what we need.
+    if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) {
+      const FIRSTPASS_STATS *const this_frame_ptr =
+          read_frame_stats(twopass, &cpi->twopass_frame,
+                           gf_group->arf_src_offset[cpi->gf_frame_index]);
+      set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
+      return;
+    }
+  }
+
+  if (oxcf->rc_cfg.mode == AOM_Q)
+    rc->active_worst_quality = oxcf->rc_cfg.cq_level;
+
+  if (cpi->gf_frame_index == gf_group->size) {
+    if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) {
+      const int num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
+      const int frames_to_key = define_kf_interval(
+          cpi, &twopass->firstpass_info, num_frames_to_detect_scenecut,
+          /*search_start_idx=*/0);
+      if (frames_to_key != -1)
+        rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
+    }
+  }
+
+  FIRSTPASS_STATS this_frame;
+  av1_zero(this_frame);
+  // call above fn
+  if (is_stat_consumption_stage(cpi)) {
+    if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) {
+      process_first_pass_stats(cpi, &this_frame);
+      update_total_stats = 1;
+    }
+  } else {
+    rc->active_worst_quality = oxcf->rc_cfg.cq_level;
+  }
+
+  // Keyframe and section processing.
+  FIRSTPASS_STATS this_frame_copy;
+  this_frame_copy = this_frame;
+  if (rc->frames_to_key <= 0) {
+    assert(rc->frames_to_key == 0);
+    // Define next KF group and assign bits to it.
+    frame_params->frame_type = KEY_FRAME;
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
+  }
+
+  if (rc->frames_to_fwd_kf <= 0)
+    rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
+
+  // Define a new GF/ARF group. (Should always enter here for key frames).
+  if (cpi->gf_frame_index == gf_group->size) {
+    av1_tf_info_reset(&cpi->ppi->tf_info);
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+    vbr_rc_reset_gop_data(&cpi->vbr_rc_info);
+#endif  // CONFIG_BITRATE_ACCURACY
+    int max_gop_length =
+        (oxcf->gf_cfg.lag_in_frames >= 32)
+            ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames -
+                                          oxcf->algo_cfg.arnr_max_frames / 2)
+            : MAX_GF_LENGTH_LAP;
+
+    // Handle forward key frame when enabled.
+    if (oxcf->kf_cfg.fwd_kf_dist > 0)
+      max_gop_length = AOMMIN(rc->frames_to_fwd_kf + 1, max_gop_length);
+
+    // Use the provided gop size in low delay setting
+    if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval;
+
+    // Limit the max gop length for the last gop in 1 pass setting.
+    max_gop_length = AOMMIN(max_gop_length, rc->frames_to_key);
+
+    // Identify regions if needed.
+    // TODO(bohanli): identify regions for all stats available.
+    if (rc->frames_since_key == 0 || rc->frames_since_key == 1 ||
+        (p_rc->frames_till_regions_update - rc->frames_since_key <
+             rc->frames_to_key &&
+         p_rc->frames_till_regions_update - rc->frames_since_key <
+             max_gop_length + 1)) {
+      // how many frames we can analyze from this frame
+      int rest_frames =
+          AOMMIN(rc->frames_to_key, MAX_FIRSTPASS_ANALYSIS_FRAMES);
+      rest_frames =
+          AOMMIN(rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end -
+                                    cpi->twopass_frame.stats_in +
+                                    (rc->frames_since_key == 0)));
+      p_rc->frames_till_regions_update = rest_frames;
+
+      int ret;
+      if (cpi->ppi->lap_enabled) {
+        av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+                         twopass->stats_buf_ctx->stats_in_end);
+        av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                           twopass->stats_buf_ctx->stats_in_end,
+                           cpi->common.error);
+        av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                           twopass->stats_buf_ctx->stats_in_end);
+        ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames,
+                               (rc->frames_since_key == 0), p_rc->regions,
+                               &p_rc->num_regions);
+      } else {
+        ret = identify_regions(
+            cpi->twopass_frame.stats_in - (rc->frames_since_key == 0),
+            rest_frames, 0, p_rc->regions, &p_rc->num_regions);
+      }
+      if (ret == -1) {
+        aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                           "Error allocating buffers in identify_regions");
+      }
+    }
+
+    int cur_region_idx =
+        find_regions_index(p_rc->regions, p_rc->num_regions,
+                           rc->frames_since_key - p_rc->regions_offset);
+    if ((cur_region_idx >= 0 &&
+         p_rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
+        rc->frames_since_key == 0) {
+      // If we start from a scenecut, then the last GOP's arf boost is not
+      // needed for this GOP.
+      cpi->ppi->gf_state.arf_gf_boost_lst = 0;
+    }
+
+    int need_gf_len = 1;
+    if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+      // set up bitstream to read
+      if (!cpi->third_pass_ctx->input_file_name && oxcf->two_pass_output) {
+        cpi->third_pass_ctx->input_file_name = oxcf->two_pass_output;
+      }
+      av1_open_second_pass_log(cpi, 1);
+      THIRD_PASS_GOP_INFO *gop_info = &cpi->third_pass_ctx->gop_info;
+      // Read in GOP information from the second pass file.
+      av1_read_second_pass_gop_info(cpi->second_pass_log_stream, gop_info,
+                                    cpi->common.error);
+#if CONFIG_BITRATE_ACCURACY
+      TPL_INFO *tpl_info;
+      AOM_CHECK_MEM_ERROR(cpi->common.error, tpl_info,
+                          aom_malloc(sizeof(*tpl_info)));
+      av1_read_tpl_info(tpl_info, cpi->second_pass_log_stream,
+                        cpi->common.error);
+      aom_free(tpl_info);
+#if CONFIG_THREE_PASS
+      // TODO(angiebird): Put this part into a func
+      cpi->vbr_rc_info.cur_gop_idx++;
+#endif  // CONFIG_THREE_PASS
+#endif  // CONFIG_BITRATE_ACCURACY
+      // Read in third_pass_info from the bitstream.
+      av1_set_gop_third_pass(cpi->third_pass_ctx);
+      // Read in per-frame info from second-pass encoding
+      av1_read_second_pass_per_frame_info(
+          cpi->second_pass_log_stream, cpi->third_pass_ctx->frame_info,
+          gop_info->num_frames, cpi->common.error);
+
+      p_rc->cur_gf_index = 0;
+      p_rc->gf_intervals[0] = cpi->third_pass_ctx->gop_info.gf_length;
+      need_gf_len = 0;
+    }
+
+    if (need_gf_len) {
+      // If we cannot obtain GF group length from second_pass_file
+      // TODO(jingning): Resolve the redundant calls here.
+      if (rc->intervals_till_gf_calculate_due == 0 || 1) {
+        calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
+      }
+
+      if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
+          oxcf->gf_cfg.lag_in_frames >= 32 &&
+          cpi->sf.tpl_sf.gop_length_decision_method != 3) {
+        int this_idx = rc->frames_since_key +
+                       p_rc->gf_intervals[p_rc->cur_gf_index] -
+                       p_rc->regions_offset - 1;
+        int this_region =
+            find_regions_index(p_rc->regions, p_rc->num_regions, this_idx);
+        int next_region =
+            find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1);
+        // TODO(angiebird): Figure out why this_region and next_region are -1 in
+        // unit test like AltRefFramePresenceTestLarge (aomedia:3134)
+        int is_last_scenecut =
+            p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key ||
+            (this_region != -1 &&
+             p_rc->regions[this_region].type == SCENECUT_REGION) ||
+            (next_region != -1 &&
+             p_rc->regions[next_region].type == SCENECUT_REGION);
+
+        int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index];
+
+        if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 &&
+            rc->min_gf_interval <= 16) {
+          // The calculate_gf_length function is previously used with
+          // max_gop_length = 32 with look-ahead gf intervals.
+          define_gf_group(cpi, frame_params, 0);
+          av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+          this_frame = this_frame_copy;
+
+          if (is_shorter_gf_interval_better(cpi, frame_params)) {
+            // A shorter gf interval is better.
+            // TODO(jingning): Remove redundant computations here.
+            max_gop_length = 16;
+            calculate_gf_length(cpi, max_gop_length, 1);
+            if (is_last_scenecut &&
+                (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) {
+              p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int;
+            }
+          }
+        }
+      }
+    }
+
+    define_gf_group(cpi, frame_params, 0);
+
+    if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE &&
+        rc->frames_since_key > 0)
+      process_first_pass_stats(cpi, &this_frame);
+
+    define_gf_group(cpi, frame_params, 1);
+
+    // write gop info if needed for third pass. Per-frame info is written after
+    // each frame is encoded.
+    av1_write_second_pass_gop_info(cpi);
+
+    av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+    assert(cpi->gf_frame_index == 0);
+#if ARF_STATS_OUTPUT
+    {
+      FILE *fpfile;
+      fpfile = fopen("arf.stt", "a");
+      ++arf_count;
+      fprintf(fpfile, "%10d %10d %10d %10d %10d\n",
+              cpi->common.current_frame.frame_number,
+              rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count,
+              p_rc->gfu_boost);
+
+      fclose(fpfile);
+    }
+#endif
+  }
+  assert(cpi->gf_frame_index < gf_group->size);
+
+  if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+      gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+    reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+    const FIRSTPASS_STATS *const this_frame_ptr =
+        read_frame_stats(twopass, &cpi->twopass_frame,
+                         gf_group->arf_src_offset[cpi->gf_frame_index]);
+    set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
+  } else {
+    // Back up this frame's stats for updating total stats during post encode.
+    cpi->twopass_frame.this_frame = update_total_stats ? start_pos : NULL;
+  }
+
+  frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+  setup_target_rate(cpi);
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  double frame_rate;
+  FIRSTPASS_STATS *stats;
+
+  if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+  av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+                   twopass->stats_buf_ctx->stats_in_end);
+  av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                     twopass->stats_buf_ctx->stats_in_end, cpi->common.error);
+  av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                     twopass->stats_buf_ctx->stats_in_end);
+
+  stats = twopass->stats_buf_ctx->total_stats;
+
+  *stats = *twopass->stats_buf_ctx->stats_in_end;
+  *twopass->stats_buf_ctx->total_left_stats = *stats;
+
+  frame_rate = 10000000.0 * stats->count / stats->duration;
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
+  av1_new_framerate(cpi, frame_rate);
+  twopass->bits_left =
+      (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0);
+
+#if CONFIG_BITRATE_ACCURACY
+  av1_vbr_rc_init(&cpi->vbr_rc_info, twopass->bits_left,
+                  (int)round(stats->count));
+#endif
+
+#if CONFIG_RATECTRL_LOG
+  rc_log_init(&cpi->rc_log);
+#endif
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
+  {
+    const double avg_error =
+        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = cpi->twopass_frame.stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min =
+        (avg_error * oxcf->rc_cfg.vbrmin_section) / 100;
+    twopass->modified_error_max =
+        (avg_error * oxcf->rc_cfg.vbrmax_section) / 100;
+    while (s < twopass->stats_buf_ctx->stats_in_end) {
+      modified_error_total +=
+          calculate_modified_err(frame_info, twopass, oxcf, s);
+      ++s;
+    }
+    twopass->modified_error_left = modified_error_total;
+  }
+
+  // Reset the vbr bits off target counters
+  cpi->ppi->p_rc.vbr_bits_off_target = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
+
+  cpi->ppi->p_rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initialize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+}
+
+void av1_init_single_pass_lap(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+
+  if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  twopass->bits_left = 0;
+  twopass->modified_error_min = 0.0;
+  twopass->modified_error_max = 0.0;
+  twopass->modified_error_left = 0.0;
+
+  // Reset the vbr bits off target counters
+  cpi->ppi->p_rc.vbr_bits_off_target = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
+
+  cpi->ppi->p_rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initialize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+
+  // Increment the stats_in pointer.
+  if (is_stat_consumption_stage(cpi) &&
+      !(cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode ==
+                                     DUCKY_ENCODE_GOP_MODE_RCL) &&
+      (cpi->gf_frame_index < cpi->ppi->gf_group.size ||
+       rc->frames_to_key == 0)) {
+    const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+    if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) {
+      FIRSTPASS_STATS this_frame;
+      assert(cpi->twopass_frame.stats_in >
+             twopass->stats_buf_ctx->stats_in_start);
+      --cpi->twopass_frame.stats_in;
+      if (cpi->ppi->lap_enabled) {
+        input_stats_lap(twopass, &cpi->twopass_frame, &this_frame);
+      } else {
+        input_stats(twopass, &cpi->twopass_frame, &this_frame);
+      }
+    } else if (cpi->ppi->lap_enabled) {
+      cpi->twopass_frame.stats_in = twopass->stats_buf_ctx->stats_in_start;
+    }
+  }
+
+  // VBR correction is done through rc->vbr_bits_off_target. Based on the
+  // sign of this value, a limited % adjustment is made to the target rate
+  // of subsequent frames, to try and push it back towards 0. This method
+  // is designed to prevent extreme behaviour at the end of a clip
+  // or group of frames.
+  p_rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  twopass->bits_left = AOMMAX(twopass->bits_left - rc->base_frame_target, 0);
+
+  if (cpi->do_update_vbr_bits_off_target_fast) {
+    // Subtract current frame's fast_extra_bits.
+    p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits;
+    rc->frame_level_fast_extra_bits = 0;
+  }
+
+  // Target vs actual bits for this arf group.
+  twopass->rolling_arf_group_target_bits += rc->base_frame_target;
+  twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
+  // Calculate the pct rc error.
+  if (p_rc->total_actual_bits) {
+    p_rc->rate_error_estimate =
+        (int)((p_rc->vbr_bits_off_target * 100) / p_rc->total_actual_bits);
+    p_rc->rate_error_estimate = clamp(p_rc->rate_error_estimate, -100, 100);
+  } else {
+    p_rc->rate_error_estimate = 0;
+  }
+
+#if CONFIG_FPMT_TEST
+  /* The variables temp_vbr_bits_off_target, temp_bits_left,
+   * temp_rolling_arf_group_target_bits, temp_rolling_arf_group_actual_bits
+   * temp_rate_error_estimate are introduced for quality simulation purpose,
+   * it retains the value previous to the parallel encode frames. The
+   * variables are updated based on the update flag.
+   *
+   * If there exist show_existing_frames between parallel frames, then to
+   * retain the temp state do not update it. */
+  const int simulate_parallel_frame =
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  int show_existing_between_parallel_frames =
+      (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+           INTNL_OVERLAY_UPDATE &&
+       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+  if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+      simulate_parallel_frame) {
+    cpi->ppi->p_rc.temp_vbr_bits_off_target = p_rc->vbr_bits_off_target;
+    cpi->ppi->p_rc.temp_bits_left = twopass->bits_left;
+    cpi->ppi->p_rc.temp_rolling_arf_group_target_bits =
+        twopass->rolling_arf_group_target_bits;
+    cpi->ppi->p_rc.temp_rolling_arf_group_actual_bits =
+        twopass->rolling_arf_group_actual_bits;
+    cpi->ppi->p_rc.temp_rate_error_estimate = p_rc->rate_error_estimate;
+  }
+#endif
+  // Update the active best quality pyramid.
+  if (!rc->is_src_frame_alt_ref) {
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+    int i;
+    for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
+      p_rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
+#if CONFIG_TUNE_VMAF
+      if (cpi->vmaf_info.original_qindex != -1 &&
+          (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+           cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+        p_rc->active_best_quality[i] = cpi->vmaf_info.original_qindex;
+      }
+#endif
+    }
+  }
+
+#if 0
+  {
+    AV1_COMMON *cm = &cpi->common;
+    FILE *fpfile;
+    fpfile = fopen("details.stt", "a");
+    fprintf(fpfile,
+            "%10d %10d %10d %10" PRId64 " %10" PRId64
+            " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n",
+            cm->current_frame.frame_number, rc->base_frame_target,
+            rc->projected_frame_size, rc->total_actual_bits,
+            rc->vbr_bits_off_target, p_rc->rate_error_estimate,
+            twopass->rolling_arf_group_target_bits,
+            twopass->rolling_arf_group_actual_bits,
+            (double)twopass->rolling_arf_group_actual_bits /
+                (double)twopass->rolling_arf_group_target_bits,
+            twopass->bpm_factor,
+            av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex,
+                                    cm->seq_params->bit_depth),
+            av1_convert_qindex_to_q(rc->active_worst_quality,
+                                    cm->seq_params->bit_depth));
+    fclose(fpfile);
+  }
+#endif
+
+  if (cpi->common.current_frame.frame_type != KEY_FRAME) {
+    twopass->kf_group_bits -= rc->base_frame_target;
+    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+  }
+  twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
+    int minq_adj_limit;
+    int maxq_adj_limit;
+    minq_adj_limit =
+        (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+    maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
+
+    // Undershoot
+    if ((rc_cfg->under_shoot_pct < 100) &&
+        (p_rc->rolling_actual_bits < p_rc->rolling_target_bits)) {
+      int pct_error =
+          ((p_rc->rolling_target_bits - p_rc->rolling_actual_bits) * 100) /
+          p_rc->rolling_target_bits;
+
+      if ((pct_error >= rc_cfg->under_shoot_pct) &&
+          (p_rc->rate_error_estimate > 0)) {
+        twopass->extend_minq += 1;
+      }
+      twopass->extend_maxq -= 1;
+      // Overshoot
+    } else if ((rc_cfg->over_shoot_pct < 100) &&
+               (p_rc->rolling_actual_bits > p_rc->rolling_target_bits)) {
+      int pct_error =
+          ((p_rc->rolling_actual_bits - p_rc->rolling_target_bits) * 100) /
+          p_rc->rolling_target_bits;
+
+      pct_error = clamp(pct_error, 0, 100);
+      if ((pct_error >= rc_cfg->over_shoot_pct) &&
+          (p_rc->rate_error_estimate < 0)) {
+        twopass->extend_maxq += 1;
+      }
+      twopass->extend_minq -= 1;
+    } else {
+      // Adjustment for extreme local overshoot.
+      // Only applies when normal adjustment above is not used (e.g.
+      // when threshold is set to 100).
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+      // Unwind extreme overshoot adjustment.
+      else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+    twopass->extend_minq =
+        clamp(twopass->extend_minq, -minq_adj_limit, minq_adj_limit);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+    // If there is a big and undexpected undershoot then feed the extra
+    // bits back in quickly. One situation where this may happen is if a
+    // frame is unexpectedly almost perfectly predicted by the ARF or GF
+    // but not very well predcited by the previous frame.
+    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+      if (rc->projected_frame_size < fast_extra_thresh) {
+        p_rc->vbr_bits_off_target_fast +=
+            fast_extra_thresh - rc->projected_frame_size;
+        p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast,
+                                                (4 * rc->avg_frame_bandwidth));
+      }
+    }
+
+#if CONFIG_FPMT_TEST
+    if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+        simulate_parallel_frame) {
+      cpi->ppi->p_rc.temp_vbr_bits_off_target_fast =
+          p_rc->vbr_bits_off_target_fast;
+      cpi->ppi->p_rc.temp_extend_minq = twopass->extend_minq;
+      cpi->ppi->p_rc.temp_extend_maxq = twopass->extend_maxq;
+    }
+#endif
+  }
+
+  // Update the frame probabilities obtained from parallel encode frames
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+#if CONFIG_FPMT_TEST
+  /* The variable temp_active_best_quality is introduced only for quality
+   * simulation purpose, it retains the value previous to the parallel
+   * encode frames. The variable is updated based on the update flag.
+   *
+   * If there exist show_existing_frames between parallel frames, then to
+   * retain the temp state do not update it. */
+  if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+      simulate_parallel_frame) {
+    int i;
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+    if (!rc->is_src_frame_alt_ref) {
+      for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i)
+        cpi->ppi->p_rc.temp_active_best_quality[i] =
+            p_rc->active_best_quality[i];
+    }
+  }
+
+  // Update the frame probabilities obtained from parallel encode frames
+  FrameProbInfo *const temp_frame_probs_simulation =
+      simulate_parallel_frame ? &cpi->ppi->temp_frame_probs_simulation
+                              : frame_probs;
+  FrameProbInfo *const temp_frame_probs =
+      simulate_parallel_frame ? &cpi->ppi->temp_frame_probs : NULL;
+#endif
+  int i, j, loop;
+  // Sequentially do average on temp_frame_probs_simulation which holds
+  // probabilities of last frame before parallel encode
+  for (loop = 0; loop <= cpi->num_frame_recode; loop++) {
+    // Sequentially update tx_type_probs
+    if (cpi->do_update_frame_probs_txtype[loop] &&
+        (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+      for (i = 0; i < TX_SIZES_ALL; i++) {
+        int left = 1024;
+
+        for (j = TX_TYPES - 1; j >= 0; j--) {
+          const int new_prob =
+              cpi->frame_new_probs[loop].tx_type_probs[update_type][i][j];
+#if CONFIG_FPMT_TEST
+          int prob =
+              (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] +
+               new_prob) >>
+              1;
+          left -= prob;
+          if (j == 0) prob += left;
+          temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = prob;
+#else
+          int prob =
+              (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->tx_type_probs[update_type][i][j] = prob;
+#endif
+        }
+      }
+    }
+
+    // Sequentially update obmc_probs
+    if (cpi->do_update_frame_probs_obmc[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+      for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+        const int new_prob =
+            cpi->frame_new_probs[loop].obmc_probs[update_type][i];
+#if CONFIG_FPMT_TEST
+        temp_frame_probs_simulation->obmc_probs[update_type][i] =
+            (temp_frame_probs_simulation->obmc_probs[update_type][i] +
+             new_prob) >>
+            1;
+#else
+        frame_probs->obmc_probs[update_type][i] =
+            (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+#endif
+      }
+    }
+
+    // Sequentially update warped_probs
+    if (cpi->do_update_frame_probs_warp[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+      const int new_prob = cpi->frame_new_probs[loop].warped_probs[update_type];
+#if CONFIG_FPMT_TEST
+      temp_frame_probs_simulation->warped_probs[update_type] =
+          (temp_frame_probs_simulation->warped_probs[update_type] + new_prob) >>
+          1;
+#else
+      frame_probs->warped_probs[update_type] =
+          (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+#endif
+    }
+
+    // Sequentially update switchable_interp_probs
+    if (cpi->do_update_frame_probs_interpfilter[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+        int left = 1536;
+
+        for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+          const int new_prob = cpi->frame_new_probs[loop]
+                                   .switchable_interp_probs[update_type][i][j];
+#if CONFIG_FPMT_TEST
+          int prob = (temp_frame_probs_simulation
+                          ->switchable_interp_probs[update_type][i][j] +
+                      new_prob) >>
+                     1;
+          left -= prob;
+          if (j == 0) prob += left;
+
+          temp_frame_probs_simulation
+              ->switchable_interp_probs[update_type][i][j] = prob;
+#else
+          int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+                      new_prob) >>
+                     1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+#endif
+        }
+      }
+    }
+  }
+
+#if CONFIG_FPMT_TEST
+  // Copying temp_frame_probs_simulation to temp_frame_probs based on
+  // the flag
+  if (cpi->do_frame_data_update &&
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      simulate_parallel_frame) {
+    for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+         update_type_idx++) {
+      for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+        temp_frame_probs->obmc_probs[update_type_idx][i] =
+            temp_frame_probs_simulation->obmc_probs[update_type_idx][i];
+      }
+      temp_frame_probs->warped_probs[update_type_idx] =
+          temp_frame_probs_simulation->warped_probs[update_type_idx];
+      for (i = 0; i < TX_SIZES_ALL; i++) {
+        for (j = 0; j < TX_TYPES; j++) {
+          temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+              temp_frame_probs_simulation->tx_type_probs[update_type_idx][i][j];
+        }
+      }
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+        for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+          temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+              temp_frame_probs_simulation
+                  ->switchable_interp_probs[update_type_idx][i][j];
+        }
+      }
+    }
+  }
+#endif
+  // Update framerate obtained from parallel encode frames
+  if (cpi->common.show_frame &&
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+    cpi->framerate = cpi->new_framerate;
+#if CONFIG_FPMT_TEST
+  // SIMULATION PURPOSE
+  int show_existing_between_parallel_frames_cndn =
+      (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+           INTNL_OVERLAY_UPDATE &&
+       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+  if (cpi->common.show_frame && !show_existing_between_parallel_frames_cndn &&
+      cpi->do_frame_data_update && simulate_parallel_frame)
+    cpi->temp_framerate = cpi->framerate;
+#endif
+}
diff --git a/third_party/aom/av1/encoder/pass2_strategy.h b/third_party/aom/av1/encoder/pass2_strategy.h
new file mode 100644
index 0000000000..5987a78a23
--- /dev/null
+++ b/third_party/aom/av1/encoder/pass2_strategy.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+#include "av1/encoder/encoder.h"
+
+/*!
+ * \brief accumulated stats and features in a gf group
+ */
+typedef struct {
+  /*!\cond */
+  double gf_group_err;
+  double gf_group_raw_error;
+  double gf_group_skip_pct;
+  double gf_group_inactive_zone_rows;
+
+  double mv_ratio_accumulator;
+  double decay_accumulator;
+  double zero_motion_accumulator;
+  double loop_decay_rate;
+  double last_loop_decay_rate;
+  double this_frame_mv_in_out;
+  double mv_in_out_accumulator;
+  double abs_mv_in_out_accumulator;
+
+  double avg_sr_coded_error;
+  double avg_pcnt_second_ref;
+  double avg_new_mv_count;
+  double avg_wavelet_energy;
+  double avg_raw_err_stdev;
+  int non_zero_stdev_count;
+  /*!\endcond */
+} GF_GROUP_STATS;
+
+/*!
+ * \brief accumulated stats and features for a frame
+ */
+typedef struct {
+  /*!\cond */
+  double frame_err;
+  double frame_coded_error;
+  double frame_sr_coded_error;
+  /*!\endcond */
+} GF_FRAME_STATS;
+/*!\cond */
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+
+void av1_init_single_pass_lap(AV1_COMP *cpi);
+
+/*!\endcond */
+/*!\brief Main per frame entry point for second pass of two pass encode
+ *
+ *\ingroup rate_control
+ *
+ * This function is called for each frame in the second pass of a two pass
+ * encode. It checks the frame type and if a new KF or GF/ARF is due.
+ * When a KF is due it calls find_next_key_frame() to work out how long
+ * this key frame group will be and assign bits to the key frame.
+ * At the start of a new GF/ARF group it calls calculate_gf_length()
+ * and define_gf_group() which are the main functions responsible for
+ * defining the size and structure of the new GF/ARF group.
+ *
+ * \param[in]    cpi           Top - level encoder instance structure
+ * \param[in]    frame_params  Per frame encoding parameters
+ * \param[in]    frame_flags   Frame type and coding flags
+ *
+ * \remark No return but analyses first pass stats and assigns a target
+ *         number of bits to the current frame and a target Q range.
+ */
+void av1_get_second_pass_params(struct AV1_COMP *cpi,
+                                struct EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags);
+
+/*!\brief Adjustments to two pass and rate control after each frame.
+ *
+ *\ingroup rate_control
+ *
+ * This function is called after each frame to make adjustments to
+ * heuristics and data structures that relate to rate control.
+ *
+ * \param[in]    cpi       Top - level encoder instance structure
+ *
+ * \remark No return value but this function updates various rate control
+ *         related data structures that for example track overshoot and
+ *         undershoot.
+ */
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in]   cpi           Top - level encoder instance structure
+ * \param[in]   rc            Rate control data
+ * \param[in]   gf_group      GF/ARF group data structure
+ * \param[in]   is_key_frame  Indicates if the first frame in the group is
+ *                            also a key frame.
+ * \param[in]   use_arf       Are ARF frames enabled or is this a GF only
+ *                            uni-directional group.
+ * \param[in]   gf_group_bits Bits available to be allocated.
+ *
+ * \remark No return but updates the rate control and group data structures
+ *         to reflect the allocation of bits.
+ */
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits);
+
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+                       const TWO_PASS_FRAME *twopass_frame,
+                       const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+                       int offset, int f_frames, int b_frames,
+                       int *num_fpstats_used, int *num_fpstats_required,
+                       int project_gfu_boost);
+
+void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
+                      FIRSTPASS_STATS *last_stats);
+void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
+                        FIRSTPASS_STATS *last_stats,
+                        struct aom_internal_error_info *error_info);
+void av1_estimate_coeff(FIRSTPASS_STATS *first_stats,
+                        FIRSTPASS_STATS *last_stats);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PASS2_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
new file mode 100644
index 0000000000..232a2f9edb
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -0,0 +1,958 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/mcomp.h"
+
+// Get primary and secondary filter strength for the given strength index and
+// search method
+static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method,
+                                             int *pri_strength,
+                                             int *sec_strength,
+                                             int strength_idx) {
+  const int tot_sec_filter =
+      (pick_method == CDEF_FAST_SEARCH_LVL5)
+          ? REDUCED_SEC_STRENGTHS_LVL5
+          : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3
+                                                    : CDEF_SEC_STRENGTHS);
+  const int pri_idx = strength_idx / tot_sec_filter;
+  const int sec_idx = strength_idx % tot_sec_filter;
+  *pri_strength = pri_idx;
+  *sec_strength = sec_idx;
+  if (pick_method == CDEF_FULL_SEARCH) return;
+
+  switch (pick_method) {
+    case CDEF_FAST_SEARCH_LVL1:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL1);
+      *pri_strength = priconv_lvl1[pri_idx];
+      break;
+    case CDEF_FAST_SEARCH_LVL2:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2);
+      *pri_strength = priconv_lvl2[pri_idx];
+      break;
+    case CDEF_FAST_SEARCH_LVL3:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2);
+      assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3);
+      *pri_strength = priconv_lvl2[pri_idx];
+      *sec_strength = secconv_lvl3[sec_idx];
+      break;
+    case CDEF_FAST_SEARCH_LVL4:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4);
+      assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3);
+      *pri_strength = priconv_lvl4[pri_idx];
+      *sec_strength = secconv_lvl3[sec_idx];
+      break;
+    case CDEF_FAST_SEARCH_LVL5:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4);
+      assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL5);
+      *pri_strength = priconv_lvl5[pri_idx];
+      *sec_strength = secconv_lvl5[sec_idx];
+      break;
+    default: assert(0 && "Invalid CDEF search method");
+  }
+}
+
+// Store CDEF filter strength calculated from strength index for given search
+// method
+#define STORE_CDEF_FILTER_STRENGTH(cdef_strength, pick_method, strength_idx) \
+  do {                                                                       \
+    get_cdef_filter_strengths((pick_method), &pri_strength, &sec_strength,   \
+                              (strength_idx));                               \
+    cdef_strength = pri_strength * CDEF_SEC_STRENGTHS + sec_strength;        \
+  } while (0)
+
+/* Search for the best strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+                           uint64_t mse[][TOTAL_STRENGTHS], int sb_count,
+                           CDEF_PICK_METHOD pick_method) {
+  uint64_t tot_mse[TOTAL_STRENGTHS];
+  const int total_strengths = nb_cdef_strengths[pick_method];
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id = 0;
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      if (mse[i][lev[gi]] < best_mse) {
+        best_mse = mse[i][lev[gi]];
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < total_strengths; j++) {
+      uint64_t best = best_mse;
+      if (mse[i][j] < best) best = mse[i][j];
+      tot_mse[j] += best;
+    }
+  }
+  for (j = 0; j < total_strengths; j++) {
+    if (tot_mse[j] < best_tot_mse) {
+      best_tot_mse = tot_mse[j];
+      best_id = j;
+    }
+  }
+  lev[nb_strengths] = best_id;
+  return best_tot_mse;
+}
+
+/* Search for the best luma+chroma strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
+                                uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
+                                CDEF_PICK_METHOD pick_method) {
+  uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id0 = 0;
+  int best_id1 = 0;
+  const int total_strengths = nb_cdef_strengths[pick_method];
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      uint64_t curr = mse[0][i][lev0[gi]];
+      curr += mse[1][i][lev1[gi]];
+      if (curr < best_mse) {
+        best_mse = curr;
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < total_strengths; j++) {
+      int k;
+      for (k = 0; k < total_strengths; k++) {
+        uint64_t best = best_mse;
+        uint64_t curr = mse[0][i][j];
+        curr += mse[1][i][k];
+        if (curr < best) best = curr;
+        tot_mse[j][k] += best;
+      }
+    }
+  }
+  for (j = 0; j < total_strengths; j++) {
+    int k;
+    for (k = 0; k < total_strengths; k++) {
+      if (tot_mse[j][k] < best_tot_mse) {
+        best_tot_mse = tot_mse[j][k];
+        best_id0 = j;
+        best_id1 = k;
+      }
+    }
+  }
+  lev0[nb_strengths] = best_id0;
+  lev1[nb_strengths] = best_id1;
+  return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+                                      uint64_t mse[][TOTAL_STRENGTHS],
+                                      int sb_count,
+                                      CDEF_PICK_METHOD pick_method) {
+  uint64_t best_tot_mse;
+  int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+              pick_method <= CDEF_FAST_SEARCH_LVL5);
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  if (!fast) {
+    for (i = 0; i < 4 * nb_strengths; i++) {
+      int j;
+      for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+      best_tot_mse =
+          search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method);
+    }
+  }
+  return best_tot_mse;
+}
+
+/* Search for the set of luma+chroma strengths that minimizes mse. */
+static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
+                                           int nb_strengths,
+                                           uint64_t (**mse)[TOTAL_STRENGTHS],
+                                           int sb_count,
+                                           CDEF_PICK_METHOD pick_method) {
+  uint64_t best_tot_mse;
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse =
+        search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  for (i = 0; i < 4 * nb_strengths; i++) {
+    int j;
+    for (j = 0; j < nb_strengths - 1; j++) {
+      best_lev0[j] = best_lev0[j + 1];
+      best_lev1[j] = best_lev1[j + 1];
+    }
+    best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse,
+                                   sb_count, pick_method);
+  }
+  return best_tot_mse;
+}
+
+static INLINE void init_src_params(int *src_stride, int *width, int *height,
+                                   int *width_log2, int *height_log2,
+                                   BLOCK_SIZE bsize) {
+  *src_stride = block_size_wide[bsize];
+  *width = block_size_wide[bsize];
+  *height = block_size_high[bsize];
+  *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+  *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+}
+#if CONFIG_AV1_HIGHBITDEPTH
+/* Compute MSE only on the blocks we filtered. */
+static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
+                                         cdef_list *dlist, int cdef_count,
+                                         BLOCK_SIZE bsize, int coeff_shift,
+                                         int row, int col) {
+  assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+         bsize == BLOCK_8X8);
+  uint64_t sum = 0;
+  int bi, bx, by;
+  uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst);
+  uint16_t *dst_buff = &dst16[row * dstride + col];
+  int src_stride, width, height, width_log2, height_log2;
+  init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+                  bsize);
+  for (bi = 0; bi < cdef_count; bi++) {
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    sum += aom_mse_wxh_16bit_highbd(
+        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
+        &src[bi << (height_log2 + width_log2)], src_stride, width, height);
+  }
+  return sum >> 2 * coeff_shift;
+}
+#endif
+
+// Checks dual and quad block processing is applicable for block widths 8 and 4
+// respectively.
+static INLINE int is_dual_or_quad_applicable(cdef_list *dlist, int width,
+                                             int cdef_count, int bi, int iter) {
+  assert(width == 8 || width == 4);
+  const int blk_offset = (width == 8) ? 1 : 3;
+  if ((iter + blk_offset) >= cdef_count) return 0;
+
+  if (dlist[bi].by == dlist[bi + blk_offset].by &&
+      dlist[bi].bx + blk_offset == dlist[bi + blk_offset].bx)
+    return 1;
+
+  return 0;
+}
+
+static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
+                                  cdef_list *dlist, int cdef_count,
+                                  BLOCK_SIZE bsize, int coeff_shift, int row,
+                                  int col) {
+  assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+         bsize == BLOCK_8X8);
+  uint64_t sum = 0;
+  int bi, bx, by;
+  int iter = 0;
+  int inc = 1;
+  uint8_t *dst8 = (uint8_t *)dst;
+  uint8_t *dst_buff = &dst8[row * dstride + col];
+  int src_stride, width, height, width_log2, height_log2;
+  init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+                  bsize);
+
+  const int num_blks = 16 / width;
+  for (bi = 0; bi < cdef_count; bi += inc) {
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    uint16_t *src_tmp = &src[bi << (height_log2 + width_log2)];
+    uint8_t *dst_tmp =
+        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)];
+
+    if (is_dual_or_quad_applicable(dlist, width, cdef_count, bi, iter)) {
+      sum += aom_mse_16xh_16bit(dst_tmp, dstride, src_tmp, width, height);
+      iter += num_blks;
+      inc = num_blks;
+    } else {
+      sum += aom_mse_wxh_16bit(dst_tmp, dstride, src_tmp, src_stride, width,
+                               height);
+      iter += 1;
+      inc = 1;
+    }
+  }
+
+  return sum >> 2 * coeff_shift;
+}
+
+// Fill the boundary regions of the block with CDEF_VERY_LARGE, only if the
+// region is outside frame boundary
+static INLINE void fill_borders_for_fbs_on_frame_boundary(
+    uint16_t *inbuf, int hfilt_size, int vfilt_size,
+    bool is_fb_on_frm_left_boundary, bool is_fb_on_frm_right_boundary,
+    bool is_fb_on_frm_top_boundary, bool is_fb_on_frm_bottom_boundary) {
+  if (!is_fb_on_frm_left_boundary && !is_fb_on_frm_right_boundary &&
+      !is_fb_on_frm_top_boundary && !is_fb_on_frm_bottom_boundary)
+    return;
+  if (is_fb_on_frm_bottom_boundary) {
+    // Fill bottom region of the block
+    const int buf_offset =
+        (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + CDEF_HBORDER;
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_left_boundary) {
+    const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE;
+    // Fill bottom-left region of the block
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_right_boundary) {
+    const int buf_offset =
+        (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + hfilt_size + CDEF_HBORDER;
+    // Fill bottom-right region of the block
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_top_boundary) {
+    // Fill top region of the block
+    fill_rect(&inbuf[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_top_boundary || is_fb_on_frm_left_boundary) {
+    // Fill top-left region of the block
+    fill_rect(inbuf, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_top_boundary || is_fb_on_frm_right_boundary) {
+    const int buf_offset = hfilt_size + CDEF_HBORDER;
+    // Fill top-right region of the block
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_left_boundary) {
+    const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE;
+    // Fill left region of the block
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, vfilt_size, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_right_boundary) {
+    const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE;
+    // Fill right region of the block
+    fill_rect(&inbuf[buf_offset + hfilt_size + CDEF_HBORDER], CDEF_BSTRIDE,
+              vfilt_size, CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+}
+
+// Calculate the number of 8x8/4x4 filter units for which SSE can be calculated
+// after CDEF filtering in single function call
+static AOM_FORCE_INLINE int get_error_calc_width_in_filt_units(
+    cdef_list *dlist, int cdef_count, int bi, int subsampling_x,
+    int subsampling_y) {
+  // TODO(Ranjit): Extend the optimization for 422
+  if (subsampling_x != subsampling_y) return 1;
+
+  // Combining more blocks seems to increase encode time due to increase in
+  // control code
+  if (bi + 3 < cdef_count && dlist[bi].by == dlist[bi + 3].by &&
+      dlist[bi].bx + 3 == dlist[bi + 3].bx) {
+    /* Calculate error for four 8x8/4x4 blocks using 32x8/16x4 block specific
+     * logic if y co-ordinates match and x co-ordinates are
+     * separated by 3 for first and fourth 8x8/4x4 blocks in dlist[]. */
+    return 4;
+  }
+  if (bi + 1 < cdef_count && dlist[bi].by == dlist[bi + 1].by &&
+      dlist[bi].bx + 1 == dlist[bi + 1].bx) {
+    /* Calculate error for two 8x8/4x4 blocks using 16x8/8x4 block specific
+     * logic if their y co-ordinates match and x co-ordinates are
+     * separated by 1 for first and second 8x8/4x4 blocks in dlist[]. */
+    return 2;
+  }
+  return 1;
+}
+
+// Returns the block error after CDEF filtering for a given strength
+static INLINE uint64_t get_filt_error(
+    const CdefSearchCtx *cdef_search_ctx, const struct macroblockd_plane *pd,
+    cdef_list *dlist, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+    int var[CDEF_NBLOCKS][CDEF_NBLOCKS], uint16_t *in, uint8_t *ref_buffer,
+    int ref_stride, int row, int col, int pri_strength, int sec_strength,
+    int cdef_count, int pli, int coeff_shift, BLOCK_SIZE bs) {
+  uint64_t curr_sse = 0;
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bs, pd->subsampling_x, pd->subsampling_y);
+  const int bw_log2 = 3 - pd->subsampling_x;
+  const int bh_log2 = 3 - pd->subsampling_y;
+
+  // TODO(Ranjit): Extend this optimization for HBD
+  if (!cdef_search_ctx->use_highbitdepth) {
+    // If all 8x8/4x4 blocks in CDEF block need to be filtered, calculate the
+    // error at CDEF block level
+    const int tot_blk_count =
+        (block_size_wide[plane_bsize] * block_size_high[plane_bsize]) >>
+        (bw_log2 + bh_log2);
+    if (cdef_count == tot_blk_count) {
+      // Calculate the offset in the buffer based on block position
+      const FULLPEL_MV this_mv = { row, col };
+      const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+      if (pri_strength == 0 && sec_strength == 0) {
+        // When CDEF strength is zero, filtering is not applied. Hence
+        // error is calculated between source and unfiltered pixels
+        curr_sse =
+            aom_sse(&ref_buffer[buf_offset], ref_stride,
+                    get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride,
+                    block_size_wide[plane_bsize], block_size_high[plane_bsize]);
+      } else {
+        DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+
+        av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in,
+                           cdef_search_ctx->xdec[pli],
+                           cdef_search_ctx->ydec[pli], dir, dirinit, var, pli,
+                           dlist, cdef_count, pri_strength,
+                           sec_strength + (sec_strength == 3),
+                           cdef_search_ctx->damping, coeff_shift);
+        curr_sse =
+            aom_sse(&ref_buffer[buf_offset], ref_stride, tmp_dst8,
+                    (1 << MAX_SB_SIZE_LOG2), block_size_wide[plane_bsize],
+                    block_size_high[plane_bsize]);
+      }
+    } else {
+      // If few 8x8/4x4 blocks in CDEF block need to be filtered, filtering
+      // functions produce 8-bit output and the error is calculated in 8-bit
+      // domain
+      if (pri_strength == 0 && sec_strength == 0) {
+        int num_error_calc_filt_units = 1;
+        for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) {
+          const uint8_t by = dlist[bi].by;
+          const uint8_t bx = dlist[bi].bx;
+          const int16_t by_pos = (by << bh_log2);
+          const int16_t bx_pos = (bx << bw_log2);
+          // Calculate the offset in the buffer based on block position
+          const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos };
+          const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+          num_error_calc_filt_units = get_error_calc_width_in_filt_units(
+              dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y);
+          curr_sse += aom_sse(
+              &ref_buffer[buf_offset], ref_stride,
+              get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride,
+              num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2));
+        }
+      } else {
+        DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+        av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in,
+                           cdef_search_ctx->xdec[pli],
+                           cdef_search_ctx->ydec[pli], dir, dirinit, var, pli,
+                           dlist, cdef_count, pri_strength,
+                           sec_strength + (sec_strength == 3),
+                           cdef_search_ctx->damping, coeff_shift);
+        int num_error_calc_filt_units = 1;
+        for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) {
+          const uint8_t by = dlist[bi].by;
+          const uint8_t bx = dlist[bi].bx;
+          const int16_t by_pos = (by << bh_log2);
+          const int16_t bx_pos = (bx << bw_log2);
+          // Calculate the offset in the buffer based on block position
+          const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos };
+          const FULLPEL_MV tmp_buf_pos = { by_pos, bx_pos };
+          const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+          const int tmp_buf_offset =
+              get_offset_from_fullmv(&tmp_buf_pos, (1 << MAX_SB_SIZE_LOG2));
+          num_error_calc_filt_units = get_error_calc_width_in_filt_units(
+              dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y);
+          curr_sse += aom_sse(
+              &ref_buffer[buf_offset], ref_stride, &tmp_dst8[tmp_buf_offset],
+              (1 << MAX_SB_SIZE_LOG2),
+              num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2));
+        }
+      }
+    }
+  } else {
+    DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+
+    av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in,
+                       cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli],
+                       dir, dirinit, var, pli, dlist, cdef_count, pri_strength,
+                       sec_strength + (sec_strength == 3),
+                       cdef_search_ctx->damping, coeff_shift);
+    curr_sse = cdef_search_ctx->compute_cdef_dist_fn(
+        ref_buffer, ref_stride, tmp_dst, dlist, cdef_count,
+        cdef_search_ctx->bsize[pli], coeff_shift, row, col);
+  }
+  return curr_sse;
+}
+
+// Calculates MSE at block level.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   fbr: Row index in units of 64x64 block
+//   fbc: Column index in units of 64x64 block
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+                             struct aom_internal_error_info *error_info,
+                             int fbr, int fbc, int sb_count) {
+  // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+  // in future to handle error propagation.
+  (void)error_info;
+  const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params;
+  const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref;
+  const int coeff_shift = cdef_search_ctx->coeff_shift;
+  const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2;
+  const int *mi_high_l2 = cdef_search_ctx->mi_high_l2;
+
+  // Declare and initialize the temporary buffers.
+  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+  cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+  int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+  int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+  int hb_step = 1, vb_step = 1;
+  BLOCK_SIZE bs;
+
+  const MB_MODE_INFO *const mbmi =
+      mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+
+  uint8_t *ref_buffer[MAX_MB_PLANE] = { ref->y_buffer, ref->u_buffer,
+                                        ref->v_buffer };
+  int ref_stride[MAX_MB_PLANE] = { ref->y_stride, ref->uv_stride,
+                                   ref->uv_stride };
+
+  if (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64 ||
+      mbmi->bsize == BLOCK_64X128) {
+    bs = mbmi->bsize;
+    if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+      nhb = AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+      hb_step = 2;
+    }
+    if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+      nvb = AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+      vb_step = 2;
+    }
+  } else {
+    bs = BLOCK_64X64;
+  }
+  // Get number of 8x8 blocks which are not skip. Cdef processing happens for
+  // 8x8 blocks which are not skip.
+  const int cdef_count = av1_cdef_compute_sb_list(
+      mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
+  const bool is_fb_on_frm_left_boundary = (fbc == 0);
+  const bool is_fb_on_frm_right_boundary =
+      (fbc + hb_step == cdef_search_ctx->nhfb);
+  const bool is_fb_on_frm_top_boundary = (fbr == 0);
+  const bool is_fb_on_frm_bottom_boundary =
+      (fbr + vb_step == cdef_search_ctx->nvfb);
+  const int yoff = CDEF_VBORDER * (!is_fb_on_frm_top_boundary);
+  const int xoff = CDEF_HBORDER * (!is_fb_on_frm_left_boundary);
+  int dirinit = 0;
+  for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) {
+    /* We avoid filtering the pixels for which some of the pixels to
+    average are outside the frame. We could change the filter instead,
+    but it would add special cases for any future vectorization. */
+    const int hfilt_size = (nhb << mi_wide_l2[pli]);
+    const int vfilt_size = (nvb << mi_high_l2[pli]);
+    const int ysize =
+        vfilt_size + CDEF_VBORDER * (!is_fb_on_frm_bottom_boundary) + yoff;
+    const int xsize =
+        hfilt_size + CDEF_HBORDER * (!is_fb_on_frm_right_boundary) + xoff;
+    const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
+    const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+    struct macroblockd_plane pd = cdef_search_ctx->plane[pli];
+    cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+                             pd.dst.buf, row - yoff, col - xoff, pd.dst.stride,
+                             ysize, xsize);
+    fill_borders_for_fbs_on_frame_boundary(
+        inbuf, hfilt_size, vfilt_size, is_fb_on_frm_left_boundary,
+        is_fb_on_frm_right_boundary, is_fb_on_frm_top_boundary,
+        is_fb_on_frm_bottom_boundary);
+    for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) {
+      int pri_strength, sec_strength;
+      get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength,
+                                &sec_strength, gi);
+      const uint64_t curr_mse = get_filt_error(
+          cdef_search_ctx, &pd, dlist, dir, &dirinit, var, in, ref_buffer[pli],
+          ref_stride[pli], row, col, pri_strength, sec_strength, cdef_count,
+          pli, coeff_shift, bs);
+      if (pli < 2)
+        cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse;
+      else
+        cdef_search_ctx->mse[1][sb_count][gi] += curr_mse;
+    }
+  }
+  cdef_search_ctx->sb_index[sb_count] =
+      MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
+}
+
+// MSE calculation at frame level.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx,
+                                struct aom_internal_error_info *error_info) {
+  // Loop over each sb.
+  for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) {
+    for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) {
+      // Checks if cdef processing can be skipped for particular sb.
+      if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue;
+      // Calculate mse for each sb and store the relevant sb index.
+      av1_cdef_mse_calc_block(cdef_search_ctx, error_info, fbr, fbc,
+                              cdef_search_ctx->sb_count);
+      cdef_search_ctx->sb_count++;
+    }
+  }
+}
+
+// Allocates memory for members of CdefSearchCtx.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters
+//   related to CDEF search context.
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_alloc_data(AV1_COMMON *cm, CdefSearchCtx *cdef_search_ctx) {
+  const int nvfb = cdef_search_ctx->nvfb;
+  const int nhfb = cdef_search_ctx->nhfb;
+  CHECK_MEM_ERROR(
+      cm, cdef_search_ctx->sb_index,
+      aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0])));
+  cdef_search_ctx->sb_count = 0;
+  CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[0],
+                  aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
+  CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[1],
+                  aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
+}
+
+// Deallocates the memory allocated for members of CdefSearchCtx.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters
+//   related to CDEF search context.
+// Returns:
+//   Nothing will be returned.
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
+  if (cdef_search_ctx) {
+    aom_free(cdef_search_ctx->mse[0]);
+    cdef_search_ctx->mse[0] = NULL;
+    aom_free(cdef_search_ctx->mse[1]);
+    cdef_search_ctx->mse[1] = NULL;
+    aom_free(cdef_search_ctx->sb_index);
+    cdef_search_ctx->sb_index = NULL;
+  }
+}
+
+// Initialize the parameters related to CDEF search context.
+// Inputs:
+//   frame: Pointer to compressed frame buffer
+//   ref: Pointer to the frame buffer holding the source frame
+//   cm: Pointer to top level common structure
+//   xd: Pointer to common current coding block structure
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   pick_method: Search method used to select CDEF parameters
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
+                                        const YV12_BUFFER_CONFIG *ref,
+                                        AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        CdefSearchCtx *cdef_search_ctx,
+                                        CDEF_PICK_METHOD pick_method) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  cdef_search_ctx->mi_params = &cm->mi_params;
+  cdef_search_ctx->ref = ref;
+  cdef_search_ctx->nvfb =
+      (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  cdef_search_ctx->nhfb =
+      (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+  cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6);
+  cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method];
+  cdef_search_ctx->num_planes = num_planes;
+  cdef_search_ctx->pick_method = pick_method;
+  cdef_search_ctx->sb_count = 0;
+  cdef_search_ctx->use_highbitdepth = cm->seq_params->use_highbitdepth;
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+                       num_planes);
+  // Initialize plane wise information.
+  for (int pli = 0; pli < num_planes; pli++) {
+    cdef_search_ctx->xdec[pli] = xd->plane[pli].subsampling_x;
+    cdef_search_ctx->ydec[pli] = xd->plane[pli].subsampling_y;
+    cdef_search_ctx->bsize[pli] =
+        cdef_search_ctx->ydec[pli]
+            ? (cdef_search_ctx->xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+            : (cdef_search_ctx->xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+    cdef_search_ctx->mi_wide_l2[pli] =
+        MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    cdef_search_ctx->mi_high_l2[pli] =
+        MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+    cdef_search_ctx->plane[pli] = xd->plane[pli];
+  }
+  // Function pointer initialization.
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (cm->seq_params->use_highbitdepth) {
+    cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_highbd;
+    cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd;
+  } else {
+    cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd;
+    cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+  }
+#else
+  cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd;
+  cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+#endif
+}
+
+void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+                           int is_screen_content) {
+  const int bd = cm->seq_params->bit_depth;
+  const int q =
+      av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  // Check the speed feature to avoid extra signaling.
+  if (skip_cdef) {
+    cdef_info->cdef_bits = 1;
+    cdef_info->nb_cdef_strengths = 2;
+  } else {
+    cdef_info->cdef_bits = 0;
+    cdef_info->nb_cdef_strengths = 1;
+  }
+  cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6);
+
+  int predicted_y_f1 = 0;
+  int predicted_y_f2 = 0;
+  int predicted_uv_f1 = 0;
+  int predicted_uv_f2 = 0;
+  if (is_screen_content) {
+    predicted_y_f1 =
+        (int)(5.88217781e-06 * q * q + 6.10391455e-03 * q + 9.95043102e-02);
+    predicted_y_f2 =
+        (int)(-7.79934857e-06 * q * q + 6.58957830e-03 * q + 8.81045025e-01);
+    predicted_uv_f1 =
+        (int)(-6.79500136e-06 * q * q + 1.02695586e-02 * q + 1.36126802e-01);
+    predicted_uv_f2 =
+        (int)(-9.99613695e-08 * q * q - 1.79361339e-05 * q + 1.17022324e+0);
+    predicted_y_f1 = clamp(predicted_y_f1, 0, 15);
+    predicted_y_f2 = clamp(predicted_y_f2, 0, 3);
+    predicted_uv_f1 = clamp(predicted_uv_f1, 0, 15);
+    predicted_uv_f2 = clamp(predicted_uv_f2, 0, 3);
+  } else {
+    if (!frame_is_intra_only(cm)) {
+      predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f +
+                                         q * 0.0068615186f + 0.02709886f),
+                             0, 15);
+      predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f +
+                                         q * 0.0013993345f + 0.03831067f),
+                             0, 3);
+      predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f +
+                                          q * 0.0034628846f + 0.00887099f),
+                              0, 15);
+      predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f +
+                                          q * 0.00028223585f + 0.05576307f),
+                              0, 3);
+    } else {
+      predicted_y_f1 = clamp(
+          (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f),
+          0, 15);
+      predicted_y_f2 = clamp((int)roundf(q * q * 0.0000029167343f +
+                                         q * 0.0027798624f + 0.0079405f),
+                             0, 3);
+      predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000130790995f +
+                                          q * 0.012892405f - 0.00748388f),
+                              0, 15);
+      predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f +
+                                          q * 0.00035520183f + 0.00228092f),
+                              0, 3);
+    }
+  }
+  cdef_info->cdef_strengths[0] =
+      predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2;
+  cdef_info->cdef_uv_strengths[0] =
+      predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
+
+  // mbmi->cdef_strength is already set in the encoding stage. We don't need to
+  // set it again here.
+  if (skip_cdef) {
+    cdef_info->cdef_strengths[1] = 0;
+    cdef_info->cdef_uv_strengths[1] = 0;
+    return;
+  }
+
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  MB_MODE_INFO **mbmi = mi_params->mi_grid_base;
+  // mbmi is NULL when real-time rate control library is used.
+  if (!mbmi) return;
+  for (int r = 0; r < nvfb; ++r) {
+    for (int c = 0; c < nhfb; ++c) {
+      MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c];
+      current_mbmi->cdef_strength = 0;
+    }
+    mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
+  }
+}
+
+void av1_cdef_search(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  CDEF_CONTROL cdef_control = cpi->oxcf.tool_cfg.cdef_control;
+
+  assert(cdef_control != CDEF_NONE);
+  if (cdef_control == CDEF_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame) {
+    CdefInfo *const cdef_info = &cm->cdef_info;
+    cdef_info->nb_cdef_strengths = 1;
+    cdef_info->cdef_bits = 0;
+    cdef_info->cdef_strengths[0] = 0;
+    cdef_info->cdef_uv_strengths[0] = 0;
+    return;
+  }
+
+  // Indicate if external RC is used for testing
+  const int rtc_ext_rc = cpi->rc.rtc_external_ratectrl;
+  if (rtc_ext_rc) {
+    av1_pick_cdef_from_qp(cm, 0, 0);
+    return;
+  }
+  CDEF_PICK_METHOD pick_method = cpi->sf.lpf_sf.cdef_pick_method;
+  if (pick_method == CDEF_PICK_FROM_Q) {
+    const int use_screen_content_model =
+        cm->quant_params.base_qindex >
+            AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh,
+                   cpi->rc.best_quality + 5) &&
+        cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+    av1_pick_cdef_from_qp(cm, cpi->sf.rt_sf.skip_cdef_sb,
+                          use_screen_content_model);
+    return;
+  }
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int damping = 3 + (cm->quant_params.base_qindex >> 6);
+  const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+                    pick_method <= CDEF_FAST_SEARCH_LVL5);
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+  if (!cpi->cdef_search_ctx)
+    CHECK_MEM_ERROR(cm, cpi->cdef_search_ctx,
+                    aom_malloc(sizeof(*cpi->cdef_search_ctx)));
+  CdefSearchCtx *cdef_search_ctx = cpi->cdef_search_ctx;
+
+  // Initialize parameters related to CDEF search context.
+  cdef_params_init(&cm->cur_frame->buf, cpi->source, cm, xd, cdef_search_ctx,
+                   pick_method);
+  // Allocate CDEF search context buffers.
+  cdef_alloc_data(cm, cdef_search_ctx);
+  // Frame level mse calculation.
+  if (cpi->mt_info.num_workers > 1) {
+    av1_cdef_mse_calc_frame_mt(cpi);
+  } else {
+    cdef_mse_calc_frame(cdef_search_ctx, cm->error);
+  }
+
+  /* Search for different number of signaling bits. */
+  int nb_strength_bits = 0;
+  uint64_t best_rd = UINT64_MAX;
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  int sb_count = cdef_search_ctx->sb_count;
+  uint64_t(*mse[2])[TOTAL_STRENGTHS];
+  mse[0] = cdef_search_ctx->mse[0];
+  mse[1] = cdef_search_ctx->mse[1];
+  /* Calculate the maximum number of bits required to signal CDEF strengths at
+   * block level */
+  const int total_strengths = nb_cdef_strengths[pick_method];
+  const int joint_strengths =
+      num_planes > 1 ? total_strengths * total_strengths : total_strengths;
+  const int max_signaling_bits =
+      joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1;
+  int rdmult = cpi->td.mb.rdmult;
+  for (int i = 0; i <= 3; i++) {
+    if (i > max_signaling_bits) break;
+    int best_lev0[CDEF_MAX_STRENGTHS];
+    int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
+    const int nb_strengths = 1 << i;
+    uint64_t tot_mse;
+    if (num_planes > 1) {
+      tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
+                                           mse, sb_count, pick_method);
+    } else {
+      tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count,
+                                      pick_method);
+    }
+
+    const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS *
+                                              (num_planes > 1 ? 2 : 1);
+    const int rate_cost = av1_cost_literal(total_bits);
+    const uint64_t dist = tot_mse * 16;
+    const uint64_t rd = RDCOST(rdmult, rate_cost, dist);
+    if (rd < best_rd) {
+      best_rd = rd;
+      nb_strength_bits = i;
+      memcpy(cdef_info->cdef_strengths, best_lev0,
+             nb_strengths * sizeof(best_lev0[0]));
+      if (num_planes > 1) {
+        memcpy(cdef_info->cdef_uv_strengths, best_lev1,
+               nb_strengths * sizeof(best_lev1[0]));
+      }
+    }
+  }
+
+  cdef_info->cdef_bits = nb_strength_bits;
+  cdef_info->nb_cdef_strengths = 1 << nb_strength_bits;
+  for (int i = 0; i < sb_count; i++) {
+    uint64_t best_mse = UINT64_MAX;
+    int best_gi = 0;
+    for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) {
+      uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]];
+      if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]];
+      if (curr < best_mse) {
+        best_gi = gi;
+        best_mse = curr;
+      }
+    }
+    mi_params->mi_grid_base[cdef_search_ctx->sb_index[i]]->cdef_strength =
+        best_gi;
+  }
+  if (fast) {
+    for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
+      const int luma_strength = cdef_info->cdef_strengths[j];
+      const int chroma_strength = cdef_info->cdef_uv_strengths[j];
+      int pri_strength, sec_strength;
+
+      STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_strengths[j], pick_method,
+                                 luma_strength);
+      STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_uv_strengths[j], pick_method,
+                                 chroma_strength);
+    }
+  }
+
+  cdef_info->cdef_damping = damping;
+  // Deallocate CDEF search context buffers.
+  av1_cdef_dealloc_data(cdef_search_ctx);
+}
diff --git a/third_party/aom/av1/encoder/pickcdef.h b/third_party/aom/av1/encoder/pickcdef.h
new file mode 100644
index 0000000000..192e734fb0
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKCDEF_H_
+#define AOM_AV1_ENCODER_PICKCDEF_H_
+
+#include "av1/common/cdef.h"
+#include "av1/encoder/speed_features.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\enum CDEF_CONTROL
+ * \brief This enum controls to which frames CDEF is applied.
+ */
+typedef enum {
+  CDEF_NONE = 0,      /*!< Disable CDEF on all frames. */
+  CDEF_ALL = 1,       /*!< Enable CDEF for all frames. */
+  CDEF_REFERENCE = 2, /*!< Disable CDEF on non reference frames. */
+} CDEF_CONTROL;
+
+/*!\cond */
+struct MultiThreadInfo;
+
+#define REDUCED_PRI_STRENGTHS_LVL1 8
+#define REDUCED_PRI_STRENGTHS_LVL2 5
+#define REDUCED_SEC_STRENGTHS_LVL3 2
+#define REDUCED_SEC_STRENGTHS_LVL5 1
+#define REDUCED_PRI_STRENGTHS_LVL4 2
+
+#define REDUCED_TOTAL_STRENGTHS_LVL1 \
+  (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL2 \
+  (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL3 \
+  (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL4 \
+  (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL5 \
+  (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+
+static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2,  3,
+                                                              5, 7, 10, 13 };
+static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 };
+static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 };
+static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 };
+static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 };
+static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 };
+static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
+  TOTAL_STRENGTHS,
+  REDUCED_TOTAL_STRENGTHS_LVL1,
+  REDUCED_TOTAL_STRENGTHS_LVL2,
+  REDUCED_TOTAL_STRENGTHS_LVL3,
+  REDUCED_TOTAL_STRENGTHS_LVL4,
+  REDUCED_TOTAL_STRENGTHS_LVL5,
+  TOTAL_STRENGTHS
+};
+
+typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const uint8_t *src,
+                          int src_voffset, int src_hoffset, int sstride,
+                          int vsize, int hsize);
+typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
+                                        cdef_list *dlist, int cdef_count,
+                                        BLOCK_SIZE bsize, int coeff_shift,
+                                        int row, int col);
+
+/*! \brief CDEF search context.
+ */
+typedef struct {
+  /*!
+   * Pointer to the frame buffer holding the source frame
+   */
+  const YV12_BUFFER_CONFIG *ref;
+  /*!
+   * Pointer to params related to MB_MODE_INFO arrays and related info
+   */
+  CommonModeInfoParams *mi_params;
+  /*!
+   * Info specific to each plane
+   */
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+  /*!
+   * Function pointer of copy_fn
+   */
+  copy_fn_t copy_fn;
+  /*!
+   * Function pointer of compute_cdef_dist_fn
+   */
+  compute_cdef_dist_t compute_cdef_dist_fn;
+  /*!
+   *  Number of strenghts evaluated in CDEF filter search
+   */
+  int total_strengths;
+  /*!
+   * Bit-depth dependent shift
+   */
+  int coeff_shift;
+  /*!
+   * CDEF damping factor
+   */
+  int damping;
+  /*!
+   * Search method used to select CDEF parameters
+   */
+  int pick_method;
+  /*!
+   * Number of planes
+   */
+  int num_planes;
+  /*!
+   * Log2 of width of the MI unit in pixels. mi_wide_l2[i]
+   * indicates the width of the MI unit in pixels for the ith plane
+   */
+  int mi_wide_l2[MAX_MB_PLANE];
+  /*!
+   * Log2 of height of the MI unit in pixels. mi_high_l2[i]
+   * indicates the height of the MI unit in pixels for the ith plane
+   */
+  int mi_high_l2[MAX_MB_PLANE];
+  /*!
+   * Subsampling in x direction. xdec[i] indicates the subsampling
+   * for the ith plane
+   */
+  int xdec[MAX_MB_PLANE];
+  /*!
+   * Subsampling in y direction. ydec[i] indicates the subsampling
+   * for the ith plane
+   */
+  int ydec[MAX_MB_PLANE];
+  /*!
+   * bsize[i] indicates the block size of ith plane
+   */
+  int bsize[MAX_MB_PLANE];
+  /*!
+   * Number of 64x64 blocks in vertical direction of a frame
+   */
+  int nvfb;
+  /*!
+   * Number of 64x64 blocks in horizontal direction of a frame
+   */
+  int nhfb;
+  /*!
+   * Pointer to the mean squared error between the CDEF filtered block and the
+   * source block. mse[i][j][k] stores the MSE of the ith plane (i=0 corresponds
+   * to Y-plane, i=1 corresponds to U and V planes), jth block and kth strength
+   * index
+   */
+  uint64_t (*mse[2])[TOTAL_STRENGTHS];
+  /*!
+   * Holds the position (in units of mi's) of the cdef filtered
+   * block in raster scan order
+   */
+  int *sb_index;
+  /*!
+   * Holds the count of cdef filtered blocks
+   */
+  int sb_count;
+  /*!
+   * Indicates if 16bit frame buffers are to be used i.e., the content bit-depth
+   * is > 8-bit
+   */
+  bool use_highbitdepth;
+} CdefSearchCtx;
+
+static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params,
+                              int mi_row, int mi_col) {
+  const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
+  const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
+  const int stride = mi_params->mi_stride;
+  MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
+  for (int r = 0; r < maxr; ++r, mbmi += stride) {
+    for (int c = 0; c < maxc; ++c) {
+      if (!mbmi[c]->skip_txfm) return 0;
+    }
+  }
+  return 1;
+}
+
+// Checks if cdef processing can be skipped for particular sb.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   fbr: Row index in units of 64x64 block
+//   fbc: Column index in units of 64x64 block
+// Returns:
+//   1/0 will be returned to indicate skip/don't skip cdef processing of sb
+//   respectively.
+static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params,
+                               int fbr, int fbc) {
+  const MB_MODE_INFO *const mbmi =
+      mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+  // No filtering if the entire filter block is skipped.
+  if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
+    return 1;
+  // Skip odd numbered 64x64 block rows(cols) when bsize is BLOCK_128X128,
+  // BLOCK_64X128(BLOCK_128X128, BLOCK_128X64) as for such blocks CDEF filtering
+  // is done at the corresponding block sizes.
+  if (((fbc & 1) &&
+       (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) ||
+      ((fbr & 1) &&
+       (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128)))
+    return 1;
+  return 0;
+}
+
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx);
+
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+                             struct aom_internal_error_info *error_info,
+                             int fbr, int fbc, int sb_count);
+/*!\endcond */
+
+/*!\brief AV1 CDEF parameter search
+ *
+ * \ingroup in_loop_cdef
+ *
+ * Searches for optimal CDEF parameters for frame
+ *
+ * \param[in,out]  cpi                 Top level encoder structure
+ *
+ * \remark Nothing is returned. Instead, optimal CDEF parameters are stored
+ * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
+ * \arg \c cdef_bits: Bits of strength parameters
+ * \arg \c nb_cdef_strengths: Number of strength parameters
+ * \arg \c cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the luma plane.
+ * \arg \c uv_cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the chroma planes.
+ * \arg \c damping_factor: CDEF damping factor.
+ *
+ */
+void av1_cdef_search(struct AV1_COMP *cpi);
+
+/*!\brief AV1 CDEF level from QP
+ *
+ * \ingroup in_loop_cdef
+ *
+ * Calculates CDEF levels from frame QP. Only used for speed 7+ with RT mode.
+ *
+ * \param[in,out]  cm                 Pointer to top level common structure
+ * \param[in]      skip_cdef          Flag to skip CDEF filtering
+ * \param[in]      is_screen_content  Flag indicating screen content
+ *
+ */
+void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+                           int is_screen_content);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_PICKCDEF_H_
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
new file mode 100644
index 0000000000..9084d3f13a
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+
+static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
+                            YV12_BUFFER_CONFIG *dst_bc, int plane) {
+  switch (plane) {
+    case 0: aom_yv12_copy_y(src_bc, dst_bc); break;
+    case 1: aom_yv12_copy_u(src_bc, dst_bc); break;
+    case 2: aom_yv12_copy_v(src_bc, dst_bc); break;
+    default: assert(plane >= 0 && plane <= 2); break;
+  }
+}
+
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                      : MAX_LOOP_FILTER;
+  } else {
+    return MAX_LOOP_FILTER;
+  }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                AV1_COMP *const cpi, int filt_level,
+                                int partial_frame, int plane, int dir) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  int num_workers = mt_info->num_mod_workers[MOD_LPF];
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+
+  assert(plane >= 0 && plane <= 2);
+  int filter_level[2] = { filt_level, filt_level };
+  if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
+  if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
+
+  // set base filters for use of av1_get_filter_level when in DELTA_LF mode
+  switch (plane) {
+    case 0:
+      cm->lf.filter_level[0] = filter_level[0];
+      cm->lf.filter_level[1] = filter_level[1];
+      break;
+    case 1: cm->lf.filter_level_u = filter_level[0]; break;
+    case 2: cm->lf.filter_level_v = filter_level[0]; break;
+  }
+
+  // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+  int lpf_opt_level = is_inter_tx_size_search_level_one(&cpi->sf.tx_sf);
+
+  av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
+                           plane + 1, partial_frame, mt_info->workers,
+                           num_workers, &mt_info->lf_row_sync, lpf_opt_level);
+
+  filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
+                               cm->seq_params->use_highbitdepth);
+
+  // Re-instate the unfiltered frame
+  yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
+
+  return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                               int partial_frame,
+                               const int *last_frame_filter_level, int plane,
+                               int dir) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int min_filter_level = 0;
+  const int max_filter_level = av1_get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int64_t best_err;
+  int filt_best;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int lvl;
+  switch (plane) {
+    case 0:
+      switch (dir) {
+        case 2:
+          lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >>
+                1;
+          break;
+        case 0:
+        case 1: lvl = last_frame_filter_level[dir]; break;
+        default: assert(dir >= 0 && dir <= 2); return 0;
+      }
+      break;
+    case 1: lvl = last_frame_filter_level[2]; break;
+    case 2: lvl = last_frame_filter_level[3]; break;
+    default: assert(plane >= 0 && plane <= 2); return 0;
+  }
+  int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+  const int use_coarse_search = cpi->sf.lpf_sf.use_coarse_filter_level_search;
+  assert(use_coarse_search <= 1);
+  static const int min_filter_step_lookup[2] = { 0, 2 };
+  // min_filter_step_thesh determines the stopping criteria for the search.
+  // The search is terminated when filter_step equals min_filter_step_thesh.
+  const int min_filter_step_thesh = min_filter_step_lookup[use_coarse_search];
+
+  // Set each entry to -1
+  memset(ss_err, 0xFF, sizeof(ss_err));
+  yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
+  filt_best = filt_mid;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > min_filter_step_thesh) {
+    const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+    if ((is_stat_consumption_stage_twopass(cpi)) &&
+        (cpi->ppi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        ss_err[filt_low] =
+            try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if (ss_err[filt_low] < (best_err + bias)) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err) {
+          best_err = ss_err[filt_low];
+        }
+        filt_best = filt_low;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        ss_err[filt_high] =
+            try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
+      }
+      // If value is significantly better than previous best, bias added against
+      // raising filter value
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  return filt_best;
+}
+
+void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                           LPF_PICK_METHOD method) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  struct loopfilter *const lf = &cm->lf;
+  int disable_filter_rt_screen = 0;
+  (void)sd;
+
+  lf->sharpness_level = 0;
+
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->sf.rt_sf.skip_lf_screen)
+    disable_filter_rt_screen = av1_cyclic_refresh_disable_lf_cdef(cpi);
+
+  if (disable_filter_rt_screen ||
+      cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_NONE ||
+      (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_REFERENCE &&
+       cpi->ppi->rtc_ref.non_reference_frame)) {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+    return;
+  }
+
+  if (method == LPF_PICK_MINIMAL_LPF) {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+  } else if (method >= LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = av1_get_max_filter_level(cpi);
+    const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+                                   seq_params->bit_depth);
+    // based on tests result for rtc test set
+    // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
+    const int strength_boost_q_treshold = 0;
+    int inter_frame_multiplier =
+        (q > strength_boost_q_treshold ||
+         (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+          cpi->common.width * cpi->common.height > 352 * 288))
+            ? 12034
+            : 6017;
+    // Increase strength on base TL0 for temporal layers, for low-resoln,
+    // based on frame source_sad.
+    if (cpi->svc.number_temporal_layers > 1 &&
+        cpi->svc.temporal_layer_id == 0 &&
+        cpi->common.width * cpi->common.height <= 352 * 288 &&
+        cpi->sf.rt_sf.use_nonrd_pick_mode) {
+      if (cpi->rc.frame_source_sad > 100000)
+        inter_frame_multiplier = inter_frame_multiplier << 1;
+      else if (cpi->rc.frame_source_sad > 50000)
+        inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1);
+    }
+    // These values were determined by linear fitting the result of the
+    // searched level for 8 bit depth:
+    // Keyframes: filt_guess = q * 0.06699 - 1.60817
+    // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225
+    //
+    // And high bit depth separately:
+    // filt_guess = q * 0.316206 + 3.87252
+    int filt_guess;
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8:
+        filt_guess =
+            (cm->current_frame.frame_type == KEY_FRAME)
+                ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+                : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18);
+        break;
+      case AOM_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case AOM_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 &&
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+               "or AOM_BITS_12");
+        return;
+    }
+    if (seq_params->bit_depth != AOM_BITS_8 &&
+        cm->current_frame.frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    // TODO(chengchen): retrain the model for Y, U, V filter levels
+    lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
+    if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY &&
+        !frame_is_intra_only(cm) && !cpi->rc.high_source_sad) {
+      if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+        lf->filter_level[0] = 0;
+        lf->filter_level[1] = 0;
+      } else {
+        const int num4x4 = (cm->width >> 2) * (cm->height >> 2);
+        const int newmv_thresh = 7;
+        const int distance_since_key_thresh = 5;
+        if ((cpi->td.rd_counts.newmv_or_intra_blocks * 100 / num4x4) <
+                newmv_thresh &&
+            cpi->rc.frames_since_key > distance_since_key_thresh) {
+          lf->filter_level[0] = 0;
+          lf->filter_level[1] = 0;
+        }
+      }
+    }
+  } else {
+    int last_frame_filter_level[4] = { 0 };
+    if (!frame_is_intra_only(cm)) {
+      last_frame_filter_level[0] = cpi->ppi->filter_level[0];
+      last_frame_filter_level[1] = cpi->ppi->filter_level[1];
+      last_frame_filter_level[2] = cpi->ppi->filter_level_u;
+      last_frame_filter_level[3] = cpi->ppi->filter_level_v;
+    }
+    // The frame buffer last_frame_uf is used to store the non-loop filtered
+    // reconstructed frame in search_filter_level().
+    if (aom_realloc_frame_buffer(
+            &cpi->last_frame_uf, cm->width, cm->height,
+            seq_params->subsampling_x, seq_params->subsampling_y,
+            seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+            cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate last frame buffer");
+
+    lf->filter_level[0] = lf->filter_level[1] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, 0, 2);
+    if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
+      lf->filter_level[0] =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, 0, 0);
+      lf->filter_level[1] =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, 0, 1);
+    }
+
+    if (num_planes > 1) {
+      lf->filter_level_u =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, 1, 0);
+      lf->filter_level_v =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, 2, 0);
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
new file mode 100644
index 0000000000..f567937c32
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+
+/*!\brief Algorithm for AV1 loop filter level selection.
+ *
+ * \ingroup in_loop_filter
+ * This function determines proper filter levels used for in-loop filter
+ * (deblock filter).
+ *
+ * \param[in]    sd             The pointer of frame buffer
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    method         The method used to select filter levels
+ *
+ * \par
+ * method includes:
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE:  Try the full image with different values.
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE_NON_DUAL: Try the full image filter search
+ * with non-dual filter only.
+ * \arg \c LPF_PICK_FROM_SUBIMAGE: Try a small portion of the image with
+ * different values.
+ * \arg \c LPF_PICK_FROM_Q: Estimate the level based on quantizer and frame type
+ * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last
+ * frame
+ *
+ * \remark Nothing is returned. Instead, filter levels below are stored in the
+ * "loopfilter" structure inside "cpi":
+ * \arg \c filter_level[0]: the vertical filter level for Y plane
+ * \arg \c filter_level[1]: the horizontal filter level for Y plane
+ * \arg \c filter_level_u: the filter level for U plane
+ * \arg \c filter_level_v: the filter level for V plane
+ *
+ * \n
+ * \b Overview
+ * \par
+ * The workflow of deblock filter is shown in Fig.1. \n
+ * Boundary pixels pass through a non-flatness check, followed by a step that
+ * determines smoothness and selects proper types of filters
+ * (4-, 6-, 8-, 14-tap filter). \n
+ * If non-flatness criteria is not satisfied, the encoder will not apply
+ * deblock filtering on these boundary pixels.
+ * \image html filter_flow.png "Fig.1. The workflow of deblock filter" width=70%
+ *
+ * \par
+ * The non-flatness is determined by the boundary pixels and thresholds as shown
+ * in Fig.2. \n
+ * Filtering is applied when \n
+ * \f$|p_0-p_1|<thr_1\f$   and   \f$|q_0-q_1|<thr_1\f$   and
+ * \f$2*|p_0-q_0|+|p_1-q_1|/2<thr_2\f$ \n
+ * \image html filter_thr.png "Fig.2. Non-flatness of pixel boundary" height=40%
+ *
+ * \par
+ * Thresholds ("thr_1" and "thr_2") are determined by the filter level. \n
+ * In AV1, for each frame, we employ the four filter levels, based on these
+ * observations: \n
+ * Luma and chroma planes have different characteristics, including subsampling
+ * (different plane size), coding quality (chroma planes are better coded). \n
+ * Therefore chroma planes need less deblocking filtering than luma plane. \n
+ * In addition, content texture has different spatial characteristics: vertical
+ * and horizontal direction may need different level of filtering. \n
+ * The selection of these filter levels is described in the following section.
+ *
+ * \par
+ * \b Algorithm
+ * \par
+ * The encoder selects filter levels given the current frame buffer, and the
+ * method. \n
+ * By default, "LPF_PICK_FROM_FULL_IMAGE" is used, which should provide
+ * the most appropriate filter levels. \n
+ * For video on demand (VOD) mode, if speed setting is larger than 5,
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" is used. \n
+ * For real-time mode, if speed setting is larger than 5, "LPF_PICK_FROM_Q" is
+ * used.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE" method: determine filter levels sequentially
+ * by a filter level search procedure (function "search_filter_level"). \n
+ * The order is: \n
+ * First search and determine the filter level for Y plane.
+ * Let vertical filter level (filter_level[0]) and the horizontal filter level
+ * (filter_level[1]) be equal to it. \n
+ * Keep the horizontal filter level the same and search and determine the
+ * vertical filter level. \n
+ * Search and determine the horizontal filter level. \n
+ * Search and determine filter level for U plane. \n
+ * Search and determine filter level for V plane.
+ *
+ * \par
+ * Search and determine filter level is fulfilled by function
+ * "search_filter_level". \n
+ * It starts with a base filter level ("filt_mid") initialized by the
+ * corresponding last frame's filter level. \n
+ * A filter step ("filter_step") is determined as:
+ * filter_step = filt_mid < 16 ? 4 : filt_mid / 4. \n
+ * Then a modified binary search strategy is employed to find a proper
+ * filter level. \n
+ * In each iteration, set filt_low = filt_mid - filter_step,
+ * filt_high = filt_mid + filter_step. \n
+ * We now have three candidate levels, "filt_mid", "filt_low" and "filt_high".
+ * \n
+ * Deblock filtering is applied on the current frame with candidate filter
+ * levels and the sum of squared error (SSE) between source and filtered frame
+ * is computed. \n
+ * Set "filt_best" to the filter level of the smallest SSE. If "filter_best"
+ * equals to "filt_mid", halve the filter_step. Otherwise, set filt_mid =
+ * filt_best. \n
+ * Go to the next iteration until "filter_step" is 0. \n
+ * Note that in the comparison of SSEs between SSE[filt_low] and SSE[filt_mid],
+ * a "bias" is introduced to slightly raise the filter level. \n
+ * It is based on the observation that low filter levels tend to yield a smaller
+ * SSE and produce a higher PSNR for the current frame, \n
+ * while oversmoothing it and degradating the quality for prediction for future
+ * frames and leanding to a suboptimal performance overall. \n
+ * Function "try_filter_frame" is the referrence for applying deblock filtering
+ * with a given filter level and computatition of SSE.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" method: almost the same as
+ * "LPF_PICK_FROM_FULL_IMAGE", \n
+ * just without separately searching for appropriate filter levels for vertical
+ * and horizontal filters.
+ *
+ * \par
+ * "LPF_PICK_FROM_Q" method: filter levels are determined by the
+ * quantization factor (q). \n
+ * For 8 bit: \n
+ *   Keyframes: filt_guess = q * 0.06699 - 1.60817 \n
+ *   Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 \n
+ *   inter_frame_multiplier = q > 700 ? 0.04590 : 0.02295 \n
+ * For 10 bit and 12 bit: \n
+ * filt_guess = q * 0.316206 + 3.87252 \n
+ * Then filter_level[0] = filter_level[1] = filter_level_u = filter_level_v =
+ * clamp(filt_guess, min_filter_level, max_filter_level) \n
+ * Where min_filter_level = 0, max_filter_level = 64 \n
+ * The equations were determined by linear fitting using filter levels
+ * generated by "LPF_PICK_FROM_FULL_IMAGE" method.
+ *
+ */
+void av1_pick_filter_level(const struct yv12_buffer_config *sd,
+                           struct AV1_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
new file mode 100644
index 0000000000..6429064175
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -0,0 +1,2217 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+
+// Number of Wiener iterations
+#define NUM_WIENER_ITERS 5
+
+// Penalty factor for use of dual sgr
+#define DUAL_SGR_PENALTY_MULT 0.01
+
+// Working precision for Wiener filter coefficients
+#define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16)
+
+#define SGRPROJ_EP_GRP1_START_IDX 0
+#define SGRPROJ_EP_GRP1_END_IDX 9
+#define SGRPROJ_EP_GRP1_SEARCH_COUNT 4
+#define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2
+static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6,
+                                                                       9 };
+static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = {
+  { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 },
+  { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 }
+};
+
+#if DEBUG_LR_COSTING
+RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+                                 [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif  // DEBUG_LR_COSTING
+
+typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                      const YV12_BUFFER_CONFIG *b);
+typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                           const YV12_BUFFER_CONFIG *b,
+                                           int hstart, int width, int vstart,
+                                           int height);
+typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                            int hstart, int width, int vstart,
+                                            int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define NUM_EXTRACTORS (3 * (1 + 1))
+#else
+#define NUM_EXTRACTORS 3
+#endif
+static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_sse_part,        aom_get_u_sse_part,
+  aom_get_v_sse_part,
+#if CONFIG_AV1_HIGHBITDEPTH
+  aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part,
+  aom_highbd_get_v_sse_part,
+#endif
+};
+static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_var,        aom_get_u_var,        aom_get_v_var,
+#if CONFIG_AV1_HIGHBITDEPTH
+  aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var,
+#endif
+};
+
+static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
+                                    const YV12_BUFFER_CONFIG *src,
+                                    const YV12_BUFFER_CONFIG *dst, int plane,
+                                    int highbd) {
+  return sse_part_extractors[3 * highbd + plane](
+      src, dst, limits->h_start, limits->h_end - limits->h_start,
+      limits->v_start, limits->v_end - limits->v_start);
+}
+
+static uint64_t var_restoration_unit(const RestorationTileLimits *limits,
+                                     const YV12_BUFFER_CONFIG *src, int plane,
+                                     int highbd) {
+  return var_part_extractors[3 * highbd + plane](
+      src, limits->h_start, limits->h_end - limits->h_start, limits->v_start,
+      limits->v_end - limits->v_start);
+}
+
+typedef struct {
+  const YV12_BUFFER_CONFIG *src;
+  YV12_BUFFER_CONFIG *dst;
+
+  const AV1_COMMON *cm;
+  const MACROBLOCK *x;
+  int plane;
+  int plane_w;
+  int plane_h;
+  RestUnitSearchInfo *rusi;
+
+  // Speed features
+  const LOOP_FILTER_SPEED_FEATURES *lpf_sf;
+
+  uint8_t *dgd_buffer;
+  int dgd_stride;
+  const uint8_t *src_buffer;
+  int src_stride;
+
+  // SSE values for each restoration mode for the current RU
+  // These are saved by each search function for use in search_switchable()
+  int64_t sse[RESTORE_SWITCHABLE_TYPES];
+
+  // This flag will be set based on the speed feature
+  // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
+  uint8_t skip_sgr_eval;
+
+  // Total rate and distortion so far for each restoration type
+  // These are initialised by reset_rsc in search_rest_type
+  int64_t total_sse[RESTORE_TYPES];
+  int64_t total_bits[RESTORE_TYPES];
+
+  // Reference parameters for delta-coding
+  //
+  // For each restoration type, we need to store the latest parameter set which
+  // has been used, so that we can properly cost up the next parameter set.
+  // Note that we have two sets of these - one for the single-restoration-mode
+  // search (ie, frame_restoration_type = RESTORE_WIENER or RESTORE_SGRPROJ)
+  // and one for the switchable mode. This is because these two cases can lead
+  // to different sets of parameters being signaled, but we don't know which
+  // we will pick for sure until the end of the search process.
+  WienerInfo ref_wiener;
+  SgrprojInfo ref_sgrproj;
+  WienerInfo switchable_ref_wiener;
+  SgrprojInfo switchable_ref_sgrproj;
+
+  // Buffers used to hold dgd-avg and src-avg data respectively during SIMD
+  // call of Wiener filter.
+  int16_t *dgd_avg;
+  int16_t *src_avg;
+} RestSearchCtxt;
+
+static AOM_INLINE void rsc_on_tile(void *priv) {
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  set_default_wiener(&rsc->ref_wiener);
+  set_default_sgrproj(&rsc->ref_sgrproj);
+  set_default_wiener(&rsc->switchable_ref_wiener);
+  set_default_sgrproj(&rsc->switchable_ref_sgrproj);
+}
+
+static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
+  memset(rsc->total_sse, 0, sizeof(rsc->total_sse));
+  memset(rsc->total_bits, 0, sizeof(rsc->total_bits));
+}
+
+static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
+                                const AV1_COMMON *cm, const MACROBLOCK *x,
+                                const LOOP_FILTER_SPEED_FEATURES *lpf_sf,
+                                int plane, RestUnitSearchInfo *rusi,
+                                YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) {
+  rsc->src = src;
+  rsc->dst = dst;
+  rsc->cm = cm;
+  rsc->x = x;
+  rsc->plane = plane;
+  rsc->rusi = rusi;
+  rsc->lpf_sf = lpf_sf;
+
+  const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+  const int is_uv = plane != AOM_PLANE_Y;
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+  assert(plane_w == src->crop_widths[is_uv]);
+  assert(plane_h == src->crop_heights[is_uv]);
+  assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+  assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+
+  rsc->plane_w = plane_w;
+  rsc->plane_h = plane_h;
+  rsc->src_buffer = src->buffers[plane];
+  rsc->src_stride = src->strides[is_uv];
+  rsc->dgd_buffer = dgd->buffers[plane];
+  rsc->dgd_stride = dgd->strides[is_uv];
+}
+
+static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
+                                    const RestorationTileLimits *limits,
+                                    const RestorationUnitInfo *rui) {
+  const AV1_COMMON *const cm = rsc->cm;
+  const int plane = rsc->plane;
+  const int is_uv = plane > 0;
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationLineBuffers rlbs;
+  const int bit_depth = cm->seq_params->bit_depth;
+  const int highbd = cm->seq_params->use_highbitdepth;
+
+  const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
+  // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+  // also used in encoder.
+  const int optimized_lr = 0;
+
+  av1_loop_restoration_filter_unit(
+      limits, rui, &rsi->boundaries, &rlbs, rsc->plane_w, rsc->plane_h,
+      is_uv && cm->seq_params->subsampling_x,
+      is_uv && cm->seq_params->subsampling_y, highbd, bit_depth,
+      fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
+      rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr, cm->error);
+
+  return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
+}
+
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+                                     int src_stride, const uint8_t *dat8,
+                                     int dat_stride, int32_t *flt0,
+                                     int flt0_stride, int32_t *flt1,
+                                     int flt1_stride, int xq[2],
+                                     const sgr_params_type *params) {
+  int i, j;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    }
+  } else if (params->r[0] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[0] * (flt0[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+    }
+  } else if (params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[1] * (flt1[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt1 += flt1_stride;
+    }
+  } else {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t e = (int32_t)(dat[j]) - src[j];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+
+  return err;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
+                                      int height, int src_stride,
+                                      const uint8_t *dat8, int dat_stride,
+                                      int32_t *flt0, int flt0_stride,
+                                      int32_t *flt1, int flt1_stride, int xq[2],
+                                      const sgr_params_type *params) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  int i, j;
+  int64_t err = 0;
+  const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    int xq0 = xq[0];
+    int xq1 = xq[1];
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t d = dat[j];
+        const int32_t s = src[j];
+        const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+        int32_t v0 = flt0[j] - u;
+        int32_t v1 = flt1[j] - u;
+        int32_t v = half;
+        v += xq0 * v0;
+        v += xq1 * v1;
+        const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      src += src_stride;
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    int exq;
+    int32_t *flt;
+    int flt_stride;
+    if (params->r[0] > 0) {
+      exq = xq[0];
+      flt = flt0;
+      flt_stride = flt0_stride;
+    } else {
+      exq = xq[1];
+      flt = flt1;
+      flt_stride = flt1_stride;
+    }
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t d = dat[j];
+        const int32_t s = src[j];
+        const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+        int32_t v = half;
+        v += exq * (flt[j] - u);
+        const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      flt += flt_stride;
+      src += src_stride;
+    }
+  } else {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t d = dat[j];
+        const int32_t s = src[j];
+        const int32_t e = d - s;
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+  return err;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
+                                    int src_stride, const uint8_t *dat8,
+                                    int dat_stride, int use_highbitdepth,
+                                    int32_t *flt0, int flt0_stride,
+                                    int32_t *flt1, int flt1_stride, int *xqd,
+                                    const sgr_params_type *params) {
+  int xq[2];
+  av1_decode_xq(xqd, xq, params);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_highbitdepth) {
+    return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt0, flt0_stride, flt1,
+                                       flt1_stride, xq, params);
+
+  } else {
+    return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                      dat_stride, flt0, flt0_stride, flt1,
+                                      flt1_stride, xq, params);
+  }
+#else
+  (void)use_highbitdepth;
+  return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                    dat_stride, flt0, flt0_stride, flt1,
+                                    flt1_stride, xq, params);
+#endif
+}
+
+#define USE_SGRPROJ_REFINEMENT_SEARCH 1
+static int64_t finer_search_pixel_proj_error(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
+    int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+    const sgr_params_type *params) {
+  int64_t err = get_pixel_proj_error(
+      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+      flt0_stride, flt1, flt1_stride, xqd, params);
+  (void)start_step;
+#if USE_SGRPROJ_REFINEMENT_SEARCH
+  int64_t err2;
+  int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 };
+  int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
+  for (int s = start_step; s >= 1; s >>= 1) {
+    for (int p = 0; p < 2; ++p) {
+      if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) {
+        continue;
+      }
+      int skip = 0;
+      do {
+        if (xqd[p] - s >= tap_min[p]) {
+          xqd[p] -= s;
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
+          if (err2 > err) {
+            xqd[p] += s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (xqd[p] + s <= tap_max[p]) {
+          xqd[p] += s;
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
+          if (err2 > err) {
+            xqd[p] -= s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+  }
+#endif  // USE_SGRPROJ_REFINEMENT_SEARCH
+  return err;
+}
+
+static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) {
+  if (dividend < 0)
+    return (dividend - divisor / 2) / divisor;
+  else
+    return (dividend + divisor / 2) / divisor;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      H[1][1] += (int64_t)f2 * f2;
+      H[0][1] += (int64_t)f1 * f2;
+      C[0] += (int64_t)f1 * s;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      H[1][1] += (int64_t)f2 * f2;
+      H[0][1] += (int64_t)f1 * f2;
+      C[0] += (int64_t)f1 * s;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
+                                             int height, int src_stride,
+                                             const uint8_t *dat8,
+                                             int dat_stride, int32_t *flt0,
+                                             int flt0_stride, int64_t H[2][2],
+                                             int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      C[0] += (int64_t)f1 * s;
+    }
+  }
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      C[0] += (int64_t)f1 * s;
+    }
+  }
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
+                                             int height, int src_stride,
+                                             const uint8_t *dat8,
+                                             int dat_stride, int32_t *flt1,
+                                             int flt1_stride, int64_t H[2][2],
+                                             int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[1][1] += (int64_t)f2 * f2;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[1][1] += (int64_t)f2 * f2;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height,
+                            int src_stride, const uint8_t *dat8, int dat_stride,
+                            int32_t *flt0, int flt0_stride, int32_t *flt1,
+                            int flt1_stride, int64_t H[2][2], int64_t C[2],
+                            const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, flt1, flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride,
+                          flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+                          flt1, flt1_stride, H, C);
+  }
+}
+
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
+                                    int src_stride, const uint8_t *dat8,
+                                    int dat_stride, int32_t *flt0,
+                                    int flt0_stride, int32_t *flt1,
+                                    int flt1_stride, int64_t H[2][2],
+                                    int64_t C[2],
+                                    const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
+                                         int height, int src_stride,
+                                         const uint8_t *dat8, int dat_stride,
+                                         int use_highbitdepth, int32_t *flt0,
+                                         int flt0_stride, int32_t *flt1,
+                                         int flt1_stride, int *xq,
+                                         const sgr_params_type *params) {
+  int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
+  int64_t C[2] = { 0, 0 };
+
+  // Default values to be returned if the problem becomes ill-posed
+  xq[0] = 0;
+  xq[1] = 0;
+
+  if (!use_highbitdepth) {
+    if ((width & 0x7) == 0) {
+      av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride,
+                           flt0, flt0_stride, flt1, flt1_stride, H, C, params);
+    } else {
+      av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, flt1, flt1_stride, H, C,
+                             params);
+    }
+  }
+#if CONFIG_AV1_HIGHBITDEPTH
+  else {  // NOLINT
+    if ((width & 0x7) == 0) {
+      av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8,
+                                   dat_stride, flt0, flt0_stride, flt1,
+                                   flt1_stride, H, C, params);
+    } else {
+      av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, H, C, params);
+    }
+  }
+#endif
+
+  if (params->r[0] == 0) {
+    // H matrix is now only the scalar H[1][1]
+    // C vector is now only the scalar C[1]
+    const int64_t Det = H[1][1];
+    if (Det == 0) return;  // ill-posed, return default values
+    xq[0] = 0;
+    xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det);
+  } else if (params->r[1] == 0) {
+    // H matrix is now only the scalar H[0][0]
+    // C vector is now only the scalar C[0]
+    const int64_t Det = H[0][0];
+    if (Det == 0) return;  // ill-posed, return default values
+    xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det);
+    xq[1] = 0;
+  } else {
+    const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0];
+    if (Det == 0) return;  // ill-posed, return default values
+
+    // If scaling up dividend would overflow, instead scale down the divisor
+    const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1];
+    if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) ||
+        (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1))
+      xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS));
+    else
+      xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det);
+
+    const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0];
+    if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) ||
+        (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2))
+      xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS));
+    else
+      xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det);
+  }
+}
+
+static AOM_INLINE void encode_xq(int *xq, int *xqd,
+                                 const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xqd[0] = 0;
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else if (params->r[1] == 0) {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  }
+}
+
+// Apply the self-guided filter across an entire restoration unit.
+static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
+                                 int width, int height, int dat_stride,
+                                 int use_highbd, int bit_depth, int pu_width,
+                                 int pu_height, int32_t *flt0, int32_t *flt1,
+                                 int flt_stride,
+                                 struct aom_internal_error_info *error_info) {
+  for (int i = 0; i < height; i += pu_height) {
+    const int h = AOMMIN(pu_height, height - i);
+    int32_t *flt0_row = flt0 + i * flt_stride;
+    int32_t *flt1_row = flt1 + i * flt_stride;
+    const uint8_t *dat8_row = dat8 + i * dat_stride;
+
+    // Iterate over the stripe in blocks of width pu_width
+    for (int j = 0; j < width; j += pu_width) {
+      const int w = AOMMIN(pu_width, width - j);
+      if (av1_selfguided_restoration(
+              dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+              flt_stride, sgr_params_idx, bit_depth, use_highbd) != 0) {
+        aom_internal_error(
+            error_info, AOM_CODEC_MEM_ERROR,
+            "Error allocating buffer in av1_selfguided_restoration");
+      }
+    }
+  }
+}
+
+static AOM_INLINE void compute_sgrproj_err(
+    const uint8_t *dat8, const int width, const int height,
+    const int dat_stride, const uint8_t *src8, const int src_stride,
+    const int use_highbitdepth, const int bit_depth, const int pu_width,
+    const int pu_height, const int ep, int32_t *flt0, int32_t *flt1,
+    const int flt_stride, int *exqd, int64_t *err,
+    struct aom_internal_error_info *error_info) {
+  int exq[2];
+  apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+            pu_width, pu_height, flt0, flt1, flt_stride, error_info);
+  const sgr_params_type *const params = &av1_sgr_params[ep];
+  get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+                    use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+                    params);
+  encode_xq(exq, exqd, params);
+  *err = finer_search_pixel_proj_error(
+      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+      flt_stride, flt1, flt_stride, 2, exqd, params);
+}
+
+static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err,
+                                      const int *exqd, int *bestxqd,
+                                      int *bestep, const int ep) {
+  if (*besterr == -1 || err < *besterr) {
+    *bestep = ep;
+    *besterr = err;
+    bestxqd[0] = exqd[0];
+    bestxqd[1] = exqd[1];
+  }
+}
+
+static SgrprojInfo search_selfguided_restoration(
+    const uint8_t *dat8, int width, int height, int dat_stride,
+    const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
+    int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning,
+    struct aom_internal_error_info *error_info) {
+  int32_t *flt0 = rstbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  int ep, idx, bestep = 0;
+  int64_t besterr = -1;
+  int exqd[2], bestxqd[2] = { 0, 0 };
+  int flt_stride = ((width + 7) & ~7) + 8;
+  assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+         pu_width == RESTORATION_PROC_UNIT_SIZE);
+  assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+         pu_height == RESTORATION_PROC_UNIT_SIZE);
+  if (!enable_sgr_ep_pruning) {
+    for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err, error_info);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+  } else {
+    // evaluate first four seed ep in first group
+    for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) {
+      ep = sgproj_ep_grp1_seed[idx];
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err, error_info);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+    // evaluate left and right ep of winner in seed ep
+    int bestep_ref = bestep;
+    for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) {
+      if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX)
+        continue;
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err, error_info);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+    // evaluate last two group
+    for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) {
+      ep = sgproj_ep_grp2_3[idx][bestep];
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err, error_info);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+  }
+
+  SgrprojInfo ret;
+  ret.ep = bestep;
+  ret.xqd[0] = bestxqd[0];
+  ret.xqd[1] = bestxqd[1];
+  return ret;
+}
+
+static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
+                              SgrprojInfo *ref_sgrproj_info) {
+  int bits = SGRPROJ_PARAMS_BITS;
+  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+  if (params->r[0] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  if (params->r[1] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  return bits;
+}
+
+static AOM_INLINE void search_sgrproj(
+    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    struct aom_internal_error_info *error_info) {
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const MACROBLOCK *const x = rsc->x;
+  const AV1_COMMON *const cm = rsc->cm;
+  const int highbd = cm->seq_params->use_highbitdepth;
+  const int bit_depth = cm->seq_params->bit_depth;
+
+  const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
+  // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
+  if (rsc->skip_sgr_eval) {
+    rsc->total_bits[RESTORE_SGRPROJ] += bits_none;
+    rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE;
+    rsc->sse[RESTORE_SGRPROJ] = INT64_MAX;
+    return;
+  }
+
+  uint8_t *dgd_start =
+      rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
+  const uint8_t *src_start =
+      rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
+
+  const int is_uv = rsc->plane > 0;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+  const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+
+  rusi->sgrproj = search_selfguided_restoration(
+      dgd_start, limits->h_end - limits->h_start,
+      limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
+      rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
+      tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning, error_info);
+
+  RestorationUnitInfo rui;
+  rui.restoration_type = RESTORE_SGRPROJ;
+  rui.sgrproj_info = rusi->sgrproj;
+
+  rsc->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, &rui);
+
+  const int64_t bits_sgr =
+      x->mode_costs.sgrproj_restore_cost[1] +
+      (count_sgrproj_bits(&rusi->sgrproj, &rsc->ref_sgrproj)
+       << AV1_PROB_COST_SHIFT);
+  double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], bit_depth);
+  double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_sgr >> 4, rsc->sse[RESTORE_SGRPROJ], bit_depth);
+  if (rusi->sgrproj.ep < 10)
+    cost_sgr *=
+        (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
+
+  RestorationType rtype =
+      (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
+
+#if DEBUG_LR_COSTING
+  // Store ref params for later checking
+  lr_ref_params[RESTORE_SGRPROJ][rsc->plane][rest_unit_idx].sgrproj_info =
+      rsc->ref_sgrproj;
+#endif  // DEBUG_LR_COSTING
+
+  rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[rtype];
+  rsc->total_bits[RESTORE_SGRPROJ] +=
+      (cost_sgr < cost_none) ? bits_sgr : bits_none;
+  if (cost_sgr < cost_none) rsc->ref_sgrproj = rusi->sgrproj;
+}
+
+static void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src,
+                              int dgd_stride, int h_start, int h_end,
+                              uint8_t avg, const int wiener_halfwin,
+                              const int wiener_win2, int32_t *M_int32,
+                              int32_t *H_int32, int count) {
+  int j, k, l;
+  int16_t Y[WIENER_WIN2];
+
+  for (j = h_start; j < h_end; j++) {
+    const int16_t X = (int16_t)src[j] - (int16_t)avg;
+    int idx = 0;
+    for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+      for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+        Y[idx] =
+            (int16_t)dgd[(count + l) * dgd_stride + (j + k)] - (int16_t)avg;
+        idx++;
+      }
+    }
+    assert(idx == wiener_win2);
+    for (k = 0; k < wiener_win2; ++k) {
+      M_int32[k] += (int32_t)Y[k] * X;
+      for (l = k; l < wiener_win2; ++l) {
+        // H is a symmetric matrix, so we only need to fill out the upper
+        // triangle here. We can copy it down to the lower triangle outside
+        // the (i, j) loops.
+        H_int32[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l];
+      }
+    }
+  }
+}
+
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+                         int16_t *dgd_avg, int16_t *src_avg, int h_start,
+                         int h_end, int v_start, int v_end, int dgd_stride,
+                         int src_stride, int64_t *M, int64_t *H,
+                         int use_downsampled_wiener_stats) {
+  (void)dgd_avg;
+  (void)src_avg;
+  int i, k, l;
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+  int32_t M_row[WIENER_WIN2] = { 0 };
+  int32_t H_row[WIENER_WIN2 * WIENER_WIN2] = { 0 };
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+  for (i = v_start; i < v_end; i = i + downsample_factor) {
+    if (use_downsampled_wiener_stats &&
+        (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+      downsample_factor = v_end - i;
+    }
+
+    memset(M_row, 0, sizeof(int32_t) * WIENER_WIN2);
+    memset(H_row, 0, sizeof(int32_t) * WIENER_WIN2 * WIENER_WIN2);
+    acc_stat_one_line(dgd, src + i * src_stride, dgd_stride, h_start, h_end,
+                      avg, wiener_halfwin, wiener_win2, M_row, H_row, i);
+
+    for (k = 0; k < wiener_win2; ++k) {
+      // Scale M matrix based on the downsampling factor
+      M[k] += ((int64_t)M_row[k] * downsample_factor);
+      for (l = k; l < wiener_win2; ++l) {
+        // H is a symmetric matrix, so we only need to fill out the upper
+        // triangle here. We can copy it down to the lower triangle outside
+        // the (i, j) loops.
+        // Scale H Matrix based on the downsampling factor
+        H[k * wiener_win2 + l] +=
+            ((int64_t)H_row[k * wiener_win2 + l] * downsample_factor);
+      }
+    }
+  }
+
+  for (k = 0; k < wiener_win2; ++k) {
+    for (l = k + 1; l < wiener_win2; ++l) {
+      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
+                                const uint8_t *src8, int h_start, int h_end,
+                                int v_start, int v_end, int dgd_stride,
+                                int src_stride, int64_t *M, int64_t *H,
+                                aom_bit_depth_t bit_depth) {
+  int i, j, k, l;
+  int32_t Y[WIENER_WIN2];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const int32_t X = (int32_t)src[i * src_stride + j] - (int32_t)avg;
+      int idx = 0;
+      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+          Y[idx] = (int32_t)dgd[(i + l) * dgd_stride + (j + k)] - (int32_t)avg;
+          idx++;
+        }
+      }
+      assert(idx == wiener_win2);
+      for (k = 0; k < wiener_win2; ++k) {
+        M[k] += (int64_t)Y[k] * X;
+        for (l = k; l < wiener_win2; ++l) {
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H[k * wiener_win2 + l] += (int64_t)Y[k] * Y[l];
+        }
+      }
+    }
+  }
+  for (k = 0; k < wiener_win2; ++k) {
+    M[k] /= bit_depth_divider;
+    H[k * wiener_win2 + k] /= bit_depth_divider;
+    for (l = k + 1; l < wiener_win2; ++l) {
+      H[k * wiener_win2 + l] /= bit_depth_divider;
+      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE int wrap_index(int i, int wiener_win) {
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
+}
+
+// Solve linear equations to find Wiener filter tap values
+// Taps are output scaled by WIENER_FILT_STEP
+static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
+                           int64_t *x) {
+  for (int k = 0; k < n - 1; k++) {
+    // Partial pivoting: bring the row with the largest pivot to the top
+    for (int i = n - 1; i > k; i--) {
+      // If row i has a better (bigger) pivot than row (i-1), swap them
+      if (llabs(A[(i - 1) * stride + k]) < llabs(A[i * stride + k])) {
+        for (int j = 0; j < n; j++) {
+          const int64_t c = A[i * stride + j];
+          A[i * stride + j] = A[(i - 1) * stride + j];
+          A[(i - 1) * stride + j] = c;
+        }
+        const int64_t c = b[i];
+        b[i] = b[i - 1];
+        b[i - 1] = c;
+      }
+    }
+
+    // b/278065963: The multiplies
+    //   c / 256 * A[k * stride + j] / cd * 256
+    // and
+    //   c / 256 * b[k] / cd * 256
+    // within Gaussian elimination can cause a signed integer overflow. Rework
+    // the multiplies so that larger scaling is used without significantly
+    // impacting the overall precision.
+    //
+    // Precision guidance:
+    //   scale_threshold: Pick as high as possible.
+    // For max_abs_akj >= scale_threshold scenario:
+    //   scaler_A: Pick as low as possible. Needed for A[(i + 1) * stride + j].
+    //   scaler_c: Pick as low as possible while maintaining scaler_c >=
+    //     (1 << 7). Needed for A[(i + 1) * stride + j] and b[i + 1].
+    int64_t max_abs_akj = 0;
+    for (int j = 0; j < n; j++) {
+      const int64_t abs_akj = llabs(A[k * stride + j]);
+      if (abs_akj > max_abs_akj) max_abs_akj = abs_akj;
+    }
+    const int scale_threshold = 1 << 22;
+    const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 5);
+    const int scaler_c = max_abs_akj < scale_threshold ? 1 : (1 << 7);
+    const int scaler = scaler_c * scaler_A;
+
+    // Forward elimination (convert A to row-echelon form)
+    for (int i = k; i < n - 1; i++) {
+      if (A[k * stride + k] == 0) return 0;
+      const int64_t c = A[(i + 1) * stride + k] / scaler_c;
+      const int64_t cd = A[k * stride + k];
+      for (int j = 0; j < n; j++) {
+        A[(i + 1) * stride + j] -=
+            A[k * stride + j] / scaler_A * c / cd * scaler;
+      }
+      b[i + 1] -= c * b[k] / cd * scaler_c;
+    }
+  }
+  // Back-substitution
+  for (int i = n - 1; i >= 0; i--) {
+    if (A[i * stride + i] == 0) return 0;
+    int64_t c = 0;
+    for (int j = i + 1; j <= n - 1; j++) {
+      c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR;
+    }
+    // Store filter taps x in scaled form.
+    x[i] = WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i];
+  }
+
+  return 1;
+}
+
+// Fix vector b, update vector a
+static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
+                                        int64_t **Hc, int32_t *a, int32_t *b) {
+  int i, j;
+  int64_t S[WIENER_WIN];
+  int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; ++j) {
+      const int jj = wrap_index(j, wiener_win);
+      A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR;
+    }
+  }
+
+  // b/274668506: This is the dual branch for the issue in b/272139363. The fix
+  // is similar. See comments in update_b_sep_sym() below.
+  int32_t max_b_l = 0;
+  for (int l = 0; l < wiener_win; ++l) {
+    const int32_t abs_b_l = abs(b[l]);
+    if (abs_b_l > max_b_l) max_b_l = abs_b_l;
+  }
+  const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR;
+  const int scaler = max_b_l < scale_threshold ? 1 : 4;
+
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; j++) {
+      int k, l;
+      for (k = 0; k < wiener_win; ++k) {
+        const int kk = wrap_index(k, wiener_win);
+        for (l = 0; l < wiener_win; ++l) {
+          const int ll = wrap_index(l, wiener_win);
+          B[ll * wiener_halfwin1 + kk] +=
+              Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] /
+              (scaler * WIENER_TAP_SCALE_FACTOR) * b[j] /
+              (WIENER_TAP_SCALE_FACTOR / scaler);
+        }
+      }
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+    A[i] -=
+        A[wiener_halfwin1 - 1] * 2 +
+        B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+        2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+  }
+  for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+    for (j = 0; j < wiener_halfwin1 - 1; ++j) {
+      B[i * wiener_halfwin1 + j] -=
+          2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+               B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+               2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+                     (wiener_halfwin1 - 1)]);
+    }
+  }
+  if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+    S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR;
+    for (i = wiener_halfwin1; i < wiener_win; ++i) {
+      S[i] = S[wiener_win - 1 - i];
+      S[wiener_halfwin1 - 1] -= 2 * S[i];
+    }
+    for (i = 0; i < wiener_win; ++i) {
+      a[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)),
+                           (1 << (WIENER_FILT_BITS - 1)) - 1);
+    }
+  }
+}
+
+// Fix vector a, update vector b
+static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
+                                        int64_t **Hc, int32_t *a, int32_t *b) {
+  int i, j;
+  int64_t S[WIENER_WIN];
+  int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < wiener_win; i++) {
+    const int ii = wrap_index(i, wiener_win);
+    for (j = 0; j < wiener_win; j++) {
+      A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR;
+    }
+  }
+
+  // b/272139363: The computation,
+  //   Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+  //          WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR;
+  // may generate a signed-integer-overflow. Conditionally scale the terms to
+  // avoid a potential overflow.
+  //
+  // Hc contains accumulated correlation statistics and it is desired to leave
+  // as much room as possible for Hc. It was experimentally observed that the
+  // primary issue manifests itself with the second, a[l], multiply. For
+  // max_a_l < WIENER_TAP_SCALE_FACTOR the first multiply with a[k] should not
+  // increase dynamic range and the second multiply should hence be safe.
+  // Thereafter a safe scale_threshold depends on the actual operational range
+  // of Hc. The largest scale_threshold is expected to depend on bit-depth
+  // (av1_compute_stats_highbd_c() scales highbd to 8-bit) and maximum
+  // restoration-unit size (256), leading up to 32-bit positive numbers in Hc.
+  // Noting that the caller, wiener_decompose_sep_sym(), initializes a[...]
+  // to a range smaller than 16 bits, the scale_threshold is set as below for
+  // convenience.
+  int32_t max_a_l = 0;
+  for (int l = 0; l < wiener_win; ++l) {
+    const int32_t abs_a_l = abs(a[l]);
+    if (abs_a_l > max_a_l) max_a_l = abs_a_l;
+  }
+  const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR;
+  const int scaler = max_a_l < scale_threshold ? 1 : 4;
+
+  for (i = 0; i < wiener_win; i++) {
+    const int ii = wrap_index(i, wiener_win);
+    for (j = 0; j < wiener_win; j++) {
+      const int jj = wrap_index(j, wiener_win);
+      int k, l;
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          B[jj * wiener_halfwin1 + ii] +=
+              Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+              (scaler * WIENER_TAP_SCALE_FACTOR) * a[l] /
+              (WIENER_TAP_SCALE_FACTOR / scaler);
+        }
+      }
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+    A[i] -=
+        A[wiener_halfwin1 - 1] * 2 +
+        B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+        2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+  }
+  for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+    for (j = 0; j < wiener_halfwin1 - 1; ++j) {
+      B[i * wiener_halfwin1 + j] -=
+          2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+               B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+               2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+                     (wiener_halfwin1 - 1)]);
+    }
+  }
+  if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+    S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR;
+    for (i = wiener_halfwin1; i < wiener_win; ++i) {
+      S[i] = S[wiener_win - 1 - i];
+      S[wiener_halfwin1 - 1] -= 2 * S[i];
+    }
+    for (i = 0; i < wiener_win; ++i) {
+      b[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)),
+                           (1 << (WIENER_FILT_BITS - 1)) - 1);
+    }
+  }
+}
+
+static void wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
+                                     int32_t *a, int32_t *b) {
+  static const int32_t init_filt[WIENER_WIN] = {
+    WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
+    WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
+    WIENER_FILT_TAP0_MIDV,
+  };
+  int64_t *Hc[WIENER_WIN2];
+  int64_t *Mc[WIENER_WIN];
+  int i, j, iter;
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
+  for (i = 0; i < wiener_win; i++) {
+    a[i] = b[i] =
+        WIENER_TAP_SCALE_FACTOR / WIENER_FILT_STEP * init_filt[i + plane_off];
+  }
+  for (i = 0; i < wiener_win; i++) {
+    Mc[i] = M + i * wiener_win;
+    for (j = 0; j < wiener_win; j++) {
+      Hc[i * wiener_win + j] =
+          H + i * wiener_win * wiener_win2 + j * wiener_win;
+    }
+  }
+
+  iter = 1;
+  while (iter < NUM_WIENER_ITERS) {
+    update_a_sep_sym(wiener_win, Mc, Hc, a, b);
+    update_b_sep_sym(wiener_win, Mc, Hc, a, b);
+    iter++;
+  }
+}
+
+// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H,
+                             InterpKernel vfilt, InterpKernel hfilt) {
+  int32_t ab[WIENER_WIN * WIENER_WIN];
+  int16_t a[WIENER_WIN], b[WIENER_WIN];
+  int64_t P = 0, Q = 0;
+  int64_t iP = 0, iQ = 0;
+  int64_t Score, iScore;
+  int i, k, l;
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP;
+  for (i = 0; i < WIENER_HALFWIN; ++i) {
+    a[i] = a[WIENER_WIN - i - 1] = vfilt[i];
+    b[i] = b[WIENER_WIN - i - 1] = hfilt[i];
+    a[WIENER_HALFWIN] -= 2 * a[i];
+    b[WIENER_HALFWIN] -= 2 * b[i];
+  }
+  memset(ab, 0, sizeof(ab));
+  for (k = 0; k < wiener_win; ++k) {
+    for (l = 0; l < wiener_win; ++l)
+      ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off];
+  }
+  for (k = 0; k < wiener_win2; ++k) {
+    P += ab[k] * M[k] / WIENER_FILT_STEP / WIENER_FILT_STEP;
+    for (l = 0; l < wiener_win2; ++l) {
+      Q += ab[k] * H[k * wiener_win2 + l] * ab[l] / WIENER_FILT_STEP /
+           WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP;
+    }
+  }
+  Score = Q - 2 * P;
+
+  iP = M[wiener_win2 >> 1];
+  iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)];
+  iScore = iQ - 2 * iP;
+
+  return Score - iScore;
+}
+
+static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f,
+                                           InterpKernel fi) {
+  int i;
+  const int wiener_halfwin = (wiener_win >> 1);
+
+  for (i = 0; i < wiener_halfwin; ++i) {
+    const int64_t dividend = (int64_t)f[i] * WIENER_FILT_STEP;
+    const int64_t divisor = WIENER_TAP_SCALE_FACTOR;
+    // Perform this division with proper rounding rather than truncation
+    if (dividend < 0) {
+      fi[i] = (int16_t)((dividend - (divisor / 2)) / divisor);
+    } else {
+      fi[i] = (int16_t)((dividend + (divisor / 2)) / divisor);
+    }
+  }
+  // Specialize for 7-tap filter
+  if (wiener_win == WIENER_WIN) {
+    fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+    fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+    fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+  } else {
+    fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+    fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+    fi[0] = 0;
+  }
+  // Satisfy filter constraints
+  fi[WIENER_WIN - 1] = fi[0];
+  fi[WIENER_WIN - 2] = fi[1];
+  fi[WIENER_WIN - 3] = fi[2];
+  // The central element has an implicit +WIENER_FILT_STEP
+  fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
+}
+
+static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
+                             WienerInfo *ref_wiener_info) {
+  int bits = 0;
+  if (wiener_win == WIENER_WIN)
+    bits += aom_count_primitive_refsubexpfin(
+        WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+  if (wiener_win == WIENER_WIN)
+    bits += aom_count_primitive_refsubexpfin(
+        WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+  return bits;
+}
+
+static int64_t finer_search_wiener(const RestSearchCtxt *rsc,
+                                   const RestorationTileLimits *limits,
+                                   RestorationUnitInfo *rui, int wiener_win) {
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  int64_t err = try_restoration_unit(rsc, limits, rui);
+
+  if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err;
+
+  // Refinement search around the wiener filter coefficients.
+  int64_t err2;
+  int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
+                    WIENER_FILT_TAP2_MINV };
+  int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
+                    WIENER_FILT_TAP2_MAXV };
+
+  WienerInfo *plane_wiener = &rui->wiener_info;
+
+  // printf("err  pre = %"PRId64"\n", err);
+  const int start_step = 4;
+  for (int s = start_step; s >= 1; s >>= 1) {
+    for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+      int skip = 0;
+      do {
+        if (plane_wiener->hfilter[p] - s >= tap_min[p]) {
+          plane_wiener->hfilter[p] -= s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, rui);
+          if (err2 > err) {
+            plane_wiener->hfilter[p] += s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (plane_wiener->hfilter[p] + s <= tap_max[p]) {
+          plane_wiener->hfilter[p] += s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, rui);
+          if (err2 > err) {
+            plane_wiener->hfilter[p] -= s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+    for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+      int skip = 0;
+      do {
+        if (plane_wiener->vfilter[p] - s >= tap_min[p]) {
+          plane_wiener->vfilter[p] -= s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, rui);
+          if (err2 > err) {
+            plane_wiener->vfilter[p] += s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (plane_wiener->vfilter[p] + s <= tap_max[p]) {
+          plane_wiener->vfilter[p] += s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, rui);
+          if (err2 > err) {
+            plane_wiener->vfilter[p] -= s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+  }
+  // printf("err post = %"PRId64"\n", err);
+  return err;
+}
+
+static AOM_INLINE void search_wiener(
+    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    struct aom_internal_error_info *error_info) {
+  (void)tmpbuf;
+  (void)rlbs;
+  (void)error_info;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const MACROBLOCK *const x = rsc->x;
+  const int64_t bits_none = x->mode_costs.wiener_restore_cost[0];
+
+  // Skip Wiener search for low variance contents
+  if (rsc->lpf_sf->prune_wiener_based_on_src_var) {
+    const int scale[3] = { 0, 1, 2 };
+    // Obtain the normalized Qscale
+    const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
+                                    rsc->cm->seq_params->bit_depth) >>
+                   3;
+    // Derive threshold as sqr(normalized Qscale) * scale / 16,
+    const uint64_t thresh =
+        (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4;
+    const int highbd = rsc->cm->seq_params->use_highbitdepth;
+    const uint64_t src_var =
+        var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
+    // Do not perform Wiener search if source variance is lower than threshold
+    // or if the reconstruction error is zero
+    int prune_wiener = (src_var < thresh) || (rsc->sse[RESTORE_NONE] == 0);
+    if (prune_wiener) {
+      rsc->total_bits[RESTORE_WIENER] += bits_none;
+      rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
+      rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+      rsc->sse[RESTORE_WIENER] = INT64_MAX;
+      if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
+      return;
+    }
+  }
+
+  const int wiener_win =
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  int reduced_wiener_win = wiener_win;
+  if (rsc->lpf_sf->reduce_wiener_window_size) {
+    reduced_wiener_win =
+        (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA;
+  }
+
+  int64_t M[WIENER_WIN2];
+  int64_t H[WIENER_WIN2 * WIENER_WIN2];
+  int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const AV1_COMMON *const cm = rsc->cm;
+  if (cm->seq_params->use_highbitdepth) {
+    // TODO(any) : Add support for use_downsampled_wiener_stats SF in HBD
+    // functions. Optimize intrinsics of HBD design similar to LBD (i.e.,
+    // pre-calculate d and s buffers and avoid most of the C operations).
+    av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
+                             rsc->src_buffer, limits->h_start, limits->h_end,
+                             limits->v_start, limits->v_end, rsc->dgd_stride,
+                             rsc->src_stride, M, H, cm->seq_params->bit_depth);
+  } else {
+    av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                      rsc->dgd_avg, rsc->src_avg, limits->h_start,
+                      limits->h_end, limits->v_start, limits->v_end,
+                      rsc->dgd_stride, rsc->src_stride, M, H,
+                      rsc->lpf_sf->use_downsampled_wiener_stats);
+  }
+#else
+  av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                    rsc->dgd_avg, rsc->src_avg, limits->h_start, limits->h_end,
+                    limits->v_start, limits->v_end, rsc->dgd_stride,
+                    rsc->src_stride, M, H,
+                    rsc->lpf_sf->use_downsampled_wiener_stats);
+#endif
+
+  wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter);
+
+  RestorationUnitInfo rui;
+  memset(&rui, 0, sizeof(rui));
+  rui.restoration_type = RESTORE_WIENER;
+  finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter);
+  finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter);
+
+  // Filter score computes the value of the function x'*A*x - x'*b for the
+  // learned filter and compares it against identity filer. If there is no
+  // reduction in the function, the filter is reverted back to identity
+  if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter,
+                    rui.wiener_info.hfilter) > 0) {
+    rsc->total_bits[RESTORE_WIENER] += bits_none;
+    rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rsc->sse[RESTORE_WIENER] = INT64_MAX;
+    if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
+    return;
+  }
+
+  rsc->sse[RESTORE_WIENER] =
+      finer_search_wiener(rsc, limits, &rui, reduced_wiener_win);
+  rusi->wiener = rui.wiener_info;
+
+  if (reduced_wiener_win != WIENER_WIN) {
+    assert(rui.wiener_info.vfilter[0] == 0 &&
+           rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
+    assert(rui.wiener_info.hfilter[0] == 0 &&
+           rui.wiener_info.hfilter[WIENER_WIN - 1] == 0);
+  }
+
+  const int64_t bits_wiener =
+      x->mode_costs.wiener_restore_cost[1] +
+      (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->ref_wiener)
+       << AV1_PROB_COST_SHIFT);
+
+  double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE],
+      rsc->cm->seq_params->bit_depth);
+  double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_wiener >> 4, rsc->sse[RESTORE_WIENER],
+      rsc->cm->seq_params->bit_depth);
+
+  RestorationType rtype =
+      (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
+
+  // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
+  // RESTORE_NONE or based on best_rtype
+  if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) {
+    rsc->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
+  } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) {
+    rsc->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
+  }
+
+#if DEBUG_LR_COSTING
+  // Store ref params for later checking
+  lr_ref_params[RESTORE_WIENER][rsc->plane][rest_unit_idx].wiener_info =
+      rsc->ref_wiener;
+#endif  // DEBUG_LR_COSTING
+
+  rsc->total_sse[RESTORE_WIENER] += rsc->sse[rtype];
+  rsc->total_bits[RESTORE_WIENER] +=
+      (cost_wiener < cost_none) ? bits_wiener : bits_none;
+  if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener;
+}
+
+static AOM_INLINE void search_norestore(
+    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    struct aom_internal_error_info *error_info) {
+  (void)rest_unit_idx;
+  (void)tmpbuf;
+  (void)rlbs;
+  (void)error_info;
+
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+
+  const int highbd = rsc->cm->seq_params->use_highbitdepth;
+  rsc->sse[RESTORE_NONE] = sse_restoration_unit(
+      limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
+
+  rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE];
+}
+
+static AOM_INLINE void search_switchable(
+    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    struct aom_internal_error_info *error_info) {
+  (void)limits;
+  (void)tmpbuf;
+  (void)rlbs;
+  (void)error_info;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const MACROBLOCK *const x = rsc->x;
+
+  const int wiener_win =
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  double best_cost = 0;
+  int64_t best_bits = 0;
+  RestorationType best_rtype = RESTORE_NONE;
+
+  for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+    // If this restoration mode was skipped, or could not find a solution
+    // that was better than RESTORE_NONE, then we can't select it here either.
+    //
+    // Note: It is possible for the restoration search functions to find a
+    // filter which is better than RESTORE_NONE when looking purely at SSE, but
+    // for it to be rejected overall due to its rate cost. In this case, there
+    // is a chance that it may be have a lower rate cost when looking at
+    // RESTORE_SWITCHABLE, and so it might be acceptable here.
+    //
+    // Therefore we prune based on SSE, rather than on whether or not the
+    // previous search function selected this mode.
+    if (r > RESTORE_NONE) {
+      if (rsc->sse[r] > rsc->sse[RESTORE_NONE]) continue;
+    }
+
+    const int64_t sse = rsc->sse[r];
+    int64_t coeff_pcost = 0;
+    switch (r) {
+      case RESTORE_NONE: coeff_pcost = 0; break;
+      case RESTORE_WIENER:
+        coeff_pcost = count_wiener_bits(wiener_win, &rusi->wiener,
+                                        &rsc->switchable_ref_wiener);
+        break;
+      case RESTORE_SGRPROJ:
+        coeff_pcost =
+            count_sgrproj_bits(&rusi->sgrproj, &rsc->switchable_ref_sgrproj);
+        break;
+      default: assert(0); break;
+    }
+    const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
+    const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits;
+    double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth);
+    if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
+      cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
+    if (r == 0 || cost < best_cost) {
+      best_cost = cost;
+      best_bits = bits;
+      best_rtype = r;
+    }
+  }
+
+  rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
+
+#if DEBUG_LR_COSTING
+  // Store ref params for later checking
+  lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].wiener_info =
+      rsc->switchable_ref_wiener;
+  lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].sgrproj_info =
+      rsc->switchable_ref_sgrproj;
+#endif  // DEBUG_LR_COSTING
+
+  rsc->total_sse[RESTORE_SWITCHABLE] += rsc->sse[best_rtype];
+  rsc->total_bits[RESTORE_SWITCHABLE] += best_bits;
+  if (best_rtype == RESTORE_WIENER) rsc->switchable_ref_wiener = rusi->wiener;
+  if (best_rtype == RESTORE_SGRPROJ)
+    rsc->switchable_ref_sgrproj = rusi->sgrproj;
+}
+
+static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
+                                      const RestUnitSearchInfo *rusi,
+                                      RestorationUnitInfo *rui) {
+  assert(frame_rtype > 0);
+  rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
+  if (rui->restoration_type == RESTORE_WIENER)
+    rui->wiener_info = rusi->wiener;
+  else
+    rui->sgrproj_info = rusi->sgrproj;
+}
+
+static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc,
+                               bool *disable_lr_filter) {
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const CommonTileParams *tiles = &cm->tiles;
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
+  RestorationInfo *rsi = &cm->rst_info[plane];
+  const int ru_size = rsi->restoration_unit_size;
+  const int ext_size = ru_size * 3 / 2;
+
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+  static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
+    search_norestore, search_wiener, search_sgrproj, search_switchable
+  };
+
+  const int plane_num_units = rsi->num_rest_units;
+  const RestorationType num_rtypes =
+      (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
+  reset_rsc(rsc);
+
+  // Iterate over restoration units in encoding order, so that each RU gets
+  // the correct reference parameters when we cost it up. This is effectively
+  // a nested iteration over:
+  // * Each tile, order does not matter
+  //   * Each superblock within that tile, in raster order
+  //     * Each LR unit which is coded within that superblock, in raster order
+  for (int tile_row = 0; tile_row < tiles->rows; tile_row++) {
+    int sb_row_start = tiles->row_start_sb[tile_row];
+    int sb_row_end = tiles->row_start_sb[tile_row + 1];
+    for (int tile_col = 0; tile_col < tiles->cols; tile_col++) {
+      int sb_col_start = tiles->col_start_sb[tile_col];
+      int sb_col_end = tiles->col_start_sb[tile_col + 1];
+
+      // Reset reference parameters for delta-coding at the start of each tile
+      rsc_on_tile(rsc);
+
+      for (int sb_row = sb_row_start; sb_row < sb_row_end; sb_row++) {
+        int mi_row = sb_row << mib_size_log2;
+        for (int sb_col = sb_col_start; sb_col < sb_col_end; sb_col++) {
+          int mi_col = sb_col << mib_size_log2;
+
+          int rcol0, rcol1, rrow0, rrow1;
+          int has_lr_info = av1_loop_restoration_corners_in_sb(
+              cm, plane, mi_row, mi_col, sb_size, &rcol0, &rcol1, &rrow0,
+              &rrow1);
+
+          if (!has_lr_info) continue;
+
+          RestorationTileLimits limits;
+          for (int rrow = rrow0; rrow < rrow1; rrow++) {
+            int y0 = rrow * ru_size;
+            int remaining_h = plane_h - y0;
+            int h = (remaining_h < ext_size) ? remaining_h : ru_size;
+
+            limits.v_start = y0;
+            limits.v_end = y0 + h;
+            assert(limits.v_end <= plane_h);
+            // Offset upwards to align with the restoration processing stripe
+            const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+            limits.v_start = AOMMAX(0, limits.v_start - voffset);
+            if (limits.v_end < plane_h) limits.v_end -= voffset;
+
+            for (int rcol = rcol0; rcol < rcol1; rcol++) {
+              int x0 = rcol * ru_size;
+              int remaining_w = plane_w - x0;
+              int w = (remaining_w < ext_size) ? remaining_w : ru_size;
+
+              limits.h_start = x0;
+              limits.h_end = x0 + w;
+              assert(limits.h_end <= plane_w);
+
+              const int unit_idx = rrow * rsi->horz_units + rcol;
+
+              rsc->skip_sgr_eval = 0;
+              for (RestorationType r = RESTORE_NONE; r < num_rtypes; r++) {
+                if (disable_lr_filter[r]) continue;
+
+                funs[r](&limits, unit_idx, rsc, rsc->cm->rst_tmpbuf, NULL,
+                        cm->error);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static INLINE void av1_derive_flags_for_lr_processing(
+    const LOOP_FILTER_SPEED_FEATURES *lpf_sf, bool *disable_lr_filter) {
+  const bool is_wiener_disabled = lpf_sf->disable_wiener_filter;
+  const bool is_sgr_disabled = lpf_sf->disable_sgr_filter;
+
+  // Enable None Loop restoration filter if either of Wiener or Self-guided is
+  // enabled.
+  disable_lr_filter[RESTORE_NONE] = (is_wiener_disabled && is_sgr_disabled);
+
+  disable_lr_filter[RESTORE_WIENER] = is_wiener_disabled;
+  disable_lr_filter[RESTORE_SGRPROJ] = is_sgr_disabled;
+
+  // Enable Swicthable Loop restoration filter if both of the Wiener and
+  // Self-guided are enabled.
+  disable_lr_filter[RESTORE_SWITCHABLE] =
+      (is_wiener_disabled || is_sgr_disabled);
+}
+
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+// Allocate both decoder-side and encoder-side info structs for a single plane.
+// The unit size passed in should be the minimum size which we are going to
+// search; before each search, set_restoration_unit_size() must be called to
+// configure the actual size.
+static RestUnitSearchInfo *allocate_search_structs(AV1_COMMON *cm,
+                                                   RestorationInfo *rsi,
+                                                   int is_uv,
+                                                   int min_luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int sx = cm->seq_params.subsampling_x;
+  int sy = cm->seq_params.subsampling_y;
+  int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+  int s = 0;
+#endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int min_unit_size = min_luma_unit_size >> s;
+
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+  const int max_horz_units = av1_lr_count_units(min_unit_size, plane_w);
+  const int max_vert_units = av1_lr_count_units(min_unit_size, plane_h);
+  const int max_num_units = max_horz_units * max_vert_units;
+
+  aom_free(rsi->unit_info);
+  CHECK_MEM_ERROR(cm, rsi->unit_info,
+                  (RestorationUnitInfo *)aom_memalign(
+                      16, sizeof(*rsi->unit_info) * max_num_units));
+
+  RestUnitSearchInfo *rusi;
+  CHECK_MEM_ERROR(
+      cm, rusi,
+      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * max_num_units));
+
+  // If the restoration unit dimensions are not multiples of
+  // rsi->restoration_unit_size then some elements of the rusi array may be
+  // left uninitialised when we reach copy_unit_info(...). This is not a
+  // problem, as these elements are ignored later, but in order to quiet
+  // Valgrind's warnings we initialise the array below.
+  memset(rusi, 0, sizeof(*rusi) * max_num_units);
+
+  return rusi;
+}
+
+static void set_restoration_unit_size(AV1_COMMON *cm, RestorationInfo *rsi,
+                                      int is_uv, int luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int sx = cm->seq_params.subsampling_x;
+  int sy = cm->seq_params.subsampling_y;
+  int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+  int s = 0;
+#endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int unit_size = luma_unit_size >> s;
+
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+  const int horz_units = av1_lr_count_units(unit_size, plane_w);
+  const int vert_units = av1_lr_count_units(unit_size, plane_h);
+
+  rsi->restoration_unit_size = unit_size;
+  rsi->num_rest_units = horz_units * vert_units;
+  rsi->horz_units = horz_units;
+  rsi->vert_units = vert_units;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf;
+  const int num_planes = av1_num_planes(cm);
+  const int highbd = cm->seq_params->use_highbitdepth;
+  assert(!cm->features.all_lossless);
+
+  av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+
+  // Select unit size based on speed feature settings, and allocate
+  // rui structs based on this size
+  int min_lr_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+  int max_lr_unit_size = cpi->sf.lpf_sf.max_lr_unit_size;
+
+  // The minimum allowed unit size at a syntax level is 1 superblock.
+  // Apply this constraint here so that the speed features code which sets
+  // cpi->sf.lpf_sf.min_lr_unit_size does not need to know the superblock size
+  min_lr_unit_size =
+      AOMMAX(min_lr_unit_size, block_size_wide[cm->seq_params->sb_size]);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    cpi->pick_lr_ctxt.rusi[plane] = allocate_search_structs(
+        cm, &cm->rst_info[plane], plane > 0, min_lr_unit_size);
+  }
+
+  x->rdmult = cpi->rd.RDMULT;
+
+  // Allocate the frame buffer trial_frame_rst, which is used to temporarily
+  // store the loop restored frame.
+  if (aom_realloc_frame_buffer(
+          &cpi->trial_frame_rst, cm->superres_upscaled_width,
+          cm->superres_upscaled_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+          cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate trial restored frame buffer");
+
+  RestSearchCtxt rsc;
+
+  // The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers.
+  // These buffers are only required for the AVX2 and NEON implementations of
+  // av1_compute_stats. The buffer size required is calculated based on maximum
+  // width and height of the LRU (i.e., from foreach_rest_unit_in_plane() 1.5
+  // times the RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width
+  // and height aligned to multiple of 16 is considered for intrinsic purpose.
+  rsc.dgd_avg = NULL;
+  rsc.src_avg = NULL;
+#if HAVE_AVX2 || HAVE_NEON
+  // The buffers allocated below are used during Wiener filter processing of low
+  // bitdepth path. Hence, allocate the same when Wiener filter is enabled in
+  // low bitdepth path.
+  if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+    const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 *
+                         RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+    CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg,
+                    (int16_t *)aom_memalign(32, buf_size));
+
+    rsc.dgd_avg = cpi->pick_lr_ctxt.dgd_avg;
+    // When LRU width isn't multiple of 16, the 256 bits load instruction used
+    // in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to
+    // silence Valgrind warning this buffer is initialized with zero. Overhead
+    // due to this initialization is negligible since it is done at frame level.
+    memset(rsc.dgd_avg, 0, buf_size);
+    rsc.src_avg =
+        rsc.dgd_avg + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+    // Asserts the starting address of src_avg is always 32-bytes aligned.
+    assert(!((intptr_t)rsc.src_avg % 32));
+  }
+#endif
+
+  // Initialize all planes, so that any planes we skip searching will still have
+  // valid data
+  for (int plane = 0; plane < num_planes; plane++) {
+    cm->rst_info[plane].frame_restoration_type = RESTORE_NONE;
+  }
+
+  // Decide which planes to search
+  int plane_start, plane_end;
+
+  if (lpf_sf->disable_loop_restoration_luma) {
+    plane_start = AOM_PLANE_U;
+  } else {
+    plane_start = AOM_PLANE_Y;
+  }
+
+  if (num_planes == 1 || lpf_sf->disable_loop_restoration_chroma) {
+    plane_end = AOM_PLANE_Y;
+  } else {
+    plane_end = AOM_PLANE_V;
+  }
+
+  // Derive the flags to enable/disable Loop restoration filters based on the
+  // speed features 'disable_wiener_filter' and 'disable_sgr_filter'.
+  bool disable_lr_filter[RESTORE_TYPES] = { false };
+  av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter);
+
+  for (int plane = plane_start; plane <= plane_end; plane++) {
+    const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+    const int is_uv = plane != AOM_PLANE_Y;
+    int plane_w, plane_h;
+    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+    av1_extend_frame(dgd->buffers[plane], plane_w, plane_h, dgd->strides[is_uv],
+                     RESTORATION_BORDER, RESTORATION_BORDER, highbd);
+  }
+
+  double best_cost = DBL_MAX;
+  int best_luma_unit_size = max_lr_unit_size;
+  for (int luma_unit_size = max_lr_unit_size;
+       luma_unit_size >= min_lr_unit_size; luma_unit_size >>= 1) {
+    int64_t bits_this_size = 0;
+    int64_t sse_this_size = 0;
+    RestorationType best_rtype[MAX_MB_PLANE] = { RESTORE_NONE, RESTORE_NONE,
+                                                 RESTORE_NONE };
+    for (int plane = plane_start; plane <= plane_end; ++plane) {
+      set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+                                luma_unit_size);
+      init_rsc(src, &cpi->common, x, lpf_sf, plane,
+               cpi->pick_lr_ctxt.rusi[plane], &cpi->trial_frame_rst, &rsc);
+
+      restoration_search(cm, plane, &rsc, disable_lr_filter);
+
+      const int plane_num_units = cm->rst_info[plane].num_rest_units;
+      const RestorationType num_rtypes =
+          (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+      double best_cost_this_plane = DBL_MAX;
+      for (RestorationType r = 0; r < num_rtypes; ++r) {
+        // Disable Loop restoration filter based on the flags set using speed
+        // feature 'disable_wiener_filter' and 'disable_sgr_filter'.
+        if (disable_lr_filter[r]) continue;
+
+        double cost_this_plane = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+            x->rdmult, rsc.total_bits[r] >> 4, rsc.total_sse[r],
+            cm->seq_params->bit_depth);
+
+        if (cost_this_plane < best_cost_this_plane) {
+          best_cost_this_plane = cost_this_plane;
+          best_rtype[plane] = r;
+        }
+      }
+
+      bits_this_size += rsc.total_bits[best_rtype[plane]];
+      sse_this_size += rsc.total_sse[best_rtype[plane]];
+    }
+
+    double cost_this_size = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        x->rdmult, bits_this_size >> 4, sse_this_size,
+        cm->seq_params->bit_depth);
+
+    if (cost_this_size < best_cost) {
+      best_cost = cost_this_size;
+      best_luma_unit_size = luma_unit_size;
+      // Copy parameters out of rusi struct, before we overwrite it at
+      // the start of the next iteration
+      bool all_none = true;
+      for (int plane = plane_start; plane <= plane_end; ++plane) {
+        cm->rst_info[plane].frame_restoration_type = best_rtype[plane];
+        if (best_rtype[plane] != RESTORE_NONE) {
+          all_none = false;
+          const int plane_num_units = cm->rst_info[plane].num_rest_units;
+          for (int u = 0; u < plane_num_units; ++u) {
+            copy_unit_info(best_rtype[plane], &cpi->pick_lr_ctxt.rusi[plane][u],
+                           &cm->rst_info[plane].unit_info[u]);
+          }
+        }
+      }
+      // Heuristic: If all best_rtype entries are RESTORE_NONE, this means we
+      // couldn't find any good filters at this size. So we likely won't find
+      // any good filters at a smaller size either, so skip
+      if (all_none) {
+        break;
+      }
+    } else {
+      // Heuristic: If this size is worse than the previous (larger) size, then
+      // the next size down will likely be even worse, so skip
+      break;
+    }
+  }
+
+  // Final fixup to set the correct unit size
+  // We set this for all planes, even ones we have skipped searching,
+  // so that other code does not need to care which planes were and weren't
+  // searched
+  for (int plane = 0; plane < num_planes; ++plane) {
+    set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+                              best_luma_unit_size);
+  }
+
+#if HAVE_AVX || HAVE_NEON
+  if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+    aom_free(cpi->pick_lr_ctxt.dgd_avg);
+    cpi->pick_lr_ctxt.dgd_avg = NULL;
+  }
+#endif
+  for (int plane = 0; plane < num_planes; plane++) {
+    aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+    cpi->pick_lr_ctxt.rusi[plane] = NULL;
+  }
+}
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
new file mode 100644
index 0000000000..d1d0b0cec6
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+// Enable extra debugging for loop restoration costing?
+//
+// If this is set to 1, then we record not just the selected LR parameters, but
+// also the values which the search process thinks they should be delta-coded
+// against. Then, when writing out the bitstream, we verify this information,
+// to help ensure that the search code is costing things properly
+#define DEBUG_LR_COSTING 0
+
+#if DEBUG_LR_COSTING
+#define MAX_LR_UNITS_W 64
+#define MAX_LR_UNITS_H 64
+
+// Storage for reference parameters.
+//
+// The storage size is determined by:
+// * This is always written and then checked within the same frame encode pass,
+//   so we do not need to buffer multiple frames of data
+// * The parameters can be different per plane within one frame
+// * The relevant set of ref parameters can differ between the search where
+//   we set the frame restoration mode to RESTORE_WIENER, and the search where
+//   we set it to RESTORE_SWITCHABLE.
+//   So we need to store at least two sets of Wiener params and two sets of
+//   SGR params, and the easiest way to do this is to index by
+//   frame_restoration_type
+extern RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+                                        [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif  // DEBUG_LR_COSTING
+
+static const uint8_t g_shuffle_stats_data[16] = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static const uint8_t g_shuffle_stats_highbd_data[32] = {
+  0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
+  0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
+};
+
+static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end,
+                                   int v_start, int v_end, int stride) {
+  uint64_t sum = 0;
+  for (int i = v_start; i < v_end; i++) {
+    for (int j = h_start; j < h_end; j++) {
+      sum += src[i * stride + j];
+    }
+  }
+  uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
+  return (uint8_t)avg;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start,
+                                           int h_end, int v_start, int v_end,
+                                           int stride) {
+  uint64_t sum = 0;
+  for (int i = v_start; i < v_end; i++) {
+    for (int j = h_start; j < h_end; j++) {
+      sum += src[i * stride + j];
+    }
+  }
+  uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
+  return (uint16_t)avg;
+}
+#endif
+
+/*!\brief Algorithm for AV1 loop restoration search and estimation.
+ *
+ * \ingroup in_loop_restoration
+ * This function determines proper restoration filter types and
+ * associated parameters for each restoration unit in a frame.
+ *
+ * \param[in]       sd           Source frame buffer
+ * \param[in,out]   cpi          Top-level encoder structure
+ *
+ * \remark Nothing is returned. Instead, chosen restoration filter
+ * types and parameters are stored per plane in the \c rst_info structure
+ * of type \ref RestorationInfo inside \c cpi->common:
+ * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane
+ * \arg \c rst_info[ \c 1 ]: Chosen parameters for U plane if it exists
+ * \arg \c rst_info[ \c 2 ]: Chosen parameters for V plane if it exists
+ * \par
+ * The following fields in each \c rst_info[ \c p], \c p = 0, 1, 2
+ * are populated:
+ * \arg \c rst_info[ \c p ].\c frame_restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * for each \c u in 0, 1, ..., \c n( \c p ) - 1,
+ * where \c n( \c p ) is the number of restoration units in plane \c p.
+ * \par
+ * The following fields in each \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * \c p = 0, 1, 2 and \c u = 0, 1, ..., \c n( \c p ) - 1, of type
+ * \ref RestorationUnitInfo are populated:
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c wiener_info OR
+ *      \c rst_info[ \c p ].\c unit_info[ \c u ].\c sgrproj_info OR
+ *      neither, depending on
+ *      \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ *
+ */
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
new file mode 100644
index 0000000000..2e8710108b
--- /dev/null
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define NUM_FEATURES_PUSTATS 8
+#define NUM_HIDDEN_LAYERS 2
+#define HIDDEN_LAYERS_0_NODES 12
+#define HIDDEN_LAYERS_1_NODES 10
+#define LOGITS_NODES 1
+
+static const float
+    av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      -0.1758f, -0.0499f, -10.0069f, -2.2838f,  -0.3359f,  0.3459f,  -0.3285f,
+      -0.0515f, -0.5417f, 0.2357f,   -0.0575f,  -69.0782f, 0.5348f,  1.4068f,
+      0.2213f,  -1.0490f, -0.0636f,  0.1654f,   1.1002f,   33.4924f, 0.4358f,
+      1.2499f,  0.1143f,  0.0592f,   -1.6335f,  -0.0092f,  1.2207f,  -28.4543f,
+      -0.4973f, 0.4368f,  0.2341f,   -0.1623f,  -3.8986f,  0.1311f,  -1.8789f,
+      -3.9079f, -0.8158f, -0.8420f,  1.4295f,   -2.3629f,  -1.4825f, 0.6498f,
+      -5.3669f, 6.4434f,  1.8393f,   -35.0678f, 3.7459f,   -2.8504f, 2.0502f,
+      -0.1812f, -3.9011f, -1.0155f,  1.8375f,   -1.4517f,  1.3917f,  3.8664f,
+      0.8345f,  -0.3472f, 5.7740f,   -1.1196f,  -0.3264f,  -1.2481f, -0.9284f,
+      -4.9657f, 2.2831f,  0.7337f,   2.3176f,   0.6416f,   0.8804f,  1.9988f,
+      -1.3426f, 1.2728f,  1.2249f,   -0.1551f,  5.6045f,   0.2046f,  -2.1464f,
+      -2.4922f, -0.5334f, 12.1055f,  7.2467f,   -0.0070f,  0.0234f,  0.0021f,
+      0.0215f,  -0.0098f, -0.0682f,  -6.1494f,  -0.3176f,  -1.6069f, -0.2119f,
+      -1.0533f, -0.3566f, 0.5294f,   -0.4335f,  0.1626f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
+      10.5266f, 5.3268f, -1.0678f, 7.7411f,  8.7164f,  -0.3235f,
+      7.3028f,  9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      10.5932f,  2.5192f,  -0.0015f, 5.9479f,   5.2426f,   -0.4091f, 5.3220f,
+      6.0469f,   0.7200f,  3.3241f,  5.5006f,   12.8290f,  -1.6396f, 0.5743f,
+      -0.8370f,  1.9956f,  -4.9270f, -1.5295f,  2.1350f,   -9.4415f, -0.7094f,
+      5.1822f,   19.7287f, -3.0444f, -0.3320f,  0.0031f,   -0.2709f, -0.5249f,
+      0.3281f,   -0.2240f, 0.2225f,  -0.2386f,  -0.4370f,  -0.2438f, -0.4928f,
+      -0.2842f,  -2.1772f, 9.2570f,  -17.6655f, 3.5448f,   -2.8394f, -1.0167f,
+      -0.5115f,  -1.9260f, -0.2111f, -0.7528f,  -1.2387f,  -0.0401f, 5.0716f,
+      -3.3763f,  -0.2898f, -0.4956f, -7.9993f,  0.1526f,   -0.0242f, 0.7354f,
+      6.0432f,   4.8043f,  7.4790f,  -0.6295f,  1.7565f,   3.7197f,  -2.3963f,
+      6.8945f,   2.9717f,  -3.1623f, 3.4241f,   4.4676f,   -1.8154f, -2.9401f,
+      -8.5657f,  -3.0240f, -1.4661f, 8.1145f,   -12.7858f, 3.3624f,  -1.0819f,
+      -4.2856f,  1.1801f,  -0.5587f, -1.6062f,  -1.1813f,  -3.5882f, -0.2490f,
+      -24.9566f, -0.4140f, -0.1113f, 3.5537f,   4.4112f,   0.1367f,  -1.5876f,
+      1.6605f,   1.3903f,  -0.0253f, -2.1419f,  -2.2197f,  -0.7659f, -0.4249f,
+      -0.0424f,  0.1486f,  0.4643f,  -0.9068f,  -0.3619f,  -0.7624f, -0.9132f,
+      -0.4947f,  -0.3527f, -0.5445f, -0.4768f,  -1.7761f,  -1.0686f, 0.5462f,
+      1.3371f,   4.3116f,  0.0777f,  -2.7216f,  -1.8908f,  3.4989f,  7.7269f,
+      -2.7566f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
+      13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+      6.1715f,  0.5094f,  7.6433f,  -0.3992f, -1.3555f,
+    };
+
+static const float
+    av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      4.3078f, -17.3497f, 0.0195f,  34.6032f, -5.0127f,
+      5.3079f, 10.0077f,  -13.129f, 0.0087f,  -8.4009f,
+    };
+
+static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
+  4.5103f,
+};
+
+static const NN_CONFIG av1_pustats_rate_nnconfig = {
+  NUM_FEATURES_PUSTATS,                              // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_rate_hiddenlayer_0_kernel,
+      av1_pustats_rate_hiddenlayer_1_kernel,
+      av1_pustats_rate_logits_kernel,
+  },
+  {
+      av1_pustats_rate_hiddenlayer_0_bias,
+      av1_pustats_rate_hiddenlayer_1_bias,
+      av1_pustats_rate_logits_bias,
+  },
+};
+
+static const float
+    av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      -0.2560f, 0.1105f,  -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+      0.4885f,  1.7518f,  0.4985f,  0.5582f,  -0.3739f, 0.9403f,  0.3874f,
+      0.3265f,  1.7383f,  3.1747f,  0.0285f,  3.3942f,  -0.0123f, 0.5057f,
+      0.1584f,  0.2697f,  4.6151f,  3.6251f,  -0.0121f, -1.0047f, -0.0037f,
+      0.0127f,  0.1935f,  -0.5277f, -2.7144f, 0.0729f,  -0.1457f, -0.0816f,
+      -0.5462f, 0.4738f,  0.3599f,  -0.0564f, 0.0910f,  0.0126f,  -0.0310f,
+      -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f,  -0.2662f, -0.0999f,
+      -0.2983f, -0.4899f, -0.2314f, 0.2873f,  -0.3614f, 0.1783f,  -0.1210f,
+      0.3569f,  0.5436f,  -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+      1.9045f,  0.5463f,  0.1102f,  -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+      -0.3531f, -1.3095f, 0.6099f,  0.7977f,  4.1950f,  -0.0067f, -0.2762f,
+      -0.1574f, -0.2149f, 0.6104f,  -1.7053f, 0.1904f,  4.2402f,  -0.2671f,
+      0.8940f,  0.6820f,  0.2241f,  -0.9459f, 1.4571f,  0.5255f,  2.3352f,
+      -0.0806f, 0.5231f,  0.3928f,  0.4146f,  2.0956f,
+    };
+
+static const float
+    av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
+      1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+      0.f,     1.1485f, 2.7085f,  -4.7897f, 1.4093f,  -1.657f,
+    };
+
+static const float
+    av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      -0.5203f, -1.3468f, 0.3865f,  -0.6859f, 0.0058f,  4.0682f,  0.4807f,
+      -0.1380f, 0.6050f,  0.8958f,  0.7748f,  -0.1311f, 1.7317f,  1.1265f,
+      0.0827f,  0.1407f,  -0.3605f, 0.5429f,  0.1880f,  -0.1439f, 0.2837f,
+      1.6477f,  0.0832f,  0.0593f,  -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+      -0.3842f, -2.3637f, 0.2514f,  0.8263f,  -0.1872f, 0.5774f,  -0.3610f,
+      -0.0205f, 1.3977f,  -0.1083f, 0.6923f,  1.3039f,  -0.2870f, 1.0622f,
+      -0.0566f, 0.2697f,  -0.5429f, -0.6193f, 1.7559f,  0.3246f,  1.9159f,
+      0.3744f,  0.0686f,  1.0191f,  -0.4212f, 1.9591f,  -0.0691f, -0.1085f,
+      -1.2034f, 0.0606f,  1.0116f,  0.5565f,  -0.1874f, -0.7898f, 0.4796f,
+      0.2290f,  0.4334f,  -0.5817f, -0.2949f, 0.1367f,  -0.2932f, -1.1265f,
+      0.0133f,  -0.5309f, -3.3191f, 0.0939f,  0.3895f,  -2.5812f, -0.0066f,
+      -3.0063f, -0.2982f, 0.7309f,  -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+      1.9630f,  0.1988f,  0.4194f,  0.8762f,  0.3402f,  0.1051f,  -0.1598f,
+      0.2405f,  0.0392f,  1.1256f,  1.5245f,  0.0950f,  0.2160f,  -0.5023f,
+      0.2584f,  0.2074f,  0.2218f,  0.3966f,  -0.0921f, -0.2435f, -0.4560f,
+      -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f,  -0.3342f, -0.7888f,
+      -0.4488f, -1.7168f, 0.3341f,  0.1146f,  0.5226f,  0.2610f,  -0.4574f,
+      -0.4164f,
+    };
+
+static const float
+    av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
+      -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+      2.7149f,  -2.5649f, 2.7765f, 2.9617f,  2.7684f,
+    };
+
+static const float
+    av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      -0.6868f, -0.6715f, 0.449f,  -1.293f, 0.6214f,
+      0.9894f,  -0.4342f, 0.7002f, 1.4363f, 0.6951f,
+    };
+
+static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
+  2.3371f,
+};
+
+static const NN_CONFIG av1_pustats_dist_nnconfig = {
+  NUM_FEATURES_PUSTATS,                              // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_dist_hiddenlayer_0_kernel,
+      av1_pustats_dist_hiddenlayer_1_kernel,
+      av1_pustats_dist_logits_kernel,
+  },
+  {
+      av1_pustats_dist_hiddenlayer_0_bias,
+      av1_pustats_dist_hiddenlayer_1_bias,
+      av1_pustats_dist_logits_bias,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef HIDDEN_LAYERS_0_NODES
+#undef HIDDEN_LAYERS_1_NODES
+#undef LOGITS_NODES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
new file mode 100644
index 0000000000..efe909b6db
--- /dev/null
+++ b/third_party/aom/av1/encoder/random.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Advance the generator to its next state, and generate the next 32-bit output.
+// Note that the low bits of this output are comparatively low-quality, so users
+// of this function should ensure that the high bits factor through to their
+// outputs.
+static INLINE uint32_t lcg_next(uint32_t *state) {
+  *state = (uint32_t)(*state * 1103515245ULL + 12345);
+  return *state;
+}
+
+// Generate a random number in the range [0, 32768).
+static INLINE uint32_t lcg_rand16(uint32_t *state) {
+  return (lcg_next(state) / 65536) % 32768;
+}
+
+// Generate a random number in the range [0, n)
+// This is implemented as (rand() * n) / <range of RNG> rather than
+// rand() % n, for a few reasons: This implementation is faster and less biased,
+// and if is a power of 2, this uses the higher-quality top bits from the RNG
+// output rather than the lower-quality bottom bits.
+static INLINE uint32_t lcg_randint(uint32_t *state, uint32_t n) {
+  uint64_t v = ((uint64_t)lcg_next(state) * n) >> 32;
+  return (uint32_t)v;
+}
+
+// Generate a random number in the range [lo, hi)
+static INLINE uint32_t lcg_randrange(uint32_t *state, uint32_t lo,
+                                     uint32_t hi) {
+  assert(lo < hi);
+  return lo + lcg_randint(state, hi - lo);
+}
+
+// Pick k distinct numbers from the set {0, ..., n-1}
+// All possible sets of k numbers, and all possible orderings of those numbers,
+// are equally likely.
+//
+// Note: The algorithm used here uses resampling to avoid choosing repeated
+// values. This works well as long as n >> k, but can potentially lead to many
+// resampling attempts if n is equal to or only slightly larger than k.
+static INLINE void lcg_pick(int n, int k, int *out, unsigned int *seed) {
+  assert(0 <= k && k <= n);
+  for (int i = 0; i < k; i++) {
+    int v;
+
+  // Inner resampling loop
+  // We have to use a goto here because C does not have a multi-level continue
+  // statement
+  resample:
+    v = (int)lcg_randint(seed, n);
+    for (int j = 0; j < i; j++) {
+      if (v == out[j]) {
+        // Repeated v, resample
+        goto resample;
+      }
+    }
+
+    // New v, accept
+    out[i] = v;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
new file mode 100644
index 0000000000..df86380272
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -0,0 +1,3587 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_once.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#define USE_UNRESTRICTED_Q_IN_CQ_MODE 0
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2
+#define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0
+
+#define FRAME_OVERHEAD_BITS 200
+#define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
+  do {                                                       \
+    switch (bit_depth) {                                     \
+      case AOM_BITS_8: name = name##_8; break;               \
+      case AOM_BITS_10: name = name##_10; break;             \
+      case AOM_BITS_12: name = name##_12; break;             \
+      default:                                               \
+        assert(0 &&                                          \
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10" \
+               " or AOM_BITS_12");                           \
+        name = NULL;                                         \
+    }                                                        \
+  } while (0)
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+
+static int gf_high = 2400;
+static int gf_low = 300;
+#ifdef STRICT_RC
+static int kf_high = 3200;
+#else
+static int kf_high = 5000;
+#endif
+static int kf_low = 400;
+
+// How many times less pixels there are to encode given the current scaling.
+// Temporary replacement for rcf_mult and rate_thresh_mult.
+static double resize_rate_factor(const FrameDimensionCfg *const frm_dim_cfg,
+                                 int width, int height) {
+  return (double)(frm_dim_cfg->width * frm_dim_cfg->height) / (width * height);
+}
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+                          aom_bit_depth_t bit_depth) {
+  const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0) return 0;
+
+  return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1);
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+                           int *arfgf_high, int *inter, int *rtc,
+                           aom_bit_depth_t bit_depth) {
+  int i;
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    const double maxq = av1_convert_qindex_to_q(i, bit_depth);
+    kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+  }
+}
+
+static void rc_init_minq_luts(void) {
+  init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+                 arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+                 inter_minq_8, rtc_minq_8, AOM_BITS_8);
+  init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+                 arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+                 inter_minq_10, rtc_minq_10, AOM_BITS_10);
+  init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+                 arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+                 inter_minq_12, rtc_minq_12, AOM_BITS_12);
+}
+
+void av1_rc_init_minq_luts(void) { aom_once(rc_init_minq_luts); }
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
+  // Convert the index to a real Q value (scaled down to match old Q values)
+  switch (bit_depth) {
+    case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0;
+    case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0;
+    case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1.0;
+  }
+}
+
+int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
+                            const int is_screen_content_type) {
+  int enumerator;
+
+  if (is_screen_content_type) {
+    enumerator = (frame_type == KEY_FRAME) ? 1000000 : 750000;
+  } else {
+    enumerator = (frame_type == KEY_FRAME) ? 2000000 : 1500000;
+  }
+
+  return enumerator;
+}
+
+static int get_init_ratio(double sse) { return (int)(300000 / sse); }
+
+int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, int accurate_estimate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_screen_content_type = cpi->is_screen_content_type;
+  const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+  const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+  int enumerator = av1_get_bpmb_enumerator(frame_type, is_screen_content_type);
+
+  assert(correction_factor <= MAX_BPB_FACTOR &&
+         correction_factor >= MIN_BPB_FACTOR);
+
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type != KEY_FRAME &&
+      accurate_estimate && cpi->rec_sse != UINT64_MAX) {
+    const int mbs = cm->mi_params.MBs;
+    const double sse_sqrt =
+        (double)((int)sqrt((double)(cpi->rec_sse)) << BPER_MB_NORMBITS) /
+        (double)mbs;
+    const int ratio = (cpi->rc.bit_est_ratio == 0) ? get_init_ratio(sse_sqrt)
+                                                   : cpi->rc.bit_est_ratio;
+    // Clamp the enumerator to lower the q fluctuations.
+    enumerator = AOMMIN(AOMMAX((int)(ratio * sse_sqrt), 20000), 170000);
+  }
+
+  // q based adjustment to baseline enumerator
+  return (int)(enumerator * correction_factor / q);
+}
+
+int av1_estimate_bits_at_q(const AV1_COMP *cpi, int q,
+                           double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  const int mbs = cm->mi_params.MBs;
+  const int bpm =
+      (int)(av1_rc_bits_per_mb(cpi, frame_type, q, correction_factor,
+                               cpi->sf.hl_sf.accurate_bit_estimate));
+  return AOMMAX(FRAME_OVERHEAD_BITS,
+                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
+                                    FRAME_UPDATE_TYPE frame_update_type) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const int min_frame_target =
+      AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+  // Clip the frame target to the minimum setup value.
+  if (frame_update_type == OVERLAY_UPDATE ||
+      frame_update_type == INTNL_OVERLAY_UPDATE) {
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
+  } else if (target < min_frame_target) {
+    target = min_frame_target;
+  }
+
+  // Clip the frame target to the maximum allowed value.
+  if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+  if (oxcf->rc_cfg.max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_cfg.max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+
+  return target;
+}
+
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int64_t target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+  if (rc_cfg->max_intra_bitrate_pct) {
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_intra_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+  return (int)target;
+}
+
+// Update the buffer level for higher temporal layers, given the encoded current
+// temporal layer.
+static void update_layer_buffer_level(SVC *svc, int encoded_frame_size,
+                                      bool is_screen) {
+  const int current_temporal_layer = svc->temporal_layer_id;
+  for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers;
+       ++i) {
+    const int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+    lp_rc->bits_off_target +=
+        (int)round(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
+    // Clip buffer level to maximum buffer size for the layer.
+    lp_rc->bits_off_target =
+        AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+    lp_rc->buffer_level = lp_rc->bits_off_target;
+
+    // For screen-content mode: don't let buffer level go below threshold,
+    // given here as -rc->maximum_ buffer_size, to allow buffer to come back
+    // up sooner after slide change with big oveshoot.
+    if (is_screen) {
+      lp_rc->bits_off_target =
+          AOMMAX(lp_rc->bits_off_target, -lp_rc->maximum_buffer_size);
+      lp_rc->buffer_level = lp_rc->bits_off_target;
+    }
+  }
+}
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame)
+    p_rc->bits_off_target -= encoded_frame_size;
+  else
+    p_rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+
+  // Clip the buffer level to the maximum specified buffer size.
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  // For screen-content mode: don't let buffel level go below threshold,
+  // given here as -rc->maximum_ buffer_size, to allow buffer to come back
+  // up sooner after slide change with big oveshoot.
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
+    p_rc->bits_off_target =
+        AOMMAX(p_rc->bits_off_target, -p_rc->maximum_buffer_size);
+  p_rc->buffer_level = p_rc->bits_off_target;
+
+  if (cpi->ppi->use_svc)
+    update_layer_buffer_level(&cpi->svc, encoded_frame_size,
+                              cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+
+#if CONFIG_FPMT_TEST
+  /* The variable temp_buffer_level is introduced for quality
+   * simulation purpose, it retains the value previous to the parallel
+   * encode frames. The variable is updated based on the update flag.
+   *
+   * If there exist show_existing_frames between parallel frames, then to
+   * retain the temp state do not update it. */
+  int show_existing_between_parallel_frames =
+      (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+           INTNL_OVERLAY_UPDATE &&
+       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+  if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    p_rc->temp_buffer_level = p_rc->buffer_level;
+  }
+#endif
+}
+
+int av1_rc_get_default_min_gf_interval(int width, int height,
+                                       double framerate) {
+  // Assume we do not need any constraint lower than 4K 20 fps
+  static const double factor_safe = 3840 * 2160 * 20.0;
+  const double factor = (double)width * height * framerate;
+  const int default_interval =
+      clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+  if (factor <= factor_safe)
+    return default_interval;
+  else
+    return AOMMAX(default_interval,
+                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+  // Note this logic makes:
+  // 4K24: 5
+  // 4K30: 6
+  // 4K60: 12
+}
+
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+  int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
+  interval = AOMMAX(MAX_GF_INTERVAL, interval);
+  return AOMMAX(interval, min_gf_interval);
+}
+
+void av1_primary_rc_init(const AV1EncoderConfig *oxcf,
+                         PRIMARY_RATE_CONTROL *p_rc) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  int worst_allowed_q = rc_cfg->worst_allowed_q;
+
+  int min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+  int max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+  if (min_gf_interval == 0)
+    min_gf_interval = av1_rc_get_default_min_gf_interval(
+        oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+        oxcf->input_cfg.init_framerate);
+  if (max_gf_interval == 0)
+    max_gf_interval = av1_rc_get_default_max_gf_interval(
+        oxcf->input_cfg.init_framerate, min_gf_interval);
+  p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2;
+  p_rc->this_key_frame_forced = 0;
+  p_rc->next_key_frame_forced = 0;
+  p_rc->ni_frames = 0;
+
+  p_rc->tot_q = 0.0;
+  p_rc->total_actual_bits = 0;
+  p_rc->total_target_bits = 0;
+  p_rc->buffer_level = p_rc->starting_buffer_level;
+
+  if (oxcf->target_seq_level_idx[0] < SEQ_LEVELS) {
+    worst_allowed_q = 255;
+  }
+  if (oxcf->pass == AOM_RC_ONE_PASS && rc_cfg->mode == AOM_CBR) {
+    p_rc->avg_frame_qindex[KEY_FRAME] = worst_allowed_q;
+    p_rc->avg_frame_qindex[INTER_FRAME] = worst_allowed_q;
+  } else {
+    p_rc->avg_frame_qindex[KEY_FRAME] =
+        (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+    p_rc->avg_frame_qindex[INTER_FRAME] =
+        (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+  }
+  p_rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q,
+                                        oxcf->tool_cfg.bit_depth);
+  p_rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q;
+  p_rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q;
+
+  for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    p_rc->rate_correction_factors[i] = 0.7;
+  }
+  p_rc->rate_correction_factors[KF_STD] = 1.0;
+  p_rc->bits_off_target = p_rc->starting_buffer_level;
+
+  p_rc->rolling_target_bits =
+      (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+  p_rc->rolling_actual_bits =
+      (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
+
+  rc->frames_till_gf_update_due = 0;
+  rc->ni_av_qi = rc_cfg->worst_allowed_q;
+  rc->ni_tot_qi = 0;
+
+  rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+  rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+        oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+        oxcf->input_cfg.init_framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+        oxcf->input_cfg.init_framerate, rc->min_gf_interval);
+  rc->avg_frame_low_motion = 0;
+
+  rc->resize_state = ORIG;
+  rc->resize_avg_qp = 0;
+  rc->resize_buffer_underflow = 0;
+  rc->resize_count = 0;
+  rc->rtc_external_ratectrl = 0;
+  rc->frame_level_fast_extra_bits = 0;
+  rc->use_external_qp_one_pass = 0;
+}
+
+static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level,
+                                      int drop_mark) {
+  SVC *svc = &cpi->svc;
+  if (!cpi->ppi->use_svc || cpi->svc.number_spatial_layers == 1 ||
+      cpi->svc.framedrop_mode == AOM_LAYER_DROP) {
+    return (buffer_level <= drop_mark);
+  } else {
+    // For SVC in the AOM_FULL_SUPERFRAME_DROP): the condition on
+    // buffer is checked on current and upper spatial layers.
+    for (int i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      PRIMARY_RATE_CONTROL *lrc = &lc->p_rc;
+      // Exclude check for layer whose bitrate is 0.
+      if (lc->target_bandwidth > 0) {
+        const int drop_thresh = cpi->oxcf.rc_cfg.drop_frames_water_mark;
+        const int drop_mark_layer =
+            (int)(drop_thresh * lrc->optimal_buffer_level / 100);
+        if (lrc->buffer_level <= drop_mark_layer) return true;
+      }
+    }
+    return false;
+  }
+}
+
+int av1_rc_drop_frame(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+#if CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  int64_t buffer_level =
+      simulate_parallel_frame ? p_rc->temp_buffer_level : p_rc->buffer_level;
+#else
+  int64_t buffer_level = p_rc->buffer_level;
+#endif
+  // Never drop on key frame, or for frame whose base layer is key.
+  // If drop_count_consec hits or exceeds max_consec_drop then don't drop.
+  if (cpi->common.current_frame.frame_type == KEY_FRAME ||
+      (cpi->ppi->use_svc &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+      !oxcf->rc_cfg.drop_frames_water_mark ||
+      (rc->max_consec_drop > 0 &&
+       rc->drop_count_consec >= rc->max_consec_drop)) {
+    return 0;
+  } else {
+    SVC *svc = &cpi->svc;
+    // In the full_superframe framedrop mode for svc, if the previous spatial
+    // layer was dropped, drop the current spatial layer.
+    if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+        svc->drop_spatial_layer[svc->spatial_layer_id - 1] &&
+        svc->framedrop_mode == AOM_FULL_SUPERFRAME_DROP)
+      return 1;
+    // -1 is passed here for drop_mark since we are checking if
+    // buffer goes below 0 (<= -1).
+    if (check_buffer_below_thresh(cpi, buffer_level, -1)) {
+      // Always drop if buffer is below 0.
+      rc->drop_count_consec++;
+      return 1;
+    } else {
+      // If buffer is below drop_mark, for now just drop every other frame
+      // (starting with the next frame) until it increases back over drop_mark.
+      const int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark *
+                                  p_rc->optimal_buffer_level / 100);
+      const bool buffer_below_thresh =
+          check_buffer_below_thresh(cpi, buffer_level, drop_mark);
+      if (!buffer_below_thresh && rc->decimation_factor > 0) {
+        --rc->decimation_factor;
+      } else if (buffer_below_thresh && rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
+      }
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
+          rc->drop_count_consec++;
+          return 1;
+        } else {
+          rc->decimation_count = rc->decimation_factor;
+          return 0;
+        }
+      } else {
+        rc->decimation_count = 0;
+        return 0;
+      }
+    }
+  }
+}
+
+static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
+                        int width, int height) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1_COMMON *const cm = &cpi->common;
+  const SVC *const svc = &cpi->svc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  // Flag to indicate previous frame has overshoot, and buffer level
+  // for current frame is low (less than ~half of optimal). For such
+  // (inter) frames, if the source_sad is non-zero, relax the max_delta_up
+  // and clamp applied below.
+  const bool overshoot_buffer_low =
+      cpi->rc.rc_1_frame == -1 && rc->frame_source_sad > 1000 &&
+      p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) &&
+      rc->frames_since_key > 4;
+  int max_delta_down;
+  int max_delta_up = overshoot_buffer_low ? 60 : 20;
+  const int change_avg_frame_bandwidth =
+      abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
+      0.1 * (rc->avg_frame_bandwidth);
+
+  // Set the maximum adjustment down for Q for this frame.
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->cyclic_refresh->apply_cyclic_refresh) {
+    // For static screen type content limit the Q drop till the start of the
+    // next refresh cycle.
+    if (cpi->is_screen_content_type &&
+        (cpi->cyclic_refresh->sb_index > cpi->cyclic_refresh->last_sb_index)) {
+      max_delta_down = AOMMIN(8, AOMMAX(1, rc->q_1_frame / 32));
+    } else {
+      max_delta_down = AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8));
+    }
+    if (!cpi->ppi->use_svc && cpi->is_screen_content_type) {
+      // Link max_delta_up to max_delta_down and buffer status.
+      if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
+        max_delta_up = AOMMAX(4, max_delta_down);
+      } else {
+        max_delta_up = AOMMAX(8, max_delta_down);
+      }
+    }
+  } else {
+    max_delta_down = (cpi->is_screen_content_type)
+                         ? AOMMIN(8, AOMMAX(1, rc->q_1_frame / 16))
+                         : AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8));
+  }
+  // If resolution changes or avg_frame_bandwidth significantly changed,
+  // then set this flag to indicate change in target bits per macroblock.
+  const int change_target_bits_mb =
+      cm->prev_frame &&
+      (width != cm->prev_frame->width || height != cm->prev_frame->height ||
+       change_avg_frame_bandwidth);
+  // Apply some control/clamp to QP under certain conditions.
+  // Delay the use of the clamping for svc until after num_temporal_layers,
+  // to make they have been set for each temporal layer.
+  if (!frame_is_intra_only(cm) && rc->frames_since_key > 1 &&
+      (!cpi->ppi->use_svc ||
+       svc->current_superframe > (unsigned int)svc->number_temporal_layers) &&
+      !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl &&
+      (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
+       !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) {
+    // If in the previous two frames we have seen both overshoot and undershoot
+    // clamp Q between the two. Check for rc->q_1/2_frame > 0 in case they have
+    // not been set due to dropped frames.
+    if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
+        rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 &&
+        rc->q_2_frame > 0 && !overshoot_buffer_low) {
+      int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
+                         AOMMAX(rc->q_1_frame, rc->q_2_frame));
+      // If the previous frame had overshoot and the current q needs to
+      // increase above the clamped value, reduce the clamp for faster reaction
+      // to overshoot.
+      if (cpi->rc.rc_1_frame == -1 && q > qclamp && rc->frames_since_key > 10)
+        q = (q + qclamp) >> 1;
+      else
+        q = qclamp;
+    }
+    // Adjust Q base on source content change from scene detection.
+    if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 &&
+        rc->frames_since_key > 10 && rc->frame_source_sad > 0 &&
+        !cpi->rc.rtc_external_ratectrl) {
+      const int bit_depth = cm->seq_params->bit_depth;
+      double delta =
+          (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0;
+      // Push Q downwards if content change is decreasing and buffer level
+      // is stable (at least 1/4-optimal level), so not overshooting. Do so
+      // only for high Q to avoid excess overshoot.
+      // Else reduce decrease in Q from previous frame if content change is
+      // increasing and buffer is below max (so not undershooting).
+      if (delta < 0.0 &&
+          p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
+          q > (rc->worst_quality >> 1)) {
+        double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta);
+        double q_val = av1_convert_qindex_to_q(q, bit_depth);
+        q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+      } else if (rc->q_1_frame - q > 0 && delta > 0.1 &&
+                 p_rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size,
+                                             p_rc->optimal_buffer_level << 1)) {
+        q = (3 * q + rc->q_1_frame) >> 2;
+      }
+    }
+    // Limit the decrease in Q from previous frame.
+    if (rc->q_1_frame - q > max_delta_down) q = rc->q_1_frame - max_delta_down;
+    // Limit the increase in Q from previous frame.
+    else if (q - rc->q_1_frame > max_delta_up)
+      q = rc->q_1_frame + max_delta_up;
+  }
+  // Adjustment for temporal layers.
+  if (svc->number_temporal_layers > 1 && svc->spatial_layer_id == 0 &&
+      !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl &&
+      cpi->oxcf.resize_cfg.resize_mode != RESIZE_DYNAMIC) {
+    if (svc->temporal_layer_id > 0) {
+      // Constrain enhancement relative to the previous base TL0.
+      // Get base temporal layer TL0.
+      const int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      // lc->rc.avg_frame_bandwidth and lc->p_rc.last_q correspond to the
+      // last TL0 frame.
+      if (rc->avg_frame_bandwidth < lc->rc.avg_frame_bandwidth &&
+          q < lc->p_rc.last_q[INTER_FRAME] - 4)
+        q = lc->p_rc.last_q[INTER_FRAME] - 4;
+    } else if (cpi->svc.temporal_layer_id == 0 &&
+               p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
+               rc->frame_source_sad < 100000) {
+      // Push base TL0 Q down if buffer is stable and frame_source_sad
+      // is below threshold.
+      int delta = (svc->number_temporal_layers == 2) ? 4 : 10;
+      q = q - delta;
+    }
+  }
+  // For non-svc (single layer): if resolution has increased push q closer
+  // to the active_worst to avoid excess overshoot.
+  if (!cpi->ppi->use_svc && cm->prev_frame &&
+      (width * height > 1.5 * cm->prev_frame->width * cm->prev_frame->height))
+    q = (q + active_worst_quality) >> 1;
+  // For single layer RPS: Bias Q based on distance of closest reference.
+  if (cpi->ppi->rtc_ref.bias_recovery_frame) {
+    const int min_dist = av1_svc_get_min_ref_dist(cpi);
+    q = q - AOMMIN(min_dist, 20);
+  }
+  return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
+}
+
+static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
+  KF_STD,        // KF_UPDATE
+  INTER_NORMAL,  // LF_UPDATE
+  GF_ARF_STD,    // GF_UPDATE
+  GF_ARF_STD,    // ARF_UPDATE
+  INTER_NORMAL,  // OVERLAY_UPDATE
+  INTER_NORMAL,  // INTNL_OVERLAY_UPDATE
+  GF_ARF_LOW,    // INTNL_ARF_UPDATE
+};
+
+static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group,
+                                               int gf_frame_index) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
+  assert(update_type < FRAME_UPDATE_TYPES);
+  return rate_factor_levels[update_type];
+}
+
+/*!\brief Gets a rate vs Q correction factor
+ *
+ * This function returns the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return Returns a correction factor for the current frame
+ */
+static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
+                                         int height) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  double rcf;
+  double rate_correction_factors_kfstd;
+  double rate_correction_factors_gfarfstd;
+  double rate_correction_factors_internormal;
+
+  rate_correction_factors_kfstd =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[KF_STD]
+          : p_rc->rate_correction_factors[KF_STD];
+  rate_correction_factors_gfarfstd =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[GF_ARF_STD]
+          : p_rc->rate_correction_factors[GF_ARF_STD];
+  rate_correction_factors_internormal =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[INTER_NORMAL]
+          : p_rc->rate_correction_factors[INTER_NORMAL];
+
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    rcf = rate_correction_factors_kfstd;
+  } else if (is_stat_consumption_stage(cpi)) {
+    const RATE_FACTOR_LEVEL rf_lvl =
+        get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    double rate_correction_factors_rflvl =
+        (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+            ? rc->frame_level_rate_correction_factors[rf_lvl]
+            : p_rc->rate_correction_factors[rf_lvl];
+    rcf = rate_correction_factors_rflvl;
+  } else {
+    if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+        (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+         cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
+      rcf = rate_correction_factors_gfarfstd;
+    else
+      rcf = rate_correction_factors_internormal;
+  }
+  rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+/*!\brief Sets a rate vs Q correction factor
+ *
+ * This function updates the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   is_encode_stage       Indicates if recode loop or post-encode
+ * \param[in]   factor                New correction factor
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \remark Updates the rate correction factor for the
+ *         current frame type in cpi->rc.
+ */
+static void set_rate_correction_factor(AV1_COMP *cpi, int is_encode_stage,
+                                       double factor, int width, int height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  int update_default_rcf = 1;
+  // Normalize RCF to account for the size-dependent scaling factor.
+  factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
+
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    p_rc->rate_correction_factors[KF_STD] = factor;
+  } else if (is_stat_consumption_stage(cpi)) {
+    const RATE_FACTOR_LEVEL rf_lvl =
+        get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    if (is_encode_stage &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      rc->frame_level_rate_correction_factors[rf_lvl] = factor;
+      update_default_rcf = 0;
+    }
+    if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor;
+  } else {
+    if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+        (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+         cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) {
+      p_rc->rate_correction_factors[GF_ARF_STD] = factor;
+    } else {
+      if (is_encode_stage &&
+          cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+        rc->frame_level_rate_correction_factors[INTER_NORMAL] = factor;
+        update_default_rcf = 0;
+      }
+      if (update_default_rcf)
+        p_rc->rate_correction_factors[INTER_NORMAL] = factor;
+    }
+  }
+}
+
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage,
+                                           int width, int height) {
+  const AV1_COMMON *const cm = &cpi->common;
+  double correction_factor = 1.0;
+  double rate_correction_factor =
+      get_rate_correction_factor(cpi, width, height);
+  double adjustment_limit;
+  int projected_size_based_on_q = 0;
+  int cyclic_refresh_active =
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled;
+
+  // Do not update the rate factors for arf overlay frames.
+  if (cpi->rc.is_src_frame_alt_ref) return;
+
+  // Don't update rate correction factors here on scene changes as
+  // it is already reset in av1_encodedframe_overshoot_cbr(),
+  // but reset variables related to previous frame q and size.
+  // Note that the counter of frames since the last scene change
+  // is only valid when cyclic refresh mode is enabled and that
+  // this break out only applies to scene changes that are not
+  // recorded as INTRA only key frames.
+  if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) &&
+      (cpi->cyclic_refresh->counter_encode_maxq_scene_change == 0) &&
+      !frame_is_intra_only(cm) && !cpi->ppi->use_svc) {
+    cpi->rc.q_2_frame = cm->quant_params.base_qindex;
+    cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+    cpi->rc.rc_2_frame = 0;
+    cpi->rc.rc_1_frame = 0;
+    return;
+  }
+
+  // Clear down mmx registers to allow floating point in what follows
+
+  // Work out how big we would have expected the frame to be at this Q given
+  // the current correction factor.
+  // Stay in double to avoid int overflow when values are large
+  if (cyclic_refresh_active) {
+    projected_size_based_on_q =
+        av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+  } else {
+    projected_size_based_on_q = av1_estimate_bits_at_q(
+        cpi, cm->quant_params.base_qindex, rate_correction_factor);
+  }
+  // Work out a size correction factor.
+  if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+    correction_factor = (double)cpi->rc.projected_frame_size /
+                        (double)projected_size_based_on_q;
+
+  // Clamp correction factor to prevent anything too extreme
+  correction_factor = AOMMAX(correction_factor, 0.25);
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 1.1)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 0.9)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
+
+  // Decide how heavily to dampen the adjustment
+  if (correction_factor > 0.0) {
+    if (cpi->is_screen_content_type) {
+      adjustment_limit =
+          0.25 + 0.5 * AOMMIN(0.5, fabs(log10(correction_factor)));
+    } else {
+      adjustment_limit =
+          0.25 + 0.75 * AOMMIN(0.5, fabs(log10(correction_factor)));
+    }
+  } else {
+    adjustment_limit = 0.75;
+  }
+
+  // Adjustment to delta Q and number of blocks updated in cyclic refressh
+  // based on over or under shoot of target in current frame.
+  if (cyclic_refresh_active && cpi->rc.this_frame_target > 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    if (correction_factor > 1.25) {
+      cr->percent_refresh_adjustment =
+          AOMMAX(cr->percent_refresh_adjustment - 1, -5);
+      cr->rate_ratio_qdelta_adjustment =
+          AOMMAX(cr->rate_ratio_qdelta_adjustment - 0.05, -0.0);
+    } else if (correction_factor < 0.5) {
+      cr->percent_refresh_adjustment =
+          AOMMIN(cr->percent_refresh_adjustment + 1, 5);
+      cr->rate_ratio_qdelta_adjustment =
+          AOMMIN(cr->rate_ratio_qdelta_adjustment + 0.05, 0.25);
+    }
+  }
+
+  if (correction_factor > 1.01) {
+    // We are not already at the worst allowable quality
+    correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit));
+    rate_correction_factor = rate_correction_factor * correction_factor;
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+  } else if (correction_factor < 0.99) {
+    // We are not already at the best allowable quality
+    correction_factor = 1.0 / correction_factor;
+    correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit));
+    correction_factor = 1.0 / correction_factor;
+
+    rate_correction_factor = rate_correction_factor * correction_factor;
+
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor < MIN_BPB_FACTOR)
+      rate_correction_factor = MIN_BPB_FACTOR;
+  }
+
+  set_rate_correction_factor(cpi, is_encode_stage, rate_correction_factor,
+                             width, height);
+}
+
+// Calculate rate for the given 'q'.
+static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
+                           double correction_factor, int q) {
+  const AV1_COMMON *const cm = &cpi->common;
+  return use_cyclic_refresh
+             ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
+             : av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, q,
+                                  correction_factor,
+                                  cpi->sf.hl_sf.accurate_bit_estimate);
+}
+
+/*!\brief Searches for a Q index value predicted to give an average macro
+ * block rate closest to the target value.
+ *
+ * Similar to find_qindex_by_rate() function, but returns a q index with a
+ * rate just above or below the desired rate, depending on which of the two
+ * rates is closer to the desired rate.
+ * Also, respects the selected aq_mode when computing the rate.
+ *
+ * \ingroup rate_control
+ * \param[in]   desired_bits_per_mb   Target bits per mb
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   correction_factor     Current Q to rate correction factor
+ * \param[in]   best_qindex           Min allowed Q value.
+ * \param[in]   worst_qindex          Max allowed Q value.
+ *
+ * \return Returns a correction factor for the current frame
+ */
+static int find_closest_qindex_by_rate(int desired_bits_per_mb,
+                                       const AV1_COMP *cpi,
+                                       double correction_factor,
+                                       int best_qindex, int worst_qindex) {
+  const int use_cyclic_refresh = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+                                 cpi->cyclic_refresh->apply_cyclic_refresh;
+
+  // Find 'qindex' based on 'desired_bits_per_mb'.
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const int mid_bits_per_mb =
+        get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid);
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  assert(low == high);
+
+  // Calculate rate difference of this q index from the desired rate.
+  const int curr_q = low;
+  const int curr_bits_per_mb =
+      get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q);
+  const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb)
+                                ? desired_bits_per_mb - curr_bits_per_mb
+                                : INT_MAX;
+  assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) ||
+         curr_q == worst_qindex);
+
+  // Calculate rate difference for previous q index too.
+  const int prev_q = curr_q - 1;
+  int prev_bit_diff;
+  if (curr_bit_diff == INT_MAX || curr_q == best_qindex) {
+    prev_bit_diff = INT_MAX;
+  } else {
+    const int prev_bits_per_mb =
+        get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q);
+    assert(prev_bits_per_mb > desired_bits_per_mb);
+    prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb;
+  }
+
+  // Pick one of the two q indices, depending on which one has rate closer to
+  // the desired rate.
+  return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q;
+}
+
+int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality,
+                      int width, int height) {
+  const int MBs = av1_get_MBs(width, height);
+  const double correction_factor =
+      get_rate_correction_factor(cpi, width, height);
+  const int target_bits_per_mb =
+      (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs);
+
+  int q =
+      find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
+                                  active_best_quality, active_worst_quality);
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi))
+    return adjust_q_cbr(cpi, q, active_worst_quality, width, height);
+
+  return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+                              int *low_motion_minq, int *high_motion_minq) {
+  if (gfu_boost > high) {
+    return low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    return high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    return low_motion_minq[q] + adjustment;
+  }
+}
+
+static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  int *kf_low_motion_minq;
+  int *kf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+  return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high,
+                            kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality_no_rc(int gfu_boost, int q,
+                                       aom_bit_depth_t bit_depth) {
+  int *arfgf_low_motion_minq;
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return get_active_quality(q, gfu_boost, gf_low, gf_high,
+                            arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth);
+}
+
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return arfgf_high_motion_minq[q];
+}
+
+static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const unsigned int curr_frame = cpi->common.current_frame.frame_number;
+  int active_worst_quality;
+  int last_q_key_frame;
+  int last_q_inter_frame;
+#if CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  last_q_key_frame = simulate_parallel_frame ? p_rc->temp_last_q[KEY_FRAME]
+                                             : p_rc->last_q[KEY_FRAME];
+  last_q_inter_frame = simulate_parallel_frame ? p_rc->temp_last_q[INTER_FRAME]
+                                               : p_rc->last_q[INTER_FRAME];
+#else
+  last_q_key_frame = p_rc->last_q[KEY_FRAME];
+  last_q_inter_frame = p_rc->last_q[INTER_FRAME];
+#endif
+
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    active_worst_quality =
+        curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2;
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame ||
+         refresh_frame->alt_ref_frame)) {
+      active_worst_quality =
+          curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame;
+    } else {
+      active_worst_quality =
+          curr_frame == 1 ? last_q_key_frame * 2 : last_q_inter_frame * 2;
+    }
+  }
+  return AOMMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
+  // Adjust active_worst_quality: If buffer is above the optimal/target level,
+  // bring active_worst_quality down depending on fullness of buffer.
+  // If buffer is below the optimal level, let the active_worst_quality go from
+  // ambient Q (at buffer = optimal level) to worst_quality level
+  // (at buffer = critical level).
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+  const SVC *const svc = &cpi->svc;
+  unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
+  // Buffer level below which we push active_worst to worst_quality.
+  int64_t critical_level = p_rc->optimal_buffer_level >> 3;
+  int64_t buff_lvl_step = 0;
+  int adjustment = 0;
+  int active_worst_quality;
+  int ambient_qp;
+  if (cm->current_frame.frame_type == KEY_FRAME) return rc->worst_quality;
+  // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+  // for the first few frames following key frame. These are both initialized
+  // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+  // So for first few frames following key, the qp of that key frame is weighted
+  // into the active_worst_quality setting. For SVC the key frame should
+  // correspond to layer (0, 0), so use that for layer context.
+  int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME];
+  if (svc->number_temporal_layers > 1) {
+    int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+    const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+    avg_qindex_key =
+        AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]);
+  }
+  ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key)
+                   ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key)
+                   : p_rc->avg_frame_qindex[INTER_FRAME];
+  ambient_qp = AOMMIN(rc->worst_quality, ambient_qp);
+
+  if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
+    // Adjust down.
+    int max_adjustment_down;  // Maximum adjustment down for Q
+
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && !cpi->ppi->use_svc &&
+        (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)) {
+      active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp);
+      max_adjustment_down = AOMMIN(4, active_worst_quality / 16);
+    } else {
+      active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
+      max_adjustment_down = active_worst_quality / 3;
+    }
+
+    if (max_adjustment_down) {
+      buff_lvl_step =
+          ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) /
+           max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((p_rc->buffer_level - p_rc->optimal_buffer_level) /
+                           buff_lvl_step);
+      active_worst_quality -= adjustment;
+    }
+  } else if (p_rc->buffer_level > critical_level) {
+    // Adjust up from ambient Q.
+    active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp);
+    if (critical_level) {
+      buff_lvl_step = (p_rc->optimal_buffer_level - critical_level);
+      if (buff_lvl_step) {
+        adjustment = (int)((rc->worst_quality - ambient_qp) *
+                           (p_rc->optimal_buffer_level - p_rc->buffer_level) /
+                           buff_lvl_step);
+      }
+      active_worst_quality += adjustment;
+    }
+  } else {
+    // Set to worst_quality if buffer is below critical level.
+    active_worst_quality = rc->worst_quality;
+  }
+  return active_worst_quality;
+}
+
+// Calculate the active_best_quality level.
+static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
+                                                 int active_worst_quality,
+                                                 int width, int height) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  int *rtc_minq;
+  const int bit_depth = cm->seq_params->bit_depth;
+  int active_best_quality = rc->best_quality;
+  ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
+
+  if (frame_is_intra_only(cm)) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (p_rc->this_key_frame_forced) {
+      int qindex = p_rc->last_boosted_qindex;
+      double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                            (last_boosted_q * 0.75), bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (current_frame->frame_number > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+      active_best_quality = get_kf_active_quality(
+          p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+      // Allow somewhat lower kf minq with small image formats.
+      if ((width * height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+      active_best_quality +=
+          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+             cpi->oxcf.rc_cfg.gf_cbr_boost_pct &&
+             (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    int q = active_worst_quality;
+    if (rc->frames_since_key > 1 &&
+        p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = p_rc->avg_frame_qindex[INTER_FRAME];
+    }
+    active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+  } else {
+    // Use the lower of active_worst_quality and recent/average Q.
+    FRAME_TYPE frame_type =
+        (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME;
+    if (p_rc->avg_frame_qindex[frame_type] < active_worst_quality)
+      active_best_quality = rtc_minq[p_rc->avg_frame_qindex[frame_type]];
+    else
+      active_best_quality = rtc_minq[active_worst_quality];
+  }
+  return active_best_quality;
+}
+
+#if RT_PASSIVE_STRATEGY
+static int get_q_passive_strategy(const AV1_COMP *const cpi,
+                                  const int q_candidate, const int threshold) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  int sum = 0;
+  int count = 0;
+  int i = 1;
+  while (i < MAX_Q_HISTORY) {
+    int frame_id = current_frame->frame_number - i;
+    if (frame_id <= 0) break;
+    sum += p_rc->q_history[frame_id % MAX_Q_HISTORY];
+    ++count;
+    ++i;
+  }
+  if (count > 0) {
+    const int avg_q = sum / count;
+    if (abs(avg_q - q_candidate) <= threshold) return avg_q;
+  }
+  return q_candidate;
+}
+#endif  // RT_PASSIVE_STRATEGY
+
+/*!\brief Picks q and q bounds given CBR rate control parameters in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Constant bit-rate mode: \c cpi->oxcf.rc_cfg.mode == \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  int q;
+  int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+  int active_best_quality = calc_active_best_quality_no_stats_cbr(
+      cpi, active_worst_quality, width, height);
+  assert(has_no_stats_stage(cpi));
+  assert(cpi->oxcf.rc_cfg.mode == AOM_CBR);
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  // Limit Q range for the adaptive loop.
+  if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced &&
+      current_frame->frame_number != 0) {
+    int qdelta = 0;
+    qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+                                        active_worst_quality, 2.0);
+    *top_index = active_worst_quality + qdelta;
+    *top_index = AOMMAX(*top_index, *bottom_index);
+  }
+
+  q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                        active_worst_quality, width, height);
+#if RT_PASSIVE_STRATEGY
+  if (current_frame->frame_type != KEY_FRAME &&
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    q = get_q_passive_strategy(cpi, q, 50);
+  }
+#endif  // RT_PASSIVE_STRATEGY
+  if (q > *top_index) {
+    // Special case when we are targeting the max allowed rate
+    if (rc->this_frame_target >= rc->max_frame_bandwidth)
+      *top_index = q;
+    else
+      q = *top_index;
+  }
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) {
+  return gf_group->layer_depth[gf_index];
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const PRIMARY_RATE_CONTROL *p_rc,
+                               const AV1EncoderConfig *const oxcf,
+                               int intra_only, aom_superres_mode superres_mode,
+                               int superres_denom) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  static const double cq_adjust_threshold = 0.1;
+  int active_cq_level = rc_cfg->cq_level;
+  if (rc_cfg->mode == AOM_CQ || rc_cfg->mode == AOM_Q) {
+    // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
+    //        rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
+    if ((superres_mode == AOM_SUPERRES_QTHRESH ||
+         superres_mode == AOM_SUPERRES_AUTO) &&
+        superres_denom != SCALE_NUMERATOR) {
+      int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO;
+      if (intra_only && rc->frames_to_key <= 1) {
+        mult = 0;
+      } else if (intra_only) {
+        mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME;
+      } else {
+        mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME;
+      }
+      active_cq_level = AOMMAX(
+          active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0);
+    }
+  }
+  if (rc_cfg->mode == AOM_CQ && p_rc->total_target_bits > 0) {
+    const double x = (double)p_rc->total_actual_bits / p_rc->total_target_bits;
+    if (x < cq_adjust_threshold) {
+      active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+    }
+  }
+  return active_cq_level;
+}
+
+/*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Any rate control other than constant bit-rate mode:
+ * \c cpi->oxcf.rc_cfg.mode != \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
+                                         int height, int *bottom_index,
+                                         int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
+
+  assert(has_no_stats_stage(cpi));
+  assert(rc_mode == AOM_VBR ||
+         (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) ||
+         rc_mode == AOM_Q);
+
+  const int cq_level =
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params->bit_depth;
+
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi);
+  int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm)) {
+    if (rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex =
+          av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (p_rc->this_key_frame_forced) {
+#if CONFIG_FPMT_TEST
+      const int simulate_parallel_frame =
+          cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+          cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+      int qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex
+                                           : p_rc->last_boosted_qindex;
+#else
+      int qindex = p_rc->last_boosted_qindex;
+#endif
+      const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex = av1_compute_qdelta(
+          rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {  // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+
+      active_best_quality = get_kf_active_quality(
+          p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((width * height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta on active_best_quality.
+      {
+        const double q_val =
+            av1_convert_qindex_to_q(active_best_quality, bit_depth);
+        active_best_quality +=
+            av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+      }
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    q = (rc->frames_since_key > 1 &&
+         p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+            ? p_rc->avg_frame_qindex[INTER_FRAME]
+            : p_rc->avg_frame_qindex[KEY_FRAME];
+    // For constrained quality dont allow Q less than the cq level
+    if (rc_mode == AOM_CQ) {
+      if (q < cq_level) q = cq_level;
+      active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+    } else if (rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex =
+          (refresh_frame->alt_ref_frame)
+              ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
+              : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+    }
+  } else {
+    if (rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+                                                     0.70, 1.0, 0.85, 1.0 };
+      const int delta_qindex = av1_compute_qdelta(
+          rc, q_val,
+          q_val * delta_rate[current_frame->frame_number % FIXED_GF_INTERVAL],
+          bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // Use the lower of active_worst_quality and recent/average Q.
+      active_best_quality =
+          (current_frame->frame_number > 1)
+              ? inter_minq[p_rc->avg_frame_qindex[INTER_FRAME]]
+              : inter_minq[p_rc->avg_frame_qindex[KEY_FRAME]];
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  // Limit Q range for the adaptive loop.
+  {
+    int qdelta = 0;
+    if (current_frame->frame_type == KEY_FRAME &&
+        !p_rc->this_key_frame_forced && current_frame->frame_number != 0) {
+      qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+                                          active_worst_quality, 2.0);
+    } else if (!rc->is_src_frame_alt_ref &&
+               (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+      qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+                                          active_worst_quality, 1.75);
+    }
+    *top_index = active_worst_quality + qdelta;
+    *top_index = AOMMAX(*top_index, *bottom_index);
+  }
+
+  if (rc_mode == AOM_Q) {
+    q = active_best_quality;
+    // Special case code to try and match quality with forced key frames
+  } else if ((current_frame->frame_type == KEY_FRAME) &&
+             p_rc->this_key_frame_forced) {
+#if CONFIG_FPMT_TEST
+    const int simulate_parallel_frame =
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+        cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+    q = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex
+                                : p_rc->last_boosted_qindex;
+#else
+    q = p_rc->last_boosted_qindex;
+#endif
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality, width, height);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75,
+                                                             1.50, 1.25, 1.15,
+                                                             1.0 };
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const RATE_FACTOR_LEVEL rf_lvl =
+      get_rate_factor_level(gf_group, cpi->gf_frame_index);
+  const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+  const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const double rate_factor =
+      (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer];
+
+  return av1_compute_qdelta_by_rate(cpi, frame_type, q, rate_factor);
+}
+
+// This unrestricted Q selection on CQ mode is useful when testing new features,
+// but may lead to Q being out of range on current RC restrictions
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width,
+                                            int height, int *bottom_index,
+                                            int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level =
+      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+                          cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params->bit_depth;
+  const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
+  (void)width;
+  (void)height;
+  assert(has_no_stats_stage(cpi));
+  assert(cpi->oxcf.rc_cfg.mode == AOM_CQ);
+
+  *top_index = q;
+  *bottom_index = q;
+
+  return q;
+}
+#endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
+
+#define STATIC_MOTION_THRESH 95
+static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                                   int *active_best, int *active_worst,
+                                   int cq_level) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int active_best_quality;
+  int active_worst_quality = *active_worst;
+  const int bit_depth = cm->seq_params->bit_depth;
+
+  if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) {
+    // If the next frame is also a key frame or the current frame is the
+    // only frame in the sequence in AOM_Q mode, just use the cq_level
+    // as q.
+    active_best_quality = cq_level;
+    active_worst_quality = cq_level;
+  } else if (p_rc->this_key_frame_forced) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    double last_boosted_q;
+    int delta_qindex;
+    int qindex;
+#if CONFIG_FPMT_TEST
+    const int simulate_parallel_frame =
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+        cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+    int last_boosted_qindex = simulate_parallel_frame
+                                  ? p_rc->temp_last_boosted_qindex
+                                  : p_rc->last_boosted_qindex;
+#else
+    int last_boosted_qindex = p_rc->last_boosted_qindex;
+#endif
+    if (is_stat_consumption_stage_twopass(cpi) &&
+        cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      qindex = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
+      active_best_quality = qindex;
+      last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 1.25, bit_depth);
+      active_worst_quality =
+          AOMMIN(qindex + delta_qindex, active_worst_quality);
+    } else {
+      qindex = last_boosted_qindex;
+      last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 0.50, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    }
+  } else {
+    // Not forced keyframe.
+    double q_adj_factor = 1.0;
+    double q_val;
+
+    // Baseline value derived from active_worst_quality and kf boost.
+    active_best_quality =
+        get_kf_active_quality(p_rc, active_worst_quality, bit_depth);
+    if (cpi->is_screen_content_type) {
+      active_best_quality /= 2;
+    }
+
+    if (is_stat_consumption_stage_twopass(cpi) &&
+        cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+      active_best_quality /= 3;
+    }
+
+    // Allow somewhat lower kf minq with small image formats.
+    if ((width * height) <= (352 * 288)) {
+      q_adj_factor -= 0.25;
+    }
+
+    // Make a further adjustment based on the kf zero motion measure.
+    if (is_stat_consumption_stage_twopass(cpi))
+      q_adj_factor +=
+          0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct);
+
+    // Convert the adjustment factor to a qindex delta
+    // on active_best_quality.
+    q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+    active_best_quality +=
+        av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+
+    // Tweak active_best_quality for AOM_Q mode when superres is on, as this
+    // will be used directly as 'q' later.
+    if (oxcf->rc_cfg.mode == AOM_Q &&
+        (cpi->superres_mode == AOM_SUPERRES_QTHRESH ||
+         cpi->superres_mode == AOM_SUPERRES_AUTO) &&
+        cm->superres_scale_denominator != SCALE_NUMERATOR) {
+      active_best_quality =
+          AOMMAX(active_best_quality -
+                     ((cm->superres_scale_denominator - SCALE_NUMERATOR) *
+                      SUPERRES_QADJ_PER_DENOM_KEYFRAME),
+                 0);
+    }
+  }
+  *active_best = active_best_quality;
+  *active_worst = active_worst_quality;
+}
+
+static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
+                                                 const int is_intrl_arf_boost,
+                                                 int *active_worst,
+                                                 int *active_best) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  int active_best_quality = *active_best;
+  int active_worst_quality = *active_worst;
+#if CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq
+                                            : cpi->ppi->twopass.extend_minq;
+  int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq
+                                            : cpi->ppi->twopass.extend_maxq;
+#endif
+  // Extension to max or min Q if undershoot or overshoot is outside
+  // the permitted range.
+  if (cpi->oxcf.rc_cfg.mode != AOM_Q) {
+    if (frame_is_intra_only(cm) ||
+        (!rc->is_src_frame_alt_ref &&
+         (refresh_frame->golden_frame || is_intrl_arf_boost ||
+          refresh_frame->alt_ref_frame))) {
+#if CONFIG_FPMT_TEST
+      active_best_quality -= extend_minq;
+      active_worst_quality += (extend_maxq / 2);
+#else
+      active_best_quality -= cpi->ppi->twopass.extend_minq / 4;
+      active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2);
+#endif
+    } else {
+#if CONFIG_FPMT_TEST
+      active_best_quality -= extend_minq / 2;
+      active_worst_quality += extend_maxq;
+#else
+      active_best_quality -= cpi->ppi->twopass.extend_minq / 4;
+      active_worst_quality += cpi->ppi->twopass.extend_maxq;
+#endif
+    }
+  }
+
+#ifndef STRICT_RC
+  // Static forced key frames Q restrictions dealt with elsewhere.
+  if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced ||
+      (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+    const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
+    active_worst_quality =
+        AOMMAX(active_worst_quality + qdelta, active_best_quality);
+  }
+#endif
+
+  // Modify active_best_quality for downscaled normal frames.
+  if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
+    int qdelta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type,
+                                            active_best_quality, 2.0);
+    active_best_quality =
+        AOMMAX(active_best_quality + qdelta, rc->best_quality);
+  }
+
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *active_best = active_best_quality;
+  *active_worst = active_worst_quality;
+}
+
+/*!\brief Gets a Q value to use  for the current frame
+ *
+ *
+ * Selects a Q value from a permitted range that we estimate
+ * will result in approximately the target number of bits.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   width                 Width of frame
+ * \param[in]   height                Height of frame
+ * \param[in]   active_worst_quality  Max Q allowed
+ * \param[in]   active_best_quality   Min Q allowed
+ *
+ * \return The suggested Q for this frame.
+ */
+static int get_q(const AV1_COMP *cpi, const int width, const int height,
+                 const int active_worst_quality,
+                 const int active_best_quality) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int q;
+#if CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg;
+  int last_boosted_qindex = simulate_parallel_frame
+                                ? p_rc->temp_last_boosted_qindex
+                                : p_rc->last_boosted_qindex;
+#else
+  int last_boosted_qindex = p_rc->last_boosted_qindex;
+#endif
+
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q ||
+      (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced &&
+       cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+       rc->frames_to_key > 1)) {
+    q = active_best_quality;
+    // Special case code to try and match quality with forced key frames.
+  } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) {
+    // If static since last kf use better of last boosted and last kf q.
+    if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
+    } else {
+      q = AOMMIN(last_boosted_qindex,
+                 (active_best_quality + active_worst_quality) / 2);
+    }
+    q = clamp(q, active_best_quality, active_worst_quality);
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality, width, height);
+    if (q > active_worst_quality) {
+      // Special case when we are targeting the max allowed rate.
+      if (rc->this_frame_target < rc->max_frame_bandwidth) {
+        q = active_worst_quality;
+      }
+    }
+    q = AOMMAX(q, active_best_quality);
+  }
+  return q;
+}
+
+// Returns |active_best_quality| for an inter frame.
+// The |active_best_quality| depends on different rate control modes:
+// VBR, Q, CQ, CBR.
+// The returning active_best_quality could further be adjusted in
+// adjust_active_best_and_worst_quality().
+static int get_active_best_quality(const AV1_COMP *const cpi,
+                                   const int active_worst_quality,
+                                   const int cq_level, const int gf_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cm->seq_params->bit_depth;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+  int active_best_quality = 0;
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+  int is_leaf_frame =
+      !(gf_group->update_type[gf_index] == ARF_UPDATE ||
+        gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost);
+
+  // TODO(jingning): Consider to rework this hack that covers issues incurred
+  // in lightfield setting.
+  if (cm->tiles.large_scale) {
+    is_leaf_frame = !(refresh_frame->golden_frame ||
+                      refresh_frame->alt_ref_frame || is_intrl_arf_boost);
+  }
+  const int is_overlay_frame = rc->is_src_frame_alt_ref;
+
+  if (is_leaf_frame || is_overlay_frame) {
+    if (rc_mode == AOM_Q) return cq_level;
+
+    active_best_quality = inter_minq[active_worst_quality];
+    // For the constrained quality mode we don't want
+    // q to fall below the cq level.
+    if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+      active_best_quality = cq_level;
+    }
+    return active_best_quality;
+  }
+
+  // Determine active_best_quality for frames that are not leaf or overlay.
+  int q = active_worst_quality;
+  // Use the lower of active_worst_quality and recent
+  // average Q as basis for GF/ARF best Q limit unless last frame was
+  // a key frame.
+  if (rc->frames_since_key > 1 &&
+      p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+    q = p_rc->avg_frame_qindex[INTER_FRAME];
+  }
+  if (rc_mode == AOM_CQ && q < cq_level) q = cq_level;
+  active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+  // Constrained quality use slightly lower active best.
+  if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16;
+  const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+  const int boost = min_boost - active_best_quality;
+  active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor);
+  if (!is_intrl_arf_boost) return active_best_quality;
+
+  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q;
+  int this_height = gf_group_pyramid_level(gf_group, gf_index);
+  while (this_height > 1) {
+    active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+    --this_height;
+  }
+  return active_best_quality;
+}
+
+// Returns the q_index for a single frame in the GOP.
+// This function assumes that rc_mode == AOM_Q mode.
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+                           int gf_pyramid_level, int arf_q) {
+  const int is_intrl_arf_boost = gf_update_type == INTNL_ARF_UPDATE;
+  int is_leaf_or_overlay_frame = gf_update_type == LF_UPDATE ||
+                                 gf_update_type == OVERLAY_UPDATE ||
+                                 gf_update_type == INTNL_OVERLAY_UPDATE;
+
+  if (is_leaf_or_overlay_frame) return base_q_index;
+
+  if (!is_intrl_arf_boost) return arf_q;
+
+  int active_best_quality = arf_q;
+  int active_worst_quality = base_q_index;
+
+  while (gf_pyramid_level > 1) {
+    active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+    --gf_pyramid_level;
+  }
+  return active_best_quality;
+}
+
+// Returns the q_index for the ARF in the GOP.
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+                        double arf_boost_factor) {
+  int active_best_quality =
+      get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth);
+  const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth);
+  const int boost = min_boost - active_best_quality;
+  return min_boost - (int)(boost * arf_boost_factor);
+}
+
+static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width,
+                                       int height, int gf_index,
+                                       int *bottom_index, int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level =
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
+  int active_best_quality = 0;
+  int active_worst_quality = rc->active_worst_quality;
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+    get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+                           &active_worst_quality, cq_level);
+  } else {
+    //  Active best quality limited by previous layer.
+    active_best_quality =
+        get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+  }
+
+  if (cq_level > 0) active_best_quality = AOMMAX(1, active_best_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  *top_index = AOMMAX(*top_index, rc->best_quality);
+  *top_index = AOMMIN(*top_index, rc->worst_quality);
+
+  *bottom_index = AOMMAX(*bottom_index, rc->best_quality);
+  *bottom_index = AOMMIN(*bottom_index, rc->worst_quality);
+
+  q = active_best_quality;
+
+  q = AOMMAX(q, rc->best_quality);
+  q = AOMMIN(q, rc->worst_quality);
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+  return q;
+}
+
+/*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc.
+ *
+ * Handles the the general cases not covered by
+ * \ref rc_pick_q_and_bounds_no_stats_cbr() and
+ * \ref rc_pick_q_and_bounds_no_stats()
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[in]       gf_index     Index of this frame in the golden frame group
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                                int gf_index, int *bottom_index,
+                                int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(has_no_stats_stage(cpi),
+                 cpi->oxcf.rc_cfg.mode == AOM_Q &&
+                     gf_group->update_type[gf_index] != ARF_UPDATE));
+  const int cq_level =
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
+
+  if (oxcf->rc_cfg.mode == AOM_Q) {
+    return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index,
+                                       bottom_index, top_index);
+  }
+
+  int active_best_quality = 0;
+  int active_worst_quality = rc->active_worst_quality;
+  int q;
+
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+
+  if (frame_is_intra_only(cm)) {
+    get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+                           &active_worst_quality, cq_level);
+#ifdef STRICT_RC
+    active_best_quality = 0;
+#endif
+  } else {
+    //  Active best quality limited by previous layer.
+    const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
+
+    if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) {
+      active_best_quality = get_active_best_quality(cpi, active_worst_quality,
+                                                    cq_level, gf_index);
+    } else {
+#if CONFIG_FPMT_TEST
+      const int simulate_parallel_frame =
+          cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+          cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+      int local_active_best_quality =
+          simulate_parallel_frame
+              ? p_rc->temp_active_best_quality[pyramid_level - 1]
+              : p_rc->active_best_quality[pyramid_level - 1];
+      active_best_quality = local_active_best_quality + 1;
+#else
+      active_best_quality = p_rc->active_best_quality[pyramid_level - 1] + 1;
+#endif
+
+      active_best_quality = AOMMIN(active_best_quality, active_worst_quality);
+#ifdef STRICT_RC
+      active_best_quality += (active_worst_quality - active_best_quality) / 16;
+#else
+      active_best_quality += (active_worst_quality - active_best_quality) / 2;
+#endif
+    }
+
+    // For alt_ref and GF frames (including internal arf frames) adjust the
+    // worst allowed quality as well. This insures that even on hard
+    // sections we dont clamp the Q at the same value for arf frames and
+    // leaf (non arf) frames. This is important to the TPL model which assumes
+    // Q drops with each arf level.
+    if (!(rc->is_src_frame_alt_ref) &&
+        (refresh_frame->golden_frame || refresh_frame->alt_ref_frame ||
+         is_intrl_arf_boost)) {
+      active_worst_quality =
+          (active_best_quality + (3 * active_worst_quality) + 2) / 4;
+    }
+  }
+
+  adjust_active_best_and_worst_quality(
+      cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality);
+  q = get_q(cpi, width, height, active_worst_quality, active_best_quality);
+
+  // Special case when we are targeting the max allowed rate.
+  if (rc->this_frame_target >= rc->max_frame_bandwidth &&
+      q > active_worst_quality) {
+    active_worst_quality = q;
+  }
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+  return q;
+}
+
+static void rc_compute_variance_onepass_rt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  YV12_BUFFER_CONFIG const *const unscaled_src = cpi->unscaled_source;
+  if (unscaled_src == NULL) return;
+
+  const uint8_t *src_y = unscaled_src->y_buffer;
+  const int src_ystride = unscaled_src->y_stride;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const uint8_t *pre_y = yv12->buffers[0];
+  const int pre_ystride = yv12->strides[0];
+
+  // TODO(yunqing): support scaled reference frames.
+  if (cpi->scaled_ref_buf[LAST_FRAME - 1]) return;
+
+  for (int i = 0; i < 2; ++i) {
+    if (unscaled_src->widths[i] != yv12->widths[i] ||
+        unscaled_src->heights[i] != yv12->heights[i]) {
+      return;
+    }
+  }
+
+  const int num_mi_cols = cm->mi_params.mi_cols;
+  const int num_mi_rows = cm->mi_params.mi_rows;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  int num_samples = 0;
+  // sse is computed on 64x64 blocks
+  const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                ? (cm->seq_params->mib_size >> 1)
+                                : cm->seq_params->mib_size;
+  const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+  const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+
+  uint64_t fsse = 0;
+  cpi->rec_sse = 0;
+
+  for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+    for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+      unsigned int sse;
+      uint8_t src[64 * 64] = { 0 };
+      // Apply 4x4 block averaging/denoising on source frame.
+      for (int i = 0; i < 64; i += 4) {
+        for (int j = 0; j < 64; j += 4) {
+          const unsigned int avg =
+              aom_avg_4x4(src_y + i * src_ystride + j, src_ystride);
+
+          for (int m = 0; m < 4; ++m) {
+            for (int n = 0; n < 4; ++n) src[i * 64 + j + m * 64 + n] = avg;
+          }
+        }
+      }
+
+      cpi->ppi->fn_ptr[bsize].vf(src, 64, pre_y, pre_ystride, &sse);
+      fsse += sse;
+      num_samples++;
+      src_y += 64;
+      pre_y += 64;
+    }
+    src_y += (src_ystride << 6) - (sb_cols << 6);
+    pre_y += (pre_ystride << 6) - (sb_cols << 6);
+  }
+  assert(num_samples > 0);
+  // Ensure rec_sse > 0
+  if (num_samples > 0) cpi->rec_sse = fsse > 0 ? fsse : 1;
+}
+
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int gf_index,
+                             int *bottom_index, int *top_index) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int q;
+  // TODO(sarahparker) merge no-stats vbr and altref q computation
+  // with rc_pick_q_and_bounds().
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if ((cpi->oxcf.rc_cfg.mode != AOM_Q ||
+       gf_group->update_type[gf_index] == ARF_UPDATE) &&
+      has_no_stats_stage(cpi)) {
+    if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+      // TODO(yunqing): the results could be used for encoder optimization.
+      cpi->rec_sse = UINT64_MAX;
+      if (cpi->sf.hl_sf.accurate_bit_estimate &&
+          cpi->common.current_frame.frame_type != KEY_FRAME)
+        rc_compute_variance_onepass_rt(cpi);
+
+      q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index,
+                                            top_index);
+      // preserve copy of active worst quality selected.
+      cpi->rc.active_worst_quality = *top_index;
+
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+    } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+      q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index,
+                                           top_index);
+#endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
+    } else {
+      q = rc_pick_q_and_bounds_no_stats(cpi, width, height, bottom_index,
+                                        top_index);
+    }
+  } else {
+    q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index,
+                             top_index);
+  }
+  if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q;
+
+  return q;
+}
+
+void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+    *frame_under_shoot_limit = 0;
+    *frame_over_shoot_limit = INT_MAX;
+  } else {
+    // For very small rate targets where the fractional adjustment
+    // may be tiny make sure there is at least a minimum range.
+    assert(cpi->sf.hl_sf.recode_tolerance <= 100);
+    const int tolerance = (int)AOMMAX(
+        100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
+    *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0);
+    *frame_over_shoot_limit =
+        AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth);
+  }
+}
+
+void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  rc->this_frame_target = target;
+
+  // Modify frame size target when down-scaled.
+  if (av1_frame_scaled(cm) && cpi->oxcf.rc_cfg.mode != AOM_CBR) {
+    rc->this_frame_target =
+        (int)(rc->this_frame_target *
+              resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height));
+  }
+
+  // Target rate per SB64 (including partial SB64s.
+  rc->sb64_target_rate =
+      (int)(((int64_t)rc->this_frame_target << 12) / (width * height));
+}
+
+static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
+  // this frame refreshes means next frames don't unless specified by user
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->frames_since_golden = 0;
+}
+
+static void update_golden_frame_stats(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Update the Golden frame usage counts.
+  if (cpi->refresh_frame.golden_frame || rc->is_src_frame_alt_ref) {
+    rc->frames_since_golden = 0;
+  } else if (cpi->common.show_frame) {
+    rc->frames_since_golden++;
+  }
+}
+
+void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+
+  const int is_intrnl_arf =
+      gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+
+  const int qindex = cm->quant_params.base_qindex;
+
+#if RT_PASSIVE_STRATEGY
+  const int frame_number = current_frame->frame_number % MAX_Q_HISTORY;
+  p_rc->q_history[frame_number] = qindex;
+#endif  // RT_PASSIVE_STRATEGY
+
+  // Update rate control heuristics
+  rc->projected_frame_size = (int)(bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  av1_rc_update_rate_correction_factors(cpi, 0, cm->width, cm->height);
+
+  // Update bit estimation ratio.
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      cm->current_frame.frame_type != KEY_FRAME &&
+      cpi->sf.hl_sf.accurate_bit_estimate) {
+    const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
+                                             cm->seq_params->bit_depth);
+    const int this_bit_est_ratio =
+        (int)(rc->projected_frame_size * q / sqrt((double)cpi->rec_sse));
+    cpi->rc.bit_est_ratio =
+        cpi->rc.bit_est_ratio == 0
+            ? this_bit_est_ratio
+            : (7 * cpi->rc.bit_est_ratio + this_bit_est_ratio) / 8;
+  }
+
+  // Keep a record of last Q and ambient average Q.
+  if (current_frame->frame_type == KEY_FRAME) {
+    p_rc->last_q[KEY_FRAME] = qindex;
+    p_rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+  } else {
+    if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
+        cpi->rc.rtc_external_ratectrl ||
+        (!rc->is_src_frame_alt_ref &&
+         !(refresh_frame->golden_frame || is_intrnl_arf ||
+           refresh_frame->alt_ref_frame))) {
+      p_rc->last_q[INTER_FRAME] = qindex;
+      p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
+          3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+      p_rc->ni_frames++;
+      p_rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth);
+      p_rc->avg_q = p_rc->tot_q / p_rc->ni_frames;
+      // Calculate the average Q for normal inter frames (not key or GFU
+      // frames).
+      rc->ni_tot_qi += qindex;
+      rc->ni_av_qi = rc->ni_tot_qi / p_rc->ni_frames;
+    }
+  }
+  // Keep record of last boosted (KF/GF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((qindex < p_rc->last_boosted_qindex) ||
+      (current_frame->frame_type == KEY_FRAME) ||
+      (!p_rc->constrained_gf_group &&
+       (refresh_frame->alt_ref_frame || is_intrnl_arf ||
+        (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) {
+    p_rc->last_boosted_qindex = qindex;
+  }
+  if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex;
+
+  update_buffer_level(cpi, rc->projected_frame_size);
+  rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (av1_frame_scaled(cm))
+    rc->this_frame_target = (int)(rc->this_frame_target /
+                                  resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+                                                     cm->width, cm->height));
+  if (current_frame->frame_type != KEY_FRAME) {
+    p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
+        p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
+        p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+  }
+
+  // Actual bits spent
+  p_rc->total_actual_bits += rc->projected_frame_size;
+  p_rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+
+  if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames,
+                        cpi->oxcf.gf_cfg.enable_auto_arf) &&
+      refresh_frame->alt_ref_frame &&
+      (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm)))
+    // Update the alternate reference frame stats as appropriate.
+    update_alt_ref_frame_stats(cpi);
+  else
+    // Update the Golden frame stats as appropriate.
+    update_golden_frame_stats(cpi);
+
+#if CONFIG_FPMT_TEST
+  /*The variables temp_avg_frame_qindex, temp_last_q, temp_avg_q,
+   * temp_last_boosted_qindex are introduced only for quality simulation
+   * purpose, it retains the value previous to the parallel encode frames. The
+   * variables are updated based on the update flag.
+   *
+   * If there exist show_existing_frames between parallel frames, then to
+   * retain the temp state do not update it. */
+  int show_existing_between_parallel_frames =
+      (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+           INTNL_OVERLAY_UPDATE &&
+       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+  if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    for (int i = 0; i < FRAME_TYPES; i++) {
+      p_rc->temp_last_q[i] = p_rc->last_q[i];
+    }
+    p_rc->temp_avg_q = p_rc->avg_q;
+    p_rc->temp_last_boosted_qindex = p_rc->last_boosted_qindex;
+    p_rc->temp_total_actual_bits = p_rc->total_actual_bits;
+    p_rc->temp_projected_frame_size = rc->projected_frame_size;
+    for (int i = 0; i < RATE_FACTOR_LEVELS; i++)
+      p_rc->temp_rate_correction_factors[i] = p_rc->rate_correction_factors[i];
+  }
+#endif
+  if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+  if (cpi->refresh_frame.golden_frame)
+    rc->frame_num_last_gf_refresh = current_frame->frame_number;
+  rc->prev_coded_width = cm->width;
+  rc->prev_coded_height = cm->height;
+  rc->frame_number_encoded++;
+  rc->prev_frame_is_dropped = 0;
+  rc->drop_count_consec = 0;
+  // if (current_frame->frame_number == 1 && cm->show_frame)
+  /*
+  rc->this_frame_target =
+      (int)(rc->this_frame_target / resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+  cm->width, cm->height));
+      */
+}
+
+void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
+  // Update buffer level with zero size, update frame counters, and return.
+  update_buffer_level(cpi, 0);
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+    cpi->rc.frames_since_key++;
+    cpi->rc.frames_to_key--;
+  }
+  cpi->rc.rc_2_frame = 0;
+  cpi->rc.rc_1_frame = 0;
+  cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
+  cpi->rc.prev_coded_width = cpi->common.width;
+  cpi->rc.prev_coded_height = cpi->common.height;
+  cpi->rc.prev_frame_is_dropped = 1;
+  // On a scene/slide change for dropped frame: reset the avg_source_sad to 0,
+  // otherwise the avg_source_sad can get too large and subsequent frames
+  // may miss the scene/slide detection.
+  if (cpi->rc.high_source_sad) cpi->rc.avg_source_sad = 0;
+  if (cpi->ppi->use_svc && cpi->svc.number_spatial_layers > 1) {
+    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = true;
+    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = true;
+  }
+}
+
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+                    int best_qindex, int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const double mid_q = av1_convert_qindex_to_q(mid, bit_depth);
+    if (mid_q < desired_q) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  assert(low == high);
+  assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q ||
+         low == worst_qindex);
+  return low;
+}
+
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       aom_bit_depth_t bit_depth) {
+  const int start_index =
+      av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality);
+  const int target_index =
+      av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality);
+  return target_index - start_index;
+}
+
+// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex],
+// assuming 'correction_factor' is 1.0.
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// bits per mb <= desired_bits_per_mb.
+// If no such q index is found, returns 'worst_qindex'.
+static int find_qindex_by_rate(const AV1_COMP *const cpi,
+                               int desired_bits_per_mb, FRAME_TYPE frame_type,
+                               int best_qindex, int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const int mid_bits_per_mb =
+        av1_rc_bits_per_mb(cpi, frame_type, mid, 1.0, 0);
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  assert(low == high);
+  assert(av1_rc_bits_per_mb(cpi, frame_type, low, 1.0, 0) <=
+             desired_bits_per_mb ||
+         low == worst_qindex);
+  return low;
+}
+
+int av1_compute_qdelta_by_rate(const AV1_COMP *cpi, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio) {
+  const RATE_CONTROL *rc = &cpi->rc;
+
+  // Look up the current projected bits per block for the base index
+  const int base_bits_per_mb =
+      av1_rc_bits_per_mb(cpi, frame_type, qindex, 1.0, 0);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+  const int target_index = find_qindex_by_rate(
+      cpi, target_bits_per_mb, frame_type, rc->best_quality, rc->worst_quality);
+  return target_index - qindex;
+}
+
+void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
+                                  RATE_CONTROL *const rc) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Special case code for 1 pass fixed Q mode tests
+  if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) {
+    rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+    rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+    rc->static_scene_max_gf_interval = rc->min_gf_interval + 1;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+    rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+          oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
+    /*
+     * Extended max interval for genuinely static scenes like slide shows.
+     * The no.of.stats available in the case of LAP is limited,
+     * hence setting to max_gf_interval.
+     */
+    if (cpi->ppi->lap_enabled)
+      rc->static_scene_max_gf_interval = rc->max_gf_interval + 1;
+    else
+      rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+    // Clamp min to max
+    rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
+  }
+}
+
+void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int vbr_max_bits;
+  const int MBs = av1_get_MBs(width, height);
+
+  rc->avg_frame_bandwidth =
+      (int)round(oxcf->rc_cfg.target_bandwidth / cpi->framerate);
+  rc->min_frame_bandwidth =
+      (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100);
+
+  rc->min_frame_bandwidth =
+      AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  vbr_max_bits =
+      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) /
+            100);
+  rc->max_frame_bandwidth =
+      AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+  av1_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+#if CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  int64_t vbr_bits_off_target = simulate_parallel_frame
+                                    ? cpi->ppi->p_rc.temp_vbr_bits_off_target
+                                    : p_rc->vbr_bits_off_target;
+#else
+  int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target;
+#endif
+  const int stats_count =
+      cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL
+          ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count
+          : 0;
+  const int frame_window = AOMMIN(
+      16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
+  assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100);
+  if (frame_window > 0) {
+    const int max_delta = (int)AOMMIN(
+        abs((int)(vbr_bits_off_target / frame_window)),
+        ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
+
+    // vbr_bits_off_target > 0 means we have extra bits to spend
+    // vbr_bits_off_target < 0 we are currently overshooting
+    *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta;
+  }
+
+#if CONFIG_FPMT_TEST
+  int64_t vbr_bits_off_target_fast =
+      simulate_parallel_frame ? cpi->ppi->p_rc.temp_vbr_bits_off_target_fast
+                              : p_rc->vbr_bits_off_target_fast;
+#endif
+  // Fast redistribution of bits arising from massive local undershoot.
+  // Dont do it for kf,arf,gf or overlay frames.
+  if (!frame_is_kf_gf_arf(cpi) &&
+#if CONFIG_FPMT_TEST
+      vbr_bits_off_target_fast &&
+#else
+      p_rc->vbr_bits_off_target_fast &&
+#endif
+      !rc->is_src_frame_alt_ref) {
+    int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int fast_extra_bits;
+#if CONFIG_FPMT_TEST
+    fast_extra_bits = (int)AOMMIN(vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits =
+        (int)AOMMIN(fast_extra_bits,
+                    AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8));
+#else
+    fast_extra_bits =
+        (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)AOMMIN(
+        fast_extra_bits,
+        AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8));
+#endif
+    if (fast_extra_bits > 0) {
+      // Update this_frame_target only if additional bits are available from
+      // local undershoot.
+      *this_frame_target += (int)fast_extra_bits;
+    }
+    // Store the fast_extra_bits of the frame and reduce it from
+    // vbr_bits_off_target_fast during postencode stage.
+    rc->frame_level_fast_extra_bits = fast_extra_bits;
+    // Retaining the condition to udpate during postencode stage since
+    // fast_extra_bits are calculated based on vbr_bits_off_target_fast.
+    cpi->do_update_vbr_bits_off_target_fast = 1;
+  }
+}
+
+void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target_rate = rc->base_frame_target;
+
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_cfg.mode == AOM_VBR || cpi->oxcf.rc_cfg.mode == AOM_CQ)
+    vbr_rate_correction(cpi, &target_rate);
+  av1_rc_set_frame_target(cpi, target_rate, width, height);
+}
+
+int av1_calc_pframe_target_size_one_pass_vbr(
+    const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int64_t target;
+#if USE_ALTREF_FOR_ONE_PASS
+  if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
+      frame_update_type == ARF_UPDATE) {
+    target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+              af_ratio) /
+             (p_rc->baseline_gf_interval + af_ratio - 1);
+  } else {
+    target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) /
+             (p_rc->baseline_gf_interval + af_ratio - 1);
+  }
+  if (target > INT_MAX) target = INT_MAX;
+#else
+  target = rc->avg_frame_bandwidth;
+#endif
+  return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type);
+}
+
+int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int64_t target = (int64_t)rc->avg_frame_bandwidth * kf_ratio;
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+int av1_calc_pframe_target_size_one_pass_cbr(
+    const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+  const RateControlCfg *rc_cfg = &oxcf->rc_cfg;
+  const int64_t diff = p_rc->optimal_buffer_level - p_rc->buffer_level;
+  const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100;
+  int min_frame_target =
+      AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int target;
+
+  if (rc_cfg->gf_cbr_boost_pct) {
+    const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100;
+    if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
+      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+                af_ratio_pct) /
+               (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    } else {
+      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) /
+               (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    }
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
+  if (cpi->ppi->use_svc) {
+    // Note that for layers, avg_frame_bandwidth is the cumulative
+    // per-frame-bandwidth. For the target size of this frame, use the
+    // layer average frame size (i.e., non-cumulative per-frame-bw).
+    int layer =
+        LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
+                         cpi->svc.number_temporal_layers);
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    target = lc->avg_frame_size;
+    min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+  }
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low =
+        (int)AOMMIN(diff / one_pct_bits, rc_cfg->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high =
+        (int)AOMMIN(-diff / one_pct_bits, rc_cfg->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  if (rc_cfg->max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  return AOMMAX(min_frame_target, target);
+}
+
+int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+  int64_t target;
+  if (cpi->common.current_frame.frame_number == 0) {
+    target = ((p_rc->starting_buffer_level / 2) > INT_MAX)
+                 ? INT_MAX
+                 : (int)(p_rc->starting_buffer_level / 2);
+    if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) {
+      target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1));
+    }
+  } else {
+    int kf_boost = 32;
+    int framerate = (int)round(cpi->framerate);
+
+    kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+    if (rc->frames_since_key < framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+static void set_golden_update(AV1_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int divisor = 10;
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+    divisor = cpi->cyclic_refresh->percent_refresh;
+
+  // Set minimum gf_interval for GF update to a multiple of the refresh period,
+  // with some max limit. Depending on past encoding stats, GF flag may be
+  // reset and update may not occur until next baseline_gf_interval.
+  const int gf_length_mult[2] = { 8, 4 };
+  if (divisor > 0)
+    p_rc->baseline_gf_interval =
+        AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * (100 / divisor),
+               MAX_GF_INTERVAL_RT);
+  else
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+  if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+    p_rc->baseline_gf_interval = 16;
+}
+
+static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+  set_golden_update(cpi);
+
+  if (p_rc->baseline_gf_interval > rc->frames_to_key &&
+      cpi->oxcf.kf_cfg.auto_key)
+    p_rc->baseline_gf_interval = rc->frames_to_key;
+  p_rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+  p_rc->constrained_gf_group =
+      (p_rc->baseline_gf_interval >= rc->frames_to_key &&
+       cpi->oxcf.kf_cfg.auto_key)
+          ? 1
+          : 0;
+  rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+  cpi->gf_frame_index = 0;
+  // SVC does not use GF as periodic boost.
+  // TODO(marpan): Find better way to disable this for SVC.
+  if (cpi->ppi->use_svc) {
+    SVC *const svc = &cpi->svc;
+    p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+    p_rc->gfu_boost = 1;
+    p_rc->constrained_gf_group = 0;
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+    for (int layer = 0;
+         layer < svc->number_spatial_layers * svc->number_temporal_layers;
+         ++layer) {
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval;
+      lc->p_rc.gfu_boost = p_rc->gfu_boost;
+      lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group;
+      lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
+      lc->group_index = 0;
+    }
+  }
+  gf_group->size = p_rc->baseline_gf_interval;
+  gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+  gf_group->refbuf_state[cpi->gf_frame_index] =
+      (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE;
+}
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  const int resize_pending = is_frame_resize_pending(cpi);
+  if (!resize_pending && !rc->high_source_sad) {
+    // Check if we should disable GF refresh (if period is up),
+    // or force a GF refresh update (if we are at least halfway through
+    // period) based on QP. Look into add info on segment deltaq.
+    PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+    const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME];
+    const int allow_gf_update =
+        rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10);
+    int gf_update_changed = 0;
+    int thresh = 87;
+    if ((cm->current_frame.frame_number - cpi->rc.frame_num_last_gf_refresh) <
+            FIXED_GF_INTERVAL_RT &&
+        rc->frames_till_gf_update_due == 1 &&
+        cm->quant_params.base_qindex > avg_qp) {
+      // Disable GF refresh since QP is above the running average QP.
+      rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 0;
+      gf_update_changed = 1;
+      cpi->refresh_frame.golden_frame = 0;
+    } else if (allow_gf_update &&
+               ((cm->quant_params.base_qindex < thresh * avg_qp / 100) ||
+                (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 20))) {
+      // Force refresh since QP is well below average QP or this is a high
+      // motion frame.
+      rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 1;
+      gf_update_changed = 1;
+      cpi->refresh_frame.golden_frame = 1;
+    }
+    if (gf_update_changed) {
+      set_baseline_gf_interval(cpi, INTER_FRAME);
+      int refresh_mask = 0;
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+        int ref_frame_map_idx = rtc_ref->ref_idx[i];
+        refresh_mask |= rtc_ref->refresh[ref_frame_map_idx]
+                        << ref_frame_map_idx;
+      }
+      cm->current_frame.refresh_frame_flags = refresh_mask;
+    }
+  }
+}
+
+/*!\brief Setup the reference prediction structure for 1 pass real-time
+ *
+ * Set the reference prediction structure for 1 layer.
+ * Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
+ * where ALT_REF always behind current by lag_alt frames, and GOLDEN is
+ * either updated on LAST with period baseline_gf_interval (fixed slot)
+ * or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       gf_update    Flag to indicate if GF is updated
+ *
+ * \remark Nothing is returned. Instead the settings for the prediction
+ * structure are set in \c cpi-ext_flags; and the buffer slot index
+ * (for each of 7 references) and refresh flags (for each of the 8 slots)
+ * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[].
+ */
+void av1_set_rtc_reference_structure_one_layer(AV1_COMP *cpi, int gf_update) {
+  AV1_COMMON *const cm = &cpi->common;
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
+  RATE_CONTROL *const rc = &cpi->rc;
+  ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &ext_flags->refresh_frame;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  unsigned int frame_number = (cpi->oxcf.rc_cfg.drop_frames_water_mark)
+                                  ? rc->frame_number_encoded
+                                  : cm->current_frame.frame_number;
+  unsigned int lag_alt = 4;
+  int last_idx = 0;
+  int last_idx_refresh = 0;
+  int gld_idx = 0;
+  int alt_ref_idx = 0;
+  int last2_idx = 0;
+  ext_refresh_frame_flags->update_pending = 1;
+  ext_flags->ref_frame_flags = 0;
+  ext_refresh_frame_flags->last_frame = 1;
+  ext_refresh_frame_flags->golden_frame = 0;
+  ext_refresh_frame_flags->alt_ref_frame = 0;
+  // Decide altref lag adaptively for rt
+  if (cpi->sf.rt_sf.sad_based_adp_altref_lag) {
+    lag_alt = 6;
+    const uint64_t th_frame_sad[4][3] = {
+      { 18000, 18000, 18000 },  // HDRES CPU 9
+      { 25000, 25000, 25000 },  // MIDRES CPU 9
+      { 40000, 30000, 20000 },  // HDRES CPU10
+      { 30000, 25000, 20000 }   // MIDRES CPU 10
+    };
+    int th_idx = cpi->sf.rt_sf.sad_based_adp_altref_lag - 1;
+    assert(th_idx < 4);
+    if (rc->avg_source_sad > th_frame_sad[th_idx][0])
+      lag_alt = 3;
+    else if (rc->avg_source_sad > th_frame_sad[th_idx][1])
+      lag_alt = 4;
+    else if (rc->avg_source_sad > th_frame_sad[th_idx][2])
+      lag_alt = 5;
+  }
+  // This defines the reference structure for 1 layer (non-svc) RTC encoding.
+  // To avoid the internal/default reference structure for non-realtime
+  // overwriting this behavior, we use the "svc" ref parameters from the
+  // external control SET_SVC_REF_FRAME_CONFIG.
+  // TODO(marpan): rename that control and the related internal parameters
+  // to rtc_ref.
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) rtc_ref->ref_idx[i] = 7;
+  for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0;
+  // Set the reference frame flags.
+  ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
+  if (!cpi->sf.rt_sf.force_only_last_ref) {
+    ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
+    ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+    if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
+      ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG;
+  }
+  const int sh = 6;
+  // Moving index slot for last: 0 - (sh - 1).
+  if (frame_number > 1) last_idx = ((frame_number - 1) % sh);
+  // Moving index for refresh of last: one ahead for next frame.
+  last_idx_refresh = (frame_number % sh);
+  gld_idx = 6;
+
+  // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+  if (frame_number > lag_alt) alt_ref_idx = ((frame_number - lag_alt) % sh);
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+    // Moving index for LAST2, lag behind LAST by 2 frames.
+    if (frame_number > 2) last2_idx = ((frame_number - 2) % sh);
+  }
+  rtc_ref->ref_idx[0] = last_idx;          // LAST
+  rtc_ref->ref_idx[1] = last_idx_refresh;  // LAST2 (for refresh of last).
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+    rtc_ref->ref_idx[1] = last2_idx;         // LAST2
+    rtc_ref->ref_idx[2] = last_idx_refresh;  // LAST3 (for refresh of last).
+  }
+  rtc_ref->ref_idx[3] = gld_idx;      // GOLDEN
+  rtc_ref->ref_idx[6] = alt_ref_idx;  // ALT_REF
+  // Refresh this slot, which will become LAST on next frame.
+  rtc_ref->refresh[last_idx_refresh] = 1;
+  // Update GOLDEN on period for fixed slot case.
+  if (gf_update && cm->current_frame.frame_type != KEY_FRAME) {
+    ext_refresh_frame_flags->golden_frame = 1;
+    rtc_ref->refresh[gld_idx] = 1;
+  }
+  rtc_ref->gld_idx_1layer = gld_idx;
+  // Set the flag to reduce the number of reference frame buffers used.
+  // This assumes that slot 7 is never used.
+  cpi->rt_reduce_num_ref_buffers = 1;
+  cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[0] < 7);
+  cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[1] < 7);
+  cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[3] < 7);
+  cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[6] < 7);
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
+    cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7);
+}
+
+/*!\brief Check for scene detection, for 1 pass real-time mode.
+ *
+ * Compute average source sad (temporal sad: between current source and
+ * previous source) over a subset of superblocks. Use this is detect big changes
+ * in content and set the \c cpi->rc.high_source_sad flag.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       frame_input  Current and last input source frames
+ *
+ * \remark Nothing is returned. Instead the flag \c cpi->rc.high_source_sad
+ * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated.
+ */
+static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
+                                          const EncodeFrameInput *frame_input) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  YV12_BUFFER_CONFIG const *const unscaled_src = frame_input->source;
+  YV12_BUFFER_CONFIG const *const unscaled_last_src = frame_input->last_source;
+  uint8_t *src_y;
+  int src_ystride;
+  int src_width;
+  int src_height;
+  uint8_t *last_src_y;
+  int last_src_ystride;
+  int last_src_width;
+  int last_src_height;
+  int width = cm->width;
+  int height = cm->height;
+  if (cpi->svc.number_spatial_layers > 1) {
+    width = cpi->oxcf.frm_dim_cfg.width;
+    height = cpi->oxcf.frm_dim_cfg.height;
+  }
+  if (width != cm->render_width || height != cm->render_height ||
+      unscaled_src == NULL || unscaled_last_src == NULL) {
+    aom_free(cpi->src_sad_blk_64x64);
+    cpi->src_sad_blk_64x64 = NULL;
+  }
+  if (unscaled_src == NULL || unscaled_last_src == NULL) return;
+  src_y = unscaled_src->y_buffer;
+  src_ystride = unscaled_src->y_stride;
+  src_width = unscaled_src->y_width;
+  src_height = unscaled_src->y_height;
+  last_src_y = unscaled_last_src->y_buffer;
+  last_src_ystride = unscaled_last_src->y_stride;
+  last_src_width = unscaled_last_src->y_width;
+  last_src_height = unscaled_last_src->y_height;
+  if (src_width != last_src_width || src_height != last_src_height) {
+    aom_free(cpi->src_sad_blk_64x64);
+    cpi->src_sad_blk_64x64 = NULL;
+    return;
+  }
+  rc->high_source_sad = 0;
+  rc->percent_blocks_with_motion = 0;
+  rc->max_block_source_sad = 0;
+  rc->prev_avg_source_sad = rc->avg_source_sad;
+  int num_mi_cols = cm->mi_params.mi_cols;
+  int num_mi_rows = cm->mi_params.mi_rows;
+  if (cpi->svc.number_spatial_layers > 1) {
+    num_mi_cols = cpi->svc.mi_cols_full_resoln;
+    num_mi_rows = cpi->svc.mi_rows_full_resoln;
+  }
+  int num_zero_temp_sad = 0;
+  uint32_t min_thresh = 10000;
+  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+    min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0
+                     ? 50000
+                     : 100000;
+  }
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
+  uint64_t avg_sad = 0;
+  uint64_t tmp_sad = 0;
+  int num_samples = 0;
+  const int thresh =
+      cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 5 : 6;
+  // SAD is computed on 64x64 blocks
+  const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                ? (cm->seq_params->mib_size >> 1)
+                                : cm->seq_params->mib_size;
+  const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+  const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+  int num_low_var_high_sumdiff = 0;
+  int light_change = 0;
+  // Flag to check light change or not.
+  const int check_light_change = 0;
+  // TODO(marpan): There seems some difference along the bottom border when
+  // using the source_last_tl0 for last_source (used for temporal layers or
+  // when previous frame is dropped).
+  // Remove this bord parameter when issue is resolved: difference is that
+  // non-zero sad exists along bottom border even though source is static.
+  const int border =
+      rc->prev_frame_is_dropped || cpi->svc.number_temporal_layers > 1;
+  // Store blkwise SAD for later use
+  if (width == cm->render_width && height == cm->render_height) {
+    if (cpi->src_sad_blk_64x64 == NULL) {
+      CHECK_MEM_ERROR(cm, cpi->src_sad_blk_64x64,
+                      (uint64_t *)aom_calloc(sb_cols * sb_rows,
+                                             sizeof(*cpi->src_sad_blk_64x64)));
+    }
+  }
+  // Avoid bottom and right border.
+  for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) {
+    for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+      tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                            last_src_ystride);
+      if (cpi->src_sad_blk_64x64 != NULL)
+        cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad;
+      if (check_light_change) {
+        unsigned int sse, variance;
+        variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                              last_src_ystride, &sse);
+        // Note: sse - variance = ((sum * sum) >> 12)
+        // Detect large lighting change.
+        if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
+          num_low_var_high_sumdiff++;
+        }
+      }
+      avg_sad += tmp_sad;
+      num_samples++;
+      if (tmp_sad == 0) num_zero_temp_sad++;
+      if (tmp_sad > rc->max_block_source_sad)
+        rc->max_block_source_sad = tmp_sad;
+
+      src_y += 64;
+      last_src_y += 64;
+    }
+    src_y += (src_ystride << 6) - (sb_cols << 6);
+    last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+  }
+  if (check_light_change && num_samples > 0 &&
+      num_low_var_high_sumdiff > (num_samples >> 1))
+    light_change = 1;
+  if (num_samples > 0) avg_sad = avg_sad / num_samples;
+  // Set high_source_sad flag if we detect very high increase in avg_sad
+  // between current and previous frame value(s). Use minimum threshold
+  // for cases where there is small change from content that is completely
+  // static.
+  if (!light_change &&
+      avg_sad >
+          AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) &&
+      rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+      num_zero_temp_sad < 3 * (num_samples >> 2))
+    rc->high_source_sad = 1;
+  else
+    rc->high_source_sad = 0;
+  rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2;
+  rc->frame_source_sad = avg_sad;
+  if (num_samples > 0)
+    rc->percent_blocks_with_motion =
+        ((num_samples - num_zero_temp_sad) * 100) / num_samples;
+  // Scene detection is only on base SLO, and using full/orignal resolution.
+  // Pass the state to the upper spatial layers.
+  if (cpi->svc.number_spatial_layers > 1) {
+    SVC *svc = &cpi->svc;
+    for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+      int tl = svc->temporal_layer_id;
+      const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      lrc->high_source_sad = rc->high_source_sad;
+      lrc->frame_source_sad = rc->frame_source_sad;
+      lrc->avg_source_sad = rc->avg_source_sad;
+      lrc->percent_blocks_with_motion = rc->percent_blocks_with_motion;
+      lrc->max_block_source_sad = rc->max_block_source_sad;
+    }
+  }
+}
+
+/*!\brief Set the GF baseline interval for 1 pass real-time mode.
+ *
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       frame_type   frame type
+ *
+ * \return Return GF update flag, and update the \c cpi->rc with
+ * the next GF interval settings.
+ */
+static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
+                                             FRAME_TYPE frame_type) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int gf_update = 0;
+  const int resize_pending = is_frame_resize_pending(cpi);
+  // GF update based on frames_till_gf_update_due, also
+  // force upddate on resize pending frame or for scene change.
+  if ((resize_pending || rc->high_source_sad ||
+       rc->frames_till_gf_update_due == 0) &&
+      cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
+    set_baseline_gf_interval(cpi, frame_type);
+    gf_update = 1;
+  }
+  return gf_update;
+}
+
+static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
+                            int prev_width, int prev_height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  SVC *const svc = &cpi->svc;
+  int target_bits_per_frame;
+  int active_worst_quality;
+  int qindex;
+  double tot_scale_change = (double)(resize_width * resize_height) /
+                            (double)(prev_width * prev_height);
+  // Disable the skip mv search for svc on resize frame.
+  svc->skip_mvsearch_last = 0;
+  svc->skip_mvsearch_gf = 0;
+  svc->skip_mvsearch_altref = 0;
+  // Reset buffer level to optimal, update target size.
+  p_rc->buffer_level = p_rc->optimal_buffer_level;
+  p_rc->bits_off_target = p_rc->optimal_buffer_level;
+  rc->this_frame_target =
+      av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME);
+  target_bits_per_frame = rc->this_frame_target;
+  if (tot_scale_change > 4.0)
+    p_rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+  else if (tot_scale_change > 1.0)
+    p_rc->avg_frame_qindex[INTER_FRAME] =
+        (p_rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1;
+  active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+  qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
+                             active_worst_quality, resize_width, resize_height);
+  // If resize is down, check if projected q index is close to worst_quality,
+  // and if so, reduce the rate correction factor (since likely can afford
+  // lower q for resized frame).
+  if (tot_scale_change < 1.0 && qindex > 90 * rc->worst_quality / 100)
+    p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+  // If resize is back up: check if projected q index is too much above the
+  // previous index, and if so, reduce the rate correction factor
+  // (since prefer to keep q for resized frame at least closet to previous q).
+  // Also check if projected qindex is close to previous qindex, if so
+  // increase correction factor (to push qindex higher and avoid overshoot).
+  if (tot_scale_change >= 1.0) {
+    if (tot_scale_change < 4.0 &&
+        qindex > 130 * p_rc->last_q[INTER_FRAME] / 100)
+      p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8;
+    if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100)
+      p_rc->rate_correction_factors[INTER_NORMAL] *= 1.5;
+  }
+  if (svc->number_temporal_layers > 1) {
+    // Apply the same rate control reset to all temporal layers.
+    for (int tl = 0; tl < svc->number_temporal_layers; tl++) {
+      LAYER_CONTEXT *lc = NULL;
+      lc = &svc->layer_context[svc->spatial_layer_id *
+                                   svc->number_temporal_layers +
+                               tl];
+      lc->rc.resize_state = rc->resize_state;
+      lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level;
+      lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level;
+      lc->p_rc.rate_correction_factors[INTER_NORMAL] =
+          p_rc->rate_correction_factors[INTER_NORMAL];
+      lc->p_rc.avg_frame_qindex[INTER_FRAME] =
+          p_rc->avg_frame_qindex[INTER_FRAME];
+    }
+  }
+}
+
+/*!\brief ChecK for resize based on Q, for 1 pass real-time mode.
+ *
+ * Check if we should resize, based on average QP from past x frames.
+ * Only allow for resize at most 1/2 scale down for now, Scaling factor
+ * for each step may be 3/4 or 1/2.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \remark Return resized width/height in \c cpi->resize_pending_params,
+ * and update some resize counters in \c rc.
+ */
+static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  RESIZE_ACTION resize_action = NO_RESIZE;
+  const int avg_qp_thr1 = 70;
+  const int avg_qp_thr2 = 50;
+  // Don't allow for resized frame to go below 160x90, resize in steps of 3/4.
+  const int min_width = (160 * 4) / 3;
+  const int min_height = (90 * 4) / 3;
+  int down_size_on = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    rc->resize_avg_qp = 0;
+    rc->resize_count = 0;
+    rc->resize_buffer_underflow = 0;
+    return;
+  }
+  // No resizing down if frame size is below some limit.
+  if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0;
+
+  // Resize based on average buffer underflow and QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > cpi->framerate) {
+    const int window = AOMMIN(30, (int)(2 * cpi->framerate));
+    rc->resize_avg_qp += p_rc->last_q[INTER_FRAME];
+    if (cpi->ppi->p_rc.buffer_level <
+        (int)(30 * p_rc->optimal_buffer_level / 100))
+      ++rc->resize_buffer_underflow;
+    ++rc->resize_count;
+    // Check for resize action every "window" frames.
+    if (rc->resize_count >= window) {
+      int avg_qp = rc->resize_avg_qp / rc->resize_count;
+      // Resize down if buffer level has underflowed sufficient amount in past
+      // window, and we are at original or 3/4 of original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state, i.e. 1/2 or 3/4 of original resolution.
+      // Currently, use a flag to turn 3/4 resizing feature on/off.
+      if (rc->resize_buffer_underflow > (rc->resize_count >> 2) &&
+          down_size_on) {
+        if (rc->resize_state == THREE_QUARTER) {
+          resize_action = DOWN_ONEHALF;
+          rc->resize_state = ONE_HALF;
+        } else if (rc->resize_state == ORIG) {
+          resize_action = DOWN_THREEFOUR;
+          rc->resize_state = THREE_QUARTER;
+        }
+      } else if (rc->resize_state != ORIG &&
+                 avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) {
+        if (rc->resize_state == THREE_QUARTER ||
+            avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100) {
+          resize_action = UP_ORIG;
+          rc->resize_state = ORIG;
+        } else if (rc->resize_state == ONE_HALF) {
+          resize_action = UP_THREEFOUR;
+          rc->resize_state = THREE_QUARTER;
+        }
+      }
+      // Reset for next window measurement.
+      rc->resize_avg_qp = 0;
+      rc->resize_count = 0;
+      rc->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_action != NO_RESIZE) {
+    int resize_width = cpi->oxcf.frm_dim_cfg.width;
+    int resize_height = cpi->oxcf.frm_dim_cfg.height;
+    int resize_scale_num = 1;
+    int resize_scale_den = 1;
+    if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) {
+      resize_scale_num = 3;
+      resize_scale_den = 4;
+    } else if (resize_action == DOWN_ONEHALF) {
+      resize_scale_num = 1;
+      resize_scale_den = 2;
+    }
+    resize_width = resize_width * resize_scale_num / resize_scale_den;
+    resize_height = resize_height * resize_scale_num / resize_scale_den;
+    resize_reset_rc(cpi, resize_width, resize_height, cm->width, cm->height);
+  }
+  return;
+}
+
+static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+
+  // Very first frame has to be key frame.
+  if (cm->current_frame.frame_number == 0) return 1;
+  // Set key frame if forced by frame flags.
+  if (frame_flags & FRAMEFLAGS_KEY) return 1;
+  if (!cpi->ppi->use_svc) {
+    // Non-SVC
+    if (cpi->oxcf.kf_cfg.auto_key && rc->frames_to_key == 0) return 1;
+  } else {
+    // SVC
+    if (svc->spatial_layer_id == 0 &&
+        (cpi->oxcf.kf_cfg.auto_key &&
+         (cpi->oxcf.kf_cfg.key_freq_max == 0 ||
+          svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)))
+      return 1;
+  }
+
+  return 0;
+}
+
+// Set to true if this frame is a recovery frame, for 1 layer RPS,
+// and whether we should apply some boost (QP, adjust speed features, etc).
+// Recovery frame here means frame whose closest reference suddenly
+// switched from previous frame to one much further away.
+// TODO(marpan): Consider adding on/off flag to SVC_REF_FRAME_CONFIG to
+// allow more control for applications.
+static bool set_flag_rps_bias_recovery_frame(const AV1_COMP *const cpi) {
+  if (cpi->ppi->rtc_ref.set_ref_frame_config &&
+      cpi->svc.number_temporal_layers == 1 &&
+      cpi->svc.number_spatial_layers == 1 &&
+      cpi->ppi->rtc_ref.reference_was_previous_frame) {
+    int min_dist = av1_svc_get_min_ref_dist(cpi);
+    // Only consider boost for this frame if its closest reference is further
+    // than x frames away, using x = 4 for now.
+    if (min_dist != INT_MAX && min_dist > 4) return true;
+  }
+  return false;
+}
+
+void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
+                                const EncodeFrameInput *frame_input,
+                                unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  SVC *const svc = &cpi->svc;
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  int target;
+  const int layer =
+      LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                       svc->number_temporal_layers);
+  if (cpi->ppi->use_svc) {
+    av1_update_temporal_layer_framerate(cpi);
+    av1_restore_layer_context(cpi);
+  }
+  cpi->ppi->rtc_ref.bias_recovery_frame = set_flag_rps_bias_recovery_frame(cpi);
+  // Set frame type.
+  if (set_key_frame(cpi, frame_flags)) {
+    *frame_type = KEY_FRAME;
+    p_rc->this_key_frame_forced =
+        cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max;
+    p_rc->kf_boost = DEFAULT_KF_BOOST_RT;
+    gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE;
+    gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME;
+    gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET;
+    if (cpi->ppi->use_svc) {
+      if (cm->current_frame.frame_number > 0)
+        av1_svc_reset_temporal_layers(cpi, 1);
+      svc->layer_context[layer].is_key_frame = 1;
+    }
+    rc->frame_number_encoded = 0;
+    cpi->ppi->rtc_ref.non_reference_frame = 0;
+  } else {
+    *frame_type = INTER_FRAME;
+    gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE;
+    gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE;
+    if (cpi->ppi->use_svc) {
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      lc->is_key_frame =
+          svc->spatial_layer_id == 0
+              ? 0
+              : svc->layer_context[svc->temporal_layer_id].is_key_frame;
+      // If the user is setting the reference structure with
+      // set_ref_frame_config and did not set any references, set the
+      // frame type to Intra-only.
+      if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+        int no_references_set = 1;
+        for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+          if (cpi->ppi->rtc_ref.reference[i]) {
+            no_references_set = 0;
+            break;
+          }
+        }
+        // Set to intra_only_frame if no references are set.
+        // The stream can start decoding on INTRA_ONLY_FRAME so long as the
+        // layer with the intra_only_frame doesn't signal a reference to a slot
+        // that hasn't been set yet.
+        if (no_references_set) *frame_type = INTRA_ONLY_FRAME;
+      }
+    }
+  }
+  // Check for scene change: for SVC check on base spatial layer only.
+  if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) {
+    if (rc->prev_coded_width == cm->width &&
+        rc->prev_coded_height == cm->height) {
+      rc_scene_detection_onepass_rt(cpi, frame_input);
+    } else {
+      aom_free(cpi->src_sad_blk_64x64);
+      cpi->src_sad_blk_64x64 = NULL;
+    }
+  }
+  // Check for dynamic resize, for single spatial layer for now.
+  // For temporal layers only check on base temporal layer.
+  if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) {
+    if (svc->number_spatial_layers == 1 && svc->temporal_layer_id == 0)
+      dynamic_resize_one_pass_cbr(cpi);
+    if (rc->resize_state == THREE_QUARTER) {
+      resize_pending_params->width = (3 + cpi->oxcf.frm_dim_cfg.width * 3) >> 2;
+      resize_pending_params->height =
+          (3 + cpi->oxcf.frm_dim_cfg.height * 3) >> 2;
+    } else if (rc->resize_state == ONE_HALF) {
+      resize_pending_params->width = (1 + cpi->oxcf.frm_dim_cfg.width) >> 1;
+      resize_pending_params->height = (1 + cpi->oxcf.frm_dim_cfg.height) >> 1;
+    } else {
+      resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width;
+      resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height;
+    }
+  } else if (is_frame_resize_pending(cpi)) {
+    resize_reset_rc(cpi, resize_pending_params->width,
+                    resize_pending_params->height, cm->width, cm->height);
+  }
+  // Set the GF interval and update flag.
+  if (!rc->rtc_external_ratectrl)
+    set_gf_interval_update_onepass_rt(cpi, *frame_type);
+  // Set target size.
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+    if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) {
+      target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+    } else {
+      target = av1_calc_pframe_target_size_one_pass_cbr(
+          cpi, gf_group->update_type[cpi->gf_frame_index]);
+    }
+  } else {
+    if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) {
+      target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+    } else {
+      target = av1_calc_pframe_target_size_one_pass_vbr(
+          cpi, gf_group->update_type[cpi->gf_frame_index]);
+    }
+  }
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q)
+    rc->active_worst_quality = cpi->oxcf.rc_cfg.cq_level;
+
+  av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
+  rc->base_frame_target = target;
+  cm->current_frame.frame_type = *frame_type;
+  // For fixed mode SVC: if KSVC is enabled remove inter layer
+  // prediction on spatial enhancement layer frames for frames
+  // whose base is not KEY frame.
+  if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode &&
+      svc->number_spatial_layers > 1 &&
+      !svc->layer_context[layer].is_key_frame) {
+    ExternalFlags *const ext_flags = &cpi->ext_flags;
+    ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  }
+}
+
+int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
+  AV1_COMMON *const cm = &cpi->common;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  double rate_correction_factor =
+      cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL];
+  const int target_size = cpi->rc.avg_frame_bandwidth;
+  double new_correction_factor;
+  int target_bits_per_mb;
+  double q2;
+  int enumerator;
+  int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+  *q = (3 * cpi->rc.worst_quality + *q) >> 2;
+  // For screen content use the max-q set by the user to allow for less
+  // overshoot on slide changes.
+  if (is_screen_content) *q = cpi->rc.worst_quality;
+  cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+  // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+  // these parameters will affect QP selection for subsequent frames. If they
+  // have settled down to a very different (low QP) state, then not adjusting
+  // them may cause next frame to select low QP and overshoot again.
+  p_rc->avg_frame_qindex[INTER_FRAME] = *q;
+  p_rc->buffer_level = p_rc->optimal_buffer_level;
+  p_rc->bits_off_target = p_rc->optimal_buffer_level;
+  // Reset rate under/over-shoot flags.
+  cpi->rc.rc_1_frame = 0;
+  cpi->rc.rc_2_frame = 0;
+  // Adjust rate correction factor.
+  target_bits_per_mb =
+      (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
+  // Reset rate correction factor: for now base it on target_bits_per_mb
+  // and qp (==max_QP). This comes from the inverse computation of
+  // av1_rc_bits_per_mb().
+  q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
+  enumerator = av1_get_bpmb_enumerator(INTER_NORMAL, is_screen_content);
+  new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+  if (new_correction_factor > rate_correction_factor) {
+    rate_correction_factor =
+        (new_correction_factor + rate_correction_factor) / 2.0;
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+    cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] =
+        rate_correction_factor;
+  }
+  // For temporal layers: reset the rate control parameters across all
+  // temporal layers.
+  if (cpi->svc.number_temporal_layers > 1) {
+    SVC *svc = &cpi->svc;
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      int sl = svc->spatial_layer_id;
+      const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+      lp_rc->avg_frame_qindex[INTER_FRAME] = *q;
+      lp_rc->buffer_level = lp_rc->optimal_buffer_level;
+      lp_rc->bits_off_target = lp_rc->optimal_buffer_level;
+      lrc->rc_1_frame = 0;
+      lrc->rc_2_frame = 0;
+      lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+    }
+  }
+  return 1;
+}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
new file mode 100644
index 0000000000..6802ad42d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
+// Threshold used to define if a KF group is static (e.g. a slide show).
+// Essentially, this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+#define STATIC_KF_GROUP_FLOAT_THRESH 0.99
+
+// The maximum duration of a GF group that is static (e.g. a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 32
+#define FIXED_GF_INTERVAL 16
+#define MAX_GF_LENGTH_LAP 16
+
+#define FIXED_GF_INTERVAL_RT 80
+#define MAX_GF_INTERVAL_RT 160
+
+#define MAX_NUM_GF_INTERVALS 15
+
+#define MAX_ARF_LAYERS 6
+// #define STRICT_RC
+
+#define DEFAULT_KF_BOOST_RT 2300
+#define DEFAULT_GF_BOOST_RT 2000
+
+// A passive rate control strategy for screen content type in real-time mode.
+// When it is turned on, the compression performance is improved by
+// 7.8% (overall_psnr), 5.0% (VMAF) on average. Some clips see gains
+// over 20% on metric.
+// The downside is that it does not guarantee frame size.
+// Since RT mode has a tight restriction on buffer overflow control, we
+// turn it off by default.
+#define RT_PASSIVE_STRATEGY 0
+#define MAX_Q_HISTORY 1000
+
+typedef struct {
+  int resize_width;
+  int resize_height;
+  uint8_t superres_denom;
+} size_params_type;
+
+enum {
+  INTER_NORMAL,
+  GF_ARF_LOW,
+  GF_ARF_STD,
+  KF_STD,
+  RATE_FACTOR_LEVELS
+} UENUM1BYTE(RATE_FACTOR_LEVEL);
+
+enum {
+  KF_UPDATE,
+  LF_UPDATE,
+  GF_UPDATE,
+  ARF_UPDATE,
+  OVERLAY_UPDATE,
+  INTNL_OVERLAY_UPDATE,  // Internal Overlay Frame
+  INTNL_ARF_UPDATE,      // Internal Altref Frame
+  FRAME_UPDATE_TYPES
+} UENUM1BYTE(FRAME_UPDATE_TYPE);
+
+enum {
+  REFBUF_RESET,   // Clear reference frame buffer
+  REFBUF_UPDATE,  // Refresh reference frame buffer
+  REFBUF_STATES
+} UENUM1BYTE(REFBUF_STATE);
+
+typedef enum {
+  NO_RESIZE = 0,
+  DOWN_THREEFOUR = 1,  // From orig to 3/4.
+  DOWN_ONEHALF = 2,    // From orig or 3/4 to 1/2.
+  UP_THREEFOUR = -1,   // From 1/2 to 3/4.
+  UP_ORIG = -2,        // From 1/2 or 3/4 to orig.
+} RESIZE_ACTION;
+
+typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE;
+
+#define MAX_FIRSTPASS_ANALYSIS_FRAMES 150
+typedef enum region_types {
+  STABLE_REGION = 0,
+  HIGH_VAR_REGION = 1,
+  SCENECUT_REGION = 2,
+  BLENDING_REGION = 3,
+} REGION_TYPES;
+
+typedef struct regions {
+  int start;
+  int last;
+  double avg_noise_var;
+  double avg_cor_coeff;
+  double avg_sr_fr_ratio;
+  double avg_intra_err;
+  double avg_coded_err;
+  REGION_TYPES type;
+} REGIONS;
+
+/*!\endcond */
+/*!
+ * \brief  Rate Control parameters and status
+ */
+typedef struct {
+  // Rate targetting variables
+
+  /*!
+   * Baseline target rate for frame before adjustment for previous under or
+   * over shoot.
+   */
+  int base_frame_target;
+  /*!
+   * Target rate for frame after adjustment for previous under or over shoot.
+   */
+  int this_frame_target;  // Actual frame target after rc adjustment.
+
+  /*!
+   * Projected size for current frame
+   */
+  int projected_frame_size;
+
+  /*!
+   * Bit size of transform coefficient for current frame.
+   */
+  int coefficient_size;
+
+  /*!
+   * Super block rate target used with some adaptive quantization strategies.
+   */
+  int sb64_target_rate;
+
+  /*!
+   * Number of frames since the last ARF / GF.
+   */
+  int frames_since_golden;
+
+  /*!
+   * Number of frames till the next ARF / GF is due.
+   */
+  int frames_till_gf_update_due;
+
+  /*!
+   * Number of determined gf groups left
+   */
+  int intervals_till_gf_calculate_due;
+
+  /*!\cond */
+  int min_gf_interval;
+  int max_gf_interval;
+  int static_scene_max_gf_interval;
+  /*!\endcond */
+  /*!
+   * Frames before the next key frame
+   */
+  int frames_to_key;
+  /*!\cond */
+  int frames_since_key;
+  int frames_to_fwd_kf;
+  int is_src_frame_alt_ref;
+  int sframe_due;
+
+  int high_source_sad;
+  uint64_t avg_source_sad;
+  uint64_t prev_avg_source_sad;
+  uint64_t frame_source_sad;
+
+  int avg_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
+  int prev_avg_frame_bandwidth;
+
+  int ni_av_qi;
+  int ni_tot_qi;
+
+  int decimation_factor;
+  int decimation_count;
+  int prev_frame_is_dropped;
+  int drop_count_consec;
+  int max_consec_drop;
+
+  /*!
+   * Frame number for encoded frames (non-dropped).
+   * Use for setting the rtc reference structure.
+   */
+  unsigned int frame_number_encoded;
+
+  /*!\endcond */
+  /*!
+   * User specified maximum Q allowed for current frame
+   */
+  int worst_quality;
+  /*!
+   * User specified minimum Q allowed for current frame
+   */
+  int best_quality;
+
+  /*!\cond */
+
+  // rate control history for last frame(1) and the frame before(2).
+  // -1: overshoot
+  //  1: undershoot
+  //  0: not initialized.
+  int rc_1_frame;
+  int rc_2_frame;
+  int q_1_frame;
+  int q_2_frame;
+
+  /*!\endcond */
+  /*!
+   * Proposed maximum allowed Q for current frame
+   */
+  int active_worst_quality;
+
+  /*!\cond */
+  // Track amount of low motion in scene
+  int avg_frame_low_motion;
+  int cnt_zeromv;
+
+  // signals if number of blocks with motion is high
+  int percent_blocks_with_motion;
+
+  // Maximum value of source sad across all blocks of frame.
+  uint64_t max_block_source_sad;
+
+  // For dynamic resize, 1 pass cbr.
+  RESIZE_STATE resize_state;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+
+  // Flag to disable content related qp adjustment.
+  int rtc_external_ratectrl;
+
+  // Stores fast_extra_bits of the current frame.
+  int frame_level_fast_extra_bits;
+
+  double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  int frame_num_last_gf_refresh;
+
+  int prev_coded_width;
+  int prev_coded_height;
+
+  // The ratio used for inter frames in bit estimation.
+  // TODO(yunqing): if golden frame is treated differently (e.g. gf_cbr_boost_
+  // pct > THR), consider to add bit_est_ratio_g for golden frames.
+  int bit_est_ratio;
+
+  // Whether to use a fixed qp for the frame, bypassing internal rate control.
+  // This flag will reset to 0 after every frame.
+  int use_external_qp_one_pass;
+  /*!\endcond */
+} RATE_CONTROL;
+
+/*!
+ * \brief  Primary Rate Control parameters and status
+ */
+typedef struct {
+  // Sub-gop level Rate targetting variables
+
+  /*!
+   * Target bit budget for the current GF / ARF group of frame.
+   */
+  int64_t gf_group_bits;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to the key frame
+   */
+  int kf_boost;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to ARFs and GFs
+   */
+  int gfu_boost;
+
+  /*!
+   * Stores the determined gf group lengths for a set of gf groups
+   */
+  int gf_intervals[MAX_NUM_GF_INTERVALS];
+
+  /*!
+   * The current group's index into gf_intervals[]
+   */
+  int cur_gf_index;
+
+  /*!\cond */
+  int num_regions;
+
+  REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+  int regions_offset;  // offset of regions from the last keyframe
+  int frames_till_regions_update;
+
+  int baseline_gf_interval;
+
+  int constrained_gf_group;
+
+  int this_key_frame_forced;
+
+  int next_key_frame_forced;
+  /*!\endcond */
+
+  /*!
+   * Initial buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t starting_buffer_level;
+
+  /*!
+   * Optimum / target buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t optimal_buffer_level;
+
+  /*!
+   * Maximum target buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t maximum_buffer_size;
+
+  /*!
+   * Q index used for ALT frame
+   */
+  int arf_q;
+
+  /*!\cond */
+  float_t arf_boost_factor;
+
+  int base_layer_qp;
+
+  // Total number of stats used only for kf_boost calculation.
+  int num_stats_used_for_kf_boost;
+
+  // Total number of stats used only for gfu_boost calculation.
+  int num_stats_used_for_gfu_boost;
+
+  // Total number of stats required by gfu_boost calculation.
+  int num_stats_required_for_gfu_boost;
+
+  int enable_scenecut_detection;
+
+  int use_arf_in_this_kf_group;
+
+  int ni_frames;
+
+  double tot_q;
+  /*!\endcond */
+
+  /*!
+   * Q used for last boosted (non leaf) frame
+   */
+  int last_kf_qindex;
+
+  /*!
+   * Average of q index of previous encoded frames in a sequence.
+   */
+  int avg_frame_qindex[FRAME_TYPES];
+
+#if CONFIG_FPMT_TEST
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * active_best_quality.
+   */
+  int temp_active_best_quality[MAX_ARF_LAYERS + 1];
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * last_boosted_qindex.
+   */
+  int temp_last_boosted_qindex;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * avg_q.
+   */
+  double temp_avg_q;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * last_q.
+   */
+  int temp_last_q[FRAME_TYPES];
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * projected_frame_size.
+   */
+  int temp_projected_frame_size;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * total_actual_bits.
+   */
+  int64_t temp_total_actual_bits;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * buffer_level.
+   */
+  int64_t temp_buffer_level;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * vbr_bits_off_target.
+   */
+  int64_t temp_vbr_bits_off_target;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * vbr_bits_off_target_fast.
+   */
+  int64_t temp_vbr_bits_off_target_fast;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * rate_correction_factors.
+   */
+  double temp_rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * rate_error_estimate.
+   */
+  int temp_rate_error_estimate;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * rolling_arf_group_target_bits.
+   */
+  int temp_rolling_arf_group_target_bits;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * rolling_arf_group_actual_bits;.
+   */
+  int temp_rolling_arf_group_actual_bits;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * bits_left;.
+   */
+  int64_t temp_bits_left;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * extend_minq.
+   */
+  int temp_extend_minq;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * extend_maxq.
+   */
+  int temp_extend_maxq;
+
+#endif
+  /*!
+   * Proposed minimum allowed Q different layers in a coding pyramid
+   */
+  int active_best_quality[MAX_ARF_LAYERS + 1];
+
+  /*!
+   * Q used for last boosted (non leaf) frame (GF/KF/ARF)
+   */
+  int last_boosted_qindex;
+
+  /*!
+   * Average Q value of previous inter frames
+   */
+  double avg_q;
+
+  /*!
+   * Q used on last encoded frame of the given type.
+   */
+  int last_q[FRAME_TYPES];
+
+  /*!
+   * Correction factors used to adjust the q estimate for a given target rate
+   * in the encode loop.
+   */
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  /*!
+   * Current total consumed bits.
+   */
+  int64_t total_actual_bits;
+
+  /*!
+   * Current total target bits.
+   */
+  int64_t total_target_bits;
+
+  /*!
+   * Current buffer level.
+   */
+  int64_t buffer_level;
+
+  /*!
+   * PCT rc error.
+   */
+  int rate_error_estimate;
+
+  /*!
+   * Error bits available from previously encoded frames.
+   */
+  int64_t vbr_bits_off_target;
+
+  /*!
+   * Error bits available from previously encoded frames undershoot.
+   */
+  int64_t vbr_bits_off_target_fast;
+
+  /*!
+   * Total bits deviated from the average frame target, from previously
+   * encoded frames.
+   */
+  int64_t bits_off_target;
+
+  /*!
+   * Rolling monitor target bits updated based on current frame target size.
+   */
+  int rolling_target_bits;
+
+  /*!
+   * Rolling monitor actual bits updated based on current frame final projected
+   * size.
+   */
+  int rolling_actual_bits;
+
+  /*!
+   * The history of qindex for each frame.
+   * Only used when RT_PASSIVE_STRATEGY = 1.
+   */
+  int q_history[MAX_Q_HISTORY];
+} PRIMARY_RATE_CONTROL;
+
+/*!\cond */
+
+struct AV1_COMP;
+struct AV1EncoderConfig;
+struct GF_GROUP;
+
+void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf,
+                         PRIMARY_RATE_CONTROL *p_rc);
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc);
+
+int av1_estimate_bits_at_q(const struct AV1_COMP *cpi, int q,
+                           double correction_factor);
+
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
+
+void av1_rc_init_minq_luts(void);
+
+int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+//   av1_get_one_pass_rt_params()
+//   av1_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by:
+//   av1_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the av1_get_..._params() functions and
+// updated during the av1_rc_postencode_update...() functions.
+// The only exceptions are av1_rc_drop_frame() and
+// av1_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+struct EncodeFrameInput;
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
+
+/*!\endcond */
+/*!\brief Updates the rate correction factor linking Q to output bits
+ *
+ * This function updates the Q rate correction factor after an encode
+ * cycle depending on whether we overshot or undershot the target rate.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   is_encode_stage       Indicates if recode loop or post-encode
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \remark Updates the relevant rate correction factor in cpi->rc
+ */
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi,
+                                           int is_encode_stage, int width,
+                                           int height);
+/*!\cond */
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int av1_rc_drop_frame(struct AV1_COMP *cpi);
+
+// Computes frame size bounds.
+void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+
+/*!\endcond */
+
+/*!\brief Picks q and q bounds given the rate control parameters in \c cpi->rc.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[in]       gf_index     Index of this frame in the golden frame group
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ * Also, updates \c rc->arf_q.
+ */
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
+                             int gf_index, int *bottom_index, int *top_index);
+
+/*!\brief Estimates q to achieve a target bits per frame
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   target_bits_per_frame Frame rate target
+ * \param[in]   active_worst_quality  Max Q allowed
+ * \param[in]   active_best_quality   Min Q allowed
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return Returns a q index value
+ */
+int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality,
+                      int width, int height);
+
+/*!\cond */
+// Gets the appropriate bpmb ennumerator based on the frame and content type
+int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
+                            const int is_screen_content_type);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type,
+                       int qindex, double correction_factor,
+                       int accurate_estimate);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
+                                    int64_t target);
+int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
+                                    int target, uint8_t frame_update_type);
+
+// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex].
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// q >= desired_q.
+// If no such q index is found, returns 'worst_qindex'.
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+                    int best_qindex, int worst_qindex);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       aom_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int av1_compute_qdelta_by_rate(const struct AV1_COMP *cpi,
+                               FRAME_TYPE frame_type, int qindex,
+                               double rate_target_ratio);
+
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
+
+void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
+
+void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
+                                  RATE_CONTROL *const rc);
+
+void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
+
+int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
+
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
+                             int height);
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi);
+
+void av1_set_rtc_reference_structure_one_layer(struct AV1_COMP *cpi,
+                                               int gf_update);
+
+/*!\endcond */
+/*!\brief Calculates how many bits to use for a P frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi                 Top level encoder structure
+ * \param[in]       frame_update_type   Type of frame
+ *
+ * \return	Returns the target number of bits for this frame.
+ */
+int av1_calc_pframe_target_size_one_pass_vbr(
+    const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+/*!\brief Calculates how many bits to use for an i frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return	Returns the target number of bits for this frame.
+ */
+int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi);
+
+/*!\brief Calculates how many bits to use for a P frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi                 Top level encoder structure
+ * \param[in]       frame_update_type   Type of frame
+ *
+ * \return  Returns the target number of bits for this frame.
+ */
+int av1_calc_pframe_target_size_one_pass_cbr(
+    const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+/*!\brief Calculates how many bits to use for an i frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Returns the target number of bits for this frame.
+ */
+int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi);
+
+/*!\brief Setup the rate control parameters for 1 pass real-time mode.
+ *
+ * - Sets the frame type and target frame size.
+ * - Sets the GF update.
+ * - Checks for scene change.
+ * - Sets the reference prediction structure for 1 layers (non-SVC).
+ * - Resets and updates are done for SVC.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       frame_type   Encoder frame type
+ * \param[in]       frame_input  Current and last input source frames
+ * \param[in]       frame_flags  Encoder frame flags
+ *
+ * \remark Nothing is returned. Instead the settings computed in this
+ * function are set in: \c frame_params, \c cpi->common, \c cpi->rc,
+ * \c cpi->svc.
+ */
+void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
+                                FRAME_TYPE *const frame_type,
+                                const struct EncodeFrameInput *frame_input,
+                                unsigned int frame_flags);
+
+/*!\brief Increase q on expected encoder overshoot, for CBR mode.
+ *
+ *  Handles the case when encoder is expected to create a large frame:
+ *  - q is increased to value closer to \c cpi->rc.worst_quality
+ *  - avg_frame_qindex is reset
+ *  - buffer levels are reset
+ *  - rate correction factor is adjusted
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]        q           Current q index
+ *
+ * \return q is returned, and updates are done to \c cpi->rc.
+ */
+int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q);
+
+/*!\brief Compute the q_indices for a single frame.
+ *
+ * Intended to be used with AOM_Q mode.
+ *
+ * \param[in]       base_q_index      Base q index
+ * \param[in]       gf_update_type    GOP update type
+ * \param[in]       gf_pyramid_level  GOP level of the current frame
+ * \param[in]       arf_q             ARF q_index
+ *
+ * \return Returns the q_index for the current frame.
+ */
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+                           int gf_pyramid_level, int arf_q);
+
+/*!\brief Compute the q_indices for the ARF of a GOP.
+ *
+ * \param[in]       base_q_index      Base q index
+ * \param[in]       gfu_boost         GFU boost
+ * \param[in]       bit_depth         Bit depth
+ * \param[in]       arf_boost_factor  ARF boost factor
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+                        double arf_boost_factor);
+
+#if !CONFIG_REALTIME_ONLY
+struct TplDepFrame;
+/*!\brief Compute the q_indices for the ARF of a GOP in Q mode.
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       tpl_frame         Tpl Frame stats
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index_q_mode(struct AV1_COMP *cpi,
+                               struct TplDepFrame *tpl_frame);
+#endif
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/rc_utils.h b/third_party/aom/av1/encoder/rc_utils.h
new file mode 100644
index 0000000000..fe22ee5afb
--- /dev/null
+++ b/third_party/aom/av1/encoder/rc_utils.h
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RC_UTILS_H_
+#define AOM_AV1_ENCODER_RC_UTILS_H_
+
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/psnr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) {
+  RATE_CONTROL *rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  if (cpi->common.current_frame.frame_number >
+      (unsigned int)cpi->svc.number_spatial_layers) {
+    if (cpi->ppi->use_svc) {
+      av1_svc_check_reset_layer_rc_flag(cpi);
+    } else {
+      if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) ||
+          rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) {
+        rc->rc_1_frame = 0;
+        rc->rc_2_frame = 0;
+        p_rc->bits_off_target = p_rc->optimal_buffer_level;
+        p_rc->buffer_level = p_rc->optimal_buffer_level;
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf,
+                                                   AV1_PRIMARY *ppi) {
+  PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  const int64_t bandwidth = rc_cfg->target_bandwidth;
+  const int64_t starting = rc_cfg->starting_buffer_level_ms;
+  const int64_t optimal = rc_cfg->optimal_buffer_level_ms;
+  const int64_t maximum = rc_cfg->maximum_buffer_size_ms;
+
+  p_rc->starting_buffer_level = starting * bandwidth / 1000;
+  p_rc->optimal_buffer_level =
+      (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+  p_rc->maximum_buffer_size =
+      (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
+}
+
+static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
+                                           AV1_LEVEL target_level, int tier) {
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  SequenceHeader *const seq_params = cpi->common.seq_params;
+  TileConfig *const tile_cfg = &oxcf->tile_cfg;
+  RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  // Adjust target bitrate to be no larger than 70% of level limit.
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const double level_bitrate_limit =
+      av1_get_max_bitrate_for_level(target_level, tier, profile);
+  const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
+  rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate);
+  // Also need to update cpi->ppi->twopass.bits_left.
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
+  if (stats != NULL)
+    cpi->ppi->twopass.bits_left =
+        (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0);
+
+  // Adjust max over-shoot percentage.
+  rc_cfg->over_shoot_pct = 0;
+
+  // Adjust max quantizer.
+  rc_cfg->worst_allowed_q = 255;
+
+  // Adjust number of tiles and tile columns to be under level limit.
+  int max_tiles, max_tile_cols;
+  av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
+  while (tile_cfg->tile_columns > 0 &&
+         (1 << tile_cfg->tile_columns) > max_tile_cols) {
+    --tile_cfg->tile_columns;
+  }
+  const int tile_cols = (1 << tile_cfg->tile_columns);
+  while (tile_cfg->tile_rows > 0 &&
+         tile_cols * (1 << tile_cfg->tile_rows) > max_tiles) {
+    --tile_cfg->tile_rows;
+  }
+
+  // Adjust min compression ratio.
+  const int still_picture = seq_params->still_picture;
+  const double min_cr =
+      av1_get_min_cr_for_level(target_level, tier, still_picture);
+  rc_cfg->min_cr = AOMMAX(rc_cfg->min_cr, (unsigned int)(min_cr * 100));
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Function to test for conditions that indicate we should loop
+ * back and recode a frame.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in]     cpi         Top-level encoder structure
+ * \param[in]     high_limit  Upper rate threshold
+ * \param[in]     low_limit   Lower rate threshold
+ * \param[in]     q           Current q index
+ * \param[in]     maxq        Maximum allowed q index
+ * \param[in]     minq        Minimum allowed q index
+ *
+ * \return        Indicates if a recode is required.
+ * \retval        1           Recode Required
+ * \retval        0           No Recode required
+ */
+static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit,
+                                       int low_limit, int q, int maxq,
+                                       int minq) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+  int force_recode = 0;
+
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+       (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->rc_cfg.cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor,
+                                                             double max_factor,
+                                                             int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor = (200.0 + 10.0 * factor);
+  return factor;
+}
+
+static AOM_INLINE int get_gfu_boost_from_r0_lap(double min_factor,
+                                                double max_factor, double r0,
+                                                int frames_to_key) {
+  double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
+                                                      frames_to_key);
+  const int boost = (int)rint(factor / r0);
+  return boost;
+}
+
+static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, 10.0);
+  factor = AOMMAX(factor, 4.0);
+  factor = (75.0 + 14.0 * factor);
+  return factor;
+}
+
+static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi,
+                                                int is_encode_stage, int q_low,
+                                                int q_high, int top_index,
+                                                int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+                                        cm->height);
+
+  int q_regulated =
+      av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                        AOMMAX(q_high, top_index), cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated < q_low && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+                                          cm->height);
+    q_regulated =
+        av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                          AOMMAX(q_high, top_index), cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi,
+                                                 int is_encode_stage,
+                                                 int q_high, int top_index,
+                                                 int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+                                        cm->height);
+  int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                      top_index, cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated > q_high && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+                                          cm->height);
+    q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    top_index, cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+/*!\brief Called after encode_with_recode_loop() has just encoded a frame.
+ * This function works out whether we undershot or overshot our bitrate
+ *  target and adjusts q as appropriate. It also decides whether or not
+ *  we need to recode the frame to get closer to the target rate.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in]     cpi             Top-level encoder structure
+ * \param[out]    loop            Should we go around the recode loop again
+ * \param[in,out] q               New q index value
+ * \param[in,out] q_low           Low q index limit for this loop itteration
+ * \param[in,out] q_high          High q index limit for this loop itteration
+ * \param[in]     top_index       Max permited new value for q index
+ * \param[in]     bottom_index    Min permited new value for q index
+ * \param[in,out] undershoot_seen Have we seen undershoot on this frame
+ * \param[in,out] overshoot_seen  Have we seen overshoot on this frame
+ * \param[in,out] low_cr_seen     Have we previously trriggered recode
+ *                                because the compression ration was less
+ *                                than a given minimum threshold.
+ * \param[in]     loop_count      Loop itterations so far.
+ *
+ */
+static AOM_INLINE void recode_loop_update_q(
+    AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
+    int *const q_high, const int top_index, const int bottom_index,
+    int *const undershoot_seen, int *const overshoot_seen,
+    int *const low_cr_seen, const int loop_count) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+  *loop = 0;
+
+  // Special case for overlay frame.
+  if (rc->is_src_frame_alt_ref &&
+      rc->projected_frame_size < rc->max_frame_bandwidth)
+    return;
+
+  const int min_cr = rc_cfg->min_cr;
+  if (min_cr > 0) {
+    const double compression_ratio =
+        av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
+    const double target_cr = min_cr / 100.0;
+    if (compression_ratio < target_cr) {
+      *low_cr_seen = 1;
+      if (*q < rc->worst_quality) {
+        const double cr_ratio = target_cr / compression_ratio;
+        const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
+        *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
+        *q_low = AOMMAX(*q, *q_low);
+        *q_high = AOMMAX(*q, *q_high);
+        *loop = 1;
+      }
+    }
+    if (*low_cr_seen) return;
+  }
+
+  if (cpi->ppi->level_params.keep_level_stats &&
+      !is_stat_generation_stage(cpi)) {
+    // Initialize level info. at the beginning of each sequence.
+    if (cm->current_frame.frame_type == KEY_FRAME &&
+        cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+      av1_init_level_info(cpi);
+    }
+    const AV1LevelParams *const level_params = &cpi->ppi->level_params;
+    // TODO(any): currently only checking operating point 0
+    const AV1LevelInfo *const level_info = level_params->level_info[0];
+    const DECODER_MODEL *const decoder_models = level_info->decoder_models;
+    const AV1_LEVEL target_level = level_params->target_seq_level_idx[0];
+
+    if (target_level < SEQ_LEVELS &&
+        decoder_models[target_level].status == DECODER_MODEL_OK) {
+      DECODER_MODEL_STATUS status = av1_decoder_model_try_smooth_buf(
+          cpi, rc->projected_frame_size, &decoder_models[target_level]);
+
+      if ((status == SMOOTHING_BUFFER_UNDERFLOW ||
+           status == SMOOTHING_BUFFER_OVERFLOW) &&
+          *q < rc->worst_quality) {
+        *q = AOMMIN(*q + 10, rc->worst_quality);
+        *q_low = AOMMAX(*q, *q_low);
+        *q_high = AOMMAX(*q, *q_high);
+        *loop = 1;
+        return;
+      }
+    }
+  }
+
+  if (rc_cfg->mode == AOM_Q) return;
+
+  const int last_q = *q;
+  int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
+  av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                   &frame_under_shoot_limit,
+                                   &frame_over_shoot_limit);
+  if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+
+  if (cm->current_frame.frame_type == KEY_FRAME &&
+      p_rc->this_key_frame_forced &&
+      rc->projected_frame_size < rc->max_frame_bandwidth) {
+    int64_t kf_err;
+    const int64_t high_err_target = cpi->ambient_err;
+    const int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params->use_highbitdepth) {
+      kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    } else {
+      kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    }
+#else
+    kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+    // Prevent possible divide by zero error below for perfect KF
+    kf_err += !kf_err;
+
+    // The key frame is not good enough or we can afford
+    // to make it better without undue risk of popping.
+    if ((kf_err > high_err_target &&
+         rc->projected_frame_size <= frame_over_shoot_limit) ||
+        (kf_err > low_err_target &&
+         rc->projected_frame_size <= frame_under_shoot_limit)) {
+      // Lower q_high
+      *q_high = AOMMAX(*q - 1, *q_low);
+
+      // Adjust Q
+      *q = (int)((*q * high_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
+    } else if (kf_err < low_err_target &&
+               rc->projected_frame_size >= frame_under_shoot_limit) {
+      // The key frame is much better than the previous frame
+      // Raise q_low
+      *q_low = AOMMIN(*q + 1, *q_high);
+
+      // Adjust Q
+      *q = (int)((*q * low_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+    *loop = (*q != last_q);
+    return;
+  }
+
+  if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
+                       AOMMAX(*q_high, top_index), bottom_index)) {
+    // Is the projected frame size out of range and are we allowed
+    // to attempt to recode.
+
+    // Frame size out of permitted range:
+    // Update correction factor & compute new Q to try...
+    // Frame is too large
+    if (rc->projected_frame_size > rc->this_frame_target) {
+      // Special case if the projected size is > the max allowed.
+      if (*q == *q_high &&
+          rc->projected_frame_size >= rc->max_frame_bandwidth) {
+        const double q_val_high_current =
+            av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth);
+        const double q_val_high_new =
+            q_val_high_current *
+            ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
+        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth,
+                                  rc->best_quality, rc->worst_quality);
+      }
+
+      // Raise Qlow as to at least the current value
+      *q_low = AOMMIN(*q + 1, *q_high);
+
+      if (*undershoot_seen || loop_count > 2 ||
+          (loop_count == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height);
+
+        *q = (*q_high + *q_low + 1) / 2;
+      } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low + 1) / 2;
+        const int q_regulated = get_regulated_q_overshoot(
+            cpi, 1, *q_low, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_count < 2 and loop_count > 2.
+        *q = (q_mid + q_regulated + 1) / 2;
+      } else {
+        *q = get_regulated_q_overshoot(cpi, 1, *q_low, *q_high, top_index,
+                                       bottom_index);
+      }
+
+      *overshoot_seen = 1;
+    } else {
+      // Frame is too small
+      *q_high = AOMMAX(*q - 1, *q_low);
+
+      if (*overshoot_seen || loop_count > 2 ||
+          (loop_count == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height);
+        *q = (*q_high + *q_low) / 2;
+      } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low) / 2;
+        const int q_regulated = get_regulated_q_undershoot(
+            cpi, 1, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_count < 2 and loop_count > 2.
+        *q = (q_mid + q_regulated) / 2;
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (rc_cfg->mode == AOM_CQ && q_regulated < *q_low) {
+          *q_low = *q;
+        }
+      } else {
+        *q = get_regulated_q_undershoot(cpi, 1, *q_high, top_index,
+                                        bottom_index);
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (rc_cfg->mode == AOM_CQ && *q < *q_low) {
+          *q_low = *q;
+        }
+      }
+
+      *undershoot_seen = 1;
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+  }
+
+  *loop = (*q != last_q);
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RC_UTILS_H_
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
new file mode 100644
index 0000000000..c2d76e7a9a
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.c
@@ -0,0 +1,1580 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_once.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+
+#define RD_THRESH_POW 1.25
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
+};
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA]
+                                            [EXT_TX_SIZES] = {
+                                              { 1, 1, 1, 1 },  // unused
+                                              { 1, 1, 0, 0 },
+                                              { 0, 0, 1, 0 },
+                                            };
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER]
+                                            [EXT_TX_SIZES] = {
+                                              { 1, 1, 1, 1 },  // unused
+                                              { 1, 1, 0, 0 },
+                                              { 0, 0, 1, 0 },
+                                              { 0, 1, 1, 1 },
+                                            };
+
+static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
+                                                      EXT_TX_SETS_INTER)] = {
+  {
+      // Intra
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_DTT4_IDTX_1DDCT,
+      EXT_TX_SET_DTT4_IDTX,
+  },
+  {
+      // Inter
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_ALL16,
+      EXT_TX_SET_DTT9_IDTX_1DDCT,
+      EXT_TX_SET_DCT_IDTX,
+  },
+};
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
+                         FRAME_CONTEXT *fc) {
+  int i, j;
+
+  for (i = 0; i < PARTITION_CONTEXTS; ++i)
+    av1_cost_tokens_from_cdf(mode_costs->partition_cost[i],
+                             fc->partition_cdf[i], NULL);
+
+  if (cm->current_frame.skip_mode_info.skip_mode_flag) {
+    for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i],
+                               fc->skip_mode_cdfs[i], NULL);
+    }
+  }
+
+  for (i = 0; i < SKIP_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->skip_txfm_cost[i],
+                             fc->skip_txfm_cdfs[i], NULL);
+  }
+
+  for (i = 0; i < KF_MODE_CONTEXTS; ++i)
+    for (j = 0; j < KF_MODE_CONTEXTS; ++j)
+      av1_cost_tokens_from_cdf(mode_costs->y_mode_costs[i][j],
+                               fc->kf_y_cdf[i][j], NULL);
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    av1_cost_tokens_from_cdf(mode_costs->mbmode_cost[i], fc->y_mode_cdf[i],
+                             NULL);
+  for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      av1_cost_tokens_from_cdf(mode_costs->intra_uv_mode_cost[i][j],
+                               fc->uv_mode_cdf[i][j], NULL);
+
+  av1_cost_tokens_from_cdf(mode_costs->filter_intra_mode_cost,
+                           fc->filter_intra_mode_cdf, NULL);
+  for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    if (av1_filter_intra_allowed_bsize(cm, i))
+      av1_cost_tokens_from_cdf(mode_costs->filter_intra_cost[i],
+                               fc->filter_intra_cdfs[i], NULL);
+  }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    av1_cost_tokens_from_cdf(mode_costs->switchable_interp_costs[i],
+                             fc->switchable_interp_cdf[i], NULL);
+
+  for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->palette_y_size_cost[i],
+                             fc->palette_y_size_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(mode_costs->palette_uv_size_cost[i],
+                             fc->palette_uv_size_cdf[i], NULL);
+    for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(mode_costs->palette_y_mode_cost[i][j],
+                               fc->palette_y_mode_cdf[i][j], NULL);
+    }
+  }
+
+  for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->palette_uv_mode_cost[i],
+                             fc->palette_uv_mode_cdf[i], NULL);
+  }
+
+  for (i = 0; i < PALETTE_SIZES; ++i) {
+    for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(mode_costs->palette_y_color_cost[i][j],
+                               fc->palette_y_color_index_cdf[i][j], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->palette_uv_color_cost[i][j],
+                               fc->palette_uv_color_index_cdf[i][j], NULL);
+    }
+  }
+
+  int sign_cost[CFL_JOINT_SIGNS];
+  av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
+  for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+    int *cost_u = mode_costs->cfl_cost[joint_sign][CFL_PRED_U];
+    int *cost_v = mode_costs->cfl_cost[joint_sign][CFL_PRED_V];
+    if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
+      memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
+    } else {
+      const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+      av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
+    }
+    if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) {
+      memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
+    } else {
+      const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+      av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+    }
+    for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
+      cost_u[u] += sign_cost[joint_sign];
+  }
+
+  for (i = 0; i < MAX_TX_CATS; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      av1_cost_tokens_from_cdf(mode_costs->tx_size_cost[i][j],
+                               fc->tx_size_cdf[i][j], NULL);
+
+  for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->txfm_partition_cost[i],
+                             fc->txfm_partition_cdf[i], NULL);
+  }
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        av1_cost_tokens_from_cdf(
+            mode_costs->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+            av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j) {
+          av1_cost_tokens_from_cdf(
+              mode_costs->intra_tx_type_costs[s][i][j],
+              fc->intra_ext_tx_cdf[s][i][j],
+              av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
+        }
+      }
+    }
+  }
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->angle_delta_cost[i],
+                             fc->angle_delta_cdf[i], NULL);
+  }
+  av1_cost_tokens_from_cdf(mode_costs->intrabc_cost, fc->intrabc_cdf, NULL);
+
+  for (i = 0; i < SPATIAL_PREDICTION_PROBS; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->spatial_pred_cost[i],
+                             fc->seg.spatial_pred_seg_cdf[i], NULL);
+  }
+
+  for (i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->tmp_pred_cost[i], fc->seg.pred_cdf[i],
+                             NULL);
+  }
+
+  if (!frame_is_intra_only(cm)) {
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->comp_inter_cost[i],
+                               fc->comp_inter_cdf[i], NULL);
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < SINGLE_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(mode_costs->single_ref_cost[i][j],
+                                 fc->single_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->comp_ref_type_cost[i],
+                               fc->comp_ref_type_cdf[i], NULL);
+    }
+
+    for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
+      for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(mode_costs->uni_comp_ref_cost[i][j],
+                                 fc->uni_comp_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < FWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(mode_costs->comp_ref_cost[i][j],
+                                 fc->comp_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < BWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(mode_costs->comp_bwdref_cost[i][j],
+                                 fc->comp_bwdref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->intra_inter_cost[i],
+                               fc->intra_inter_cdf[i], NULL);
+    }
+
+    for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->newmv_mode_cost[i], fc->newmv_cdf[i],
+                               NULL);
+    }
+
+    for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->zeromv_mode_cost[i],
+                               fc->zeromv_cdf[i], NULL);
+    }
+
+    for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->refmv_mode_cost[i], fc->refmv_cdf[i],
+                               NULL);
+    }
+
+    for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->drl_mode_cost0[i], fc->drl_cdf[i],
+                               NULL);
+    }
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      av1_cost_tokens_from_cdf(mode_costs->inter_compound_mode_cost[i],
+                               fc->inter_compound_mode_cdf[i], NULL);
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i)
+      av1_cost_tokens_from_cdf(mode_costs->compound_type_cost[i],
+                               fc->compound_type_cdf[i], NULL);
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      if (av1_is_wedge_used(i)) {
+        av1_cost_tokens_from_cdf(mode_costs->wedge_idx_cost[i],
+                                 fc->wedge_idx_cdf[i], NULL);
+      }
+    }
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->interintra_cost[i],
+                               fc->interintra_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->interintra_mode_cost[i],
+                               fc->interintra_mode_cdf[i], NULL);
+    }
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->wedge_interintra_cost[i],
+                               fc->wedge_interintra_cdf[i], NULL);
+    }
+    for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+      av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost[i],
+                               fc->motion_mode_cdf[i], NULL);
+    }
+    for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+      av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost1[i],
+                               fc->obmc_cdf[i], NULL);
+    }
+    for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->comp_idx_cost[i],
+                               fc->compound_index_cdf[i], NULL);
+    }
+    for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->comp_group_idx_cost[i],
+                               fc->comp_group_idx_cdf[i], NULL);
+    }
+  }
+}
+
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc) {
+  av1_cost_tokens_from_cdf(mode_costs->switchable_restore_cost,
+                           fc->switchable_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(mode_costs->wiener_restore_cost,
+                           fc->wiener_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(mode_costs->sgrproj_restore_cost,
+                           fc->sgrproj_restore_cdf, NULL);
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit_lut_8[QINDEX_RANGE];
+static int sad_per_bit_lut_10[QINDEX_RANGE];
+static int sad_per_bit_lut_12[QINDEX_RANGE];
+
+static void init_me_luts_bd(int *bit16lut, int range,
+                            aom_bit_depth_t bit_depth) {
+  int i;
+  // Initialize the sad lut tables using a formulaic calculation for now.
+  // This is to make it easier to resolve the impact of experimental changes
+  // to the quantizer tables.
+  for (i = 0; i < range; i++) {
+    const double q = av1_convert_qindex_to_q(i, bit_depth);
+    bit16lut[i] = (int)(0.0418 * q + 2.4107);
+  }
+}
+
+static void init_me_luts(void) {
+  init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8);
+  init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10);
+  init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12);
+}
+
+void av1_init_me_luts(void) { aom_once(init_me_luts); }
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+                                         8,  8,  4,  4,  2,  2,  1,  0 };
+
+static const int rd_layer_depth_factor[7] = {
+  160, 160, 160, 160, 192, 208, 224
+};
+
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+  return 3.2 + (0.0015 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+  return 3.25 + (0.0015 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+  return 3.3 + (0.0015 * (double)qindex);
+}
+
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+                                        FRAME_UPDATE_TYPE update_type,
+                                        int qindex) {
+  const int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+  int64_t rdmult = q * q;
+  if (update_type == KF_UPDATE) {
+    double def_rd_q_mult = def_kf_rd_multiplier(q);
+    rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+  } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) {
+    double def_rd_q_mult = def_arf_rd_multiplier(q);
+    rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+  } else {
+    double def_rd_q_mult = def_inter_rd_multiplier(q);
+    rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+  }
+
+  switch (bit_depth) {
+    case AOM_BITS_8: break;
+    case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
+    case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+  return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1;
+}
+
+int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
+                        const FRAME_UPDATE_TYPE update_type,
+                        const int layer_depth, const int boost_index,
+                        const FRAME_TYPE frame_type,
+                        const int use_fixed_qp_offsets,
+                        const int is_stat_consumption_stage) {
+  int64_t rdmult =
+      av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex);
+  if (is_stat_consumption_stage && !use_fixed_qp_offsets &&
+      (frame_type != KEY_FRAME)) {
+    // Layer depth adjustment
+    rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7;
+    // ARF boost adjustment
+    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+  }
+  return (int)rdmult;
+}
+
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) {
+  assert(beta > 0.0);
+  int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+  int newq = (int)rint(q / sqrt(beta));
+  int orig_qindex = qindex;
+  if (newq == q) {
+    return 0;
+  }
+  if (newq < q) {
+    while (qindex > 0) {
+      qindex--;
+      q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (newq >= q) {
+        break;
+      }
+    }
+  } else {
+    while (qindex < MAXQ) {
+      qindex++;
+      q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (newq <= q) {
+        break;
+      }
+    }
+  }
+  return qindex - orig_qindex;
+}
+
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+                                  int curr_qindex) {
+  curr_qindex = clamp(curr_qindex, delta_q_res, 256 - delta_q_res);
+  const int sign_deltaq_index = curr_qindex - prev_qindex >= 0 ? 1 : -1;
+  const int deltaq_deadzone = delta_q_res / 4;
+  const int qmask = ~(delta_q_res - 1);
+  int abs_deltaq_index = abs(curr_qindex - prev_qindex);
+  abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
+  int adjust_qindex = prev_qindex + sign_deltaq_index * abs_deltaq_index;
+  adjust_qindex = AOMMAX(adjust_qindex, MINQ + 1);
+  return adjust_qindex;
+}
+
+int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
+  assert(beta > 0.0);
+  const AV1_COMMON *cm = &cpi->common;
+
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+  const int qindex_rdmult = cm->quant_params.base_qindex;
+  return (int)(av1_compute_rd_mult(
+                   qindex_rdmult, cm->seq_params->bit_depth,
+                   cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+                   layer_depth, boost_index, frame_type,
+                   cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+                   is_stat_consumption_stage(cpi)) /
+               beta);
+}
+
+static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
+  double q;
+  switch (bit_depth) {
+    case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break;
+    case AOM_BITS_10:
+      q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0;
+      break;
+    case AOM_BITS_12:
+      q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0;
+      break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+  // TODO(debargha): Adjust the function below.
+  return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) {
+  switch (cpi->common.seq_params->bit_depth) {
+    case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break;
+    case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break;
+    case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+  }
+}
+
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd,
+                                 int use_nonrd_pick_mode) {
+  int i, bsize, segment_id;
+  THR_MODES mode_indices[RTC_REFS * RTC_MODES] = { 0 };
+  int num_modes_count = use_nonrd_pick_mode ? 0 : MAX_MODES;
+
+  if (use_nonrd_pick_mode) {
+    for (int r_idx = 0; r_idx < RTC_REFS; r_idx++) {
+      const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+      if (ref != INTRA_FRAME) {
+        for (i = 0; i < RTC_INTER_MODES; i++)
+          mode_indices[num_modes_count++] =
+              mode_idx[ref][mode_offset(inter_mode_list[i])];
+      } else {
+        for (i = 0; i < RTC_INTRA_MODES; i++)
+          mode_indices[num_modes_count++] =
+              mode_idx[ref][mode_offset(intra_mode_list[i])];
+      }
+    }
+  }
+
+  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+    const int qindex = clamp(
+        av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) +
+            cm->quant_params.y_dc_delta_q,
+        0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth);
+
+    for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+      // Threshold here seems unnecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[].
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
+
+      for (i = 0; i < num_modes_count; ++i) {
+        const int mode_index = use_nonrd_pick_mode ? mode_indices[i] : i;
+        rd->threshes[segment_id][bsize][mode_index] =
+            rd->thresh_mult[mode_index] < thresh_max
+                ? rd->thresh_mult[mode_index] * t / 4
+                : INT_MAX;
+      }
+    }
+  }
+}
+
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
+                          const int num_planes) {
+  const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
+  for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
+    for (int plane = 0; plane < nplanes; ++plane) {
+      LV_MAP_EOB_COST *pcost = &coeff_costs->eob_costs[eob_multi_size][plane];
+
+      for (int ctx = 0; ctx < 2; ++ctx) {
+        aom_cdf_prob *pcdf;
+        switch (eob_multi_size) {
+          case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break;
+          case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break;
+          case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break;
+          case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break;
+          case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break;
+          case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break;
+          case 6:
+          default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break;
+        }
+        av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL);
+      }
+    }
+  }
+  for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+    for (int plane = 0; plane < nplanes; ++plane) {
+      LV_MAP_COEFF_COST *pcost = &coeff_costs->coeff_costs[tx_size][plane];
+
+      for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
+                                 fc->txb_skip_cdf[tx_size][ctx], NULL);
+
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx],
+                                 fc->coeff_base_eob_cdf[tx_size][plane][ctx],
+                                 NULL);
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
+                                 fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+        pcost->base_cost[ctx][4] = 0;
+        pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] +
+                                   av1_cost_literal(1) -
+                                   pcost->base_cost[ctx][0];
+        pcost->base_cost[ctx][6] =
+            pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1];
+        pcost->base_cost[ctx][7] =
+            pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2];
+      }
+
+      for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
+                                 fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
+                                 fc->dc_sign_cdf[plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+        int br_rate[BR_CDF_SIZE];
+        int prev_cost = 0;
+        int i, j;
+        av1_cost_tokens_from_cdf(
+            br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx],
+            NULL);
+        // printf("br_rate: ");
+        // for(j = 0; j < BR_CDF_SIZE; j++)
+        //  printf("%4d ", br_rate[j]);
+        // printf("\n");
+        for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) {
+          for (j = 0; j < BR_CDF_SIZE - 1; j++) {
+            pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j];
+          }
+          prev_cost += br_rate[j];
+        }
+        pcost->lps_cost[ctx][i] = prev_cost;
+        // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx);
+        // for (i = 0; i <= COEFF_BASE_RANGE; i++)
+        //  printf("%5d ", pcost->lps_cost[ctx][i]);
+        // printf("\n");
+      }
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+        pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] =
+            pcost->lps_cost[ctx][0];
+        for (int i = 1; i <= COEFF_BASE_RANGE; ++i) {
+          pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] =
+              pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1];
+        }
+      }
+    }
+  }
+}
+
+void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
+                       MvCosts *mv_costs) {
+  // Avoid accessing 'mv_costs' when it is not allocated.
+  if (mv_costs == NULL) return;
+
+  mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+  mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+  mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+  mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
+  if (integer_mv) {
+    mv_costs->mv_cost_stack = (int **)&mv_costs->nmv_cost;
+    av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+                             nmvc, MV_SUBPEL_NONE);
+  } else {
+    mv_costs->mv_cost_stack =
+        usehp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
+    av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+                             nmvc, usehp);
+  }
+}
+
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) {
+  dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX];
+  dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX];
+  av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc,
+                           MV_SUBPEL_NONE);
+}
+
+// Populates speed features based on codec control settings (of type
+// COST_UPDATE_TYPE) and expected speed feature settings (of type
+// INTERNAL_COST_UPDATE_TYPE) by considering the least frequent cost update.
+// The populated/updated speed features are used for cost updates in the
+// encoder.
+// WARNING: Population of unified cost update frequency needs to be taken care
+// accordingly, in case of any modifications/additions to the enum
+// COST_UPDATE_TYPE/INTERNAL_COST_UPDATE_TYPE.
+static INLINE void populate_unified_cost_update_freq(
+    const CostUpdateFreq cost_upd_freq, SPEED_FEATURES *const sf) {
+  INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
+  // Mapping of entropy cost update frequency from the encoder's codec control
+  // settings of type COST_UPDATE_TYPE to speed features of type
+  // INTERNAL_COST_UPDATE_TYPE.
+  static const INTERNAL_COST_UPDATE_TYPE
+      map_cost_upd_to_internal_cost_upd[NUM_COST_UPDATE_TYPES] = {
+        INTERNAL_COST_UPD_SB, INTERNAL_COST_UPD_SBROW, INTERNAL_COST_UPD_TILE,
+        INTERNAL_COST_UPD_OFF
+      };
+
+  inter_sf->mv_cost_upd_level =
+      AOMMIN(inter_sf->mv_cost_upd_level,
+             map_cost_upd_to_internal_cost_upd[cost_upd_freq.mv]);
+  inter_sf->coeff_cost_upd_level =
+      AOMMIN(inter_sf->coeff_cost_upd_level,
+             map_cost_upd_to_internal_cost_upd[cost_upd_freq.coeff]);
+  inter_sf->mode_cost_upd_level =
+      AOMMIN(inter_sf->mode_cost_upd_level,
+             map_cost_upd_to_internal_cost_upd[cost_upd_freq.mode]);
+  sf->intra_sf.dv_cost_upd_level =
+      AOMMIN(sf->intra_sf.dv_cost_upd_level,
+             map_cost_upd_to_internal_cost_upd[cost_upd_freq.dv]);
+}
+
+// Checks if entropy costs should be initialized/updated at frame level or not.
+static INLINE int is_frame_level_cost_upd_freq_set(
+    const AV1_COMMON *const cm, const INTERNAL_COST_UPDATE_TYPE cost_upd_level,
+    const int use_nonrd_pick_mode, const int frames_since_key) {
+  const int fill_costs =
+      frame_is_intra_only(cm) ||
+      (use_nonrd_pick_mode ? frames_since_key < 2
+                           : (cm->current_frame.frame_number & 0x07) == 1);
+  return ((!use_nonrd_pick_mode && cost_upd_level != INTERNAL_COST_UPD_OFF) ||
+          cost_upd_level == INTERNAL_COST_UPD_TILE || fill_costs);
+}
+
+// Decide whether we want to update the mode entropy cost for the current frame.
+// The logit is currently inherited from selective_disable_cdf_rtc.
+static AOM_INLINE int should_force_mode_cost_update(const AV1_COMP *cpi) {
+  const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+  if (!rt_sf->frame_level_mode_cost_update) {
+    return false;
+  }
+
+  if (cpi->oxcf.algo_cfg.cdf_update_mode == 2) {
+    return cpi->frames_since_last_update == 1;
+  } else if (cpi->oxcf.algo_cfg.cdf_update_mode == 1) {
+    if (cpi->svc.number_spatial_layers == 1 &&
+        cpi->svc.number_temporal_layers == 1) {
+      const AV1_COMMON *const cm = &cpi->common;
+      const RATE_CONTROL *const rc = &cpi->rc;
+
+      return frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
+             rc->high_source_sad || rc->frames_since_key < 10 ||
+             cpi->cyclic_refresh->counter_encode_maxq_scene_change < 10 ||
+             cm->current_frame.frame_number % 8 == 0;
+    } else if (cpi->svc.number_temporal_layers > 1) {
+      return cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1;
+    }
+  }
+
+  return false;
+}
+
+void av1_initialize_rd_consts(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  RD_OPT *const rd = &cpi->rd;
+  int use_nonrd_pick_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+  int frames_since_key = cpi->rc.frames_since_key;
+
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+  const int qindex_rdmult =
+      cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q;
+  rd->RDMULT = av1_compute_rd_mult(
+      qindex_rdmult, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+#if CONFIG_RD_COMMAND
+  if (cpi->oxcf.pass == 2) {
+    const RD_COMMAND *rd_command = &cpi->rd_command;
+    if (rd_command->option_ls[rd_command->frame_index] ==
+        RD_OPTION_SET_Q_RDMULT) {
+      rd->RDMULT = rd_command->rdmult_ls[rd_command->frame_index];
+    }
+  }
+#endif  // CONFIG_RD_COMMAND
+
+  av1_set_error_per_bit(&x->errorperbit, rd->RDMULT);
+
+  set_block_thresholds(cm, rd, cpi->sf.rt_sf.use_nonrd_pick_mode);
+
+  populate_unified_cost_update_freq(cpi->oxcf.cost_upd_freq, sf);
+  const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
+  // Frame level mv cost update
+  if (is_frame_level_cost_upd_freq_set(cm, inter_sf->mv_cost_upd_level,
+                                       use_nonrd_pick_mode, frames_since_key))
+    av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
+                      cm->features.allow_high_precision_mv, x->mv_costs);
+
+  // Frame level coefficient cost update
+  if (is_frame_level_cost_upd_freq_set(cm, inter_sf->coeff_cost_upd_level,
+                                       use_nonrd_pick_mode, frames_since_key))
+    av1_fill_coeff_costs(&x->coeff_costs, cm->fc, av1_num_planes(cm));
+
+  // Frame level mode cost update
+  if (should_force_mode_cost_update(cpi) ||
+      is_frame_level_cost_upd_freq_set(cm, inter_sf->mode_cost_upd_level,
+                                       use_nonrd_pick_mode, frames_since_key))
+    av1_fill_mode_rates(cm, &x->mode_costs, cm->fc);
+
+  // Frame level dv cost update
+  if (av1_need_dv_costs(cpi)) {
+    if (cpi->td.dv_costs_alloc == NULL) {
+      CHECK_MEM_ERROR(
+          cm, cpi->td.dv_costs_alloc,
+          (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.dv_costs_alloc)));
+      cpi->td.mb.dv_costs = cpi->td.dv_costs_alloc;
+    }
+    av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs);
+  }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+  // NOTE: The tables below must be of the same size.
+
+  // The functions described below are sampled at the four most significant
+  // bits of x^2 + 8 / 256.
+
+  // Normalized rate:
+  // This table models the rate for a Laplacian source with given variance
+  // when quantized with a uniform quantizer with given stepsize. The
+  // closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+  // and H(x) is the binary entropy function.
+  static const int rate_tab_q10[] = {
+    65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
+    4044,  3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
+    3133,  3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
+    2290,  2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
+    1608,  1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
+    911,   864,  821,  781,  745,  680,  623,  574,  530,  490,  455,  424,
+    395,   345,  304,  269,  239,  213,  190,  171,  154,  126,  104,  87,
+    73,    61,   52,   44,   38,   28,   21,   16,   12,   10,   8,    6,
+    5,     3,    2,    1,    1,    1,    0,    0,
+  };
+  // Normalized distortion:
+  // This table models the normalized distortion for a Laplacian source
+  // with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance).
+  // Note the actual distortion is Dn * variance.
+  static const int dist_tab_q10[] = {
+    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,
+    5,    6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,
+    18,   21,   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,
+    59,   64,   69,   73,   78,   88,   97,   106,  115,  124,  133,  142,
+    151,  167,  184,  200,  215,  231,  245,  260,  274,  301,  327,  351,
+    375,  397,  418,  439,  458,  495,  528,  559,  587,  613,  637,  659,
+    680,  717,  749,  777,  801,  823,  842,  859,  874,  899,  919,  936,
+    949,  960,  969,  977,  983,  994,  1001, 1006, 1010, 1013, 1015, 1017,
+    1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+  };
+  static const int xsq_iq_q10[] = {
+    0,      4,      8,      12,     16,     20,     24,     28,     32,
+    40,     48,     56,     64,     72,     80,     88,     96,     112,
+    128,    144,    160,    176,    192,    208,    224,    256,    288,
+    320,    352,    384,    416,    448,    480,    544,    608,    672,
+    736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
+    1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
+    3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
+    7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
+    16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
+    36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
+    81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
+    180192, 196576, 212960, 229344, 245728,
+  };
+  const int tmp = (xsq_q10 >> 2) + 8;
+  const int k = get_msb(tmp) - 3;
+  const int xq = (k << 3) + ((tmp >> k) & 0x7);
+  const int one_q10 = 1 << 10;
+  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+  const int b_q10 = one_q10 - a_q10;
+  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  if (var == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
+    int d_q10, r_q10;
+    static const uint32_t MAX_XSQ_Q10 = 245727;
+    const uint64_t xsq_q10_64 =
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+    const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10);
+    model_rd_norm(xsq_q10, &r_q10, &d_q10);
+    *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
+    *dist = (var * (int64_t)d_q10 + 512) >> 10;
+  }
+}
+
+static double interp_cubic(const double *p, double x) {
+  return p[1] + 0.5 * x *
+                    (p[2] - p[0] +
+                     x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+                          x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+/*
+static double interp_bicubic(const double *p, int p_stride, double x,
+                             double y) {
+  double q[4];
+  q[0] = interp_cubic(p, x);
+  q[1] = interp_cubic(p + p_stride, x);
+  q[2] = interp_cubic(p + 2 * p_stride, x);
+  q[3] = interp_cubic(p + 3 * p_stride, x);
+  return interp_cubic(q, y);
+}
+*/
+
+static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3
+};
+
+static int sse_norm_curvfit_model_cat_lookup(double sse_norm) {
+  return (sse_norm > 16.0);
+}
+
+// Models distortion by sse using a logistic function on
+// l = log2(sse / q^2) as:
+// dbysse = 16 / (1 + k exp(l + c))
+static double get_dbysse_logistic(double l, double c, double k) {
+  const double A = 16.0;
+  const double dbysse = A / (1 + k * exp(l + c));
+  return dbysse;
+}
+
+// Models rate using a clamped linear function on
+// l = log2(sse / q^2) as:
+// rate = max(0, a + b * l)
+static double get_rate_clamplinear(double l, double a, double b) {
+  const double rate = a + b * l;
+  return (rate < 0 ? 0 : rate);
+}
+
+static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+  0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4
+};
+
+static const double surffit_rate_params[9][4] = {
+  {
+      638.390212,
+      2.253108,
+      166.585650,
+      -3.939401,
+  },
+  {
+      5.256905,
+      81.997240,
+      -1.321771,
+      17.694216,
+  },
+  {
+      -74.193045,
+      72.431868,
+      -19.033152,
+      15.407276,
+  },
+  {
+      416.770113,
+      14.794188,
+      167.686830,
+      -6.997756,
+  },
+  {
+      378.511276,
+      9.558376,
+      154.658843,
+      -6.635663,
+  },
+  {
+      277.818787,
+      4.413180,
+      150.317637,
+      -9.893038,
+  },
+  {
+      142.212132,
+      11.542038,
+      94.393964,
+      -5.518517,
+  },
+  {
+      219.100256,
+      4.007421,
+      108.932852,
+      -6.981310,
+  },
+  {
+      222.261971,
+      3.251049,
+      95.972916,
+      -5.609789,
+  },
+};
+
+static const double surffit_dist_params[7] = { 1.475844,  4.328362, -5.680233,
+                                               -0.500994, 0.554585, 4.839478,
+                                               -0.695837 };
+
+static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+                                             double *rpar) {
+  const int cat = bsize_surffit_model_cat_lookup[bsize];
+  rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm;
+  rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm;
+}
+
+static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+                                             double *dpar) {
+  (void)bsize;
+  const double *params = surffit_dist_params;
+  dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3]));
+  dpar[1] = params[4] + params[5] * exp(params[6] * xm);
+}
+
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+                          double yl, double *rate_f, double *distbysse_f) {
+  (void)sse_norm;
+  double rpar[2], dpar[2];
+  rate_surffit_model_params_lookup(bsize, xm, rpar);
+  dist_surffit_model_params_lookup(bsize, xm, dpar);
+
+  *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]);
+  *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]);
+}
+
+static const double interp_rgrid_curv[4][65] = {
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    118.257702,  120.210658,  121.434853,  122.100487,
+      122.377758,  122.436865,  72.290102,   96.974289,   101.652727,
+      126.830141,  140.417377,  157.644879,  184.315291,  215.823873,
+      262.300169,  335.919859,  420.624173,  519.185032,  619.854243,
+      726.053595,  827.663369,  933.127475,  1037.988755, 1138.839609,
+      1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052,
+      1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680,
+      2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011,
+      2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827,
+      2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773,
+      3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    13.087244,   15.919735,   25.930313,   24.412411,
+      28.567417,   29.924194,   30.857010,   32.742979,   36.382570,
+      39.210386,   42.265690,   47.378572,   57.014850,   82.740067,
+      137.346562,  219.968084,  316.781856,  415.643773,  516.706538,
+      614.914364,  714.303763,  815.512135,  911.210485,  1008.501528,
+      1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641,
+      1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309,
+      1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824,
+      2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694,
+      2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660,
+      3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    4.656893,    5.123633,    5.594132,    6.162376,
+      6.918433,    7.768444,    8.739415,    10.105862,   11.477328,
+      13.236604,   15.421030,   19.093623,   25.801871,   46.724612,
+      98.841054,   181.113466,  272.586364,  359.499769,  445.546343,
+      525.944439,  605.188743,  681.793483,  756.668359,  838.486885,
+      926.950356,  1015.482542, 1113.353926, 1204.897193, 1288.871992,
+      1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771,
+      1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872,
+      2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216,
+      2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436,
+      3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.337370,    0.391916,    0.468839,    0.566334,
+      0.762564,    1.069225,    1.384361,    1.787581,    2.293948,
+      3.251909,    4.412991,    8.050068,    11.606073,   27.668092,
+      65.227758,   128.463938,  202.097653,  262.715851,  312.464873,
+      355.601398,  400.609054,  447.201352,  495.761568,  552.871938,
+      619.067625,  691.984883,  773.753288,  860.628503,  946.262808,
+      1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987,
+      1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823,
+      1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119,
+      2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754,
+      3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000,
+  },
+};
+
+static const double interp_dgrid_curv[3][65] = {
+  {
+      16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
+      15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
+      15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387,
+      13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790,
+      7.487633,  5.688649,  4.267515,  3.196300,  2.434201,  1.834064,
+      1.369920,  1.035921,  0.775279,  0.574895,  0.427232,  0.314123,
+      0.233236,  0.171440,  0.128188,  0.092762,  0.067569,  0.049324,
+      0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
+      0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
+      0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
+      0.000348,  0.000193,  0.000085,  0.000021,  0.000000,
+  },
+  {
+      16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501,
+      15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967,
+      15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212,
+      13.073692, 12.222005, 11.237799, 9.985848,  8.898823,  7.423519,
+      5.995325,  4.773152,  3.744032,  2.938217,  2.294526,  1.762412,
+      1.327145,  1.020728,  0.765535,  0.570548,  0.425833,  0.313825,
+      0.232959,  0.171324,  0.128174,  0.092750,  0.067558,  0.049319,
+      0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
+      0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
+      0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
+      0.000348,  0.000193,  0.000085,  0.000021,  -0.000000,
+  },
+};
+
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+                          double *rate_f, double *distbysse_f) {
+  const double x_start = -15.5;
+  const double x_end = 16.5;
+  const double x_step = 0.5;
+  const double epsilon = 1e-6;
+  const int rcat = bsize_curvfit_model_cat_lookup[bsize];
+  const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm);
+  (void)x_end;
+
+  xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+  xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+  const double x = (xqr - x_start) / x_step;
+  const int xi = (int)floor(x);
+  const double xo = x - xi;
+
+  assert(xi > 0);
+
+  const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
+  *rate_f = interp_cubic(prate, xo);
+  const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
+  *distbysse_f = interp_cubic(pdist, xo);
+}
+
+static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
+                                       const struct macroblockd_plane *pd,
+                                       ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                                       ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+  const int num_4x4_w = mi_size_wide[plane_bsize];
+  const int num_4x4_h = mi_size_high[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_entropy_context;
+  const ENTROPY_CONTEXT *const left = pd->left_entropy_context;
+
+  memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+  memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+}
+
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
+}
+
+// Special clamping used in the encoder when calculating a prediction
+//
+// Logically, all pixel fetches used for prediction are clamped against the
+// edges of the frame. But doing this directly is slow, so instead we allocate
+// a finite border around the frame and fill it with copies of the outermost
+// pixels.
+//
+// Since this border is finite, we need to clamp the motion vector before
+// prediction in order to avoid out-of-bounds reads. At the same time, this
+// clamp must not change the prediction result.
+//
+// We can balance both of these concerns by calculating how far we would have
+// to go in each direction before the extended prediction region (the current
+// block + AOM_INTERP_EXTEND many pixels around the block) would be mapped
+// so that it touches the frame only at one row or column. This is a special
+// point because any more extreme MV will always lead to the same prediction.
+// So it is safe to clamp at that point.
+//
+// In the worst case, this requires a border of
+//   max_block_width + 2*AOM_INTERP_EXTEND = 128 + 2*4 = 136 pixels
+// around the frame edges.
+static INLINE void enc_clamp_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                MV *mv) {
+  int bw = xd->width << MI_SIZE_LOG2;
+  int bh = xd->height << MI_SIZE_LOG2;
+
+  int px_to_left_edge = xd->mi_col << MI_SIZE_LOG2;
+  int px_to_right_edge = (cm->mi_params.mi_cols - xd->mi_col) << MI_SIZE_LOG2;
+  int px_to_top_edge = xd->mi_row << MI_SIZE_LOG2;
+  int px_to_bottom_edge = (cm->mi_params.mi_rows - xd->mi_row) << MI_SIZE_LOG2;
+
+  const SubpelMvLimits mv_limits = {
+    .col_min = -GET_MV_SUBPEL(px_to_left_edge + bw + AOM_INTERP_EXTEND),
+    .col_max = GET_MV_SUBPEL(px_to_right_edge + AOM_INTERP_EXTEND),
+    .row_min = -GET_MV_SUBPEL(px_to_top_edge + bh + AOM_INTERP_EXTEND),
+    .row_max = GET_MV_SUBPEL(px_to_bottom_edge + AOM_INTERP_EXTEND)
+  };
+  clamp_mv(mv, &mv_limits);
+}
+
+void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+                 int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+  const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  const int_mv ref_mv =
+      av1_get_ref_mv_from_stack(0, ref_frames, 0, &x->mbmi_ext);
+  const int_mv ref_mv1 =
+      av1_get_ref_mv_from_stack(0, ref_frames, 1, &x->mbmi_ext);
+  MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
+  int num_mv_refs = 0;
+  pred_mv[num_mv_refs++] = ref_mv.as_mv;
+  if (ref_mv.as_int != ref_mv1.as_int) {
+    pred_mv[num_mv_refs++] = ref_mv1.as_mv;
+  }
+
+  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+  const uint8_t *const src_y_ptr = x->plane[0].src.buf;
+  int zero_seen = 0;
+  int best_sad = INT_MAX;
+  int max_mv = 0;
+  // Get the sad for each candidate reference mv.
+  for (int i = 0; i < num_mv_refs; ++i) {
+    MV *this_mv = &pred_mv[i];
+    enc_clamp_mv(&cpi->common, &x->e_mbd, this_mv);
+
+    const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+    if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+    zero_seen |= (fp_row == 0 && fp_col == 0);
+
+    const uint8_t *const ref_y_ptr =
+        &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+    // Find sad for current vector.
+    const int this_sad = cpi->ppi->fn_ptr[block_size].sdf(
+        src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride);
+    // Note if it is the best so far.
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+    }
+    if (i == 0)
+      x->pred_mv0_sad[ref_frame] = this_sad;
+    else if (i == 1)
+      x->pred_mv1_sad[ref_frame] = this_sad;
+  }
+
+  // Note the index of the mv that worked best in the reference list.
+  x->max_mv_context[ref_frame] = max_mv;
+  x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv,
+                          const int num_planes) {
+  dst[0].buf = src->y_buffer;
+  dst[0].stride = src->y_stride;
+  dst[1].buf = src->u_buffer;
+  dst[2].buf = src->v_buffer;
+  dst[1].stride = dst[2].stride = src->uv_stride;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  for (int i = 0; i < num_planes; ++i) {
+    setup_pred_plane(dst + i, xd->mi[0]->bsize, dst[i].buf,
+                     i ? src->uv_crop_width : src->y_crop_width,
+                     i ? src->uv_crop_height : src->y_crop_height,
+                     dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+  }
+}
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
+                                             int ref_frame) {
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+  RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1];
+  const RefCntBuffer *const ref_buf =
+      get_ref_frame_buf(&cpi->common, ref_frame);
+  return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf
+                                                       : NULL;
+}
+
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            InterpFilter interp_filter, int dual_filter) {
+  if (interp_filter == SWITCHABLE) {
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    int inter_filter_cost = 0;
+    for (int dir = 0; dir < 2; ++dir) {
+      if (dir && !dual_filter) break;
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      const InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx][filter];
+    }
+    return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+  } else {
+    return 0;
+  }
+}
+
+void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
+  RD_OPT *const rd = &cpi->rd;
+
+  // Set baseline threshold values.
+  av1_zero(rd->thresh_mult);
+
+  rd->thresh_mult[THR_NEARESTMV] = 300;
+  rd->thresh_mult[THR_NEARESTL2] = 300;
+  rd->thresh_mult[THR_NEARESTL3] = 300;
+  rd->thresh_mult[THR_NEARESTB] = 300;
+  rd->thresh_mult[THR_NEARESTA2] = 300;
+  rd->thresh_mult[THR_NEARESTA] = 300;
+  rd->thresh_mult[THR_NEARESTG] = 300;
+
+  rd->thresh_mult[THR_NEWMV] = 1000;
+  rd->thresh_mult[THR_NEWL2] = 1000;
+  rd->thresh_mult[THR_NEWL3] = 1000;
+  rd->thresh_mult[THR_NEWB] = 1000;
+  rd->thresh_mult[THR_NEWA2] = 1100;
+  rd->thresh_mult[THR_NEWA] = 1000;
+  rd->thresh_mult[THR_NEWG] = 1000;
+
+  rd->thresh_mult[THR_NEARMV] = 1000;
+  rd->thresh_mult[THR_NEARL2] = 1000;
+  rd->thresh_mult[THR_NEARL3] = 1000;
+  rd->thresh_mult[THR_NEARB] = 1000;
+  rd->thresh_mult[THR_NEARA2] = 1000;
+  rd->thresh_mult[THR_NEARA] = 1000;
+  rd->thresh_mult[THR_NEARG] = 1000;
+
+  rd->thresh_mult[THR_GLOBALMV] = 2200;
+  rd->thresh_mult[THR_GLOBALL2] = 2000;
+  rd->thresh_mult[THR_GLOBALL3] = 2000;
+  rd->thresh_mult[THR_GLOBALB] = 2400;
+  rd->thresh_mult[THR_GLOBALA2] = 2000;
+  rd->thresh_mult[THR_GLOBALG] = 2000;
+  rd->thresh_mult[THR_GLOBALA] = 2400;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360;
+  rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640;
+  rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200;
+
+  rd->thresh_mult[THR_DC] = 1000;
+  rd->thresh_mult[THR_PAETH] = 1000;
+  rd->thresh_mult[THR_SMOOTH] = 2200;
+  rd->thresh_mult[THR_SMOOTH_V] = 2000;
+  rd->thresh_mult[THR_SMOOTH_H] = 2000;
+  rd->thresh_mult[THR_H_PRED] = 2000;
+  rd->thresh_mult[THR_V_PRED] = 1800;
+  rd->thresh_mult[THR_D135_PRED] = 2500;
+  rd->thresh_mult[THR_D203_PRED] = 2000;
+  rd->thresh_mult[THR_D157_PRED] = 2500;
+  rd->thresh_mult[THR_D67_PRED] = 2000;
+  rd->thresh_mult[THR_D113_PRED] = 2500;
+  rd->thresh_mult[THR_D45_PRED] = 2500;
+}
+
+static INLINE void update_thr_fact(int (*factor_buf)[MAX_MODES],
+                                   THR_MODES best_mode_index,
+                                   THR_MODES mode_start, THR_MODES mode_end,
+                                   BLOCK_SIZE min_size, BLOCK_SIZE max_size,
+                                   int max_rd_thresh_factor) {
+  for (THR_MODES mode = mode_start; mode < mode_end; ++mode) {
+    for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) {
+      int *const fact = &factor_buf[bs][mode];
+      if (mode == best_mode_index) {
+        *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR);
+      } else {
+        *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor);
+      }
+    }
+  }
+}
+
+void av1_update_rd_thresh_fact(
+    const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES],
+    int use_adaptive_rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index,
+    THR_MODES inter_mode_start, THR_MODES inter_mode_end,
+    THR_MODES intra_mode_start, THR_MODES intra_mode_end) {
+  assert(use_adaptive_rd_thresh > 0);
+  const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
+
+  const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size;
+  BLOCK_SIZE min_size, max_size;
+  if (bsize_is_1_to_4) {
+    // This part handles block sizes with 1:4 and 4:1 aspect ratios
+    // TODO(any): Experiment with threshold update for parent/child blocks
+    min_size = bsize;
+    max_size = bsize;
+  } else {
+    min_size = AOMMAX(bsize - 2, BLOCK_4X4);
+    max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size);
+  }
+
+  update_thr_fact(factor_buf, best_mode_index, inter_mode_start, inter_mode_end,
+                  min_size, max_size, max_rd_thresh_factor);
+  update_thr_fact(factor_buf, best_mode_index, intra_mode_start, intra_mode_end,
+                  min_size, max_size, max_rd_thresh_factor);
+}
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+                               aom_bit_depth_t bit_depth) {
+  const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth);
+  switch (bit_depth) {
+    case AOM_BITS_8: return 20 * q;
+    case AOM_BITS_10: return 5 * q;
+    case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
new file mode 100644
index 0000000000..b38d9ca542
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
+
+#include <limits.h>
+
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, R, D)                                            \
+  (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
+   ((D) * (1 << RDDIV_BITS)))
+
+#define RDCOST_NEG_R(RM, R, D) \
+  (((D) * (1 << RDDIV_BITS)) - \
+   ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT))
+
+#define RDCOST_DBL_WITH_NATIVE_BD_DIST(RM, R, D, BD)               \
+  (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+   ((double)((D) >> (2 * (BD - 8))) * (1 << RDDIV_BITS)))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+// The fractional part of rd_thresh factor is stored with 5 bits. The maximum
+// factor that we allow is two, which is stored as 2 ** (5+1) = 64
+#define RD_THRESH_FAC_FRAC_BITS (5)
+#define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS))
+#define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1)
+#define RD_THRESH_LOG_DEC_FACTOR (4)
+#define RD_THRESH_INC (1)
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+// Macros for common video resolutions: width x height
+// For example, 720p represents video resolution of 1280x720 pixels.
+#define RESOLUTION_288P 352 * 288
+#define RESOLUTION_360P 640 * 360
+#define RESOLUTION_480P 640 * 480
+#define RESOLUTION_720P 1280 * 720
+#define RESOLUTION_1080P 1920 * 1080
+#define RESOLUTION_1440P 2560 * 1440
+#define RESOLUTION_4K 3840 * 2160
+
+#define RTC_REFS 4
+static const MV_REFERENCE_FRAME real_time_ref_combos[RTC_REFS][2] = {
+  { LAST_FRAME, NONE_FRAME },
+  { ALTREF_FRAME, NONE_FRAME },
+  { GOLDEN_FRAME, NONE_FRAME },
+  { INTRA_FRAME, NONE_FRAME }
+};
+
+static INLINE int mode_offset(const PREDICTION_MODE mode) {
+  if (mode >= NEARESTMV) {
+    return INTER_OFFSET(mode);
+  } else {
+    switch (mode) {
+      case DC_PRED: return 0;
+      case V_PRED: return 1;
+      case H_PRED: return 2;
+      case SMOOTH_PRED: return 3;
+      default: assert(0); return -1;
+    }
+  }
+}
+
+enum {
+  // Default initialization when we are not using winner mode framework. e.g.
+  // intrabc
+  DEFAULT_EVAL = 0,
+  // Initialization for selecting winner mode
+  MODE_EVAL,
+  // Initialization for winner mode evaluation
+  WINNER_MODE_EVAL,
+  // All mode evaluation types
+  MODE_EVAL_TYPES,
+} UENUM1BYTE(MODE_EVAL_TYPE);
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
+
+  int RDMULT;
+
+  double r0;
+} RD_OPT;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->rdcost = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip_txfm = 1;
+  rd_stats->zero_rate = 0;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats->txb_coeff_cost[plane] = 0;
+  }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = INT_MAX;
+  rd_stats->dist = INT64_MAX;
+  rd_stats->rdcost = INT64_MAX;
+  rd_stats->sse = INT64_MAX;
+  rd_stats->skip_txfm = 0;
+  rd_stats->zero_rate = 0;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats->txb_coeff_cost[plane] = INT_MAX;
+  }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+                                      const RD_STATS *rd_stats_src) {
+  if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) {
+    // If rd_stats_dst or rd_stats_src has invalid rate, we will make
+    // rd_stats_dst invalid.
+    av1_invalid_rd_stats(rd_stats_dst);
+    return;
+  }
+  rd_stats_dst->rate = (int)AOMMIN(
+      ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX);
+  if (!rd_stats_dst->zero_rate)
+    rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
+  rd_stats_dst->dist += rd_stats_src->dist;
+  if (rd_stats_dst->sse < INT64_MAX && rd_stats_src->sse < INT64_MAX) {
+    rd_stats_dst->sse += rd_stats_src->sse;
+  }
+  rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+  }
+#endif
+}
+
+static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
+                                           int rate, int skip_txfm, int64_t sse,
+                                           int zero_rate) {
+  assert(rd_stats->rate != INT_MAX && rate != INT_MAX);
+  rd_stats->rate += rate;
+  if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate;
+  rd_stats->dist += dist;
+  rd_stats->skip_txfm &= skip_txfm;
+  rd_stats->sse += sse;
+}
+
+static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) {
+  assert(mult >= 0);
+  if (rate >= 0) {
+    return RDCOST(mult, rate, dist);
+  }
+  return RDCOST_NEG_R(mult, -rate, dist);
+}
+
+static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) {
+  if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX &&
+      rd_cost->rdcost < INT64_MAX) {
+    rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist);
+  } else {
+    av1_invalid_rd_stats(rd_cost);
+  }
+}
+
+static INLINE void av1_rd_stats_subtraction(int mult,
+                                            const RD_STATS *const left,
+                                            const RD_STATS *const right,
+                                            RD_STATS *result) {
+  if (left->rate == INT_MAX || right->rate == INT_MAX ||
+      left->dist == INT64_MAX || right->dist == INT64_MAX ||
+      left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) {
+    av1_invalid_rd_stats(result);
+  } else {
+    result->rate = left->rate - right->rate;
+    result->dist = left->dist - right->dist;
+    result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist);
+  }
+}
+
+struct TileInfo;
+struct TileDataEnc;
+struct AV1_COMP;
+struct macroblock;
+
+/*!\brief Compute rdmult based on q index and frame update type
+ *
+ * \param[in]       bit_depth       bit depth
+ * \param[in]       update_type     frame update type
+ * \param[in]       qindex          q index
+ *
+ * \return rdmult
+ */
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+                                        FRAME_UPDATE_TYPE update_type,
+                                        int qindex);
+
+int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
+                        const FRAME_UPDATE_TYPE update_type,
+                        const int layer_depth, const int boost_index,
+                        const FRAME_TYPE frame_type,
+                        const int use_fixed_qp_offsets,
+                        const int is_stat_consumption_stage);
+
+void av1_initialize_rd_consts(struct AV1_COMP *cpi);
+
+// Sets the multiplier to convert mv cost to l1 error during motion search.
+void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit,
+                         int qindex);
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
+                                  unsigned int qstep, int *rate, int64_t *dist);
+
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+                          double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+                          double yl, double *rate_f, double *distbysse_f);
+
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            InterpFilter interp_filter, int dual_filter);
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
+                                             int ref_frame);
+
+void av1_init_me_luts(void);
+
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
+
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
+
+void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*fact)[MAX_MODES], int rd_thresh,
+                               BLOCK_SIZE bsize, THR_MODES best_mode_index,
+                               THR_MODES inter_mode_start,
+                               THR_MODES inter_mode_end,
+                               THR_MODES intra_mode_start,
+                               THR_MODES intra_mode_end);
+
+static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    for (int j = 0; j < MAX_MODES; ++j) {
+      x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL;
+    }
+  }
+}
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int64_t thresh,
+                                      int thresh_fact) {
+  return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
+                 BLOCK_SIZE block_size);
+
+// Sets the multiplier to convert mv cost to l2 error during motion search.
+static INLINE void av1_set_error_per_bit(int *errorperbit, int rdmult) {
+  *errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1);
+}
+
+// Get the threshold for R-D optimization of coefficients depending upon mode
+// decision/winner mode processing
+static INLINE void get_rd_opt_coeff_thresh(
+    const uint32_t (*const coeff_opt_threshold)[2],
+    TxfmSearchParams *txfm_params, int enable_winner_mode_for_coeff_opt,
+    int is_winner_mode) {
+  if (!enable_winner_mode_for_coeff_opt) {
+    // Default initialization of threshold
+    txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[DEFAULT_EVAL][0];
+    txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[DEFAULT_EVAL][1];
+    return;
+  }
+  // TODO(any): Experiment with coeff_opt_dist_threshold values when
+  // enable_winner_mode_for_coeff_opt is ON
+  // TODO(any): Skip the winner mode processing for blocks with lower residual
+  // energy as R-D optimization of coefficients would have been enabled during
+  // mode decision
+
+  // Use conservative threshold during mode decision and perform R-D
+  // optimization of coeffs always for winner modes
+  if (is_winner_mode) {
+    txfm_params->coeff_opt_thresholds[0] =
+        coeff_opt_threshold[WINNER_MODE_EVAL][0];
+    txfm_params->coeff_opt_thresholds[1] =
+        coeff_opt_threshold[WINNER_MODE_EVAL][1];
+  } else {
+    txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[MODE_EVAL][0];
+    txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[MODE_EVAL][1];
+  }
+}
+
+// Used to reset the state of mb rd hash information
+static INLINE void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) {
+  if (!mb_rd_record) return;
+
+  // Reset the state for use_mb_rd_hash
+  mb_rd_record->num = mb_rd_record->index_start = 0;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv,
+                          const int num_planes);
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+                               aom_bit_depth_t bit_depth);
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
+                         FRAME_CONTEXT *fc);
+
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc);
+
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
+                          const int num_planes);
+
+void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
+                       MvCosts *mv_costs);
+
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs);
+
+int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
+
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta);
+
+/*!\brief Adjust current superblock's q_index based on delta q resolution
+ *
+ * \param[in]       delta_q_res       delta q resolution
+ * \param[in]       prev_qindex       previous superblock's q index
+ * \param[in]       curr_qindex       current superblock's q index
+ *
+ * \return the current superblock's adjusted q_index
+ */
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+                                  int curr_qindex);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
new file mode 100644
index 0000000000..c17fbccf8c
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -0,0 +1,6598 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/mode_prune_model_weights.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
+#include "av1/encoder/var_based_part.h"
+
+#define LAST_NEW_MV_INDEX 6
+
+// Mode_threshold multiplication factor table for prune_inter_modes_if_skippable
+// The values are kept in Q12 format and equation used to derive is
+// (2.5 - ((float)x->qindex / MAXQ) * 1.5)
+#define MODE_THRESH_QBITS 12
+static const int mode_threshold_mul_factor[QINDEX_RANGE] = {
+  10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999,
+  9975,  9951,  9927,  9903,  9879,  9854,  9830,  9806,  9782,  9758,  9734,
+  9710,  9686,  9662,  9638,  9614,  9589,  9565,  9541,  9517,  9493,  9469,
+  9445,  9421,  9397,  9373,  9349,  9324,  9300,  9276,  9252,  9228,  9204,
+  9180,  9156,  9132,  9108,  9083,  9059,  9035,  9011,  8987,  8963,  8939,
+  8915,  8891,  8867,  8843,  8818,  8794,  8770,  8746,  8722,  8698,  8674,
+  8650,  8626,  8602,  8578,  8553,  8529,  8505,  8481,  8457,  8433,  8409,
+  8385,  8361,  8337,  8312,  8288,  8264,  8240,  8216,  8192,  8168,  8144,
+  8120,  8096,  8072,  8047,  8023,  7999,  7975,  7951,  7927,  7903,  7879,
+  7855,  7831,  7806,  7782,  7758,  7734,  7710,  7686,  7662,  7638,  7614,
+  7590,  7566,  7541,  7517,  7493,  7469,  7445,  7421,  7397,  7373,  7349,
+  7325,  7301,  7276,  7252,  7228,  7204,  7180,  7156,  7132,  7108,  7084,
+  7060,  7035,  7011,  6987,  6963,  6939,  6915,  6891,  6867,  6843,  6819,
+  6795,  6770,  6746,  6722,  6698,  6674,  6650,  6626,  6602,  6578,  6554,
+  6530,  6505,  6481,  6457,  6433,  6409,  6385,  6361,  6337,  6313,  6289,
+  6264,  6240,  6216,  6192,  6168,  6144,  6120,  6096,  6072,  6048,  6024,
+  5999,  5975,  5951,  5927,  5903,  5879,  5855,  5831,  5807,  5783,  5758,
+  5734,  5710,  5686,  5662,  5638,  5614,  5590,  5566,  5542,  5518,  5493,
+  5469,  5445,  5421,  5397,  5373,  5349,  5325,  5301,  5277,  5253,  5228,
+  5204,  5180,  5156,  5132,  5108,  5084,  5060,  5036,  5012,  4987,  4963,
+  4939,  4915,  4891,  4867,  4843,  4819,  4795,  4771,  4747,  4722,  4698,
+  4674,  4650,  4626,  4602,  4578,  4554,  4530,  4506,  4482,  4457,  4433,
+  4409,  4385,  4361,  4337,  4313,  4289,  4265,  4241,  4216,  4192,  4168,
+  4144,  4120,  4096
+};
+
+static const THR_MODES av1_default_mode_order[MAX_MODES] = {
+  THR_NEARESTMV,
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+  THR_NEARESTA2,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_NEWMV,
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+  THR_NEWA2,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+  THR_NEARA2,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
+
+  THR_COMP_NEAREST_NEARESTLA,
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+  THR_COMP_NEAREST_NEARESTGA,
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+  THR_COMP_NEAREST_NEARESTLA2,
+  THR_COMP_NEAREST_NEARESTL2A2,
+  THR_COMP_NEAREST_NEARESTL3A2,
+  THR_COMP_NEAREST_NEARESTGA2,
+  THR_COMP_NEAREST_NEARESTLL2,
+  THR_COMP_NEAREST_NEARESTLL3,
+  THR_COMP_NEAREST_NEARESTLG,
+  THR_COMP_NEAREST_NEARESTBA,
+
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
+
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
+
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_GLOBAL_GLOBALGB,
+
+  THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEWLA2,
+  THR_COMP_NEW_NEARESTLA2,
+  THR_COMP_NEAREST_NEWLA2,
+  THR_COMP_NEW_NEARLA2,
+  THR_COMP_NEAR_NEWLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
+
+  THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEWL2A2,
+  THR_COMP_NEW_NEARESTL2A2,
+  THR_COMP_NEAREST_NEWL2A2,
+  THR_COMP_NEW_NEARL2A2,
+  THR_COMP_NEAR_NEWL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
+
+  THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEWL3A2,
+  THR_COMP_NEW_NEARESTL3A2,
+  THR_COMP_NEAREST_NEWL3A2,
+  THR_COMP_NEW_NEARL3A2,
+  THR_COMP_NEAR_NEWL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
+
+  THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEWGA2,
+  THR_COMP_NEW_NEARESTGA2,
+  THR_COMP_NEAREST_NEWGA2,
+  THR_COMP_NEW_NEARGA2,
+  THR_COMP_NEAR_NEWGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEWLL2,
+  THR_COMP_NEW_NEARESTLL2,
+  THR_COMP_NEAREST_NEWLL2,
+  THR_COMP_NEW_NEARLL2,
+  THR_COMP_NEAR_NEWLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
+
+  THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEWLL3,
+  THR_COMP_NEW_NEARESTLL3,
+  THR_COMP_NEAREST_NEWLL3,
+  THR_COMP_NEW_NEARLL3,
+  THR_COMP_NEAR_NEWLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
+
+  THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEWLG,
+  THR_COMP_NEW_NEARESTLG,
+  THR_COMP_NEAREST_NEWLG,
+  THR_COMP_NEW_NEARLG,
+  THR_COMP_NEAR_NEWLG,
+  THR_COMP_GLOBAL_GLOBALLG,
+
+  THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEWBA,
+  THR_COMP_NEW_NEARESTBA,
+  THR_COMP_NEAREST_NEWBA,
+  THR_COMP_NEW_NEARBA,
+  THR_COMP_NEAR_NEWBA,
+  THR_COMP_GLOBAL_GLOBALBA,
+
+  THR_DC,
+  THR_PAETH,
+  THR_SMOOTH,
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
+};
+
+/*!\cond */
+typedef struct SingleInterModeState {
+  int64_t rd;
+  MV_REFERENCE_FRAME ref_frame;
+  int valid;
+} SingleInterModeState;
+
+typedef struct InterModeSearchState {
+  int64_t best_rd;
+  int64_t best_skip_rd[2];
+  MB_MODE_INFO best_mbmode;
+  int best_rate_y;
+  int best_rate_uv;
+  int best_mode_skippable;
+  int best_skip2;
+  THR_MODES best_mode_index;
+  int num_available_refs;
+  int64_t dist_refs[REF_FRAMES];
+  int dist_order_refs[REF_FRAMES];
+  int64_t mode_threshold[MAX_MODES];
+  int64_t best_intra_rd;
+  unsigned int best_pred_sse;
+
+  /*!
+   * \brief Keep track of best intra rd for use in compound mode.
+   */
+  int64_t best_pred_rd[REFERENCE_MODES];
+  // Save a set of single_newmv for each checked ref_mv.
+  int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+  // The rd of simple translation in single inter modes
+  int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+  int64_t best_single_rd[REF_FRAMES];
+  PREDICTION_MODE best_single_mode[REF_FRAMES];
+
+  // Single search results by [directions][modes][reference frames]
+  SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+  int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+  SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+                                            [FWD_REFS];
+  int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+  MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+  IntraModeSearchState intra_search_state;
+  RD_STATS best_y_rdcost;
+} InterModeSearchState;
+/*!\endcond */
+
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
+    md->ready = 0;
+    md->num = 0;
+    md->dist_sum = 0;
+    md->ld_sum = 0;
+    md->sse_sum = 0;
+    md->sse_sse_sum = 0;
+    md->sse_ld_sum = 0;
+  }
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                             int64_t sse, int *est_residue_cost,
+                             int64_t *est_dist) {
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  if (md->ready) {
+    if (sse < md->dist_mean) {
+      *est_residue_cost = 0;
+      *est_dist = sse;
+    } else {
+      *est_dist = (int64_t)round(md->dist_mean);
+      const double est_ld = md->a * sse + md->b;
+      // Clamp estimated rate cost by INT_MAX / 2.
+      // TODO(angiebird@google.com): find better solution than clamping.
+      if (fabs(est_ld) < 1e-2) {
+        *est_residue_cost = INT_MAX / 2;
+      } else {
+        double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+        if (est_residue_cost_dbl < 0) {
+          *est_residue_cost = 0;
+        } else {
+          *est_residue_cost =
+              (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+        }
+      }
+      if (*est_residue_cost <= 0) {
+        *est_residue_cost = 0;
+        *est_dist = sse;
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
+
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const int block_idx = inter_mode_data_block_idx(bsize);
+    InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+    if (block_idx == -1) continue;
+    if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
+      continue;
+    } else {
+      if (md->ready == 0) {
+        md->dist_mean = md->dist_sum / md->num;
+        md->ld_mean = md->ld_sum / md->num;
+        md->sse_mean = md->sse_sum / md->num;
+        md->sse_sse_mean = md->sse_sse_sum / md->num;
+        md->sse_ld_mean = md->sse_ld_sum / md->num;
+      } else {
+        const double factor = 3;
+        md->dist_mean =
+            (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+        md->ld_mean =
+            (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+        md->sse_mean =
+            (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+        md->sse_sse_mean =
+            (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+            (factor + 1);
+        md->sse_ld_mean =
+            (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+            (factor + 1);
+      }
+
+      const double my = md->ld_mean;
+      const double mx = md->sse_mean;
+      const double dx = sqrt(md->sse_sse_mean);
+      const double dxy = md->sse_ld_mean;
+
+      md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+      md->b = my - md->a * mx;
+      md->ready = 1;
+
+      md->num = 0;
+      md->dist_sum = 0;
+      md->ld_sum = 0;
+      md->sse_sum = 0;
+      md->sse_sse_sum = 0;
+      md->sse_ld_sum = 0;
+    }
+    (void)rdmult;
+  }
+}
+
+static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data,
+                                            BLOCK_SIZE bsize, int64_t sse,
+                                            int64_t dist, int residue_cost) {
+  if (residue_cost == 0 || sse == dist) return;
+  const int block_idx = inter_mode_data_block_idx(bsize);
+  if (block_idx == -1) return;
+  InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+  if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+    const double ld = (sse - dist) * 1. / residue_cost;
+    ++rd_model->num;
+    rd_model->dist_sum += dist;
+    rd_model->ld_sum += ld;
+    rd_model->sse_sum += sse;
+    rd_model->sse_sse_sum += (double)sse * (double)sse;
+    rd_model->sse_ld_sum += sse * ld;
+  }
+}
+
+static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info,
+                                             int mode_rate, int64_t sse,
+                                             int64_t rd, RD_STATS *rd_cost,
+                                             RD_STATS *rd_cost_y,
+                                             RD_STATS *rd_cost_uv,
+                                             const MB_MODE_INFO *mbmi) {
+  const int num = inter_modes_info->num;
+  assert(num < MAX_INTER_MODES);
+  inter_modes_info->mbmi_arr[num] = *mbmi;
+  inter_modes_info->mode_rate_arr[num] = mode_rate;
+  inter_modes_info->sse_arr[num] = sse;
+  inter_modes_info->est_rd_arr[num] = rd;
+  inter_modes_info->rd_cost_arr[num] = *rd_cost;
+  inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y;
+  inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv;
+  ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+  if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+    // To avoid inconsistency in qsort() ordering when two elements are equal,
+    // using idx as tie breaker. Refer aomedia:2928
+    if (((RdIdxPair *)a)->idx == ((RdIdxPair *)b)->idx)
+      return 0;
+    else if (((RdIdxPair *)a)->idx > ((RdIdxPair *)b)->idx)
+      return 1;
+    else
+      return -1;
+  } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+    return 1;
+  } else {
+    return -1;
+  }
+}
+
+static AOM_INLINE void inter_modes_info_sort(
+    const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) {
+  if (inter_modes_info->num == 0) {
+    return;
+  }
+  for (int i = 0; i < inter_modes_info->num; ++i) {
+    rd_idx_pair_arr[i].idx = i;
+    rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
+  }
+  qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+        compare_rd_idx_pair);
+}
+
+// Similar to get_horver_correlation, but also takes into account first
+// row/column, when computing horizontal/vertical correlation.
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride,
+                                       int width, int height, float *hcorr,
+                                       float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - left neighbor pixel
+  // z - top neighbor pixel
+  int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0;
+  int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0;
+  int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0;
+
+  // First, process horizontal correlation on just the first row
+  x_sum += diff[0];
+  x2_sum += diff[0] * diff[0];
+  x_firstrow += diff[0];
+  x2_firstrow += diff[0] * diff[0];
+  for (int j = 1; j < width; ++j) {
+    const int16_t x = diff[j];
+    const int16_t y = diff[j - 1];
+    x_sum += x;
+    x_firstrow += x;
+    x2_sum += x * x;
+    x2_firstrow += x * x;
+    xy_sum += x * y;
+  }
+
+  // Process vertical correlation in the first column
+  x_firstcol += diff[0];
+  x2_firstcol += diff[0] * diff[0];
+  for (int i = 1; i < height; ++i) {
+    const int16_t x = diff[i * stride];
+    const int16_t z = diff[(i - 1) * stride];
+    x_sum += x;
+    x_firstcol += x;
+    x2_sum += x * x;
+    x2_firstcol += x * x;
+    xz_sum += x * z;
+  }
+
+  // Now process horiz and vert correlation through the rest unit
+  for (int i = 1; i < height; ++i) {
+    for (int j = 1; j < width; ++j) {
+      const int16_t x = diff[i * stride + j];
+      const int16_t y = diff[i * stride + j - 1];
+      const int16_t z = diff[(i - 1) * stride + j];
+      x_sum += x;
+      x2_sum += x * x;
+      xy_sum += x * y;
+      xz_sum += x * z;
+    }
+  }
+
+  for (int j = 0; j < width; ++j) {
+    x_finalrow += diff[(height - 1) * stride + j];
+    x2_finalrow +=
+        diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j];
+  }
+  for (int i = 0; i < height; ++i) {
+    x_finalcol += diff[i * stride + width - 1];
+    x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
+
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
+                       int64_t *sse_y) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+    unsigned int sse;
+
+    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, &sse);
+    total_sse += sse;
+    if (!plane && sse_y) *sse_y = sse;
+  }
+  total_sse <<= 4;
+  return total_sse;
+}
+
+int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                          intptr_t block_size, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             intptr_t block_size) {
+  int64_t error = 0;
+
+  for (int i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+  }
+
+  return error;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff, intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+#endif
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+                                 PREDICTION_MODE best_intra_mode) {
+  if (mode == D113_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  if (mode == D67_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D203_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D157_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  return 0;
+}
+
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
+                       int16_t mode_context) {
+  if (is_inter_compound_mode(mode)) {
+    return mode_costs
+        ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+  }
+
+  int mode_cost = 0;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+  assert(is_inter_mode(mode));
+
+  if (mode == NEWMV) {
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
+    return mode_cost;
+  } else {
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+    if (mode == GLOBALMV) {
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+}
+
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+                                              int ref_idx) {
+  return ref_idx ? compound_ref1_mode(this_mode)
+                 : compound_ref0_mode(this_mode);
+}
+
+static AOM_INLINE void estimate_ref_frame_costs(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs,
+    int segment_id, unsigned int *ref_costs_single,
+    unsigned int (*ref_costs_comp)[REF_FRAMES]) {
+  int seg_ref_active =
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+    int ref_frame;
+    for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+      memset(ref_costs_comp[ref_frame], 0,
+             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
+  } else {
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    ref_costs_single[INTRA_FRAME] =
+        mode_costs->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      ref_costs_single[i] = base_cost;
+
+    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+    // Determine cost of a single ref frame, where frame types are represented
+    // by a tree:
+    // Level 0: add cost whether this ref is a forward or backward ref
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
+
+    // Level 1: if this ref is forward ref,
+    // add cost whether it is last/last2 or last3/golden
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
+
+    // Level 1: if this ref is backward ref
+    // then add cost whether this ref is altref or backward ref
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][1];
+
+    // Level 2: further add cost whether this ref is last or last2
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][1];
+
+    // Level 2: last3 or golden
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][1];
+
+    // Level 2: bwdref or altref2
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p6][5][1];
+
+    if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
+      // Similar to single ref, determine cost of compound ref frames.
+      // cost_compound_refs = cost_first_ref + cost_second_ref
+      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
+
+      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
+
+      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+              base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+      ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+      // cost of first ref frame
+      ref_bicomp_costs[LAST_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[LAST_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+      ref_bicomp_costs[LAST3_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1];
+
+      // cost of second ref frame
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF_FRAME] +=
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+
+      // cost: if one ref frame is forward ref, the other ref is backward ref
+      int ref0, ref1;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+          ref_costs_comp[ref0][ref1] =
+              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+        }
+      }
+
+      // cost: if both ref frames are the same side.
+      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+    } else {
+      int ref0, ref1;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+          ref_costs_comp[ref0][ref1] = 512;
+      }
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+    }
+  }
+}
+
+static AOM_INLINE void store_coding_context(
+#if CONFIG_INTERNAL_STATS
+    MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index,
+#else
+    MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+#endif  // CONFIG_INTERNAL_STATS
+    int skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->rd_stats.skip_txfm = x->txfm_search_info.skip_txfm;
+  ctx->skippable = skippable;
+#if CONFIG_INTERNAL_STATS
+  ctx->best_mode_index = mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+  ctx->mic = *xd->mi[0];
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+}
+
+static AOM_INLINE void setup_buffer_ref_mvs_inter(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref_frame);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, ref_frame);
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+  assert(yv12 != NULL);
+
+  if (scaled_ref_frame) {
+    // Setup pred block based on scaled reference, because av1_mv_pred() doesn't
+    // support scaling.
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL,
+                         num_planes);
+  } else {
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+  }
+
+  // Gets an initial list of candidate vectors from neighbours and orders them
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+  // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+  // Further refinement that is encode side only to test the top few candidates
+  // in full and choose the best as the center point for subsequent searches.
+  // The current implementation doesn't support scaling.
+  av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride,
+              ref_frame, block_size);
+
+  // Go back to unscaled reference.
+  if (scaled_ref_frame) {
+    // We had temporarily setup pred block based on scaled reference above. Go
+    // back to unscaled reference now, for subsequent use.
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+  }
+}
+
+#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+                                     xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+                                     xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+                                     xd->mb_to_bottom_edge +
+                                         RIGHT_BOTTOM_MARGIN };
+  clamp_mv(mv, &mv_limits);
+}
+
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+                            const MACROBLOCK *const x,
+                            PREDICTION_MODE this_mode,
+                            const MV_REFERENCE_FRAME ref_frames[2],
+                            InterModeSearchState *search_state) {
+  const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+  if (!is_comp_pred) {
+    if (this_mode == NEARMV) {
+      if (ref_mv_count == 0) {
+        // NEARMV has the same motion vector as NEARESTMV
+        compare_mode = NEARESTMV;
+      }
+      if (ref_mv_count == 1 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // NEARMV has the same motion vector as GLOBALMV
+        compare_mode = GLOBALMV;
+      }
+    }
+    if (this_mode == GLOBALMV) {
+      if (ref_mv_count == 0 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // GLOBALMV has the same motion vector as NEARESTMV
+        compare_mode = NEARESTMV;
+      }
+      if (ref_mv_count == 1) {
+        // GLOBALMV has the same motion vector as NEARMV
+        compare_mode = NEARMV;
+      }
+    }
+
+    if (compare_mode != MB_MODE_COUNT) {
+      // Use modelled_rd to check whether compare mode was searched
+      if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+          INT64_MAX) {
+        const int16_t mode_ctx =
+            av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+        const int compare_cost =
+            cost_mv_ref(&x->mode_costs, compare_mode, mode_ctx);
+        const int this_cost = cost_mv_ref(&x->mode_costs, this_mode, mode_ctx);
+
+        // Only skip if the mode cost is larger than compare mode cost
+        if (this_cost > compare_cost) {
+          search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+              search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+          return 1;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+                                     const AV1_COMMON *cm,
+                                     const MACROBLOCK *x) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  *out_mv = in_mv;
+  lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv,
+                     cm->features.cur_frame_force_integer_mv);
+  clamp_mv2(&out_mv->as_mv, xd);
+  return av1_is_fullmv_in_range(&x->mv_limits,
+                                get_fullmv_from_mv(&out_mv->as_mv));
+}
+
+// To use single newmv directly for compound modes, need to clamp the mv to the
+// valid mv range. Without this, encoder would generate out of range mv, and
+// this is seen in 8k encoding.
+static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv,
+                                     int ref_idx) {
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+  SubpelMvLimits mv_limits;
+
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv);
+  clamp_mv(&mv->as_mv, &mv_limits);
+}
+
+static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, int_mv *cur_mv,
+                            int *const rate_mv, HandleInterModeArgs *const args,
+                            inter_mode_info *mode_info) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+  const int ref_mv_idx = mbmi->ref_mv_idx;
+
+  if (is_comp_pred) {
+    const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]];
+    const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]];
+    if (this_mode == NEW_NEWMV) {
+      if (valid_mv0) {
+        cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+        clamp_mv_in_range(x, &cur_mv[0], 0);
+      }
+      if (valid_mv1) {
+        cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+        clamp_mv_in_range(x, &cur_mv[1], 1);
+      }
+      *rate_mv = 0;
+      for (int i = 0; i < 2; ++i) {
+        const int_mv ref_mv = av1_get_ref_mv(x, i);
+        *rate_mv += av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv,
+                                    x->mv_costs->nmv_joint_cost,
+                                    x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+      }
+    } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+      if (valid_mv1) {
+        cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+        clamp_mv_in_range(x, &cur_mv[1], 1);
+      }
+      const int_mv ref_mv = av1_get_ref_mv(x, 1);
+      *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
+                                 x->mv_costs->nmv_joint_cost,
+                                 x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+    } else {
+      assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
+      if (valid_mv0) {
+        cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+        clamp_mv_in_range(x, &cur_mv[0], 0);
+      }
+      const int_mv ref_mv = av1_get_ref_mv(x, 0);
+      *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
+                                 x->mv_costs->nmv_joint_cost,
+                                 x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+    }
+  } else {
+    // Single ref case.
+    const int ref_idx = 0;
+    int search_range = INT_MAX;
+
+    if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) {
+      const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+      int min_mv_diff = INT_MAX;
+      int best_match = -1;
+      MV prev_ref_mv[2] = { { 0 } };
+      for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) {
+        prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame,
+                                                     idx, &x->mbmi_ext)
+                               .as_mv;
+        const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row),
+                                       abs(ref_mv.col - prev_ref_mv[idx].col));
+
+        if (min_mv_diff > ref_mv_diff) {
+          min_mv_diff = ref_mv_diff;
+          best_match = idx;
+        }
+      }
+
+      if (min_mv_diff < (16 << 3)) {
+        if (args->single_newmv_valid[best_match][refs[0]]) {
+          search_range = min_mv_diff;
+          search_range +=
+              AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row -
+                         prev_ref_mv[best_match].row),
+                     abs(args->single_newmv[best_match][refs[0]].as_mv.col -
+                         prev_ref_mv[best_match].col));
+          // Get full pixel search range.
+          search_range = (search_range + 4) >> 3;
+        }
+      }
+    }
+
+    int_mv best_mv;
+    av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
+                             mode_info, &best_mv, args);
+    if (best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+    args->single_newmv[ref_mv_idx][refs[0]] = best_mv;
+    args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
+    args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
+    cur_mv[0].as_int = best_mv.as_int;
+
+    // Return after single_newmv is set.
+    if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX;
+  }
+
+  return 0;
+}
+
+static INLINE void update_mode_start_end_index(
+    const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi,
+    int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed,
+    int interintra_allowed, int eval_motion_mode) {
+  *mode_index_start = (int)SIMPLE_TRANSLATION;
+  *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed;
+  if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+    if (!eval_motion_mode) {
+      *mode_index_end = (int)SIMPLE_TRANSLATION;
+    } else {
+      // Set the start index appropriately to process motion modes other than
+      // simple translation
+      *mode_index_start = 1;
+    }
+  }
+  if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16)
+    *mode_index_end = SIMPLE_TRANSLATION;
+}
+
+/*!\brief AV1 motion mode search
+ *
+ * \ingroup inter_mode_search
+ * Function to search over and determine the motion mode. It will update
+ * mbmi->motion_mode to one of SIMPLE_TRANSLATION, OBMC_CAUSAL, or
+ * WARPED_CAUSAL and determine any necessary side information for the selected
+ * motion mode. It will also perform the full transform search, unless the
+ * input parameter do_tx_search indicates to do an estimation of the RD rather
+ * than an RD corresponding to a full transform search. It will return the
+ * RD for the final motion_mode.
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] rd_stats_y        Struct to keep track of the RD information
+ *                                  for only the Y plane.
+ * \param[in,out] rd_stats_uv       Struct to keep track of the RD information
+ *                                  for only the UV planes.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in,out] ref_skip_rd       A length 2 array, where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in,out] rate_mv           The rate associated with the motion vectors.
+ *                                  This will be modified if a motion search is
+ *                                  done in the motion mode search.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in,out] best_est_rd       Estimated RD for motion mode search if
+ *                                  do_tx_search (see below) is 0.
+ * \param[in]     do_tx_search      Parameter to indicate whether or not to do
+ *                                  a full transform search. This will compute
+ *                                  an estimated RD for the modes without the
+ *                                  transform search and later perform the full
+ *                                  transform search on the best candidates.
+ * \param[in]     inter_modes_info  InterModesInfo struct to hold inter mode
+ *                                  information to perform a full transform
+ *                                  search only on winning candidates searched
+ *                                  with an estimate for transform coding RD.
+ * \param[in]     eval_motion_mode  Boolean whether or not to evaluate motion
+ *                                  motion modes other than SIMPLE_TRANSLATION.
+ * \param[out]    yrd               Stores the rdcost corresponding to encoding
+ *                                  the luma plane.
+ * \return Returns INT64_MAX if the determined motion mode is invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * motion mode search is a success.
+ */
+static int64_t motion_mode_rd(
+    const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
+    BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats_uv, HandleInterModeArgs *const args, int64_t ref_best_rd,
+    int64_t *ref_skip_rd, int *rate_mv, const BUFFER_SET *orig_dst,
+    int64_t *best_est_rd, int do_tx_search, InterModesInfo *inter_modes_info,
+    int eval_motion_mode, int64_t *yrd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int rate2_nocoeff = rd_stats->rate;
+  int best_xskip_txfm = 0;
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int rate_mv0 = *rate_mv;
+  const int interintra_allowed = cm->seq_params->enable_interintra_compound &&
+                                 is_interintra_allowed(mbmi) &&
+                                 mbmi->compound_idx;
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mbmi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
+
+  assert(mbmi->ref_frame[1] != INTRA_FRAME);
+  const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+  av1_invalid_rd_stats(&best_rd_stats);
+  mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+  *yrd = INT64_MAX;
+  if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mbmi, features->allow_warped_motion);
+  }
+
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    // Collect projection samples used in least squares approximation of
+    // the warped motion parameters if WARPED_CAUSAL is going to be searched.
+    if (warp_sample_info->num < 0) {
+      warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+    }
+    mbmi->num_proj_ref = warp_sample_info->num;
+  }
+  const int total_samples = mbmi->num_proj_ref;
+  if (total_samples == 0) {
+    // Do not search WARPED_CAUSAL if there are no samples to use to determine
+    // warped parameters.
+    last_motion_mode_allowed = OBMC_CAUSAL;
+  }
+
+  const MB_MODE_INFO base_mbmi = *mbmi;
+  MB_MODE_INFO best_mbmi;
+  const int interp_filter = features->interp_filter;
+  const int switchable_rate =
+      av1_is_interp_needed(xd)
+          ? av1_get_switchable_rate(x, xd, interp_filter,
+                                    cm->seq_params->enable_dual_filter)
+          : 0;
+  int64_t best_rd = INT64_MAX;
+  int best_rate_mv = rate_mv0;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int mode_index_start, mode_index_end;
+  const int txfm_rd_gate_level =
+      get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+                             cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+                             TX_SEARCH_MOTION_MODE, eval_motion_mode);
+
+  // Modify the start and end index according to speed features. For example,
+  // if SIMPLE_TRANSLATION has already been searched according to
+  // the motion_mode_for_winner_cand speed feature, update the mode_index_start
+  // to avoid searching it again.
+  update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end,
+                              last_motion_mode_allowed, interintra_allowed,
+                              eval_motion_mode);
+  // Main function loop. This loops over all of the possible motion modes and
+  // computes RD to determine the best one. This process includes computing
+  // any necessary side information for the motion mode and performing the
+  // transform search.
+  for (int mode_index = mode_index_start; mode_index <= mode_index_end;
+       mode_index++) {
+    if (args->skip_motion_mode && mode_index) continue;
+    int tmp_rate2 = rate2_nocoeff;
+    const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
+    int tmp_rate_mv = rate_mv0;
+
+    *mbmi = base_mbmi;
+    if (is_interintra_mode) {
+      // Only use SIMPLE_TRANSLATION for interintra
+      mbmi->motion_mode = SIMPLE_TRANSLATION;
+    } else {
+      mbmi->motion_mode = (MOTION_MODE)mode_index;
+      assert(mbmi->ref_frame[1] != INTRA_FRAME);
+    }
+
+    // Do not search OBMC if the probability of selecting it is below a
+    // predetermined threshold for this update_type and block size.
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int use_actual_frame_probs = 1;
+    int prune_obmc;
+#if CONFIG_FPMT_TEST
+    use_actual_frame_probs =
+        (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+    if (!use_actual_frame_probs) {
+      prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] <
+                   cpi->sf.inter_sf.prune_obmc_prob_thresh;
+    }
+#endif
+    if (use_actual_frame_probs) {
+      prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+                   cpi->sf.inter_sf.prune_obmc_prob_thresh;
+    }
+    if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) &&
+        mbmi->motion_mode == OBMC_CAUSAL)
+      continue;
+
+    if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
+      // SIMPLE_TRANSLATION mode: no need to recalculate.
+      // The prediction is calculated before motion_mode_rd() is called in
+      // handle_inter_mode()
+    } else if (mbmi->motion_mode == OBMC_CAUSAL) {
+      const uint32_t cur_mv = mbmi->mv[0].as_int;
+      // OBMC_CAUSAL not allowed for compound prediction
+      assert(!is_comp_pred);
+      if (have_newmv_in_inter_mode(this_mode)) {
+        av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
+                                 &mbmi->mv[0], NULL);
+        tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+      }
+      if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
+        // Build the predictor according to the current motion vector if it has
+        // not already been built
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      0, av1_num_planes(cm) - 1);
+      }
+      // Build the inter predictor by blending the predictor corresponding to
+      // this MV, and the neighboring blocks using the OBMC model
+      av1_build_obmc_inter_prediction(
+          cm, xd, args->above_pred_buf, args->above_pred_stride,
+          args->left_pred_buf, args->left_pred_stride);
+#if !CONFIG_REALTIME_ONLY
+    } else if (mbmi->motion_mode == WARPED_CAUSAL) {
+      int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+      mbmi->motion_mode = WARPED_CAUSAL;
+      mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+      mbmi->interp_filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+      memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+      memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+      // Select the samples according to motion vector difference
+      if (mbmi->num_proj_ref > 1) {
+        mbmi->num_proj_ref = av1_selectSamples(
+            &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize);
+      }
+
+      // Compute the warped motion parameters with a least squares fit
+      //  using the collected samples
+      if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                               mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                               &mbmi->wm_params, mi_row, mi_col)) {
+        assert(!is_comp_pred);
+        if (have_newmv_in_inter_mode(this_mode)) {
+          // Refine MV for NEWMV mode
+          const int_mv mv0 = mbmi->mv[0];
+          const WarpedMotionParams wm_params0 = mbmi->wm_params;
+          const int num_proj_ref0 = mbmi->num_proj_ref;
+
+          const int_mv ref_mv = av1_get_ref_mv(x, 0);
+          SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+          av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                            &ref_mv.as_mv, NULL);
+
+          // Refine MV in a small range.
+          av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+                               total_samples, cpi->sf.mv_sf.warp_search_method,
+                               cpi->sf.mv_sf.warp_search_iters);
+
+          if (mv0.as_int != mbmi->mv[0].as_int) {
+            // Keep the refined MV and WM parameters.
+            tmp_rate_mv = av1_mv_bit_cost(
+                &mbmi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+                x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+            tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+          } else {
+            // Restore the old MV and WM parameters.
+            mbmi->mv[0] = mv0;
+            mbmi->wm_params = wm_params0;
+            mbmi->num_proj_ref = num_proj_ref0;
+          }
+        }
+
+        // Build the warped predictor
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      av1_num_planes(cm) - 1);
+      } else {
+        continue;
+      }
+#endif  // !CONFIG_REALTIME_ONLY
+    } else if (is_interintra_mode) {
+      const int ret =
+          av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd,
+                                      &tmp_rate_mv, &tmp_rate2, orig_dst);
+      if (ret < 0) continue;
+    }
+
+    // If we are searching newmv and the mv is the same as refmv, skip the
+    // current mode
+    if (!av1_check_newmv_joint_nonzero(cm, x)) continue;
+
+    // Update rd_stats for the current motion mode
+    txfm_info->skip_txfm = 0;
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    rd_stats->skip_txfm = 1;
+    rd_stats->rate = tmp_rate2;
+    const ModeCosts *mode_costs = &x->mode_costs;
+    if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
+    if (interintra_allowed) {
+      rd_stats->rate +=
+          mode_costs->interintra_cost[size_group_lookup[bsize]]
+                                     [mbmi->ref_frame[1] == INTRA_FRAME];
+    }
+    if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
+        (mbmi->ref_frame[1] != INTRA_FRAME)) {
+      if (last_motion_mode_allowed == WARPED_CAUSAL) {
+        rd_stats->rate +=
+            mode_costs->motion_mode_cost[bsize][mbmi->motion_mode];
+      } else {
+        rd_stats->rate +=
+            mode_costs->motion_mode_cost1[bsize][mbmi->motion_mode];
+      }
+    }
+
+    int64_t this_yrd = INT64_MAX;
+
+    if (!do_tx_search) {
+      // Avoid doing a transform search here to speed up the overall mode
+      // search. It will be done later in the mode search if the current
+      // motion mode seems promising.
+      int64_t curr_sse = -1;
+      int64_t sse_y = -1;
+      int est_residue_cost = 0;
+      int64_t est_dist = 0;
+      int64_t est_rd = 0;
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        curr_sse = get_sse(cpi, x, &sse_y);
+        const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+                                                 &est_residue_cost, &est_dist);
+        (void)has_est_rd;
+        assert(has_est_rd);
+      } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 ||
+                 cpi->sf.rt_sf.use_nonrd_pick_mode) {
+        model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
+            cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist,
+            NULL, &curr_sse, NULL, NULL, NULL);
+        sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]];
+      }
+      est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist);
+      if (est_rd * 0.80 > *best_est_rd) {
+        mbmi->ref_frame[1] = ref_frame_1;
+        continue;
+      }
+      const int mode_rate = rd_stats->rate;
+      rd_stats->rate += est_residue_cost;
+      rd_stats->dist = est_dist;
+      rd_stats->rdcost = est_rd;
+      if (rd_stats->rdcost < *best_est_rd) {
+        *best_est_rd = rd_stats->rdcost;
+        assert(sse_y >= 0);
+        ref_skip_rd[1] = txfm_rd_gate_level
+                             ? RDCOST(x->rdmult, mode_rate, (sse_y << 4))
+                             : INT64_MAX;
+      }
+      if (cm->current_frame.reference_mode == SINGLE_REFERENCE) {
+        if (!is_comp_pred) {
+          assert(curr_sse >= 0);
+          inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+                                rd_stats->rdcost, rd_stats, rd_stats_y,
+                                rd_stats_uv, mbmi);
+        }
+      } else {
+        assert(curr_sse >= 0);
+        inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+                              rd_stats->rdcost, rd_stats, rd_stats_y,
+                              rd_stats_uv, mbmi);
+      }
+      mbmi->skip_txfm = 0;
+    } else {
+      // Perform full transform search
+      int64_t skip_rd = INT64_MAX;
+      int64_t skip_rdy = INT64_MAX;
+      if (txfm_rd_gate_level) {
+        // Check if the mode is good enough based on skip RD
+        int64_t sse_y = INT64_MAX;
+        int64_t curr_sse = get_sse(cpi, x, &sse_y);
+        skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse);
+        skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4));
+        int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd,
+                                        txfm_rd_gate_level, 0);
+        if (!eval_txfm) continue;
+      }
+
+      // Do transform search
+      const int mode_rate = rd_stats->rate;
+      if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                           rd_stats->rate, ref_best_rd)) {
+        if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
+          return INT64_MAX;
+        }
+        continue;
+      }
+      const int skip_ctx = av1_get_skip_txfm_context(xd);
+      const int y_rate =
+          rd_stats->skip_txfm
+              ? x->mode_costs.skip_txfm_cost[skip_ctx][1]
+              : (rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx][0]);
+      this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y->dist);
+
+      const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (curr_rd < ref_best_rd) {
+        ref_best_rd = curr_rd;
+        ref_skip_rd[0] = skip_rd;
+        ref_skip_rd[1] = skip_rdy;
+      }
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        inter_mode_data_push(
+            tile_data, mbmi->bsize, rd_stats->sse, rd_stats->dist,
+            rd_stats_y->rate + rd_stats_uv->rate +
+                mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
+      }
+    }
+
+    if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
+      if (is_nontrans_global_motion(xd, xd->mi[0])) {
+        mbmi->interp_filters =
+            av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+      }
+    }
+
+    const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    if (mode_index == 0) {
+      args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+    }
+    if (mode_index == 0 || tmp_rd < best_rd) {
+      // Update best_rd data if this is the best motion mode so far
+      best_mbmi = *mbmi;
+      best_rd = tmp_rd;
+      best_rd_stats = *rd_stats;
+      best_rd_stats_y = *rd_stats_y;
+      best_rate_mv = tmp_rate_mv;
+      *yrd = this_yrd;
+      if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
+      memcpy(best_blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+      best_xskip_txfm = mbmi->skip_txfm;
+    }
+  }
+  // Update RD and mbmi stats for selected motion mode
+  mbmi->ref_frame[1] = ref_frame_1;
+  *rate_mv = best_rate_mv;
+  if (best_rd == INT64_MAX || !av1_check_newmv_joint_nonzero(cm, x)) {
+    av1_invalid_rd_stats(rd_stats);
+    restore_dst_buf(xd, *orig_dst, num_planes);
+    return INT64_MAX;
+  }
+  *mbmi = best_mbmi;
+  *rd_stats = best_rd_stats;
+  *rd_stats_y = best_rd_stats_y;
+  if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
+  memcpy(txfm_info->blk_skip, best_blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+  txfm_info->skip_txfm = best_xskip_txfm;
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  return 0;
+}
+
+static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
+                            MACROBLOCK *const x, BLOCK_SIZE bsize,
+                            const BUFFER_SET *const orig_dst, int64_t best_rd) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int64_t total_sse = 0;
+  int64_t this_rd = INT64_MAX;
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  rd_stats->rate = x->mode_costs.skip_mode_cost[skip_mode_ctx][1];
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    // Call av1_enc_build_inter_predictor() for one plane at a time.
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  plane, plane);
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+    av1_subtract_plane(x, plane_bsize, plane);
+
+    int64_t sse =
+        av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL);
+    if (is_cur_buf_hbd(xd)) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+    sse <<= 4;
+    total_sse += sse;
+    // When current rd cost is more than the best rd, skip evaluation of
+    // remaining planes.
+    this_rd = RDCOST(x->rdmult, rd_stats->rate, total_sse);
+    if (this_rd > best_rd) break;
+  }
+
+  rd_stats->dist = rd_stats->sse = total_sse;
+  rd_stats->rdcost = this_rd;
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  return 0;
+}
+
+// Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant
+// mode
+// Note(rachelbarker): This speed feature currently does not interact correctly
+// with global motion. The issue is that, when global motion is used, GLOBALMV
+// produces a different prediction to NEARESTMV/NEARMV even if the motion
+// vectors are the same. Thus GLOBALMV should not be pruned in this case.
+static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext,
+                                      int ref_idx,
+                                      const MV_REFERENCE_FRAME *ref_frame,
+                                      PREDICTION_MODE single_mode) {
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  assert(single_mode != NEWMV);
+  if (single_mode == NEARESTMV) {
+    return 0;
+  } else if (single_mode == NEARMV) {
+    // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV
+    // when ref_mv_count = 1, NEARMV is same as GLOBALMV
+    if (ref_mv_count < 2) return 1;
+  } else if (single_mode == GLOBALMV) {
+    // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV
+    if (ref_mv_count == 0) return 1;
+    // when ref_mv_count == 1, NEARMV is same as GLOBALMV
+    else if (ref_mv_count == 1)
+      return 0;
+
+    int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count);
+    // Check GLOBALMV is matching with any mv in ref_mv_stack
+    for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) {
+      int_mv this_mv;
+
+      if (ref_idx == 0)
+        this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      else
+        this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+      if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int)
+        return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+                              int ref_idx, int ref_mv_idx,
+                              int skip_repeated_ref_mv,
+                              const MV_REFERENCE_FRAME *ref_frame,
+                              const MB_MODE_INFO_EXT *mbmi_ext) {
+  const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
+  assert(is_inter_singleref_mode(single_mode));
+  if (single_mode == NEWMV) {
+    this_mv->as_int = INVALID_MV;
+  } else if (single_mode == GLOBALMV) {
+    if (skip_repeated_ref_mv &&
+        check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+      return 0;
+    *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+  } else {
+    assert(single_mode == NEARMV || single_mode == NEARESTMV);
+    const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+    const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1;
+    if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
+      assert(ref_mv_offset >= 0);
+      if (ref_idx == 0) {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv;
+      } else {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
+      }
+    } else {
+      if (skip_repeated_ref_mv &&
+          check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+        return 0;
+      *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+    }
+  }
+  return 1;
+}
+
+// Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list
+// population
+static INLINE int skip_nearest_near_mv_using_refmv_weight(
+    const MACROBLOCK *const x, const PREDICTION_MODE this_mode,
+    const int8_t ref_frame_type, PREDICTION_MODE best_mode) {
+  if (this_mode != NEARESTMV && this_mode != NEARMV) return 0;
+  // Do not skip the mode if the current block has not yet obtained a valid
+  // inter mode.
+  if (!is_inter_mode(best_mode)) return 0;
+
+  const MACROBLOCKD *xd = &x->e_mbd;
+  // Do not skip the mode if both the top and left neighboring blocks are not
+  // available.
+  if (!xd->left_available || !xd->up_available) return 0;
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const uint16_t *const ref_mv_weight = mbmi_ext->weight[ref_frame_type];
+  const int ref_mv_count =
+      AOMMIN(MAX_REF_MV_SEARCH, mbmi_ext->ref_mv_count[ref_frame_type]);
+
+  if (ref_mv_count == 0) return 0;
+  // If ref mv list has at least one nearest candidate do not prune NEARESTMV
+  if (this_mode == NEARESTMV && ref_mv_weight[0] >= REF_CAT_LEVEL) return 0;
+
+  // Count number of ref mvs populated from nearest candidates
+  int nearest_refmv_count = 0;
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_mv_count; ref_mv_idx++) {
+    if (ref_mv_weight[ref_mv_idx] >= REF_CAT_LEVEL) nearest_refmv_count++;
+  }
+
+  // nearest_refmv_count indicates the closeness of block motion characteristics
+  // with respect to its spatial neighbor. Smaller value of nearest_refmv_count
+  // w.r.t to ref_mv_count means less correlation with its spatial neighbors.
+  // Hence less possibility for NEARESTMV and NEARMV modes becoming the best
+  // mode since these modes work well for blocks that shares similar motion
+  // characteristics with its neighbor. Thus, NEARMV mode is pruned when
+  // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV
+  // mode is pruned if none of the ref mvs are populated from nearest candidate.
+  const int prune_thresh = 1 + (ref_mv_count >= 2);
+  if (nearest_refmv_count < prune_thresh) return 1;
+  return 0;
+}
+
+// This function update the non-new mv for the current prediction mode
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
+                               const AV1_COMMON *cm, const MACROBLOCK *x,
+                               int skip_repeated_ref_mv) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+
+  int ret = 1;
+  for (int i = 0; i < is_comp_pred + 1; ++i) {
+    int_mv this_mv;
+    this_mv.as_int = INVALID_MV;
+    ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx,
+                      skip_repeated_ref_mv, mbmi->ref_frame, &x->mbmi_ext);
+    if (!ret) return 0;
+    const PREDICTION_MODE single_mode = get_single_mode(this_mode, i);
+    if (single_mode == NEWMV) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      cur_mv[i] =
+          (i == 0) ? x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                         .this_mv
+                   : x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                         .comp_mv;
+    } else {
+      ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
+    }
+  }
+  return ret;
+}
+
+static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               const int (*const drl_mode_cost0)[2],
+                               int8_t ref_frame_type) {
+  int cost = 0;
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
+        if (mbmi->ref_mv_idx == idx) return cost;
+      }
+    }
+    return cost;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    for (int idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
+        if (mbmi->ref_mv_idx == (idx - 1)) return cost;
+      }
+    }
+    return cost;
+  }
+  return cost;
+}
+
+static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args,
+                                        const MB_MODE_INFO *const mbmi,
+                                        PREDICTION_MODE this_mode) {
+  for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+    const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
+    const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+    if (single_mode == NEWMV &&
+        args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               PREDICTION_MODE mode) {
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+  const int has_drl =
+      (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+  const int ref_set =
+      has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1;
+
+  return ref_set;
+}
+
+// Checks if particular ref_mv_idx should be pruned.
+static int prune_ref_mv_idx_using_qindex(const int reduce_inter_modes,
+                                         const int qindex,
+                                         const int ref_mv_idx) {
+  if (reduce_inter_modes >= 3) return 1;
+  // Q-index logic based pruning is enabled only for
+  // reduce_inter_modes = 2.
+  assert(reduce_inter_modes == 2);
+  // When reduce_inter_modes=2, pruning happens as below based on q index.
+  // For q index range between 0 and 85: prune if ref_mv_idx >= 1.
+  // For q index range between 86 and 170: prune if ref_mv_idx == 2.
+  // For q index range between 171 and 255: no pruning.
+  const int min_prune_ref_mv_idx = (qindex * 3 / QINDEX_RANGE) + 1;
+  return (ref_mv_idx >= min_prune_ref_mv_idx);
+}
+
+// Whether this reference motion vector can be skipped, based on initial
+// heuristics.
+static bool ref_mv_idx_early_breakout(
+    const SPEED_FEATURES *const sf,
+    const RefFrameDistanceInfo *const ref_frame_dist_info, MACROBLOCK *x,
+    const HandleInterModeArgs *const args, int64_t ref_best_rd,
+    int ref_mv_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  const int is_comp_pred = has_second_ref(mbmi);
+  if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) {
+    if (mbmi->ref_frame[0] == LAST2_FRAME ||
+        mbmi->ref_frame[0] == LAST3_FRAME ||
+        mbmi->ref_frame[1] == LAST2_FRAME ||
+        mbmi->ref_frame[1] == LAST3_FRAME) {
+      const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+      if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+          REF_CAT_LEVEL) {
+        return true;
+      }
+    }
+    // TODO(any): Experiment with reduce_inter_modes for compound prediction
+    if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred &&
+        have_newmv_in_inter_mode(mbmi->mode)) {
+      if (mbmi->ref_frame[0] != ref_frame_dist_info->nearest_past_ref &&
+          mbmi->ref_frame[0] != ref_frame_dist_info->nearest_future_ref) {
+        const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+        const int do_prune = prune_ref_mv_idx_using_qindex(
+            sf->inter_sf.reduce_inter_modes, x->qindex, ref_mv_idx);
+        if (do_prune &&
+            (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+             REF_CAT_LEVEL)) {
+          return true;
+        }
+      }
+    }
+  }
+
+  mbmi->ref_mv_idx = ref_mv_idx;
+  if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) {
+    return true;
+  }
+  size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost;
+  const int drl_cost = get_drl_cost(
+      mbmi, mbmi_ext, x->mode_costs.drl_mode_cost0, ref_frame_type);
+  est_rd_rate += drl_cost;
+  if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd &&
+      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+    return true;
+  }
+  return false;
+}
+
+// Compute the estimated RD cost for the motion vector with simple translation.
+static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          RD_STATS *rd_stats,
+                                          HandleInterModeArgs *args,
+                                          int ref_mv_idx, int64_t ref_best_rd,
+                                          BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  const AV1_COMMON *cm = &cpi->common;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const ModeCosts *mode_costs = &x->mode_costs;
+
+  struct macroblockd_plane *p = xd->plane;
+  const BUFFER_SET orig_dst = {
+    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+  };
+  av1_init_rd_stats(rd_stats);
+
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  if (mbmi->ref_frame[1] == INTRA_FRAME) {
+    mbmi->ref_frame[1] = NONE_FRAME;
+  }
+  int16_t mode_ctx =
+      av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+  mbmi->num_proj_ref = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->ref_mv_idx = ref_mv_idx;
+
+  rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+  const int drl_cost =
+      get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+  rd_stats->rate += drl_cost;
+
+  int_mv cur_mv[2];
+  if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) {
+    return INT64_MAX;
+  }
+  assert(have_nearmv_in_inter_mode(mbmi->mode));
+  for (int i = 0; i < is_comp_pred + 1; ++i) {
+    mbmi->mv[i].as_int = cur_mv[i].as_int;
+  }
+  const int ref_mv_cost = cost_mv_ref(mode_costs, mbmi->mode, mode_ctx);
+  rd_stats->rate += ref_mv_cost;
+
+  if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) {
+    return INT64_MAX;
+  }
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->num_proj_ref = 0;
+  if (is_comp_pred) {
+    // Only compound_average
+    mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+    mbmi->comp_group_idx = 0;
+    mbmi->compound_idx = 1;
+  }
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+  int est_rate;
+  int64_t est_dist;
+  model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist,
+                                  NULL, NULL, NULL, NULL, NULL);
+  return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist);
+}
+
+// Represents a set of integers, from 0 to sizeof(int) * 8, as bits in
+// an integer. 0 for the i-th bit means that integer is excluded, 1 means
+// it is included.
+static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); }
+
+static INLINE bool mask_check_bit(int mask, int index) {
+  return (mask >> index) & 0x1;
+}
+
+// Before performing the full MV search in handle_inter_mode, do a simple
+// translation search and see if we can eliminate any motion vectors.
+// Returns an integer where, if the i-th bit is set, it means that the i-th
+// motion vector should be searched. This is only set for NEAR_MV.
+static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
+                                RD_STATS *rd_stats,
+                                HandleInterModeArgs *const args,
+                                int64_t ref_best_rd, BLOCK_SIZE bsize,
+                                const int ref_set) {
+  // If the number of ref mv count is equal to 1, do not prune the same. It
+  // is better to evaluate the same than to prune it.
+  if (ref_set == 1) return 1;
+  AV1_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+
+  // Only search indices if they have some chance of being good.
+  int good_indices = 0;
+  for (int i = 0; i < ref_set; ++i) {
+    if (ref_mv_idx_early_breakout(&cpi->sf, &cpi->ref_frame_dist_info, x, args,
+                                  ref_best_rd, i)) {
+      continue;
+    }
+    mask_set_bit(&good_indices, i);
+  }
+
+  // Only prune in NEARMV mode, if the speed feature is set, and the block size
+  // is large enough. If these conditions are not met, return all good indices
+  // found so far.
+  if (!cpi->sf.inter_sf.prune_mode_search_simple_translation)
+    return good_indices;
+  if (!have_nearmv_in_inter_mode(this_mode)) return good_indices;
+  if (num_pels_log2_lookup[bsize] <= 6) return good_indices;
+  // Do not prune when there is internal resizing. TODO(elliottk) fix this
+  // so b/2384 can be resolved.
+  if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) ||
+      (mbmi->ref_frame[1] > 0 &&
+       av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) {
+    return good_indices;
+  }
+
+  // Calculate the RD cost for the motion vectors using simple translation.
+  int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    // If this index is bad, ignore it.
+    if (!mask_check_bit(good_indices, ref_mv_idx)) {
+      continue;
+    }
+    idx_rdcost[ref_mv_idx] = simple_translation_pred_rd(
+        cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize);
+  }
+  // Find the index with the best RD cost.
+  int best_idx = 0;
+  for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) {
+    if (idx_rdcost[i] < idx_rdcost[best_idx]) {
+      best_idx = i;
+    }
+  }
+  // Only include indices that are good and within a % of the best.
+  const double dth = has_second_ref(mbmi) ? 1.05 : 1.001;
+  // If the simple translation cost is not within this multiple of the
+  // best RD, skip it. Note that the cutoff is derived experimentally.
+  const double ref_dth = 5;
+  int result = 0;
+  for (int i = 0; i < ref_set; ++i) {
+    if (mask_check_bit(good_indices, i) &&
+        (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth &&
+        (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) {
+      mask_set_bit(&result, i);
+    }
+  }
+  return result;
+}
+
+/*!\brief Motion mode information for inter mode search speedup.
+ *
+ * Used in a speed feature to search motion modes other than
+ * SIMPLE_TRANSLATION only on winning candidates.
+ */
+typedef struct motion_mode_candidate {
+  /*!
+   * Mode info for the motion mode candidate.
+   */
+  MB_MODE_INFO mbmi;
+  /*!
+   * Rate describing the cost of the motion vectors for this candidate.
+   */
+  int rate_mv;
+  /*!
+   * Rate before motion mode search and transform coding is applied.
+   */
+  int rate2_nocoeff;
+  /*!
+   * An integer value 0 or 1 which indicates whether or not to skip the motion
+   * mode search and default to SIMPLE_TRANSLATION as a speed feature for this
+   * candidate.
+   */
+  int skip_motion_mode;
+  /*!
+   * Total RD cost for this candidate.
+   */
+  int64_t rd_cost;
+} motion_mode_candidate;
+
+/*!\cond */
+typedef struct motion_mode_best_st_candidate {
+  motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES];
+  int num_motion_mode_cand;
+} motion_mode_best_st_candidate;
+
+// Checks if the current reference frame matches with neighbouring block's
+// (top/left) reference frames
+static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
+                                                   MB_MODE_INFO *nb_mbmi) {
+  MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0],
+                                          nb_mbmi->ref_frame[1] };
+  MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0],
+                                           cur_mbmi->ref_frame[1] };
+  const int is_cur_comp_pred = has_second_ref(cur_mbmi);
+  int match_found = 0;
+
+  for (int i = 0; i < (is_cur_comp_pred + 1); i++) {
+    if ((cur_ref_frames[i] == nb_ref_frames[0]) ||
+        (cur_ref_frames[i] == nb_ref_frames[1]))
+      match_found = 1;
+  }
+  return match_found;
+}
+
+static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
+                                                  MACROBLOCKD *xd) {
+  if (!xd->up_available) return 1;
+  const int mi_col = xd->mi_col;
+  MB_MODE_INFO **cur_mbmi = xd->mi;
+  // prev_row_mi points into the mi array, starting at the beginning of the
+  // previous row.
+  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+  const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols);
+  uint8_t mi_step;
+  for (int above_mi_col = mi_col; above_mi_col < end_col;
+       above_mi_col += mi_step) {
+    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+    mi_step = mi_size_wide[above_mi[0]->bsize];
+    int match_found = 0;
+    if (is_inter_block(*above_mi))
+      match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi);
+    if (match_found) return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
+                                                 MACROBLOCKD *xd) {
+  if (!xd->left_available) return 1;
+  const int mi_row = xd->mi_row;
+  MB_MODE_INFO **cur_mbmi = xd->mi;
+  // prev_col_mi points into the mi array, starting at the top of the
+  // previous column
+  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+  const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows);
+  uint8_t mi_step;
+  for (int left_mi_row = mi_row; left_mi_row < end_row;
+       left_mi_row += mi_step) {
+    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+    mi_step = mi_size_high[left_mi[0]->bsize];
+    int match_found = 0;
+    if (is_inter_block(*left_mi))
+      match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi);
+    if (match_found) return 1;
+  }
+  return 0;
+}
+/*!\endcond */
+
+/*! \brief Struct used to hold TPL data to
+ * narrow down parts of the inter mode search.
+ */
+typedef struct {
+  /*!
+   * The best inter cost out of all of the reference frames.
+   */
+  int64_t best_inter_cost;
+  /*!
+   * The inter cost for each reference frame.
+   */
+  int64_t ref_inter_cost[INTER_REFS_PER_FRAME];
+} PruneInfoFromTpl;
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(Remya): Check if get_tpl_stats_b() can be reused
+static AOM_INLINE void get_block_level_tpl_stats(
+    AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs,
+    PruneInfoFromTpl *inter_cost_info_from_tpl) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  if (!av1_tpl_stats_ready(tpl_data, tpl_idx)) return;
+  const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows);
+       row += row_step) {
+    for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr);
+         col += col_step_sr) {
+      const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+      // Sums up the inter cost of corresponding ref frames
+      for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+        inter_cost_info_from_tpl->ref_inter_cost[ref_idx] +=
+            this_stats->pred_error[ref_idx];
+      }
+    }
+  }
+
+  // Computes the best inter cost (minimum inter_cost)
+  int64_t best_inter_cost = INT64_MAX;
+  for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+    const int64_t cur_inter_cost =
+        inter_cost_info_from_tpl->ref_inter_cost[ref_idx];
+    // For invalid ref frames, cur_inter_cost = 0 and has to be handled while
+    // calculating the minimum inter_cost
+    if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) &&
+        valid_refs[ref_idx])
+      best_inter_cost = cur_inter_cost;
+  }
+  inter_cost_info_from_tpl->best_inter_cost = best_inter_cost;
+}
+#endif
+
+static AOM_INLINE int prune_modes_based_on_tpl_stats(
+    PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx,
+    const PREDICTION_MODE this_mode, int prune_mode_level) {
+  const int have_newmv = have_newmv_in_inter_mode(this_mode);
+  if ((prune_mode_level < 2) && have_newmv) return 0;
+
+  const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost;
+  if (best_inter_cost == INT64_MAX) return 0;
+
+  const int prune_level = prune_mode_level - 1;
+  int64_t cur_inter_cost;
+
+  const int is_globalmv =
+      (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV);
+  const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx;
+
+  // Thresholds used for pruning:
+  // Lower value indicates aggressive pruning and higher value indicates
+  // conservative pruning which is set based on ref_mv_idx and speed feature.
+  // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index
+  // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV
+  static const int tpl_inter_mode_prune_mul_factor[3][MAX_REF_MV_SEARCH + 1] = {
+    { 6, 6, 6, 4 }, { 6, 4, 4, 4 }, { 5, 4, 4, 4 }
+  };
+
+  const int is_comp_pred = (refs[1] > INTRA_FRAME);
+  if (!is_comp_pred) {
+    cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+  } else {
+    const int64_t inter_cost_ref0 =
+        inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+    const int64_t inter_cost_ref1 =
+        inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1];
+    // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for
+    // more aggressive pruning
+    cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1);
+  }
+
+  // Prune the mode if cur_inter_cost is greater than threshold times
+  // best_inter_cost
+  if (cur_inter_cost >
+      ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] *
+        best_inter_cost) >>
+       2))
+    return 1;
+  return 0;
+}
+
+/*!\brief High level function to select parameters for compound mode.
+ *
+ * \ingroup inter_mode_search
+ * The main search functionality is done in the call to av1_compound_type_rd().
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in,out] cur_mv            Current motion vector.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] compmode_interinter_cost  RD of the selected interinter
+                                    compound mode.
+ * \param[in,out] rd_buffers        CompoundTypeRdBuffers struct to hold all
+ *                                  allocated buffers for the compound
+ *                                  predictors and masks in the compound type
+ *                                  search.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in]     tmp_dst           A temporary prediction buffer to hold a
+ *                                  computed prediction.
+ * \param[in,out] rate_mv           The rate associated with the motion vectors.
+ *                                  This will be modified if a motion search is
+ *                                  done in the motion mode search.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] skip_rd           An array of length 2 where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in,out] skip_build_pred   Indicates whether or not to build the inter
+ *                                  predictor. If this is 0, the inter predictor
+ *                                  has already been built and thus we can avoid
+ *                                  repeating computation.
+ * \return Returns 1 if this mode is worse than one already seen and 0 if it is
+ * a viable candidate.
+ */
+static int process_compound_inter_mode(
+    AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    int64_t ref_best_rd, int_mv *cur_mv, BLOCK_SIZE bsize,
+    int *compmode_interinter_cost, const CompoundTypeRdBuffers *rd_buffers,
+    const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, int *rate_mv,
+    RD_STATS *rd_stats, int64_t *skip_rd, int *skip_build_pred) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const AV1_COMMON *cm = &cpi->common;
+  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                   cm->seq_params->enable_masked_compound;
+  int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+
+  const int num_planes = av1_num_planes(cm);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int is_luma_interp_done = 0;
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+  int64_t best_rd_compound;
+  int64_t rd_thresh;
+  const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
+  const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
+  rd_thresh = get_rd_thresh_from_best_rd(ref_best_rd, (1 << comp_type_rd_shift),
+                                         comp_type_rd_scale);
+  // Select compound type and any parameters related to that type
+  // (for example, the mask parameters if it is a masked mode) and compute
+  // the RD
+  *compmode_interinter_cost = av1_compound_type_rd(
+      cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used,
+      orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats,
+      ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
+  if (ref_best_rd < INT64_MAX &&
+      (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
+          ref_best_rd) {
+    restore_dst_buf(xd, *orig_dst, num_planes);
+    return 1;
+  }
+
+  // Build only uv predictor for COMPOUND_AVERAGE.
+  // Note there is no need to call av1_enc_build_inter_predictor
+  // for luma if COMPOUND_AVERAGE is selected because it is the first
+  // candidate in av1_compound_type_rd, which means it used the dst_buf
+  // rather than the tmp_buf.
+  if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && is_luma_interp_done) {
+    if (num_planes > 1) {
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_U, num_planes - 1);
+    }
+    *skip_build_pred = 1;
+  }
+  return 0;
+}
+
+// Speed feature to prune out MVs that are similar to previous MVs if they
+// don't achieve the best RD advantage.
+static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx,
+                                   int_mv save_mv[MAX_REF_MV_SEARCH - 1][2],
+                                   MB_MODE_INFO *mbmi, int pruning_factor) {
+  int i;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const int thr = (1 + is_comp_pred) << (pruning_factor + 1);
+
+  // Skip the evaluation if an MV match is found.
+  if (ref_mv_idx > 0) {
+    for (int idx = 0; idx < ref_mv_idx; ++idx) {
+      if (save_mv[idx][0].as_int == INVALID_MV) continue;
+
+      int mv_diff = 0;
+      for (i = 0; i < 1 + is_comp_pred; ++i) {
+        mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
+                   abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
+      }
+
+      // If this mode is not the best one, and current MV is similar to
+      // previous stored MV, terminate this ref_mv_idx evaluation.
+      if (best_ref_mv_idx == -1 && mv_diff <= thr) return 1;
+    }
+  }
+
+  if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
+    for (i = 0; i < is_comp_pred + 1; ++i)
+      save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
+  }
+
+  return 0;
+}
+
+/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE
+ *
+ * \ingroup inter_mode_search
+ *
+ * Compares the sse of zero mv and the best sse found in single new_mv. If the
+ * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped.
+ * Else returns 0.
+ *
+ * Note that the sse of here comes from single_motion_search. So it is
+ * interpolated with the filter in motion search, not the actual interpolation
+ * filter used in encoding.
+ *
+ * \param[in]     fn_ptr            A table of function pointers to compute SSE.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             The current block_size.
+ * \param[in]     args              The args to handle_inter_mode, used to track
+ *                                  the best SSE.
+ * \param[in]    prune_zero_mv_with_sse  The argument holds speed feature
+ *                                       prune_zero_mv_with_sse value
+ * \return Returns 1 if zero_mv is pruned, 0 otherwise.
+ */
+static AOM_INLINE int prune_zero_mv_with_sse(
+    const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize,
+    const HandleInterModeArgs *args, int prune_zero_mv_with_sse) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+
+  const int is_comp_pred = has_second_ref(mbmi);
+  const MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+
+  for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+    if (xd->global_motion[refs[idx]].wmtype != IDENTITY) {
+      // Pruning logic only works for IDENTITY type models
+      // Note: In theory we could apply similar logic for TRANSLATION
+      // type models, but we do not code these due to a spec bug
+      // (see comments in gm_get_motion_vector() in av1/common/mv.h)
+      assert(xd->global_motion[refs[idx]].wmtype != TRANSLATION);
+      return 0;
+    }
+
+    // Don't prune if we have invalid data
+    assert(mbmi->mv[idx].as_int == 0);
+    if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) {
+      return 0;
+    }
+  }
+
+  // Sum up the sse of ZEROMV and best NEWMV
+  unsigned int this_sse_sum = 0;
+  unsigned int best_sse_sum = 0;
+  for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+    const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+    const struct macroblockd_plane *pd = xd->plane;
+    const struct buf_2d *src_buf = &p->src;
+    const struct buf_2d *ref_buf = &pd->pre[idx];
+    const uint8_t *src = src_buf->buf;
+    const uint8_t *ref = ref_buf->buf;
+    const int src_stride = src_buf->stride;
+    const int ref_stride = ref_buf->stride;
+
+    unsigned int this_sse;
+    fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse);
+    this_sse_sum += this_sse;
+
+    const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]];
+    best_sse_sum += best_sse;
+  }
+
+  const double mul = prune_zero_mv_with_sse > 1 ? 1.00 : 1.25;
+  if ((double)this_sse_sum > (mul * (double)best_sse_sum)) {
+    return 1;
+  }
+
+  return 0;
+}
+
+/*!\brief Searches for interpolation filter in realtime mode during winner eval
+ *
+ * \ingroup inter_mode_search
+ *
+ * Does a simple interpolation filter search during winner mode evaluation. This
+ * is currently only used by realtime mode as \ref
+ * av1_interpolation_filter_search is not called during realtime encoding.
+ *
+ * This function only searches over two possible filters. EIGHTTAP_REGULAR is
+ * always search. For lowres clips (<= 240p), MULTITAP_SHARP is also search. For
+ * higher  res slips (>240p), EIGHTTAP_SMOOTH is also searched.
+ *  *
+ * \param[in]     cpi               Pointer to the compressor. Used for feature
+ *                                  flags.
+ * \param[in,out] x                 Pointer to macroblock. This is primarily
+ *                                  used to access the buffers.
+ * \param[in]     mi_row            The current row in mi unit (4X4 pixels).
+ * \param[in]     mi_col            The current col in mi unit (4X4 pixels).
+ * \param[in]     bsize             The current block_size.
+ * \return Returns true if a predictor is built in xd->dst, false otherwise.
+ */
+static AOM_INLINE bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  static const InterpFilters filters_ref_set[3] = {
+    { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR },
+    { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+    { MULTITAP_SHARP, MULTITAP_SHARP }
+  };
+
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  int64_t best_cost = INT64_MAX;
+  int best_filter_index = -1;
+  // dst_bufs[0] sores the new predictor, and dist_bifs[1] stores the best
+  const int num_planes = av1_num_planes(cm);
+  const int is_240p_or_lesser = AOMMIN(cm->width, cm->height) <= 240;
+  assert(is_inter_mode(mi->mode));
+  assert(mi->motion_mode == SIMPLE_TRANSLATION);
+  assert(!is_inter_compound_mode(mi->mode));
+
+  if (!av1_is_interp_needed(xd)) {
+    return false;
+  }
+
+  struct macroblockd_plane *pd = xd->plane;
+  const BUFFER_SET orig_dst = {
+    { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+    { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+  };
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
+  const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+                                 tmp_buf + 2 * MAX_SB_SQUARE },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+  const BUFFER_SET *dst_bufs[2] = { &orig_dst, &tmp_dst };
+
+  for (int i = 0; i < 3; ++i) {
+    if (is_240p_or_lesser) {
+      if (filters_ref_set[i].x_filter == EIGHTTAP_SMOOTH) {
+        continue;
+      }
+    } else {
+      if (filters_ref_set[i].x_filter == MULTITAP_SHARP) {
+        continue;
+      }
+    }
+    int64_t cost;
+    RD_STATS tmp_rd = { 0 };
+
+    mi->interp_filters.as_filters = filters_ref_set[i];
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+
+    model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+                       ? MODELRD_LEGACY
+                       : MODELRD_TYPE_INTERP_FILTER](
+        cpi, bsize, x, xd, AOM_PLANE_Y, AOM_PLANE_Y, &tmp_rd.rate, &tmp_rd.dist,
+        &tmp_rd.skip_txfm, &tmp_rd.sse, NULL, NULL, NULL);
+
+    tmp_rd.rate += av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+                                           cm->seq_params->enable_dual_filter);
+    cost = RDCOST(x->rdmult, tmp_rd.rate, tmp_rd.dist);
+    if (cost < best_cost) {
+      best_filter_index = i;
+      best_cost = cost;
+      swap_dst_buf(xd, dst_bufs, num_planes);
+    }
+  }
+  assert(best_filter_index >= 0);
+
+  mi->interp_filters.as_filters = filters_ref_set[best_filter_index];
+
+  const bool is_best_pred_in_orig = &orig_dst == dst_bufs[1];
+
+  if (is_best_pred_in_orig) {
+    swap_dst_buf(xd, dst_bufs, num_planes);
+  } else {
+    // Note that xd->pd's bufers are kept in sync with dst_bufs[0]. So if
+    // is_best_pred_in_orig is false, that means the current buffer is the
+    // original one.
+    assert(&orig_dst == dst_bufs[0]);
+    assert(xd->plane[AOM_PLANE_Y].dst.buf == orig_dst.plane[AOM_PLANE_Y]);
+    const int width = block_size_wide[bsize];
+    const int height = block_size_high[bsize];
+#if CONFIG_AV1_HIGHBITDEPTH
+    const bool is_hbd = is_cur_buf_hbd(xd);
+    if (is_hbd) {
+      aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(tmp_dst.plane[AOM_PLANE_Y]),
+                               tmp_dst.stride[AOM_PLANE_Y],
+                               CONVERT_TO_SHORTPTR(orig_dst.plane[AOM_PLANE_Y]),
+                               orig_dst.stride[AOM_PLANE_Y], width, height);
+    } else {
+      aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y],
+                        orig_dst.plane[AOM_PLANE_Y],
+                        orig_dst.stride[AOM_PLANE_Y], width, height);
+    }
+#else
+    aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y],
+                      orig_dst.plane[AOM_PLANE_Y], orig_dst.stride[AOM_PLANE_Y],
+                      width, height);
+#endif
+  }
+
+  // Build the YUV predictor.
+  if (num_planes > 1) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  AOM_PLANE_U, AOM_PLANE_V);
+  }
+
+  return true;
+}
+
+/*!\brief AV1 inter mode RD computation
+ *
+ * \ingroup inter_mode_search
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] rd_stats_y        Struct to keep track of the RD information
+ *                                  for only the Y plane.
+ * \param[in,out] rd_stats_uv       Struct to keep track of the RD information
+ *                                  for only the UV planes.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in]     tmp_buf           Temporary buffer used to hold predictors
+ *                                  built in this search.
+ * \param[in,out] rd_buffers        CompoundTypeRdBuffers struct to hold all
+ *                                  allocated buffers for the compound
+ *                                  predictors and masks in the compound type
+ *                                  search.
+ * \param[in,out] best_est_rd       Estimated RD for motion mode search if
+ *                                  do_tx_search (see below) is 0.
+ * \param[in]     do_tx_search      Parameter to indicate whether or not to do
+ *                                  a full transform search. This will compute
+ *                                  an estimated RD for the modes without the
+ *                                  transform search and later perform the full
+ *                                  transform search on the best candidates.
+ * \param[in,out] inter_modes_info  InterModesInfo struct to hold inter mode
+ *                                  information to perform a full transform
+ *                                  search only on winning candidates searched
+ *                                  with an estimate for transform coding RD.
+ * \param[in,out] motion_mode_cand  A motion_mode_candidate struct to store
+ *                                  motion mode information used in a speed
+ *                                  feature to search motion modes other than
+ *                                  SIMPLE_TRANSLATION only on winning
+ *                                  candidates.
+ * \param[in,out] skip_rd           A length 2 array, where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in]     inter_cost_info_from_tpl A PruneInfoFromTpl struct used to
+ *                                         narrow down the search based on data
+ *                                         collected in the TPL model.
+ * \param[out]    yrd               Stores the rdcost corresponding to encoding
+ *                                  the luma plane.
+ *
+ * \return The RD cost for the mode being searched.
+ */
+static int64_t handle_inter_mode(
+    AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+    BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats_uv, HandleInterModeArgs *args, int64_t ref_best_rd,
+    uint8_t *const tmp_buf, const CompoundTypeRdBuffers *rd_buffers,
+    int64_t *best_est_rd, const int do_tx_search,
+    InterModesInfo *inter_modes_info, motion_mode_candidate *motion_mode_cand,
+    int64_t *skip_rd, PruneInfoFromTpl *inter_cost_info_from_tpl,
+    int64_t *yrd) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+
+#if CONFIG_REALTIME_ONLY
+  const int prune_modes_based_on_tpl = 0;
+#else   // CONFIG_REALTIME_ONLY
+  const TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const int prune_modes_based_on_tpl =
+      cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
+      av1_tpl_stats_ready(tpl_data, cpi->gf_frame_index);
+#endif  // CONFIG_REALTIME_ONLY
+  int i;
+  // Reference frames for this mode
+  const int refs[2] = { mbmi->ref_frame[0],
+                        (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  int rate_mv = 0;
+  int64_t rd = INT64_MAX;
+  // Do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  struct macroblockd_plane *pd = xd->plane;
+  const BUFFER_SET orig_dst = {
+    { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+    { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+  };
+  const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+                                 tmp_buf + 2 * MAX_SB_SQUARE },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+
+  int64_t ret_val = INT64_MAX;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  int64_t best_rd = INT64_MAX;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  int64_t best_yrd = INT64_MAX;
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int best_xskip_txfm = 0;
+  int64_t newmv_ret_val = INT64_MAX;
+  inter_mode_info mode_info[MAX_REF_MV_SEARCH];
+
+  // Do not prune the mode based on inter cost from tpl if the current ref frame
+  // is the winner ref in neighbouring blocks.
+  int ref_match_found_in_above_nb = 0;
+  int ref_match_found_in_left_nb = 0;
+  if (prune_modes_based_on_tpl) {
+    ref_match_found_in_above_nb =
+        find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd);
+    ref_match_found_in_left_nb =
+        find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd);
+  }
+
+  // First, perform a simple translation search for each of the indices. If
+  // an index performs well, it will be fully searched in the main loop
+  // of this function.
+  const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+  // Save MV results from first 2 ref_mv_idx.
+  int_mv save_mv[MAX_REF_MV_SEARCH - 1][2];
+  int best_ref_mv_idx = -1;
+  const int idx_mask =
+      ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set);
+  const int16_t mode_ctx =
+      av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int ref_mv_cost = cost_mv_ref(mode_costs, this_mode, mode_ctx);
+  const int base_rate =
+      args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
+
+  for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
+    save_mv[i][0].as_int = INVALID_MV;
+    save_mv[i][1].as_int = INVALID_MV;
+  }
+  args->start_mv_cnt = 0;
+
+  // Main loop of this function. This will  iterate over all of the ref mvs
+  // in the dynamic reference list and do the following:
+  //    1.) Get the current MV. Create newmv MV if necessary
+  //    2.) Search compound type and parameters if applicable
+  //    3.) Do interpolation filter search
+  //    4.) Build the inter predictor
+  //    5.) Pick the motion mode (SIMPLE_TRANSLATION, OBMC_CAUSAL,
+  //        WARPED_CAUSAL)
+  //    6.) Update stats if best so far
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mbmi->ref_mv_idx = ref_mv_idx;
+
+    mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
+    mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX;
+    const int drl_cost = get_drl_cost(
+        mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+    mode_info[ref_mv_idx].drl_cost = drl_cost;
+    mode_info[ref_mv_idx].skip = 0;
+
+    if (!mask_check_bit(idx_mask, ref_mv_idx)) {
+      // MV did not perform well in simple translation search. Skip it.
+      continue;
+    }
+    if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb &&
+        !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) {
+      // Skip mode if TPL model indicates it will not be beneficial.
+      if (prune_modes_based_on_tpl_stats(
+              inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode,
+              cpi->sf.inter_sf.prune_inter_modes_based_on_tpl))
+        continue;
+    }
+    av1_init_rd_stats(rd_stats);
+
+    // Initialize compound mode data
+    mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+    mbmi->comp_group_idx = 0;
+    mbmi->compound_idx = 1;
+    if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+
+    mbmi->num_proj_ref = 0;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+    // Compute cost for signalling this DRL index
+    rd_stats->rate = base_rate;
+    rd_stats->rate += drl_cost;
+
+    int rs = 0;
+    int compmode_interinter_cost = 0;
+
+    int_mv cur_mv[2];
+
+    // TODO(Cherma): Extend this speed feature to support compound mode
+    int skip_repeated_ref_mv =
+        is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv;
+    // Generate the current mv according to the prediction mode
+    if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) {
+      continue;
+    }
+
+    // The above call to build_cur_mv does not handle NEWMV modes. Build
+    // the mv here if we have NEWMV for any predictors.
+    if (have_newmv_in_inter_mode(this_mode)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, handle_newmv_time);
+#endif
+      newmv_ret_val =
+          handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, handle_newmv_time);
+#endif
+
+      if (newmv_ret_val != 0) continue;
+
+      if (is_inter_singleref_mode(this_mode) &&
+          cur_mv[0].as_int != INVALID_MV) {
+        const MV_REFERENCE_FRAME ref = refs[0];
+        const unsigned int this_sse = x->pred_sse[ref];
+        if (this_sse < args->best_single_sse_in_refs[ref]) {
+          args->best_single_sse_in_refs[ref] = this_sse;
+        }
+
+        if (cpi->sf.rt_sf.skip_newmv_mode_based_on_sse) {
+          const int th_idx = cpi->sf.rt_sf.skip_newmv_mode_based_on_sse - 1;
+          const int pix_idx = num_pels_log2_lookup[bsize] - 4;
+          const double scale_factor[3][11] = {
+            { 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9 },
+            { 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 1, 1, 1, 1, 1 },
+            { 0.7, 0.7, 0.7, 0.7, 1, 1, 1, 1, 1, 1, 1 }
+          };
+          assert(pix_idx >= 0);
+          assert(th_idx <= 2);
+          if (args->best_pred_sse < scale_factor[th_idx][pix_idx] * this_sse)
+            continue;
+        }
+      }
+
+      rd_stats->rate += rate_mv;
+    }
+    // Copy the motion vector for this mode into mbmi struct
+    for (i = 0; i < is_comp_pred + 1; ++i) {
+      mbmi->mv[i].as_int = cur_mv[i].as_int;
+    }
+
+    if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+        mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+      continue;
+    }
+
+    // Skip the rest of the search if prune_ref_mv_idx_search speed feature
+    // is enabled, and the current MV is similar to a previous one.
+    if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred &&
+        prune_ref_mv_idx_search(ref_mv_idx, best_ref_mv_idx, save_mv, mbmi,
+                                cpi->sf.inter_sf.prune_ref_mv_idx_search))
+      continue;
+
+    if (cpi->sf.gm_sf.prune_zero_mv_with_sse &&
+        (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
+      if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args,
+                                 cpi->sf.gm_sf.prune_zero_mv_with_sse)) {
+        continue;
+      }
+    }
+
+    int skip_build_pred = 0;
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+
+    // Handle a compound predictor, continue if it is determined this
+    // cannot be the best compound mode
+    if (is_comp_pred) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, compound_type_rd_time);
+#endif
+      const int not_best_mode = process_compound_inter_mode(
+          cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost,
+          rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd,
+          &skip_build_pred);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, compound_type_rd_time);
+#endif
+      if (not_best_mode) continue;
+    }
+
+    if (!args->skip_ifs) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, interpolation_filter_search_time);
+#endif
+      // Determine the interpolation filter for this mode
+      ret_val = av1_interpolation_filter_search(
+          x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
+          &skip_build_pred, args, ref_best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, interpolation_filter_search_time);
+#endif
+      if (args->modelled_rd != NULL && !is_comp_pred) {
+        args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+      }
+      if (ret_val != 0) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
+                 ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      }
+
+      // Compute modelled RD if enabled
+      if (args->modelled_rd != NULL) {
+        if (is_comp_pred) {
+          const int mode0 = compound_ref0_mode(this_mode);
+          const int mode1 = compound_ref1_mode(this_mode);
+          const int64_t mrd =
+              AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                     args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+          if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
+        }
+      }
+    }
+
+    rd_stats->rate += compmode_interinter_cost;
+    if (skip_build_pred != 1) {
+      // Build this inter predictor if it has not been previously built
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0,
+                                    av1_num_planes(cm) - 1);
+    }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, motion_mode_rd_time);
+#endif
+    int rate2_nocoeff = rd_stats->rate;
+    // Determine the motion mode. This will be one of SIMPLE_TRANSLATION,
+    // OBMC_CAUSAL or WARPED_CAUSAL
+    int64_t this_yrd;
+    ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
+                             rd_stats_uv, args, ref_best_rd, skip_rd, &rate_mv,
+                             &orig_dst, best_est_rd, do_tx_search,
+                             inter_modes_info, 0, &this_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, motion_mode_rd_time);
+#endif
+    assert(
+        IMPLIES(!av1_check_newmv_joint_nonzero(cm, x), ret_val == INT64_MAX));
+
+    if (ret_val != INT64_MAX) {
+      int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      const THR_MODES mode_enum = get_prediction_mode_idx(
+          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      // Collect mode stats for multiwinner mode processing
+      store_winner_mode_stats(&cpi->common, x, mbmi, rd_stats, rd_stats_y,
+                              rd_stats_uv, mode_enum, NULL, bsize, tmp_rd,
+                              cpi->sf.winner_mode_sf.multi_winner_mode_type,
+                              do_tx_search);
+      if (tmp_rd < best_rd) {
+        best_yrd = this_yrd;
+        // Update the best rd stats if we found the best mode so far
+        best_rd_stats = *rd_stats;
+        best_rd_stats_y = *rd_stats_y;
+        best_rd_stats_uv = *rd_stats_uv;
+        best_rd = tmp_rd;
+        best_mbmi = *mbmi;
+        best_xskip_txfm = txfm_info->skip_txfm;
+        memcpy(best_blk_skip, txfm_info->blk_skip,
+               sizeof(best_blk_skip[0]) * xd->height * xd->width);
+        av1_copy_array(best_tx_type_map, xd->tx_type_map,
+                       xd->height * xd->width);
+        motion_mode_cand->rate_mv = rate_mv;
+        motion_mode_cand->rate2_nocoeff = rate2_nocoeff;
+      }
+
+      if (tmp_rd < ref_best_rd) {
+        ref_best_rd = tmp_rd;
+        best_ref_mv_idx = ref_mv_idx;
+      }
+    }
+    restore_dst_buf(xd, orig_dst, num_planes);
+  }
+
+  if (best_rd == INT64_MAX) return INT64_MAX;
+
+  // re-instate status of the best choice
+  *rd_stats = best_rd_stats;
+  *rd_stats_y = best_rd_stats_y;
+  *rd_stats_uv = best_rd_stats_uv;
+  *yrd = best_yrd;
+  *mbmi = best_mbmi;
+  txfm_info->skip_txfm = best_xskip_txfm;
+  assert(IMPLIES(mbmi->comp_group_idx == 1,
+                 mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+  memcpy(txfm_info->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+
+  rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+  return rd_stats->rdcost;
+}
+
+/*!\brief Search for the best intrabc predictor
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function performs a motion search to find the best intrabc predictor.
+ *
+ * \returns Returns the best overall rdcost (including the non-intrabc modes
+ * search before this function).
+ */
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc ||
+      !cpi->sf.mv_sf.use_intrabc || cpi->sf.rt_sf.use_nonrd_pick_mode)
+    return INT64_MAX;
+  const int num_planes = av1_num_planes(cm);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TileInfo *tile = &xd->tile;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
+
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+  // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+  int_mv nearestmv, nearmv;
+  av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
+                                   0);
+
+  if (nearestmv.as_int == INVALID_MV) {
+    nearestmv.as_int = 0;
+  }
+  if (nearmv.as_int == INVALID_MV) {
+    nearmv.as_int = 0;
+  }
+
+  int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+  if (dv_ref.as_int == 0) {
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row);
+  }
+  // Ref DV should not have sub-pel.
+  assert((dv_ref.as_mv.col & 7) == 0);
+  assert((dv_ref.as_mv.row & 7) == 0);
+  mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
+
+  struct buf_2d yv12_mb[MAX_MB_PLANE];
+  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes);
+  for (int i = 0; i < num_planes; ++i) {
+    xd->plane[i].pre[0] = yv12_mb[i];
+  }
+
+  enum IntrabcMotionDirection {
+    IBC_MOTION_ABOVE,
+    IBC_MOTION_LEFT,
+    IBC_MOTION_DIRECTIONS
+  };
+
+  MB_MODE_INFO best_mbmi = *mbmi;
+  RD_STATS best_rdstats = *rd_stats;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+
+  FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+  const search_site_config *lookahead_search_sites =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+  const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
+  av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
+                                     &dv_ref.as_mv, start_mv,
+                                     lookahead_search_sites, search_method,
+                                     /*fine_search_interval=*/0);
+  const IntraBCMVCosts *const dv_costs = x->dv_costs;
+  av1_set_ms_to_intra_mode(&fullms_params, dv_costs);
+
+  for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
+       dir < IBC_MOTION_DIRECTIONS; ++dir) {
+    switch (dir) {
+      case IBC_MOTION_ABOVE:
+        fullms_params.mv_limits.col_min =
+            (tile->mi_col_start - mi_col) * MI_SIZE;
+        fullms_params.mv_limits.col_max =
+            (tile->mi_col_end - mi_col) * MI_SIZE - w;
+        fullms_params.mv_limits.row_min =
+            (tile->mi_row_start - mi_row) * MI_SIZE;
+        fullms_params.mv_limits.row_max =
+            (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h;
+        break;
+      case IBC_MOTION_LEFT:
+        fullms_params.mv_limits.col_min =
+            (tile->mi_col_start - mi_col) * MI_SIZE;
+        fullms_params.mv_limits.col_max =
+            (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w;
+        // TODO(aconverse@google.com): Minimize the overlap between above and
+        // left areas.
+        fullms_params.mv_limits.row_min =
+            (tile->mi_row_start - mi_row) * MI_SIZE;
+        int bottom_coded_mi_edge =
+            AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end);
+        fullms_params.mv_limits.row_max =
+            (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+        break;
+      default: assert(0);
+    }
+    assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min);
+    assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max);
+    assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min);
+    assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max);
+
+    av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv);
+
+    if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min ||
+        fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) {
+      continue;
+    }
+
+    const int step_param = cpi->mv_search_params.mv_step_param;
+    IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info;
+    int_mv best_mv, best_hash_mv;
+    FULLPEL_MV_STATS best_mv_stats;
+
+    int bestsme =
+        av1_full_pixel_search(start_mv, &fullms_params, step_param, NULL,
+                              &best_mv.as_fullmv, &best_mv_stats, NULL);
+    const int hashsme = av1_intrabc_hash_search(
+        cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv);
+    if (hashsme < bestsme) {
+      best_mv = best_hash_mv;
+      bestsme = hashsme;
+    }
+
+    if (bestsme == INT_MAX) continue;
+    const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    if (!av1_is_fullmv_in_range(&fullms_params.mv_limits,
+                                get_fullmv_from_mv(&dv)))
+      continue;
+    if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
+                         cm->seq_params->mib_size_log2))
+      continue;
+
+    // DV should not have sub-pel.
+    assert((dv.col & 7) == 0);
+    assert((dv.row & 7) == 0);
+    memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    mbmi->use_intrabc = 1;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+    mbmi->mv[0].as_mv = dv;
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    mbmi->skip_txfm = 0;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                  av1_num_planes(cm) - 1);
+
+    // TODO(aconverse@google.com): The full motion field defining discount
+    // in MV_COST_WEIGHT is too large. Explore other values.
+    const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv,
+                                        dv_costs->dv_costs, MV_COST_WEIGHT_SUB);
+    const int rate_mode = x->mode_costs.intrabc_cost[1];
+    RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
+    if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y,
+                         &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
+      continue;
+    rd_stats_yuv.rdcost =
+        RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist);
+    if (rd_stats_yuv.rdcost < best_rd) {
+      best_rd = rd_stats_yuv.rdcost;
+      best_mbmi = *mbmi;
+      best_rdstats = rd_stats_yuv;
+      memcpy(best_blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+    }
+  }
+  *mbmi = best_mbmi;
+  *rd_stats = best_rdstats;
+  memcpy(txfm_info->blk_skip, best_blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+#if CONFIG_RD_DEBUG
+  mbmi->rd_stats = *rd_stats;
+#endif
+  return best_rd;
+}
+
+// TODO(chiyotsai@google.com): We are using struct $struct_name instead of their
+// typedef here because Doxygen doesn't know about the typedefs yet. So using
+// the typedef will prevent doxygen from finding this function and generating
+// the callgraph. Once documents for AV1_COMP and MACROBLOCK are added to
+// doxygen, we can revert back to using the typedefs.
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+  uint8_t y_skip_txfm = 0, uv_skip_txfm = 0;
+  int64_t dist_y = 0, dist_uv = 0;
+
+  ctx->rd_stats.skip_txfm = 0;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->use_intrabc = 0;
+  mbmi->mv[0].as_int = 0;
+  mbmi->skip_mode = 0;
+
+  const int64_t intra_yrd =
+      av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+                                 &y_skip_txfm, bsize, best_rd, ctx);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  if (intra_yrd < best_rd) {
+    // Search intra modes for uv planes if needed
+    if (num_planes > 1) {
+      // Set up the tx variables for reproducing the y predictions in case we
+      // need it for chroma-from-luma.
+      if (xd->is_chroma_ref && store_cfl_required_rdo(cm, x)) {
+        memcpy(txfm_info->blk_skip, ctx->blk_skip,
+               sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+        av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+      }
+      const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                  &dist_uv, &uv_skip_txfm, bsize,
+                                  max_uv_tx_size);
+    }
+
+    // Intra block is always coded as non-skip
+    rd_cost->rate =
+        rate_y + rate_uv +
+        x->mode_costs.skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+    rd_cost->dist = dist_y + dist_uv;
+    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+    rd_cost->skip_txfm = 0;
+  } else {
+    rd_cost->rate = INT_MAX;
+  }
+
+  if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+    best_rd = rd_cost->rdcost;
+  if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) {
+    ctx->rd_stats.skip_txfm = mbmi->skip_txfm;
+    memcpy(ctx->blk_skip, txfm_info->blk_skip,
+           sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+    assert(rd_cost->rate != INT_MAX);
+  }
+  if (rd_cost->rate == INT_MAX) return;
+
+  ctx->mic = *xd->mi[0];
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+static AOM_INLINE void calc_target_weighted_pred(
+    const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+    const uint8_t *above, int above_stride, const uint8_t *left,
+    int left_stride);
+
+static AOM_INLINE void rd_pick_skip_mode(
+    RD_STATS *rd_cost, InterModeSearchState *search_state,
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  x->compound_idx = 1;  // COMPOUND_AVERAGE
+  RD_STATS skip_mode_rd_stats;
+  av1_invalid_rd_stats(&skip_mode_rd_stats);
+
+  if (skip_mode_info->ref_frame_idx_0 == INVALID_IDX ||
+      skip_mode_info->ref_frame_idx_1 == INVALID_IDX) {
+    return;
+  }
+
+  const MV_REFERENCE_FRAME ref_frame =
+      LAST_FRAME + skip_mode_info->ref_frame_idx_0;
+  const MV_REFERENCE_FRAME second_ref_frame =
+      LAST_FRAME + skip_mode_info->ref_frame_idx_1;
+  const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
+  const THR_MODES mode_index =
+      get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
+
+  if (mode_index == THR_INVALID) {
+    return;
+  }
+
+  if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
+       cpi->sf.inter_sf.disable_onesided_comp) &&
+      cpi->all_one_sided_refs) {
+    return;
+  }
+
+  mbmi->mode = this_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frame;
+  mbmi->ref_frame[1] = second_ref_frame;
+  const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext.ref_mv_count[ref_frame_type] == UINT8_MAX) {
+    MB_MODE_INFO_EXT *mbmi_ext = &x->mbmi_ext;
+    if (mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+        mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+      return;
+    }
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+                     xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                     mbmi_ext->mode_context);
+    // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+    // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+    av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type);
+  }
+
+  assert(this_mode == NEAREST_NEARESTMV);
+  if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) {
+    return;
+  }
+
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = x->compound_idx;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->ref_mv_idx = 0;
+  mbmi->skip_mode = mbmi->skip_txfm = 1;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  for (int i = 0; i < num_planes; i++) {
+    xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+    xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+  }
+
+  BUFFER_SET orig_dst;
+  for (int i = 0; i < num_planes; i++) {
+    orig_dst.plane[i] = xd->plane[i].dst.buf;
+    orig_dst.stride[i] = xd->plane[i].dst.stride;
+  }
+
+  // Compare the use of skip_mode with the best intra/inter mode obtained.
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  int64_t best_intra_inter_mode_cost = INT64_MAX;
+  if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) {
+    const ModeCosts *mode_costs = &x->mode_costs;
+    best_intra_inter_mode_cost = RDCOST(
+        x->rdmult, rd_cost->rate + mode_costs->skip_mode_cost[skip_mode_ctx][0],
+        rd_cost->dist);
+    // Account for non-skip mode rate in total rd stats
+    rd_cost->rate += mode_costs->skip_mode_cost[skip_mode_ctx][0];
+    av1_rd_cost_update(x->rdmult, rd_cost);
+  }
+
+  // Obtain the rdcost for skip_mode.
+  skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst,
+               best_intra_inter_mode_cost);
+
+  if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost &&
+      (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) {
+    assert(mode_index != THR_INVALID);
+    search_state->best_mbmode.skip_mode = 1;
+    search_state->best_mbmode = *mbmi;
+    memset(search_state->best_mbmode.inter_tx_size,
+           search_state->best_mbmode.tx_size,
+           sizeof(search_state->best_mbmode.inter_tx_size));
+    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height,
+                  search_state->best_mbmode.skip_txfm && is_inter_block(mbmi),
+                  xd);
+    search_state->best_mode_index = mode_index;
+
+    // Update rd_cost
+    rd_cost->rate = skip_mode_rd_stats.rate;
+    rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
+    rd_cost->rdcost = skip_mode_rd_stats.rdcost;
+
+    search_state->best_rd = rd_cost->rdcost;
+    search_state->best_skip2 = 1;
+    search_state->best_mode_skippable = 1;
+
+    x->txfm_search_info.skip_txfm = 1;
+  }
+}
+
+// Get winner mode stats of given mode index
+static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats(
+    MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost,
+    int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index,
+    RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv,
+    THR_MODES *winner_mode_index, MULTI_WINNER_MODE_TYPE multi_winner_mode_type,
+    int mode_idx) {
+  MB_MODE_INFO *winner_mbmi;
+  if (multi_winner_mode_type) {
+    assert(mode_idx >= 0 && mode_idx < x->winner_mode_count);
+    WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx];
+    winner_mbmi = &winner_mode_stat->mbmi;
+
+    *winner_rd_cost = &winner_mode_stat->rd_cost;
+    *winner_rate_y = winner_mode_stat->rate_y;
+    *winner_rate_uv = winner_mode_stat->rate_uv;
+    *winner_mode_index = winner_mode_stat->mode_index;
+  } else {
+    winner_mbmi = best_mbmode;
+    *winner_rd_cost = best_rd_cost;
+    *winner_rate_y = best_rate_y;
+    *winner_rate_uv = best_rate_uv;
+    *winner_mode_index = *best_mode_index;
+  }
+  return winner_mbmi;
+}
+
+// speed feature: fast intra/inter transform type search
+// Used for speed >= 2
+// When this speed feature is on, in rd mode search, only DCT is used.
+// After the mode is determined, this function is called, to select
+// transform types and get accurate rdcost.
+static AOM_INLINE void refine_winner_mode_tx(
+    const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+    PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index,
+    MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+    int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  int64_t best_rd;
+  const int num_planes = av1_num_planes(cm);
+
+  if (!is_winner_mode_processing_enabled(cpi, x, best_mbmode,
+                                         rd_cost->skip_txfm))
+    return;
+
+  // Set params for winner mode evaluation
+  set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+  // No best mode identified so far
+  if (*best_mode_index == THR_INVALID) return;
+
+  best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+  for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) {
+    RD_STATS *winner_rd_stats = NULL;
+    int winner_rate_y = 0, winner_rate_uv = 0;
+    THR_MODES winner_mode_index = 0;
+
+    // TODO(any): Combine best mode and multi-winner mode processing paths
+    // Get winner mode stats for current mode index
+    MB_MODE_INFO *winner_mbmi = get_winner_mode_stats(
+        x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index,
+        &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index,
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, mode_idx);
+
+    if (xd->lossless[winner_mbmi->segment_id] == 0 &&
+        winner_mode_index != THR_INVALID &&
+        is_winner_mode_processing_enabled(cpi, x, winner_mbmi,
+                                          rd_cost->skip_txfm)) {
+      RD_STATS rd_stats = *winner_rd_stats;
+      int skip_blk = 0;
+      RD_STATS rd_stats_y, rd_stats_uv;
+      const int skip_ctx = av1_get_skip_txfm_context(xd);
+
+      *mbmi = *winner_mbmi;
+
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Select prediction reference frames.
+      for (int i = 0; i < num_planes; i++) {
+        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+        if (has_second_ref(mbmi))
+          xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+      }
+
+      if (is_inter_mode(mbmi->mode)) {
+        const int mi_row = xd->mi_row;
+        const int mi_col = xd->mi_col;
+        bool is_predictor_built = false;
+        const PREDICTION_MODE prediction_mode = mbmi->mode;
+        // Do interpolation filter search for realtime mode if applicable.
+        if (cpi->sf.winner_mode_sf.winner_mode_ifs &&
+            cpi->oxcf.mode == REALTIME &&
+            cm->current_frame.reference_mode == SINGLE_REFERENCE &&
+            is_inter_mode(prediction_mode) &&
+            mbmi->motion_mode == SIMPLE_TRANSLATION &&
+            !is_inter_compound_mode(prediction_mode)) {
+          is_predictor_built =
+              fast_interp_search(cpi, x, mi_row, mi_col, bsize);
+        }
+        if (!is_predictor_built) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                        av1_num_planes(cm) - 1);
+        }
+        if (mbmi->motion_mode == OBMC_CAUSAL)
+          av1_build_obmc_inter_predictors_sb(cm, xd);
+
+        av1_subtract_plane(x, bsize, 0);
+        if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+            !xd->lossless[mbmi->segment_id]) {
+          av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                              INT64_MAX);
+          assert(rd_stats_y.rate != INT_MAX);
+        } else {
+          av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                            INT64_MAX);
+          memset(mbmi->inter_tx_size, mbmi->tx_size,
+                 sizeof(mbmi->inter_tx_size));
+          for (int i = 0; i < xd->height * xd->width; ++i)
+            set_blk_skip(txfm_info->blk_skip, 0, i, rd_stats_y.skip_txfm);
+        }
+      } else {
+        av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                          INT64_MAX);
+      }
+
+      if (num_planes > 1) {
+        av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+
+      const ModeCosts *mode_costs = &x->mode_costs;
+      if (is_inter_mode(mbmi->mode) &&
+          RDCOST(x->rdmult,
+                 mode_costs->skip_txfm_cost[skip_ctx][0] + rd_stats_y.rate +
+                     rd_stats_uv.rate,
+                 (rd_stats_y.dist + rd_stats_uv.dist)) >
+              RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][1],
+                     (rd_stats_y.sse + rd_stats_uv.sse))) {
+        skip_blk = 1;
+        rd_stats_y.rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+        rd_stats_uv.rate = 0;
+        rd_stats_y.dist = rd_stats_y.sse;
+        rd_stats_uv.dist = rd_stats_uv.sse;
+      } else {
+        skip_blk = 0;
+        rd_stats_y.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
+      }
+      int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate -
+                      winner_rate_y - winner_rate_uv;
+      int64_t this_rd =
+          RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist));
+      if (best_rd > this_rd) {
+        *best_mbmode = *mbmi;
+        *best_mode_index = winner_mode_index;
+        av1_copy_array(ctx->blk_skip, txfm_info->blk_skip, ctx->num_4x4_blk);
+        av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+        rd_cost->rate = this_rate;
+        rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+        rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        *best_skip2 = skip_blk;
+      }
+    }
+  }
+}
+
+/*!\cond */
+typedef struct {
+  // Mask for each reference frame, specifying which prediction modes to NOT try
+  // during search.
+  uint32_t pred_modes[REF_FRAMES];
+  // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of
+  // reference frames (i, j).
+  // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1
+  // (NONE_FRAME).
+  bool ref_combo[REF_FRAMES][REF_FRAMES + 1];
+} mode_skip_mask_t;
+/*!\endcond */
+
+// Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
+static AOM_INLINE void disable_reference(
+    MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+  for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+    ref_combo[ref][ref2 + 1] = true;
+  }
+}
+
+// Update 'ref_combo' mask to disable all inter references except ALTREF.
+static AOM_INLINE void disable_inter_references_except_altref(
+    bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+  disable_reference(LAST_FRAME, ref_combo);
+  disable_reference(LAST2_FRAME, ref_combo);
+  disable_reference(LAST3_FRAME, ref_combo);
+  disable_reference(GOLDEN_FRAME, ref_combo);
+  disable_reference(BWDREF_FRAME, ref_combo);
+  disable_reference(ALTREF2_FRAME, ref_combo);
+}
+
+static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = {
+  { LAST_FRAME, NONE_FRAME },     { ALTREF_FRAME, NONE_FRAME },
+  { LAST_FRAME, ALTREF_FRAME },   { GOLDEN_FRAME, NONE_FRAME },
+  { INTRA_FRAME, NONE_FRAME },    { GOLDEN_FRAME, ALTREF_FRAME },
+  { LAST_FRAME, GOLDEN_FRAME },   { LAST_FRAME, INTRA_FRAME },
+  { LAST_FRAME, BWDREF_FRAME },   { LAST_FRAME, LAST3_FRAME },
+  { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME },
+  { BWDREF_FRAME, NONE_FRAME },   { BWDREF_FRAME, ALTREF_FRAME },
+  { ALTREF_FRAME, INTRA_FRAME },  { BWDREF_FRAME, INTRA_FRAME },
+};
+
+typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
+
+static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask,
+                                         REF_SET ref_set) {
+  if (ref_set == REF_SET_FULL) {
+    // Everything available by default.
+    memset(mask, 0, sizeof(*mask));
+  } else {
+    // All modes available by default.
+    memset(mask->pred_modes, 0, sizeof(mask->pred_modes));
+    // All references disabled first.
+    for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) {
+      for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+        mask->ref_combo[ref1][ref2 + 1] = true;
+      }
+    }
+    const MV_REFERENCE_FRAME(*ref_set_combos)[2];
+    int num_ref_combos;
+
+    // Then enable reduced set of references explicitly.
+    switch (ref_set) {
+      case REF_SET_REDUCED:
+        ref_set_combos = reduced_ref_combos;
+        num_ref_combos =
+            (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]);
+        break;
+      case REF_SET_REALTIME:
+        ref_set_combos = real_time_ref_combos;
+        num_ref_combos =
+            (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]);
+        break;
+      default: assert(0); num_ref_combos = 0;
+    }
+
+    for (int i = 0; i < num_ref_combos; ++i) {
+      const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i];
+      mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false;
+    }
+  }
+}
+
+static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
+                                           const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  unsigned char segment_id = mbmi->segment_id;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
+  REF_SET ref_set = REF_SET_FULL;
+
+  if (sf->rt_sf.use_real_time_ref_set)
+    ref_set = REF_SET_REALTIME;
+  else if (cpi->oxcf.ref_frm_cfg.enable_reduced_reference_set)
+    ref_set = REF_SET_REDUCED;
+
+  default_skip_mask(mask, ref_set);
+
+  int min_pred_mv_sad = INT_MAX;
+  MV_REFERENCE_FRAME ref_frame;
+  if (ref_set == REF_SET_REALTIME) {
+    // For real-time encoding, we only look at a subset of ref frames. So the
+    // threshold for pruning should be computed from this subset as well.
+    const int num_rt_refs =
+        sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos);
+    for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) {
+      const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+      if (ref != INTRA_FRAME) {
+        min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]);
+      }
+    }
+  } else {
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+      min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) {
+      // Skip checking missing reference in both single and compound reference
+      // modes.
+      disable_reference(ref_frame, mask->ref_combo);
+    } else {
+      // Skip fixed mv modes for poor references
+      if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+        mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+      }
+    }
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      // Reference not used for the segment.
+      disable_reference(ref_frame, mask->ref_combo);
+    }
+  }
+  // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature
+  // is disabled for this segment. This is to prevent the possibility that we
+  // end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref &&
+        (cpi->oxcf.algo_cfg.arnr_max_frames == 0)) {
+      disable_inter_references_except_altref(mask->ref_combo);
+
+      mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+      const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+      int_mv near_mv, nearest_mv, global_mv;
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames,
+                  &x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
+
+      if (near_mv.as_int != global_mv.as_int)
+        mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
+      if (nearest_mv.as_int != global_mv.as_int)
+        mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV);
+    }
+  }
+
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (inter_sf->alt_ref_search_fp &&
+        (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) {
+      mask->pred_modes[ALTREF_FRAME] = 0;
+      disable_inter_references_except_altref(mask->ref_combo);
+      disable_reference(INTRA_FRAME, mask->ref_combo);
+    }
+  }
+
+  if (inter_sf->alt_ref_search_fp) {
+    if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) {
+      int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3);
+      // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
+      // those are past frames
+      MV_REFERENCE_FRAME start_frame =
+          inter_sf->alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME;
+      for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) {
+        if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+            0) {
+          // Prune inter modes when relative dist of ALTREF2 and ALTREF is close
+          // to the relative dist of LAST_FRAME.
+          if (inter_sf->alt_ref_search_fp == 1 &&
+              (abs(cpi->ref_frame_dist_info
+                       .ref_relative_dist[ref_frame - LAST_FRAME]) >
+               1.5 * abs(cpi->ref_frame_dist_info
+                             .ref_relative_dist[LAST_FRAME - LAST_FRAME]))) {
+            continue;
+          }
+          if (x->pred_mv_sad[ref_frame] > sad_thresh)
+            mask->pred_modes[ref_frame] |= INTER_ALL;
+        }
+      }
+    }
+  }
+
+  if (sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
+    if (x->best_pred_mv_sad[0] < INT_MAX) {
+      int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 1);
+      const int prune_ref_list[2] = { GOLDEN_FRAME, ALTREF_FRAME };
+
+      // Conservatively skip the modes w.r.t. GOLDEN and ALTREF references
+      for (int ref_idx = 0; ref_idx < 2; ref_idx++) {
+        ref_frame = prune_ref_list[ref_idx];
+        if (x->pred_mv_sad[ref_frame] > sad_thresh)
+          mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+      }
+    }
+  }
+
+  if (bsize > sf->part_sf.max_intra_bsize) {
+    disable_reference(INTRA_FRAME, mask->ref_combo);
+  }
+
+  if (!cpi->oxcf.tool_cfg.enable_global_motion) {
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      mask->pred_modes[ref_frame] |= (1 << GLOBALMV);
+      mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV);
+    }
+  }
+
+  mask->pred_modes[INTRA_FRAME] |=
+      ~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
+
+  // Prune reference frames which are not the closest to the current
+  // frame and with large pred_mv_sad.
+  if (inter_sf->prune_single_ref) {
+    assert(inter_sf->prune_single_ref > 0 && inter_sf->prune_single_ref < 3);
+    const double prune_threshes[2] = { 1.20, 1.05 };
+
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      const RefFrameDistanceInfo *const ref_frame_dist_info =
+          &cpi->ref_frame_dist_info;
+      const int is_closest_ref =
+          (ref_frame == ref_frame_dist_info->nearest_past_ref) ||
+          (ref_frame == ref_frame_dist_info->nearest_future_ref);
+
+      if (!is_closest_ref) {
+        const int dir =
+            (ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+                ? 0
+                : 1;
+        if (x->best_pred_mv_sad[dir] < INT_MAX &&
+            x->pred_mv_sad[ref_frame] >
+                prune_threshes[inter_sf->prune_single_ref - 1] *
+                    x->best_pred_mv_sad[dir])
+          mask->pred_modes[ref_frame] |= INTER_SINGLE_ALL;
+      }
+    }
+  }
+}
+
+static AOM_INLINE void init_neighbor_pred_buf(
+    const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args,
+    int is_hbd) {
+  if (is_hbd) {
+    const int len = sizeof(uint16_t);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred);
+    args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred +
+                                                 (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred);
+    args->left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len);
+    args->left_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len);
+  } else {
+    args->above_pred_buf[0] = obmc_buffer->above_pred;
+    args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = obmc_buffer->left_pred;
+    args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE;
+  }
+}
+
+static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                      MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME rf[2];
+  av1_set_ref_frame(rf, ref_frame);
+
+  if ((cpi->prune_ref_frame_mask >> ref_frame) & 1) return 1;
+
+  if (prune_ref_by_selective_ref_frame(cpi, x, rf,
+                                       cm->cur_frame->ref_display_order_hint)) {
+    return 1;
+  }
+
+  return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_by_compound_ref(
+    int ref_frame, int skip_ref_frame_mask) {
+  for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+    if (!(skip_ref_frame_mask & (1 << r))) {
+      const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+      if (rf[0] == ref_frame || rf[1] == ref_frame) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
+                                                 const MB_MODE_INFO *mi_cache) {
+  if (!mi_cache) {
+    return 0;
+  }
+
+  if (ref_frame < REF_FRAMES) {
+    return (ref_frame == mi_cache->ref_frame[0] ||
+            ref_frame == mi_cache->ref_frame[1]);
+  }
+
+  // if we are here, then the current mode is compound.
+  MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame);
+  return ref_frame == cached_ref_type;
+}
+
+// Please add/modify parameter setting in this function, making it consistent
+// and easy to read and maintain.
+static AOM_INLINE void set_params_rd_pick_inter_mode(
+    const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask,
+    unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES],
+    struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  unsigned char segment_id = mbmi->segment_id;
+
+  init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd));
+  av1_collect_neighbors_ref_counts(xd);
+  estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  x->best_pred_mv_sad[0] = INT_MAX;
+  x->best_pred_mv_sad[1] = INT_MAX;
+
+  for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+       ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    mbmi_ext->mode_context[ref_frame] = 0;
+    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      // Skip the ref frame if the mask says skip and the ref is not used by
+      // compound ref.
+      if (skip_ref_frame_mask & (1 << ref_frame) &&
+          !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) &&
+          !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
+        continue;
+      }
+      assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
+      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
+    }
+    if (cpi->sf.inter_sf.alt_ref_search_fp ||
+        cpi->sf.inter_sf.prune_single_ref ||
+        cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
+      // Store the best pred_mv_sad across all past frames
+      if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+          0)
+        x->best_pred_mv_sad[0] =
+            AOMMIN(x->best_pred_mv_sad[0], x->pred_mv_sad[ref_frame]);
+      else
+        // Store the best pred_mv_sad across all future frames
+        x->best_pred_mv_sad[1] =
+            AOMMIN(x->best_pred_mv_sad[1], x->pred_mv_sad[ref_frame]);
+    }
+  }
+
+  if (!cpi->sf.rt_sf.use_real_time_ref_set && is_comp_ref_allowed(bsize)) {
+    // No second reference on RT ref set, so no need to initialize
+    for (MV_REFERENCE_FRAME ref_frame = EXTREF_FRAME;
+         ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+      mbmi_ext->mode_context[ref_frame] = 0;
+      mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+      const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+      if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
+            (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) {
+        continue;
+      }
+
+      if (skip_ref_frame_mask & (1 << ref_frame) &&
+          !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
+        continue;
+      }
+      // Ref mv list population is not required, when compound references are
+      // pruned.
+      if (prune_ref_frame(cpi, x, ref_frame)) continue;
+
+      av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                       xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                       mbmi_ext->mode_context);
+      // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+      // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+      av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+    }
+  }
+
+  av1_count_overlappable_neighbors(cm, xd);
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  int use_actual_frame_probs = 1;
+  int prune_obmc;
+#if CONFIG_FPMT_TEST
+  use_actual_frame_probs =
+      (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+  if (!use_actual_frame_probs) {
+    prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] <
+                 cpi->sf.inter_sf.prune_obmc_prob_thresh;
+  }
+#endif
+  if (use_actual_frame_probs) {
+    prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+                 cpi->sf.inter_sf.prune_obmc_prob_thresh;
+  }
+  if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) {
+    if (check_num_overlappable_neighbors(mbmi) &&
+        is_motion_variation_allowed_bsize(bsize)) {
+      int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+      int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                       MAX_SB_SIZE >> 1 };
+      int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                        MAX_SB_SIZE >> 1 };
+      int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+      av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf,
+                                          dst_width1, dst_height1,
+                                          args->above_pred_stride);
+      av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf,
+                                         dst_width2, dst_height2,
+                                         args->left_pred_stride);
+      const int num_planes = av1_num_planes(cm);
+      av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
+                           mi_col, 0, num_planes);
+      calc_target_weighted_pred(
+          cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0],
+          args->left_pred_buf[0], args->left_pred_stride[0]);
+    }
+  }
+
+  init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
+
+  // Set params for mode evaluation
+  set_mode_eval_params(cpi, x, MODE_EVAL);
+
+  x->comp_rd_stats_idx = 0;
+
+  for (int idx = 0; idx < REF_FRAMES; idx++) {
+    args->best_single_sse_in_refs[idx] = INT32_MAX;
+  }
+}
+
+static AOM_INLINE void init_single_inter_mode_search_state(
+    InterModeSearchState *search_state) {
+  for (int dir = 0; dir < 2; ++dir) {
+    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+        SingleInterModeState *state;
+
+        state = &search_state->single_state[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+
+        state = &search_state->single_state_modelled[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+
+        search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+      }
+    }
+  }
+
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+    search_state->best_single_rd[ref_frame] = INT64_MAX;
+    search_state->best_single_mode[ref_frame] = PRED_MODE_INVALID;
+  }
+  av1_zero(search_state->single_state_cnt);
+  av1_zero(search_state->single_state_modelled_cnt);
+}
+
+static AOM_INLINE void init_inter_mode_search_state(
+    InterModeSearchState *search_state, const AV1_COMP *cpi,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
+  init_intra_mode_search_state(&search_state->intra_search_state);
+  av1_invalid_rd_stats(&search_state->best_y_rdcost);
+
+  search_state->best_rd = best_rd_so_far;
+  search_state->best_skip_rd[0] = INT64_MAX;
+  search_state->best_skip_rd[1] = INT64_MAX;
+
+  av1_zero(search_state->best_mbmode);
+
+  search_state->best_rate_y = INT_MAX;
+
+  search_state->best_rate_uv = INT_MAX;
+
+  search_state->best_mode_skippable = 0;
+
+  search_state->best_skip2 = 0;
+
+  search_state->best_mode_index = THR_INVALID;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
+
+  search_state->num_available_refs = 0;
+  memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
+  memset(search_state->dist_order_refs, -1,
+         sizeof(search_state->dist_order_refs));
+
+  for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    search_state->mode_threshold[i] = 0;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  for (int i = LAST_NEW_MV_INDEX + 1; i < SINGLE_REF_MODE_END; ++i)
+    search_state->mode_threshold[i] =
+        ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+        RD_THRESH_FAC_FRAC_BITS;
+
+  search_state->best_intra_rd = INT64_MAX;
+
+  search_state->best_pred_sse = UINT_MAX;
+
+  av1_zero(search_state->single_newmv);
+  av1_zero(search_state->single_newmv_rate);
+  av1_zero(search_state->single_newmv_valid);
+  for (int i = SINGLE_INTER_MODE_START; i < SINGLE_INTER_MODE_END; ++i) {
+    for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
+      for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+        search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+        search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+      }
+    }
+  }
+
+  for (int i = 0; i < REFERENCE_MODES; ++i) {
+    search_state->best_pred_rd[i] = INT64_MAX;
+  }
+
+  if (cpi->common.current_frame.reference_mode != SINGLE_REFERENCE) {
+    for (int i = SINGLE_REF_MODE_END; i < THR_INTER_MODE_END; ++i)
+      search_state->mode_threshold[i] =
+          ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+          RD_THRESH_FAC_FRAC_BITS;
+
+    for (int i = COMP_INTER_MODE_START; i < COMP_INTER_MODE_END; ++i) {
+      for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
+        for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+          search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+          search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+        }
+      }
+    }
+
+    init_single_inter_mode_search_state(search_state);
+  }
+}
+
+static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
+                           const MV_REFERENCE_FRAME *ref_frame,
+                           const PREDICTION_MODE this_mode) {
+  if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) {
+    return true;
+  }
+
+  return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1];
+}
+
+static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                      BLOCK_SIZE bsize,
+                                      PREDICTION_MODE curr_mode,
+                                      const MV_REFERENCE_FRAME *ref_frames) {
+  const int comp_pred = ref_frames[1] > INTRA_FRAME;
+  if (comp_pred) {
+    if (!is_comp_ref_allowed(bsize)) return 1;
+    if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) {
+      return 1;
+    }
+
+    const AV1_COMMON *const cm = &cpi->common;
+    if (frame_is_intra_only(cm)) return 1;
+
+    const CurrentFrame *const current_frame = &cm->current_frame;
+    if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
+
+    const struct segmentation *const seg = &cm->seg;
+    const unsigned char segment_id = x->e_mbd.mi[0]->segment_id;
+    // Do not allow compound prediction if the segment level reference frame
+    // feature is in use as in this case there can only be one reference.
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+  }
+
+  if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) {
+    // Mode must be compatible
+    if (!is_interintra_allowed_bsize(bsize)) return 1;
+    if (!is_interintra_allowed_mode(curr_mode)) return 1;
+  }
+
+  return 0;
+}
+
+static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
+                                        BLOCK_SIZE bsize, int mib_size) {
+  const int sb_size_mask = mib_size - 1;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_row_in_sb = mi_row & sb_size_mask;
+  const int mi_col_in_sb = mi_col & sb_size_mask;
+  const int mi_w = mi_size_wide[bsize];
+  const int mi_h = mi_size_high[bsize];
+  int picked_ref_frames_mask = 0;
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) {
+      picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j];
+    }
+  }
+  return picked_ref_frames_mask;
+}
+
+// Check if reference frame pair of the current block matches with the given
+// block.
+static INLINE int match_ref_frame_pair(const MB_MODE_INFO *mbmi,
+                                       const MV_REFERENCE_FRAME *ref_frames) {
+  return ((ref_frames[0] == mbmi->ref_frame[0]) &&
+          (ref_frames[1] == mbmi->ref_frame[1]));
+}
+
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
+static int inter_mode_search_order_independent_skip(
+    const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask,
+    InterModeSearchState *search_state, int skip_ref_frame_mask,
+    PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) {
+  if (mask_says_skip(mode_skip_mask, ref_frame, mode)) {
+    return 1;
+  }
+
+  const int ref_type = av1_ref_frame_type(ref_frame);
+  if (!cpi->sf.rt_sf.use_real_time_ref_set)
+    if (prune_ref_frame(cpi, x, ref_type)) return 1;
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test &&
+      ref_frame[0] == INTRA_FRAME)
+    return 1;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) {
+    return 1;
+  }
+
+  // Reuse the prediction mode in cache
+  if (x->use_mb_mode_cache) {
+    const MB_MODE_INFO *cached_mi = x->mb_mode_cache;
+    const PREDICTION_MODE cached_mode = cached_mi->mode;
+    const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame;
+    const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME;
+
+    // If the cached mode is intra, then we just need to match the mode.
+    if (is_mode_intra(cached_mode) && mode != cached_mode) {
+      return 1;
+    }
+
+    // If the cached mode is single inter mode, then we match the mode and
+    // reference frame.
+    if (cached_mode_is_single) {
+      if (mode != cached_mode || ref_frame[0] != cached_frame[0]) {
+        return 1;
+      }
+    } else {
+      // If the cached mode is compound, then we need to consider several cases.
+      const int mode_is_single = ref_frame[1] <= INTRA_FRAME;
+      if (mode_is_single) {
+        // If the mode is single, we know the modes can't match. But we might
+        // still want to search it if compound mode depends on the current mode.
+        int skip_motion_mode_only = 0;
+        if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[0]);
+        } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[1]);
+        } else if (cached_mode == NEW_NEWMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[0] ||
+                                   ref_frame[0] == cached_frame[1]);
+        }
+
+        return 1 + skip_motion_mode_only;
+      } else {
+        // If both modes are compound, then everything must match.
+        if (mode != cached_mode || ref_frame[0] != cached_frame[0] ||
+            ref_frame[1] != cached_frame[1]) {
+          return 1;
+        }
+      }
+    }
+  }
+
+  const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+  // If no valid mode has been found so far in PARTITION_NONE when finding a
+  // valid partition is required, do not skip mode.
+  if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE &&
+      x->must_find_valid_partition)
+    return 0;
+
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  // Prune NEARMV and NEAR_NEARMV based on q index and neighbor's reference
+  // frames
+  if (sf->inter_sf.prune_nearmv_using_neighbors &&
+      (mode == NEAR_NEARMV || mode == NEARMV)) {
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    if (search_state->best_rd != INT64_MAX && xd->left_available &&
+        xd->up_available) {
+      const int thresholds[PRUNE_NEARMV_MAX][3] = { { 1, 0, 0 },
+                                                    { 1, 1, 0 },
+                                                    { 2, 1, 0 } };
+      const int qindex_sub_range = x->qindex * 3 / QINDEX_RANGE;
+
+      assert(sf->inter_sf.prune_nearmv_using_neighbors <= PRUNE_NEARMV_MAX &&
+             qindex_sub_range < 3);
+      const int num_ref_frame_pair_match_thresh =
+          thresholds[sf->inter_sf.prune_nearmv_using_neighbors - 1]
+                    [qindex_sub_range];
+
+      assert(num_ref_frame_pair_match_thresh <= 2 &&
+             num_ref_frame_pair_match_thresh >= 0);
+      int num_ref_frame_pair_match = 0;
+
+      num_ref_frame_pair_match = match_ref_frame_pair(xd->left_mbmi, ref_frame);
+      num_ref_frame_pair_match +=
+          match_ref_frame_pair(xd->above_mbmi, ref_frame);
+
+      // Pruning based on ref frame pair match with neighbors.
+      if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1;
+    }
+  }
+
+  int skip_motion_mode = 0;
+  if (mbmi->partition != PARTITION_NONE) {
+    int skip_ref = skip_ref_frame_mask & (1 << ref_type);
+    if (ref_type <= ALTREF_FRAME && skip_ref) {
+      // Since the compound ref modes depends on the motion estimation result of
+      // two single ref modes (best mv of single ref modes as the start point),
+      // if current single ref mode is marked skip, we need to check if it will
+      // be used in compound ref modes.
+      if (is_ref_frame_used_by_compound_ref(ref_type, skip_ref_frame_mask)) {
+        // Found a not skipped compound ref mode which contains current
+        // single ref. So this single ref can't be skipped completely
+        // Just skip its motion mode search, still try its simple
+        // transition mode.
+        skip_motion_mode = 1;
+        skip_ref = 0;
+      }
+    }
+    // If we are reusing the prediction from cache, and the current frame is
+    // required by the cache, then we cannot prune it.
+    if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) {
+      skip_ref = 0;
+      // If the cache only needs the current reference type for compound
+      // prediction, then we can skip motion mode search.
+      skip_motion_mode = (ref_type <= ALTREF_FRAME &&
+                          x->mb_mode_cache->ref_frame[1] > INTRA_FRAME);
+    }
+    if (skip_ref) return 1;
+  }
+
+  if (ref_frame[0] == INTRA_FRAME) {
+    if (mode != DC_PRED) {
+      // Disable intra modes other than DC_PRED for blocks with low variance
+      // Threshold for intra skipping based on source variance
+      // TODO(debargha): Specialize the threshold for super block sizes
+      const unsigned int skip_intra_var_thresh = 64;
+      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+          x->source_variance < skip_intra_var_thresh)
+        return 1;
+    }
+  }
+
+  if (skip_motion_mode) return 2;
+
+  return 0;
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode,
+                             const MV_REFERENCE_FRAME *ref_frames,
+                             const AV1_COMMON *cm) {
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  mbmi->ref_mv_idx = 0;
+  mbmi->mode = curr_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frames[0];
+  mbmi->ref_frame[1] = ref_frames[1];
+  pmi->palette_size[0] = 0;
+  pmi->palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+static AOM_INLINE void collect_single_states(MACROBLOCK *x,
+                                             InterModeSearchState *search_state,
+                                             const MB_MODE_INFO *const mbmi) {
+  int i, j;
+  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+  const int mode_offset = INTER_OFFSET(this_mode);
+  const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+  // Simple rd
+  int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+  for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    const int64_t rd =
+        search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+    if (rd < simple_rd) simple_rd = rd;
+  }
+
+  // Insertion sort of single_state
+  const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+  SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+  i = search_state->single_state_cnt[dir][mode_offset];
+  for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+    state_s[j] = state_s[j - 1];
+  state_s[j] = this_state_s;
+  search_state->single_state_cnt[dir][mode_offset]++;
+
+  // Modelled rd
+  int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+  for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    const int64_t rd =
+        search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+    if (rd < modelled_rd) modelled_rd = rd;
+  }
+
+  // Insertion sort of single_state_modelled
+  const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+  SingleInterModeState *state_m =
+      search_state->single_state_modelled[dir][mode_offset];
+  i = search_state->single_state_modelled_cnt[dir][mode_offset];
+  for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+    state_m[j] = state_m[j - 1];
+  state_m[j] = this_state_m;
+  search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static AOM_INLINE void analyze_single_states(
+    const AV1_COMP *cpi, InterModeSearchState *search_state) {
+  const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result;
+  assert(prune_level >= 1);
+  int i, j, dir, mode;
+
+  for (dir = 0; dir < 2; ++dir) {
+    int64_t best_rd;
+    SingleInterModeState(*state)[FWD_REFS];
+    const int prune_factor = prune_level >= 2 ? 6 : 5;
+
+    // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+    // reference frames for all the modes (NEARESTMV and NEARMV may not
+    // have same motion vectors). Always keep the best of each mode
+    // because it might form the best possible combination with other mode.
+    state = search_state->single_state[dir];
+    best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                     state[INTER_OFFSET(GLOBALMV)][0].rd);
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+        if (state[mode][i].rd != INT64_MAX &&
+            (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+          state[mode][i].valid = 0;
+        }
+      }
+    }
+
+    state = search_state->single_state_modelled[dir];
+    best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                     state[INTER_OFFSET(GLOBALMV)][0].rd);
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) {
+        if (state[mode][i].rd != INT64_MAX &&
+            (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+          state[mode][i].valid = 0;
+        }
+      }
+    }
+  }
+
+  // Ordering by simple rd first, then by modelled rd
+  for (dir = 0; dir < 2; ++dir) {
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+      const int state_cnt_m =
+          search_state->single_state_modelled_cnt[dir][mode];
+      SingleInterModeState *state_s = search_state->single_state[dir][mode];
+      SingleInterModeState *state_m =
+          search_state->single_state_modelled[dir][mode];
+      int count = 0;
+      const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+      for (i = 0; i < state_cnt_s; ++i) {
+        if (state_s[i].rd == INT64_MAX) break;
+        if (state_s[i].valid) {
+          search_state->single_rd_order[dir][mode][count++] =
+              state_s[i].ref_frame;
+        }
+      }
+      if (count >= max_candidates) continue;
+
+      for (i = 0; i < state_cnt_m && count < max_candidates; ++i) {
+        if (state_m[i].rd == INT64_MAX) break;
+        if (!state_m[i].valid) continue;
+        const int ref_frame = state_m[i].ref_frame;
+        int match = 0;
+        // Check if existing already
+        for (j = 0; j < count; ++j) {
+          if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+            match = 1;
+            break;
+          }
+        }
+        if (match) continue;
+        // Check if this ref_frame is removed in simple rd
+        int valid = 1;
+        for (j = 0; j < state_cnt_s; ++j) {
+          if (ref_frame == state_s[j].ref_frame) {
+            valid = state_s[j].valid;
+            break;
+          }
+        }
+        if (valid) {
+          search_state->single_rd_order[dir][mode][count++] = ref_frame;
+        }
+      }
+    }
+  }
+}
+
+static int compound_skip_get_candidates(
+    const AV1_COMP *cpi, const InterModeSearchState *search_state,
+    const int dir, const PREDICTION_MODE mode) {
+  const int mode_offset = INTER_OFFSET(mode);
+  const SingleInterModeState *state =
+      search_state->single_state[dir][mode_offset];
+  const SingleInterModeState *state_modelled =
+      search_state->single_state_modelled[dir][mode_offset];
+
+  int max_candidates = 0;
+  for (int i = 0; i < FWD_REFS; ++i) {
+    if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+    max_candidates++;
+  }
+
+  int candidates = max_candidates;
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) {
+    candidates = AOMMIN(2, max_candidates);
+  }
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) {
+    if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+        state[0].ref_frame == state_modelled[0].ref_frame)
+      candidates = 1;
+    if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+  }
+
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) {
+    // Limit the number of candidates to 1 in each direction for compound
+    // prediction
+    candidates = AOMMIN(1, candidates);
+  }
+  return candidates;
+}
+
+static int compound_skip_by_single_states(
+    const AV1_COMP *cpi, const InterModeSearchState *search_state,
+    const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+    const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+  const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+  const int mode[2] = { compound_ref0_mode(this_mode),
+                        compound_ref1_mode(this_mode) };
+  const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+  const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+                            refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+  int ref_searched[2] = { 0, 0 };
+  int ref_mv_match[2] = { 1, 1 };
+  int i, j;
+
+  for (i = 0; i < 2; ++i) {
+    const SingleInterModeState *state =
+        search_state->single_state[mode_dir[i]][mode_offset[i]];
+    const int state_cnt =
+        search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+    for (j = 0; j < state_cnt; ++j) {
+      if (state[j].ref_frame == refs[i]) {
+        ref_searched[i] = 1;
+        break;
+      }
+    }
+  }
+
+  const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+  for (i = 0; i < 2; ++i) {
+    if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) {
+      continue;
+    }
+    const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+    for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+      int_mv single_mv;
+      int_mv comp_mv;
+      get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs,
+                  &x->mbmi_ext);
+      get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, &x->mbmi_ext);
+      if (single_mv.as_int != comp_mv.as_int) {
+        ref_mv_match[i] = 0;
+        break;
+      }
+    }
+  }
+
+  for (i = 0; i < 2; ++i) {
+    if (!ref_searched[i] || !ref_mv_match[i]) continue;
+    const int candidates =
+        compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+    const MV_REFERENCE_FRAME *ref_order =
+        search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+    int match = 0;
+    for (j = 0; j < candidates; ++j) {
+      if (refs[i] == ref_order[j]) {
+        match = 1;
+        break;
+      }
+    }
+    if (!match) return 1;
+  }
+
+  return 0;
+}
+
+// Check if ref frames of current block matches with given block.
+static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
+                                   const MV_REFERENCE_FRAME *ref_frames,
+                                   int *const is_ref_match) {
+  if (is_inter_block(mbmi)) {
+    is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0];
+    is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0];
+    if (has_second_ref(mbmi)) {
+      is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1];
+      is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1];
+    }
+  }
+}
+
+// Prune compound mode using ref frames of neighbor blocks.
+static INLINE int compound_skip_using_neighbor_refs(
+    MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
+    const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) {
+  // Exclude non-extended compound modes from pruning
+  if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+      this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+    return 0;
+
+  if (prune_ext_comp_using_neighbors >= 3) return 1;
+
+  int is_ref_match[2] = { 0 };  // 0 - match for forward refs
+                                // 1 - match for backward refs
+  // Check if ref frames of this block matches with left neighbor.
+  if (xd->left_available)
+    match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match);
+
+  // Check if ref frames of this block matches with above neighbor.
+  if (xd->up_available)
+    match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match);
+
+  // Combine ref frame match with neighbors in forward and backward refs.
+  const int track_ref_match = is_ref_match[0] + is_ref_match[1];
+
+  // Pruning based on ref frame match with neighbors.
+  if (track_ref_match >= prune_ext_comp_using_neighbors) return 0;
+  return 1;
+}
+
+// Update best single mode for the given reference frame based on simple rd.
+static INLINE void update_best_single_mode(InterModeSearchState *search_state,
+                                           const PREDICTION_MODE this_mode,
+                                           const MV_REFERENCE_FRAME ref_frame,
+                                           int64_t this_rd) {
+  if (this_rd < search_state->best_single_rd[ref_frame]) {
+    search_state->best_single_rd[ref_frame] = this_rd;
+    search_state->best_single_mode[ref_frame] = this_mode;
+  }
+}
+
+// Prune compound mode using best single mode for the same reference.
+static INLINE int skip_compound_using_best_single_mode_ref(
+    const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames,
+    const PREDICTION_MODE *best_single_mode,
+    int prune_comp_using_best_single_mode_ref) {
+  // Exclude non-extended compound modes from pruning
+  if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+      this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+    return 0;
+
+  assert(this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV);
+  const PREDICTION_MODE comp_mode_ref0 = compound_ref0_mode(this_mode);
+  // Get ref frame direction corresponding to NEWMV
+  // 0 - NEWMV corresponding to forward direction
+  // 1 - NEWMV corresponding to backward direction
+  const int newmv_dir = comp_mode_ref0 != NEWMV;
+
+  // Avoid pruning the compound mode when ref frame corresponding to NEWMV
+  // have NEWMV as single mode winner.
+  // Example: For an extended-compound mode,
+  // {mode, {fwd_frame, bwd_frame}} = {NEAR_NEWMV, {LAST_FRAME, ALTREF_FRAME}}
+  // - Ref frame corresponding to NEWMV is ALTREF_FRAME
+  // - Avoid pruning this mode, if best single mode corresponding to ref frame
+  //   ALTREF_FRAME is NEWMV
+  const PREDICTION_MODE single_mode = best_single_mode[ref_frames[newmv_dir]];
+  if (single_mode == NEWMV) return 0;
+
+  // Avoid pruning the compound mode when best single mode is not available
+  if (prune_comp_using_best_single_mode_ref == 1)
+    if (single_mode == MB_MODE_COUNT) return 0;
+  return 1;
+}
+
+static int compare_int64(const void *a, const void *b) {
+  int64_t a64 = *((int64_t *)a);
+  int64_t b64 = *((int64_t *)b);
+  if (a64 < b64) {
+    return -1;
+  } else if (a64 == b64) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+static INLINE void update_search_state(
+    InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst,
+    PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats,
+    const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv,
+    THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int skip_txfm =
+      mbmi->skip_txfm && !is_mode_intra(av1_mode_defs[new_best_mode].mode);
+  const TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  search_state->best_rd = new_best_rd_stats->rdcost;
+  search_state->best_mode_index = new_best_mode;
+  *best_rd_stats_dst = *new_best_rd_stats;
+  search_state->best_mbmode = *mbmi;
+  search_state->best_skip2 = skip_txfm;
+  search_state->best_mode_skippable = new_best_rd_stats->skip_txfm;
+  // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and
+  // rate_uv because av1_txfm_search process is replaced by rd estimation.
+  // Therefore, we should avoid updating best_rate_y and best_rate_uv here.
+  // These two values will be updated when av1_txfm_search is called.
+  if (txfm_search_done) {
+    search_state->best_rate_y =
+        new_best_rd_stats_y->rate +
+        x->mode_costs.skip_txfm_cost[skip_ctx]
+                                    [new_best_rd_stats->skip_txfm || skip_txfm];
+    search_state->best_rate_uv = new_best_rd_stats_uv->rate;
+  }
+  search_state->best_y_rdcost = *new_best_rd_stats_y;
+  memcpy(ctx->blk_skip, txfm_info->blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+  av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+// Find the best RD for a reference frame (among single reference modes)
+// and store +10% of it in the 0-th element in ref_frame_rd.
+static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) {
+  assert(ref_frame_rd[0] == INT64_MAX);
+  int64_t ref_copy[REF_FRAMES - 1];
+  memcpy(ref_copy, ref_frame_rd + 1,
+         sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1));
+  qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64);
+
+  int64_t cutoff = ref_copy[0];
+  // The cut-off is within 10% of the best.
+  if (cutoff != INT64_MAX) {
+    assert(cutoff < INT64_MAX / 200);
+    cutoff = (110 * cutoff) / 100;
+  }
+  ref_frame_rd[0] = cutoff;
+}
+
+// Check if either frame is within the cutoff.
+static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES],
+                                        MV_REFERENCE_FRAME frame1,
+                                        MV_REFERENCE_FRAME frame2) {
+  assert(frame2 > 0);
+  return ref_frame_rd[frame1] <= ref_frame_rd[0] ||
+         ref_frame_rd[frame2] <= ref_frame_rd[0];
+}
+
+static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost,
+    HandleInterModeArgs *const args, TileDataEnc *const tile_data,
+    PICK_MODE_CONTEXT *const ctx,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+    const motion_mode_best_st_candidate *const best_motion_mode_cands,
+    int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd,
+    InterModeSearchState *const search_state, int64_t *yrd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  InterModesInfo *const inter_modes_info = x->inter_modes_info;
+  const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand;
+
+  for (int cand = 0; cand < num_best_cand; cand++) {
+    RD_STATS rd_stats;
+    RD_STATS rd_stats_y;
+    RD_STATS rd_stats_uv;
+    av1_init_rd_stats(&rd_stats);
+    av1_init_rd_stats(&rd_stats_y);
+    av1_init_rd_stats(&rd_stats_uv);
+    int rate_mv;
+
+    rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv;
+    args->skip_motion_mode =
+        best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode;
+    *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi;
+    rd_stats.rate =
+        best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff;
+
+    // Continue if the best candidate is compound.
+    if (!is_inter_singleref_mode(mbmi->mode)) continue;
+
+    x->txfm_search_info.skip_txfm = 0;
+    struct macroblockd_plane *pd = xd->plane;
+    const BUFFER_SET orig_dst = {
+      { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+      { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+    };
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    // Initialize motion mode to simple translation
+    // Calculation of switchable rate depends on it.
+    mbmi->motion_mode = 0;
+    const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    int64_t skip_rd[2] = { search_state->best_skip_rd[0],
+                           search_state->best_skip_rd[1] };
+    int64_t this_yrd = INT64_MAX;
+    int64_t ret_value = motion_mode_rd(
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, args,
+        search_state->best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd,
+        do_tx_search, inter_modes_info, 1, &this_yrd);
+
+    if (ret_value != INT64_MAX) {
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+      const THR_MODES mode_enum = get_prediction_mode_idx(
+          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      // Collect mode stats for multiwinner mode processing
+      store_winner_mode_stats(
+          &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
+          mode_enum, NULL, bsize, rd_stats.rdcost,
+          cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search);
+      if (rd_stats.rdcost < search_state->best_rd) {
+        *yrd = this_yrd;
+        update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                            &rd_stats_uv, mode_enum, x, do_tx_search);
+        if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0];
+      }
+    }
+  }
+}
+
+/*!\cond */
+// Arguments for speed feature pruning of inter mode search
+typedef struct {
+  int *skip_motion_mode;
+  mode_skip_mask_t *mode_skip_mask;
+  InterModeSearchState *search_state;
+  int skip_ref_frame_mask;
+  int reach_first_comp_mode;
+  int mode_thresh_mul_fact;
+  int num_single_modes_processed;
+  int prune_cpd_using_sr_stats_ready;
+} InterModeSFArgs;
+/*!\endcond */
+
+static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+                           int64_t *ref_frame_rd, int midx,
+                           InterModeSFArgs *args, int is_low_temp_var) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  // Get the actual prediction mode we are trying in this iteration
+  const THR_MODES mode_enum = av1_default_mode_order[midx];
+  const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+  const PREDICTION_MODE this_mode = mode_def->mode;
+  const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+  const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+  const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+  const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+  if (ref_frame == INTRA_FRAME) return 1;
+
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  if (sf->inter_sf.skip_arf_compound && update_type == ARF_UPDATE &&
+      comp_pred) {
+    return 1;
+  }
+
+  // This is for real time encoding.
+  if (is_low_temp_var && !comp_pred && ref_frame != LAST_FRAME &&
+      this_mode != NEARESTMV)
+    return 1;
+
+  // Check if this mode should be skipped because it is incompatible with the
+  // current frame
+  if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames))
+    return 1;
+  const int ret = inter_mode_search_order_independent_skip(
+      cpi, x, args->mode_skip_mask, args->search_state,
+      args->skip_ref_frame_mask, this_mode, mode_def->ref_frame);
+  if (ret == 1) return 1;
+  *(args->skip_motion_mode) = (ret == 2);
+
+  // We've reached the first compound prediction mode, get stats from the
+  // single reference predictors to help with pruning.
+  // Disable this pruning logic if interpolation filter search was skipped for
+  // single prediction modes as it can result in aggressive pruning of compound
+  // prediction modes due to the absence of modelled_rd populated by
+  // av1_interpolation_filter_search().
+  // TODO(Remya): Check the impact of the sf
+  // 'prune_comp_search_by_single_result' if compound prediction modes are
+  // enabled in future for REALTIME encode.
+  if (!sf->interp_sf.skip_interp_filter_search &&
+      sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
+      args->reach_first_comp_mode == 0) {
+    analyze_single_states(cpi, args->search_state);
+    args->reach_first_comp_mode = 1;
+  }
+
+  // Prune aggressively when best mode is skippable.
+  int mul_fact = args->search_state->best_mode_skippable
+                     ? args->mode_thresh_mul_fact
+                     : (1 << MODE_THRESH_QBITS);
+  int64_t mode_threshold =
+      (args->search_state->mode_threshold[mode_enum] * mul_fact) >>
+      MODE_THRESH_QBITS;
+
+  if (args->search_state->best_rd < mode_threshold) return 1;
+
+  // Skip this compound mode based on the RD results from the single prediction
+  // modes
+  if (!sf->interp_sf.skip_interp_filter_search &&
+      sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
+    if (compound_skip_by_single_states(cpi, args->search_state, this_mode,
+                                       ref_frame, second_ref_frame, x))
+      return 1;
+  }
+
+  if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) {
+    // After we done with single reference modes, find the 2nd best RD
+    // for a reference frame. Only search compound modes that have a reference
+    // frame at least as good as the 2nd best.
+    if (!args->prune_cpd_using_sr_stats_ready &&
+        args->num_single_modes_processed == NUM_SINGLE_REF_MODES) {
+      find_top_ref(ref_frame_rd);
+      args->prune_cpd_using_sr_stats_ready = 1;
+    }
+    if (args->prune_cpd_using_sr_stats_ready &&
+        !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame))
+      return 1;
+  }
+
+  // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+  if (sf->inter_sf.skip_ext_comp_nearmv_mode &&
+      (this_mode == NEW_NEARMV || this_mode == NEAR_NEWMV)) {
+    return 1;
+  }
+
+  if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) {
+    if (compound_skip_using_neighbor_refs(
+            xd, this_mode, ref_frames,
+            sf->inter_sf.prune_ext_comp_using_neighbors))
+      return 1;
+  }
+
+  if (sf->inter_sf.prune_comp_using_best_single_mode_ref && comp_pred) {
+    if (skip_compound_using_best_single_mode_ref(
+            this_mode, ref_frames, args->search_state->best_single_mode,
+            sf->inter_sf.prune_comp_using_best_single_mode_ref))
+      return 1;
+  }
+
+  if (sf->inter_sf.prune_nearest_near_mv_using_refmv_weight && !comp_pred) {
+    const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+    if (skip_nearest_near_mv_using_refmv_weight(
+            x, this_mode, ref_frame_type,
+            args->search_state->best_mbmode.mode)) {
+      // Ensure the mode is pruned only when the current block has obtained a
+      // valid inter mode.
+      assert(is_inter_mode(args->search_state->best_mbmode.mode));
+      return 1;
+    }
+  }
+
+  if (sf->rt_sf.prune_inter_modes_with_golden_ref &&
+      ref_frame == GOLDEN_FRAME && !comp_pred) {
+    const int subgop_size = AOMMIN(cpi->ppi->gf_group.size, FIXED_GF_INTERVAL);
+    if (cpi->rc.frames_since_golden > (subgop_size >> 2) &&
+        args->search_state->best_mbmode.ref_frame[0] != GOLDEN_FRAME) {
+      if ((bsize > BLOCK_16X16 && this_mode == NEWMV) || this_mode == NEARMV)
+        return 1;
+    }
+  }
+
+  return 0;
+}
+
+static void record_best_compound(REFERENCE_MODE reference_mode,
+                                 RD_STATS *rd_stats, int comp_pred, int rdmult,
+                                 InterModeSearchState *search_state,
+                                 int compmode_cost) {
+  int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+  if (reference_mode == REFERENCE_MODE_SELECT) {
+    single_rate = rd_stats->rate - compmode_cost;
+    hybrid_rate = rd_stats->rate;
+  } else {
+    single_rate = rd_stats->rate;
+    hybrid_rate = rd_stats->rate + compmode_cost;
+  }
+
+  single_rd = RDCOST(rdmult, single_rate, rd_stats->dist);
+  hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist);
+
+  if (!comp_pred) {
+    if (single_rd < search_state->best_pred_rd[SINGLE_REFERENCE])
+      search_state->best_pred_rd[SINGLE_REFERENCE] = single_rd;
+  } else {
+    if (single_rd < search_state->best_pred_rd[COMPOUND_REFERENCE])
+      search_state->best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+  }
+  if (hybrid_rd < search_state->best_pred_rd[REFERENCE_MODE_SELECT])
+    search_state->best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+}
+
+// Does a transform search over a list of the best inter mode candidates.
+// This is called if the original mode search computed an RD estimate
+// for the transform search rather than doing a full search.
+static void tx_search_best_inter_candidates(
+    AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+    int64_t best_rd_so_far, BLOCK_SIZE bsize,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int mi_row, int mi_col,
+    InterModeSearchState *search_state, RD_STATS *rd_cost,
+    PICK_MODE_CONTEXT *ctx, int64_t *yrd) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int num_planes = av1_num_planes(cm);
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  InterModesInfo *inter_modes_info = x->inter_modes_info;
+  inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+  search_state->best_rd = best_rd_so_far;
+  search_state->best_mode_index = THR_INVALID;
+  // Initialize best mode stats for winner mode processing
+  x->winner_mode_count = 0;
+  store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+                          NULL, bsize, best_rd_so_far,
+                          cpi->sf.winner_mode_sf.multi_winner_mode_type, 0);
+  inter_modes_info->num =
+      inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
+          ? inter_modes_info->num
+          : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
+  const int64_t top_est_rd =
+      inter_modes_info->num > 0
+          ? inter_modes_info
+                ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+          : INT64_MAX;
+  *yrd = INT64_MAX;
+  int64_t best_rd_in_this_partition = INT64_MAX;
+  int num_inter_mode_cands = inter_modes_info->num;
+  int newmv_mode_evaled = 0;
+  int max_allowed_cands = INT_MAX;
+  if (cpi->sf.inter_sf.limit_inter_mode_cands) {
+    // The bound on the no. of inter mode candidates, beyond which the
+    // candidates are limited if a newmv mode got evaluated, is set as
+    // max_allowed_cands + 1.
+    const int num_allowed_cands[5] = { INT_MAX, 10, 9, 6, 2 };
+    assert(cpi->sf.inter_sf.limit_inter_mode_cands <= 4);
+    max_allowed_cands =
+        num_allowed_cands[cpi->sf.inter_sf.limit_inter_mode_cands];
+  }
+
+  int num_mode_thresh = INT_MAX;
+  if (cpi->sf.inter_sf.limit_txfm_eval_per_mode) {
+    // Bound the no. of transform searches per prediction mode beyond a
+    // threshold.
+    const int num_mode_thresh_ary[4] = { INT_MAX, 4, 3, 0 };
+    assert(cpi->sf.inter_sf.limit_txfm_eval_per_mode <= 3);
+    num_mode_thresh =
+        num_mode_thresh_ary[cpi->sf.inter_sf.limit_txfm_eval_per_mode];
+  }
+
+  int num_tx_cands = 0;
+  int num_tx_search_modes[INTER_MODE_END - INTER_MODE_START] = { 0 };
+  // Iterate over best inter mode candidates and perform tx search
+  for (int j = 0; j < num_inter_mode_cands; ++j) {
+    const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+    *mbmi = inter_modes_info->mbmi_arr[data_idx];
+    const PREDICTION_MODE prediction_mode = mbmi->mode;
+    int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+    if (curr_est_rd * 0.80 > top_est_rd) break;
+
+    if (num_tx_cands > num_mode_thresh) {
+      if ((prediction_mode != NEARESTMV &&
+           num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 1) ||
+          (prediction_mode == NEARESTMV &&
+           num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 2))
+        continue;
+    }
+
+    txfm_info->skip_txfm = 0;
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    bool is_predictor_built = false;
+
+    // Initialize RD stats
+    RD_STATS rd_stats;
+    RD_STATS rd_stats_y;
+    RD_STATS rd_stats_uv;
+    const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+    int64_t skip_rd = INT64_MAX;
+    const int txfm_rd_gate_level = get_txfm_rd_gate_level(
+        cm->seq_params->enable_masked_compound,
+        cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_DEFAULT,
+        /*eval_motion_mode=*/0);
+    if (txfm_rd_gate_level) {
+      // Check if the mode is good enough based on skip RD
+      int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
+      skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
+      int eval_txfm = check_txfm_eval(x, bsize, search_state->best_skip_rd[0],
+                                      skip_rd, txfm_rd_gate_level, 0);
+      if (!eval_txfm) continue;
+    }
+
+    // Build the prediction for this mode
+    if (!is_predictor_built) {
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                    av1_num_planes(cm) - 1);
+    }
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      av1_build_obmc_inter_predictors_sb(cm, xd);
+    }
+
+    num_tx_cands++;
+    if (have_newmv_in_inter_mode(prediction_mode)) newmv_mode_evaled = 1;
+    num_tx_search_modes[prediction_mode - INTER_MODE_START]++;
+    int64_t this_yrd = INT64_MAX;
+    // Do the transform search
+    if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+                         mode_rate, search_state->best_rd)) {
+      continue;
+    } else {
+      const int y_rate =
+          rd_stats.skip_txfm
+              ? mode_costs->skip_txfm_cost[skip_ctx][1]
+              : (rd_stats_y.rate + mode_costs->skip_txfm_cost[skip_ctx][0]);
+      this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y.dist);
+
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        inter_mode_data_push(
+            tile_data, mbmi->bsize, rd_stats.sse, rd_stats.dist,
+            rd_stats_y.rate + rd_stats_uv.rate +
+                mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
+      }
+    }
+    rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+    if (rd_stats.rdcost < best_rd_in_this_partition) {
+      best_rd_in_this_partition = rd_stats.rdcost;
+      *yrd = this_yrd;
+    }
+
+    const THR_MODES mode_enum = get_prediction_mode_idx(
+        prediction_mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum,
+        NULL, bsize, rd_stats.rdcost,
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+
+    if (rd_stats.rdcost < search_state->best_rd) {
+      update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                          &rd_stats_uv, mode_enum, x, txfm_search_done);
+      search_state->best_skip_rd[0] = skip_rd;
+      // Limit the total number of modes to be evaluated if the first is valid
+      // and transform skip or compound
+      if (cpi->sf.inter_sf.inter_mode_txfm_breakout) {
+        if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) {
+          // Evaluate more candidates at high quantizers where occurrence of
+          // transform skip is high.
+          const int max_cands_cap[5] = { 2, 3, 5, 7, 9 };
+          const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
+          num_inter_mode_cands =
+              AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num);
+        } else if (!j && has_second_ref(&search_state->best_mbmode)) {
+          const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1;
+          // Evaluate more candidates at low quantizers where occurrence of
+          // single reference mode is high.
+          const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 },
+                                                { 10, 7, 5, 3 } };
+          const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS;
+          num_inter_mode_cands = AOMMIN(
+              max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num);
+        }
+      }
+    }
+    // If the number of candidates evaluated exceeds max_allowed_cands, break if
+    // a newmv mode was evaluated already.
+    if ((num_tx_cands > max_allowed_cands) && newmv_mode_evaled) break;
+  }
+}
+
+// Indicates number of winner simple translation modes to be used
+static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 };
+
+// Adds a motion mode to the candidate list for motion_mode_for_winner_cand
+// speed feature. This list consists of modes that have only searched
+// SIMPLE_TRANSLATION. The final list will be used to search other motion
+// modes after the initial RD search.
+static void handle_winner_cand(
+    MB_MODE_INFO *const mbmi,
+    motion_mode_best_st_candidate *best_motion_mode_cands,
+    int max_winner_motion_mode_cand, int64_t this_rd,
+    motion_mode_candidate *motion_mode_cand, int skip_motion_mode) {
+  // Number of current motion mode candidates in list
+  const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand;
+  int valid_motion_mode_cand_loc = num_motion_mode_cand;
+
+  // find the best location to insert new motion mode candidate
+  for (int j = 0; j < num_motion_mode_cand; j++) {
+    if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) {
+      valid_motion_mode_cand_loc = j;
+      break;
+    }
+  }
+
+  // Insert motion mode if location is found
+  if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) {
+    if (num_motion_mode_cand > 0 &&
+        valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1)
+      memmove(
+          &best_motion_mode_cands
+               ->motion_mode_cand[valid_motion_mode_cand_loc + 1],
+          &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc],
+          (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) -
+           valid_motion_mode_cand_loc) *
+              sizeof(best_motion_mode_cands->motion_mode_cand[0]));
+    motion_mode_cand->mbmi = *mbmi;
+    motion_mode_cand->rd_cost = this_rd;
+    motion_mode_cand->skip_motion_mode = skip_motion_mode;
+    best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] =
+        *motion_mode_cand;
+    best_motion_mode_cands->num_motion_mode_cand =
+        AOMMIN(max_winner_motion_mode_cand,
+               best_motion_mode_cands->num_motion_mode_cand + 1);
+  }
+}
+
+/*!\brief Search intra modes in interframes
+ *
+ * \ingroup intra_mode_search
+ *
+ * This function searches for the best intra mode when the current frame is an
+ * interframe. This function however does *not* handle luma palette mode.
+ * Palette mode is currently handled by \ref av1_search_palette_mode.
+ *
+ * This function will first iterate through the luma mode candidates to find the
+ * best luma intra mode. Once the best luma mode it's found, it will then search
+ * for the best chroma mode. Because palette mode is currently not handled by
+ * here, a cache of uv mode is stored in
+ * InterModeSearchState::intra_search_state so it can be reused later by \ref
+ * av1_search_palette_mode.
+ *
+ * \param[in,out] search_state      Struct keep track of the prediction mode
+ *                                  search state in interframe.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in,out] x                 Pointer to struct holding all the data for
+ *                                  the current prediction block.
+ * \param[out]    rd_cost           Stores the best rd_cost among all the
+ *                                  prediction modes searched.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] ctx               Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ *                                  for only the Y plane.
+ * \param[in]     sf_args           Stores the list of intra mode candidates
+ *                                  to be searched.
+ * \param[in]     intra_ref_frame_cost  The entropy cost for signaling that the
+ *                                      current ref frame is an intra frame.
+ * \param[in]     yrd_threshold     The rdcost threshold for luma intra mode to
+ *                                  terminate chroma intra mode search.
+ *
+ * \remark If a new best mode is found, search_state and rd_costs are updated
+ * correspondingly. While x is also modified, it is only used as a temporary
+ * buffer, and the final decisions are stored in search_state.
+ */
+static AOM_INLINE void search_intra_modes_in_interframe(
+    InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x,
+    RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+    const InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost,
+    int64_t yrd_threshold) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  IntraModeSearchState *intra_search_state = &search_state->intra_search_state;
+
+  int is_best_y_mode_intra = 0;
+  RD_STATS best_intra_rd_stats_y;
+  int64_t best_rd_y = INT64_MAX;
+  int best_mode_cost_y = -1;
+  MB_MODE_INFO best_mbmi = *xd->mi[0];
+  THR_MODES best_mode_enum = THR_INVALID;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int num_4x4 = bsize_to_num_blk(bsize);
+
+  // Performs luma search
+  int64_t best_model_rd = INT64_MAX;
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = 0; mode_idx < LUMA_MODE_COUNT; ++mode_idx) {
+    if (sf->intra_sf.skip_intra_in_interframe &&
+        search_state->intra_search_state.skip_intra_modes)
+      break;
+    set_y_mode_and_delta_angle(
+        mode_idx, mbmi, sf->intra_sf.prune_luma_odd_delta_angles_in_intra);
+    assert(mbmi->mode < INTRA_MODE_END);
+
+    // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+    if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode))
+      continue;
+
+    const THR_MODES mode_enum =
+        get_prediction_mode_idx(mbmi->mode, INTRA_FRAME, NONE_FRAME);
+    if ((!intra_mode_cfg->enable_smooth_intra ||
+         cpi->sf.intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED)
+      continue;
+    if (av1_is_directional_mode(mbmi->mode) &&
+        !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) &&
+        mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
+    const PREDICTION_MODE this_mode = mbmi->mode;
+
+    assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
+    assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
+    init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
+    x->txfm_search_info.skip_txfm = 0;
+
+    if (this_mode != DC_PRED) {
+      // Only search the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+        if (search_state->best_mode_index != THR_INVALID &&
+            search_state->best_mbmode.ref_frame[0] > INTRA_FRAME)
+          continue;
+      }
+      if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(
+                this_mode, search_state->intra_search_state.best_intra_mode))
+          continue;
+      }
+    }
+
+    RD_STATS intra_rd_stats_y;
+    int mode_cost_y;
+    int64_t intra_rd_y = INT64_MAX;
+    const int is_luma_result_valid = av1_handle_intra_y_mode(
+        intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx,
+        &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y,
+        &best_model_rd, top_intra_model_rd);
+    if (is_luma_result_valid && intra_rd_y < yrd_threshold) {
+      is_best_y_mode_intra = 1;
+      if (intra_rd_y < best_rd_y) {
+        best_intra_rd_stats_y = intra_rd_stats_y;
+        best_mode_cost_y = mode_cost_y;
+        best_rd_y = intra_rd_y;
+        best_mbmi = *mbmi;
+        best_mode_enum = mode_enum;
+        memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+               sizeof(best_blk_skip[0]) * num_4x4);
+        av1_copy_array(best_tx_type_map, xd->tx_type_map, num_4x4);
+      }
+    }
+  }
+
+  if (!is_best_y_mode_intra) {
+    return;
+  }
+
+  assert(best_rd_y < INT64_MAX);
+
+  // Restores the best luma mode
+  *mbmi = best_mbmi;
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * num_4x4);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, num_4x4);
+
+  // Performs chroma search
+  RD_STATS intra_rd_stats, intra_rd_stats_uv;
+  av1_init_rd_stats(&intra_rd_stats);
+  av1_init_rd_stats(&intra_rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    const int intra_uv_mode_valid = av1_search_intra_uv_modes_in_interframe(
+        intra_search_state, cpi, x, bsize, &intra_rd_stats,
+        &best_intra_rd_stats_y, &intra_rd_stats_uv, search_state->best_rd);
+
+    if (!intra_uv_mode_valid) {
+      return;
+    }
+  }
+
+  // Merge the luma and chroma rd stats
+  assert(best_mode_cost_y >= 0);
+  intra_rd_stats.rate = best_intra_rd_stats_y.rate + best_mode_cost_y;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+    // in the tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    best_intra_rd_stats_y.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const PREDICTION_MODE mode = mbmi->mode;
+  if (num_planes > 1 && xd->is_chroma_ref) {
+    const int uv_mode_cost =
+        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
+    intra_rd_stats.rate +=
+        intra_rd_stats_uv.rate +
+        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+  }
+
+  // Intra block is always coded as non-skip
+  intra_rd_stats.skip_txfm = 0;
+  intra_rd_stats.dist = best_intra_rd_stats_y.dist + intra_rd_stats_uv.dist;
+  // Add in the cost of the no skip flag.
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  intra_rd_stats.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
+  // Calculate the final RD estimate for this mode.
+  const int64_t this_rd =
+      RDCOST(x->rdmult, intra_rd_stats.rate, intra_rd_stats.dist);
+  // Keep record of best intra rd
+  if (this_rd < search_state->best_intra_rd) {
+    search_state->best_intra_rd = this_rd;
+    intra_search_state->best_intra_mode = mode;
+  }
+
+  for (int i = 0; i < REFERENCE_MODES; ++i) {
+    search_state->best_pred_rd[i] =
+        AOMMIN(search_state->best_pred_rd[i], this_rd);
+  }
+
+  intra_rd_stats.rdcost = this_rd;
+
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, &intra_rd_stats, &best_intra_rd_stats_y,
+      &intra_rd_stats_uv, best_mode_enum, NULL, bsize, intra_rd_stats.rdcost,
+      cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+  if (intra_rd_stats.rdcost < search_state->best_rd) {
+    update_search_state(search_state, rd_cost, ctx, &intra_rd_stats,
+                        &best_intra_rd_stats_y, &intra_rd_stats_uv,
+                        best_mode_enum, x, txfm_search_done);
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Prepare inter_cost and intra_cost from TPL stats, which are used as ML
+// features in intra mode pruning.
+static AOM_INLINE void calculate_cost_from_tpl_data(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int64_t *inter_cost, int64_t *intra_cost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  // Only consider full SB.
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d;
+  const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
+                  (block_size_high[sb_size] / tpl_bsize_1d);
+  SuperBlockEnc *sb_enc = &x->sb_enc;
+  if (sb_enc->tpl_data_count == len) {
+    const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
+    const int tpl_stride = sb_enc->tpl_stride;
+    const int tplw = mi_size_wide[tpl_bsize];
+    const int tplh = mi_size_high[tpl_bsize];
+    const int nw = mi_size_wide[bsize] / tplw;
+    const int nh = mi_size_high[bsize] / tplh;
+    if (nw >= 1 && nh >= 1) {
+      const int of_h = mi_row % mi_size_high[sb_size];
+      const int of_w = mi_col % mi_size_wide[sb_size];
+      const int start = of_h / tplh * tpl_stride + of_w / tplw;
+
+      for (int k = 0; k < nh; k++) {
+        for (int l = 0; l < nw; l++) {
+          *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
+          *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
+        }
+      }
+      *inter_cost /= nw * nh;
+      *intra_cost /= nw * nh;
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune
+// intra mode search.
+static AOM_INLINE void skip_intra_modes_in_interframe(
+    AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize,
+    InterModeSearchState *search_state, const SPEED_FEATURES *const sf,
+    int64_t inter_cost, int64_t intra_cost) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int comp_pred = search_state->best_mbmode.ref_frame[1] > INTRA_FRAME;
+  if (sf->rt_sf.prune_intra_mode_based_on_mv_range &&
+      bsize > sf->part_sf.max_intra_bsize && !comp_pred) {
+    const MV best_mv = search_state->best_mbmode.mv[0].as_mv;
+    const int mv_thresh = 16 << sf->rt_sf.prune_intra_mode_based_on_mv_range;
+    if (abs(best_mv.row) < mv_thresh && abs(best_mv.col) < mv_thresh &&
+        x->source_variance > 128) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+      return;
+    }
+  }
+
+  const unsigned int src_var_thresh_intra_skip = 1;
+  const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe;
+  if (!(skip_intra_in_interframe &&
+        (x->source_variance > src_var_thresh_intra_skip)))
+    return;
+
+  // Prune intra search based on best inter mode being transfrom skip.
+  if ((skip_intra_in_interframe >= 2) && search_state->best_mbmode.skip_txfm) {
+    const int qindex_thresh[2] = { 200, MAXQ };
+    const int ind = (skip_intra_in_interframe >= 3) ? 1 : 0;
+    if (!have_newmv_in_inter_mode(search_state->best_mbmode.mode) &&
+        (x->qindex <= qindex_thresh[ind])) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+      return;
+    } else if ((skip_intra_in_interframe >= 4) &&
+               (inter_cost < 0 || intra_cost < 0)) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+      return;
+    }
+  }
+  // Use ML model to prune intra search.
+  if (inter_cost >= 0 && intra_cost >= 0) {
+    const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
+                                     ? &av1_intrap_nn_config
+                                     : &av1_intrap_hd_nn_config;
+    float nn_features[6];
+    float scores[2] = { 0.0f };
+
+    nn_features[0] = (float)search_state->best_mbmode.skip_txfm;
+    nn_features[1] = (float)mi_size_wide_log2[bsize];
+    nn_features[2] = (float)mi_size_high_log2[bsize];
+    nn_features[3] = (float)intra_cost;
+    nn_features[4] = (float)inter_cost;
+    const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+    const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
+    nn_features[5] = (float)(ac_q_max / ac_q);
+
+    av1_nn_predict(nn_features, nn_config, 1, scores);
+
+    // For two parameters, the max prob returned from av1_nn_softmax equals
+    // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the
+    // calling of av1_nn_softmax.
+    const float thresh[5] = { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f };
+    assert(skip_intra_in_interframe <= 5);
+    if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+    }
+  }
+}
+
+static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi,
+                                                 int is_single_pred) {
+  const MODE encoding_mode = cpi->oxcf.mode;
+  if (encoding_mode == REALTIME) {
+    return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE &&
+            (cpi->sf.interp_sf.skip_interp_filter_search ||
+             cpi->sf.winner_mode_sf.winner_mode_ifs));
+  } else if (encoding_mode == GOOD) {
+    // Skip interpolation filter search for single prediction modes.
+    return (cpi->sf.interp_sf.skip_interp_filter_search && is_single_pred);
+  }
+  return false;
+}
+
+static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi,
+                                         const MACROBLOCK *x,
+                                         BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+
+  if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION ||
+      !sf->rt_sf.short_circuit_low_temp_var ||
+      !sf->rt_sf.prune_inter_modes_using_temp_var) {
+    return 0;
+  }
+
+  const int mi_row = x->e_mbd.mi_row;
+  const int mi_col = x->e_mbd.mi_col;
+  int is_low_temp_var = 0;
+
+  if (cm->seq_params->sb_size == BLOCK_64X64)
+    is_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
+        &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+  else
+    is_low_temp_var = av1_get_force_skip_low_temp_var(
+        &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+
+  return is_low_temp_var;
+}
+
+// TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb.
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+                            struct macroblock *x, struct RD_STATS *rd_cost,
+                            BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                            int64_t best_rd_so_far) {
+  AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
+  const int num_planes = av1_num_planes(cm);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  int i;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *comp_inter_cost =
+      mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
+
+  InterModeSearchState search_state;
+  init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far);
+  INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+  };
+  HandleInterModeArgs args = { { NULL },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+                               { NULL },
+                               { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                 MAX_SB_SIZE >> 1 },
+                               NULL,
+                               NULL,
+                               NULL,
+                               search_state.modelled_rd,
+                               INT_MAX,
+                               INT_MAX,
+                               search_state.simple_rd,
+                               0,
+                               false,
+                               interintra_modes,
+                               { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
+                               { { 0, 0 } },
+                               { 0 },
+                               0,
+                               0,
+                               -1,
+                               -1,
+                               -1,
+                               { 0 },
+                               { 0 },
+                               UINT_MAX };
+  // Currently, is_low_temp_var is used in real time encoding.
+  const int is_low_temp_var = get_block_temp_var(cpi, x, bsize);
+
+  for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1;
+  // Indicates the appropriate number of simple translation winner modes for
+  // exhaustive motion mode evaluation
+  const int max_winner_motion_mode_cand =
+      num_winner_motion_modes[sf->winner_mode_sf.motion_mode_for_winner_cand];
+  assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES);
+  motion_mode_candidate motion_mode_cand;
+  motion_mode_best_st_candidate best_motion_mode_cands;
+  // Initializing the number of motion mode candidates to zero.
+  best_motion_mode_cands.num_motion_mode_cand = 0;
+  for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i)
+    best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX;
+
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+  av1_invalid_rd_stats(rd_cost);
+
+  for (i = 0; i < REF_FRAMES; ++i) {
+    x->warp_sample_info[i].num = -1;
+  }
+
+  // Ref frames that are selected by square partition blocks.
+  int picked_ref_frames_mask = 0;
+  if (sf->inter_sf.prune_ref_frame_for_rect_partitions &&
+      mbmi->partition != PARTITION_NONE) {
+    // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
+    // partition blocks. prune_ref_frame_for_rect_partitions >=2
+    // implies prune for vert, horiz and extended partition blocks.
+    if ((mbmi->partition != PARTITION_VERT &&
+         mbmi->partition != PARTITION_HORZ) ||
+        sf->inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
+      picked_ref_frames_mask =
+          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size);
+    }
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
+  // Skip ref frames that never selected by square blocks.
+  const int skip_ref_frame_mask =
+      picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
+  mode_skip_mask_t mode_skip_mask;
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  // init params, set frame modes, speed features
+  set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask,
+                                skip_ref_frame_mask, ref_costs_single,
+                                ref_costs_comp, yv12_mb);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
+
+  int64_t best_est_rd = INT64_MAX;
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  // If do_tx_search is 0, only estimated RD should be computed.
+  // If do_tx_search is 1, all modes have TX search performed.
+  const int do_tx_search =
+      !((sf->inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+        (sf->inter_sf.inter_mode_rd_model_estimation == 2 &&
+         num_pels_log2_lookup[bsize] > 8));
+  InterModesInfo *inter_modes_info = x->inter_modes_info;
+  inter_modes_info->num = 0;
+
+  // Temporary buffers used by handle_inter_mode().
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
+
+  // The best RD found for the reference frame, among single reference modes.
+  // Note that the 0-th element will contain a cut-off that is later used
+  // to determine if we should skip a compound mode.
+  int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                       INT64_MAX, INT64_MAX, INT64_MAX,
+                                       INT64_MAX, INT64_MAX };
+
+  // Prepared stats used later to check if we could skip intra mode eval.
+  int64_t inter_cost = -1;
+  int64_t intra_cost = -1;
+  // Need to tweak the threshold for hdres speed 0 & 1.
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // Obtain the relevant tpl stats for pruning inter modes
+  PruneInfoFromTpl inter_cost_info_from_tpl;
+#if !CONFIG_REALTIME_ONLY
+  if (sf->inter_sf.prune_inter_modes_based_on_tpl) {
+    // x->tpl_keep_ref_frame[id] = 1 => no pruning in
+    // prune_ref_by_selective_ref_frame()
+    // x->tpl_keep_ref_frame[id] = 0  => ref frame can be pruned in
+    // prune_ref_by_selective_ref_frame()
+    // Populating valid_refs[idx] = 1 ensures that
+    // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a
+    // pruned ref frame.
+    int valid_refs[INTER_REFS_PER_FRAME];
+    for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+      const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME };
+      valid_refs[frame - 1] =
+          x->tpl_keep_ref_frame[frame] ||
+          !prune_ref_by_selective_ref_frame(
+              cpi, x, refs, cm->cur_frame->ref_display_order_hint);
+    }
+    av1_zero(inter_cost_info_from_tpl);
+    get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs,
+                              &inter_cost_info_from_tpl);
+  }
+
+  const int do_pruning =
+      (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
+  if (do_pruning && sf->intra_sf.skip_intra_in_interframe &&
+      cpi->oxcf.algo_cfg.enable_tpl_model)
+    calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost,
+                                 &intra_cost);
+#endif  // !CONFIG_REALTIME_ONLY
+
+  // Initialize best mode stats for winner mode processing.
+  const int max_winner_mode_count =
+      winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type];
+  zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats);
+  x->winner_mode_count = 0;
+  store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+                          NULL, bsize, best_rd_so_far,
+                          sf->winner_mode_sf.multi_winner_mode_type, 0);
+
+  int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS);
+  if (sf->inter_sf.prune_inter_modes_if_skippable) {
+    // Higher multiplication factor values for lower quantizers.
+    mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex];
+  }
+
+  // Initialize arguments for mode loop speed features
+  InterModeSFArgs sf_args = { &args.skip_motion_mode,
+                              &mode_skip_mask,
+                              &search_state,
+                              skip_ref_frame_mask,
+                              0,
+                              mode_thresh_mul_fact,
+                              0,
+                              0 };
+  int64_t best_inter_yrd = INT64_MAX;
+
+  // This is the main loop of this function. It loops over all possible inter
+  // modes and calls handle_inter_mode() to compute the RD for each.
+  // Here midx is just an iterator index that should not be used by itself
+  // except to keep track of the number of modes searched. It should be used
+  // with av1_default_mode_order to get the enum that defines the mode, which
+  // can be used with av1_mode_defs to get the prediction mode and the ref
+  // frames.
+  // TODO(yunqing, any): Setting mode_start and mode_end outside for-loop brings
+  // good speedup for real time case. If we decide to use compound mode in real
+  // time, maybe we can modify av1_default_mode_order table.
+  THR_MODES mode_start = THR_INTER_MODE_START;
+  THR_MODES mode_end = THR_INTER_MODE_END;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  if (current_frame->reference_mode == SINGLE_REFERENCE) {
+    mode_start = SINGLE_REF_MODE_START;
+    mode_end = SINGLE_REF_MODE_END;
+  }
+
+  for (THR_MODES midx = mode_start; midx < mode_end; ++midx) {
+    // Get the actual prediction mode we are trying in this iteration
+    const THR_MODES mode_enum = av1_default_mode_order[midx];
+    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+    const PREDICTION_MODE this_mode = mode_def->mode;
+    const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+
+    const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+    const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+    const int is_single_pred =
+        ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME;
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+    init_mbmi(mbmi, this_mode, ref_frames, cm);
+
+    txfm_info->skip_txfm = 0;
+    sf_args.num_single_modes_processed += is_single_pred;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, skip_inter_mode_time);
+#endif
+    // Apply speed features to decide if this inter mode can be skipped
+    const int is_skip_inter_mode = skip_inter_mode(
+        cpi, x, bsize, ref_frame_rd, midx, &sf_args, is_low_temp_var);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, skip_inter_mode_time);
+#endif
+    if (is_skip_inter_mode) continue;
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    mbmi->ref_mv_idx = 0;
+
+    const int64_t ref_best_rd = search_state.best_rd;
+    RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+    av1_init_rd_stats(&rd_stats);
+
+    const int ref_frame_cost = comp_pred
+                                   ? ref_costs_comp[ref_frame][second_ref_frame]
+                                   : ref_costs_single[ref_frame];
+    const int compmode_cost =
+        is_comp_ref_allowed(mbmi->bsize) ? comp_inter_cost[comp_pred] : 0;
+    const int real_compmode_cost =
+        cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
+            ? compmode_cost
+            : 0;
+    // Point to variables that are maintained between loop iterations
+    args.single_newmv = search_state.single_newmv;
+    args.single_newmv_rate = search_state.single_newmv_rate;
+    args.single_newmv_valid = search_state.single_newmv_valid;
+    args.single_comp_cost = real_compmode_cost;
+    args.ref_frame_cost = ref_frame_cost;
+    args.best_pred_sse = search_state.best_pred_sse;
+    args.skip_ifs = skip_interp_filter_search(cpi, is_single_pred);
+
+    int64_t skip_rd[2] = { search_state.best_skip_rd[0],
+                           search_state.best_skip_rd[1] };
+    int64_t this_yrd = INT64_MAX;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, handle_inter_mode_time);
+#endif
+    int64_t this_rd = handle_inter_mode(
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &args,
+        ref_best_rd, tmp_buf, &x->comp_rd_buffer, &best_est_rd, do_tx_search,
+        inter_modes_info, &motion_mode_cand, skip_rd, &inter_cost_info_from_tpl,
+        &this_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, handle_inter_mode_time);
+#endif
+    if (current_frame->reference_mode != SINGLE_REFERENCE) {
+      if (!args.skip_ifs &&
+          sf->inter_sf.prune_comp_search_by_single_result > 0 &&
+          is_inter_singleref_mode(this_mode)) {
+        collect_single_states(x, &search_state, mbmi);
+      }
+
+      if (sf->inter_sf.prune_comp_using_best_single_mode_ref > 0 &&
+          is_inter_singleref_mode(this_mode))
+        update_best_single_mode(&search_state, this_mode, ref_frame, this_rd);
+    }
+
+    if (this_rd == INT64_MAX) continue;
+
+    if (mbmi->skip_txfm) {
+      rd_stats_y.rate = 0;
+      rd_stats_uv.rate = 0;
+    }
+
+    if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred &&
+        this_rd < ref_frame_rd[ref_frame]) {
+      ref_frame_rd[ref_frame] = this_rd;
+    }
+
+    // Did this mode help, i.e., is it the new best mode
+    if (this_rd < search_state.best_rd) {
+      assert(IMPLIES(comp_pred,
+                     cm->current_frame.reference_mode != SINGLE_REFERENCE));
+      search_state.best_pred_sse = x->pred_sse[ref_frame];
+      best_inter_yrd = this_yrd;
+      update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                          &rd_stats_uv, mode_enum, x, do_tx_search);
+      if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
+      // skip_rd[0] is the best total rd for a skip mode so far.
+      // skip_rd[1] is the best total rd for a skip mode so far in luma.
+      // When do_tx_search = 1, both skip_rd[0] and skip_rd[1] are updated.
+      // When do_tx_search = 0, skip_rd[1] is updated.
+      search_state.best_skip_rd[1] = skip_rd[1];
+    }
+    if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
+      // Add this mode to motion mode candidate list for motion mode search
+      // if using motion_mode_for_winner_cand speed feature
+      handle_winner_cand(mbmi, &best_motion_mode_cands,
+                         max_winner_motion_mode_cand, this_rd,
+                         &motion_mode_cand, args.skip_motion_mode);
+    }
+
+    /* keep record of best compound/single-only prediction */
+    record_best_compound(cm->current_frame.reference_mode, &rd_stats, comp_pred,
+                         x->rdmult, &search_state, compmode_cost);
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
+  if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
+    // For the single ref winner candidates, evaluate other motion modes (non
+    // simple translation).
+    evaluate_motion_mode_for_winner_candidates(
+        cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb,
+        &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd,
+        &search_state, &best_inter_yrd);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, do_tx_search_time);
+#endif
+  if (do_tx_search != 1) {
+    // A full tx search has not yet been done, do tx search for
+    // top mode candidates
+    tx_search_best_inter_candidates(cpi, tile_data, x, best_rd_so_far, bsize,
+                                    yv12_mb, mi_row, mi_col, &search_state,
+                                    rd_cost, ctx, &best_inter_yrd);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, do_tx_search_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, handle_intra_mode_time);
+#endif
+  // Gate intra mode evaluation if best of inter is skip except when source
+  // variance is extremely low and also based on max intra bsize.
+  skip_intra_modes_in_interframe(cm, x, bsize, &search_state, sf, inter_cost,
+                                 intra_cost);
+
+  const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
+  search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx,
+                                   &sf_args, intra_ref_frame_cost,
+                                   best_inter_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, handle_intra_mode_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, refine_winner_mode_tx_time);
+#endif
+  int winner_mode_count =
+      sf->winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1;
+  // In effect only when fast tx search speed features are enabled.
+  refine_winner_mode_tx(
+      cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
+      &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
+      search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, refine_winner_mode_tx_time);
+#endif
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  // Only try palette mode when the best mode so far is an intra mode.
+  const int try_palette =
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(features->allow_screen_content_tools, mbmi->bsize) &&
+      !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate != INT_MAX;
+  RD_STATS this_rd_cost;
+  int this_skippable = 0;
+  if (try_palette) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_search_palette_mode_time);
+#endif
+    this_skippable = av1_search_palette_mode(
+        &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
+        ctx, &this_rd_cost, search_state.best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_search_palette_mode_time);
+#endif
+    if (this_rd_cost.rdcost < search_state.best_rd) {
+      search_state.best_mode_index = THR_DC;
+      mbmi->mv[0].as_int = 0;
+      rd_cost->rate = this_rd_cost.rate;
+      rd_cost->dist = this_rd_cost.dist;
+      rd_cost->rdcost = this_rd_cost.rdcost;
+      search_state.best_rd = rd_cost->rdcost;
+      search_state.best_mbmode = *mbmi;
+      search_state.best_skip2 = 0;
+      search_state.best_mode_skippable = this_skippable;
+      memcpy(ctx->blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    }
+  }
+
+  search_state.best_mbmode.skip_mode = 0;
+  if (cm->current_frame.skip_mode_info.skip_mode_flag &&
+      is_comp_ref_allowed(bsize)) {
+    const struct segmentation *const seg = &cm->seg;
+    unsigned char segment_id = mbmi->segment_id;
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+      rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb);
+    }
+  }
+
+  // Make sure that the ref_mv_idx is only nonzero when we're
+  // using a mode which can support ref_mv_idx
+  if (search_state.best_mbmode.ref_mv_idx != 0 &&
+      !(search_state.best_mbmode.mode == NEWMV ||
+        search_state.best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+    search_state.best_mbmode.ref_mv_idx = 0;
+  }
+
+  if (search_state.best_mode_index == THR_INVALID ||
+      search_state.best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  const InterpFilter interp_filter = features->interp_filter;
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter ==
+          search_state.best_mbmode.interp_filters.as_filters.y_filter) ||
+         !is_inter_block(&search_state.best_mbmode));
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter ==
+          search_state.best_mbmode.interp_filters.as_filters.x_filter) ||
+         !is_inter_block(&search_state.best_mbmode));
+
+  if (!cpi->rc.is_src_frame_alt_ref && sf->inter_sf.adaptive_rd_thresh) {
+    av1_update_rd_thresh_fact(
+        cm, x->thresh_freq_fact, sf->inter_sf.adaptive_rd_thresh, bsize,
+        search_state.best_mode_index, mode_start, mode_end, THR_DC, MAX_MODES);
+  }
+
+  // macroblock modes
+  *mbmi = search_state.best_mbmode;
+  txfm_info->skip_txfm |= search_state.best_skip2;
+
+  // Note: this section is needed since the mode may have been forced to
+  // GLOBALMV by the all-zero mode handling of ref-mv.
+  if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+    // Correct the interp filters for GLOBALMV
+    if (is_nontrans_global_motion(xd, xd->mi[0])) {
+      int_interpfilters filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+      assert(mbmi->interp_filters.as_int == filters.as_int);
+      (void)filters;
+    }
+  }
+
+  txfm_info->skip_txfm |= search_state.best_mode_skippable;
+
+  assert(search_state.best_mode_index != THR_INVALID);
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context(x, ctx, search_state.best_mode_index,
+                       search_state.best_mode_skippable);
+#else
+  store_coding_context(x, ctx, search_state.best_mode_skippable);
+#endif  // CONFIG_INTERNAL_STATS
+
+  if (mbmi->palette_mode_info.palette_size[1] > 0) {
+    assert(try_palette);
+    av1_restore_uv_color_map(cpi, x);
+  }
+}
+
+void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
+                                        TileDataEnc *tile_data, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  unsigned char segment_id = mbmi->segment_id;
+  const int comp_pred = 0;
+  int i;
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *comp_inter_cost =
+      mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  InterpFilter best_filter = SWITCHABLE;
+  int64_t this_rd = INT64_MAX;
+  int rate2 = 0;
+  const int64_t distortion2 = 0;
+  (void)mi_row;
+  (void)mi_col;
+  (void)tile_data;
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  estimate_ref_frame_costs(cm, xd, mode_costs, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mode = GLOBALMV;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->uv_mode = UV_DC_PRED;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME))
+    mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  else
+    mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->mv[0].as_int =
+      gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
+                           features->allow_high_precision_mv, bsize, mi_col,
+                           mi_row, features->cur_frame_force_integer_mv)
+          .as_int;
+  mbmi->tx_size = max_txsize_lookup[bsize];
+  x->txfm_search_info.skip_txfm = 1;
+
+  mbmi->ref_mv_idx = 0;
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  av1_count_overlappable_neighbors(cm, xd);
+  if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+    int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
+    // Select the samples according to motion vector difference
+    if (mbmi->num_proj_ref > 1) {
+      mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                             mbmi->num_proj_ref, bsize);
+    }
+  }
+
+  const InterpFilter interp_filter = features->interp_filter;
+  set_default_interp_filters(mbmi, interp_filter);
+
+  if (interp_filter != SWITCHABLE) {
+    best_filter = interp_filter;
+  } else {
+    best_filter = EIGHTTAP_REGULAR;
+    if (av1_is_interp_needed(xd)) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        mbmi->interp_filters = av1_broadcast_interp_filter(i);
+        rs = av1_get_switchable_rate(x, xd, interp_filter,
+                                     cm->seq_params->enable_dual_filter);
+        if (rs < best_rs) {
+          best_rs = rs;
+          best_filter = mbmi->interp_filters.as_filters.y_filter;
+        }
+      }
+    }
+  }
+  // Set the appropriate filter
+  mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
+  rate2 += av1_get_switchable_rate(x, xd, interp_filter,
+                                   cm->seq_params->enable_dual_filter);
+
+  if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
+    rate2 += comp_inter_cost[comp_pred];
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rate2 += ref_costs_single[LAST_FRAME];
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+
+  rd_cost->rate = rate2;
+  rd_cost->dist = distortion2;
+  rd_cost->rdcost = this_rd;
+
+  if (this_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter == mbmi->interp_filters.as_filters.y_filter));
+
+  if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+    av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
+                              cpi->sf.inter_sf.adaptive_rd_thresh, bsize,
+                              THR_GLOBALMV, THR_INTER_MODE_START,
+                              THR_INTER_MODE_END, THR_DC, MAX_MODES);
+  }
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context(x, ctx, THR_GLOBALMV, 0);
+#else
+  store_coding_context(x, ctx, 0);
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+/*!\cond */
+struct calc_target_weighted_pred_ctxt {
+  const OBMCBuffer *obmc_buffer;
+  const uint8_t *tmp;
+  int tmp_stride;
+  int overlap;
+};
+/*!\endcond */
+
+static INLINE void calc_target_weighted_pred_above(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
+  (void)nb_mi;
+  (void)num_planes;
+  (void)rel_mi_row;
+  (void)dir;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+  const int bw = xd->width << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE);
+  int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE);
+  const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+  const int is_hbd = is_cur_buf_hbd(xd);
+
+  if (!is_hbd) {
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp16[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+  }
+}
+
+static INLINE void calc_target_weighted_pred_left(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
+  (void)nb_mi;
+  (void)num_planes;
+  (void)rel_mi_col;
+  (void)dir;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+  const int bw = xd->width << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw);
+  int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw);
+  const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+  const int is_hbd = is_cur_buf_hbd(xd);
+
+  if (!is_hbd) {
+    for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+  }
+}
+
+// This function has a structure similar to av1_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+//  PObmc(x,y) =
+//    AOM_BLEND_A64(Mh(x),
+//                  AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+//                  PLeft(x, y))
+//
+// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+//  AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+//    Mh(x) * Mv(y) * P(x,y) +
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+//  Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
+//  Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+//  wsrc(x, y) =
+//    AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+//  mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+//  error(x, y) =
+//    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
+//
+static AOM_INLINE void calc_target_weighted_pred(
+    const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+    const uint8_t *above, int above_stride, const uint8_t *left,
+    int left_stride) {
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+  const int bw = xd->width << MI_SIZE_LOG2;
+  const int bh = xd->height << MI_SIZE_LOG2;
+  const OBMCBuffer *obmc_buffer = &x->obmc_buffer;
+  int32_t *mask_buf = obmc_buffer->mask;
+  int32_t *wsrc_buf = obmc_buffer->wsrc;
+
+  const int is_hbd = is_cur_buf_hbd(xd);
+  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+
+  // plane 0 should not be sub-sampled
+  assert(xd->plane[0].subsampling_x == 0);
+  assert(xd->plane[0].subsampling_y == 0);
+
+  av1_zero_array(wsrc_buf, bw * bh);
+  for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+  // handle above row
+  if (xd->up_available) {
+    const int overlap =
+        AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, above,
+                                                   above_stride, overlap };
+    foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd,
+                                  max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                  calc_target_weighted_pred_above, &ctxt);
+  }
+
+  for (int i = 0; i < bw * bh; ++i) {
+    wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+    mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+  }
+
+  // handle left column
+  if (xd->left_available) {
+    const int overlap =
+        AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, left,
+                                                   left_stride, overlap };
+    foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd,
+                                 max_neighbor_obmc[mi_size_high_log2[bsize]],
+                                 calc_target_weighted_pred_left, &ctxt);
+  }
+
+  if (!is_hbd) {
+    const uint8_t *src = x->plane[0].src.buf;
+
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += bw;
+      src += x->plane[0].src.stride;
+    }
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += bw;
+      src += x->plane[0].src.stride;
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
new file mode 100644
index 0000000000..efb797e5b5
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
+
+#include <stdbool.h>
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define COMP_TYPE_RD_THRESH_SCALE 11
+#define COMP_TYPE_RD_THRESH_SHIFT 4
+#define MAX_WINNER_MOTION_MODES 10
+
+struct TileInfo;
+struct macroblock;
+struct RD_STATS;
+
+/*!\brief AV1 intra mode selection for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * Top level function for rd-based intra mode selection during intra frame
+ * encoding. This function will first search for the best luma prediction by
+ * calling av1_rd_pick_intra_sby_mode, then it searches for chroma prediction
+ * with av1_rd_pick_intra_sbuv_mode. If applicable, this function ends the
+ * search with an evaluation for intrabc.
+ *
+ * \param[in]    cpi            Top-level encoder structure.
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock.
+ * \param[in]    rd_cost        Struct to keep track of the RD information.
+ * \param[in]    bsize          Current block size.
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process.
+ * \param[in]    best_rd Best   RD seen for this block so far.
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+/*!\brief AV1 inter mode selection.
+ *
+ * \ingroup inter_mode_search
+ * \callgraph
+ * Top level function for inter mode selection. This function will loop over
+ * all possible inter modes and select the best one for the current block by
+ * computing the RD cost. The mode search and RD are computed in
+ * handle_inter_mode(), which is called from this function within the main
+ * loop.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+                                data/contexts/models for the tile during
+                                encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ * \param[in]    best_rd_so_far Best RD seen for this block so far
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+                            struct macroblock *x, struct RD_STATS *rd_cost,
+                            BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                            int64_t best_rd_so_far);
+
+/*!\brief AV1 intra mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Top level function for Non-RD optimized intra mode selection.
+ * This finction will loop over subset of intra modes and select the best one
+ * based on calculated modelled RD cost. Only 4 intra modes are checked as
+ * specified in \c intra_mode_list. When calculating RD cost Hadamard transform
+ * of residual is used to calculate rate. Estmation of RD cost is performed
+ * in \c av1_estimate_block_intra which is called from this function
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
+/*!\brief AV1 inter mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * Top level function for Non-RD optimized inter mode selection.
+ * This finction will loop over subset of inter modes and select the best one
+ * based on calculated modelled RD cost. While making decisions which modes to
+ * check, this function applies heuristics based on previously checked modes,
+ * block residual variance, block size, and other factors to prune certain
+ * modes and reference frames. Currently only single reference frame modes
+ * are checked. Additional heuristics are applied to decide if intra modes
+ *  need to be checked.
+ *  *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+                                data/contexts/models for the tile during
+                                encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+                                  struct TileDataEnc *tile_data,
+                                  struct macroblock *x,
+                                  struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                  PICK_MODE_CONTEXT *ctx);
+
+void av1_rd_pick_inter_mode_sb_seg_skip(
+    const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+    struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
+    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
+
+static INLINE int coded_to_superres_mi(int mi_col, int denom) {
+  return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
+}
+
+static INLINE int av1_encoder_get_relative_dist(int a, int b) {
+  assert(a >= 0 && b >= 0);
+  return (a - b);
+}
+
+// This function will return number of mi's in a superblock.
+static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
+  const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
+  int sb_mi_rows =
+      (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) /
+      mi_alloc_size_1d;
+  assert(mi_size_wide[cm->seq_params->sb_size] ==
+         mi_size_high[cm->seq_params->sb_size]);
+  int sb_mi_size = sb_mi_rows * sb_mi_rows;
+
+  return sb_mi_size;
+}
+
+// This function prunes the mode if either of the reference frame falls in the
+// pruning list
+static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
+                            const unsigned int *const ref_display_order_hint,
+                            const unsigned int frame_display_order_hint,
+                            const int *ref_frame_list) {
+  for (int i = 0; i < 2; i++) {
+    if (ref_frame_list[i] == NONE_FRAME) continue;
+
+    if (ref_frame[0] == ref_frame_list[i] ||
+        ref_frame[1] == ref_frame_list[i]) {
+      if (av1_encoder_get_relative_dist(
+              ref_display_order_hint[ref_frame_list[i] - LAST_FRAME],
+              frame_display_order_hint) < 0)
+        return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame,
+                                         int8_t closest_past_ref,
+                                         int8_t closest_future_ref) {
+  int has_closest_past_ref =
+      (ref_frame[0] == closest_past_ref) || (ref_frame[1] == closest_past_ref);
+  int has_closest_future_ref = (ref_frame[0] == closest_future_ref) ||
+                               (ref_frame[1] == closest_future_ref);
+  return (has_closest_past_ref && has_closest_future_ref);
+}
+
+static INLINE int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame,
+                                       const MACROBLOCK *const x) {
+  int has_best_past_pred_mv_sad = 0;
+  int has_best_future_pred_mv_sad = 0;
+  if (x->best_pred_mv_sad[0] < INT_MAX && x->best_pred_mv_sad[1] < INT_MAX) {
+    has_best_past_pred_mv_sad =
+        (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[0]) ||
+        (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[0]);
+    has_best_future_pred_mv_sad =
+        (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[1]) ||
+        (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[1]);
+  }
+  return (has_best_past_pred_mv_sad && has_best_future_pred_mv_sad);
+}
+
+static INLINE int prune_ref_by_selective_ref_frame(
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const MV_REFERENCE_FRAME *const ref_frame,
+    const unsigned int *const ref_display_order_hint) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  if (!sf->inter_sf.selective_ref_frame) return 0;
+
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+
+  if (sf->inter_sf.selective_ref_frame >= 2 ||
+      (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) {
+    int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME };
+
+    if (x != NULL) {
+      // Disable pruning if either tpl suggests that we keep the frame or
+      // the pred_mv gives us the best sad
+      if (x->tpl_keep_ref_frame[LAST3_FRAME] ||
+          x->pred_mv_sad[LAST3_FRAME] == x->best_pred_mv_sad[0]) {
+        ref_frame_list[0] = NONE_FRAME;
+      }
+      if (x->tpl_keep_ref_frame[LAST2_FRAME] ||
+          x->pred_mv_sad[LAST2_FRAME] == x->best_pred_mv_sad[0]) {
+        ref_frame_list[1] = NONE_FRAME;
+      }
+    }
+
+    if (prune_ref(ref_frame, ref_display_order_hint,
+                  ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME],
+                  ref_frame_list))
+      return 1;
+  }
+
+  if (sf->inter_sf.selective_ref_frame >= 3) {
+    int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME };
+
+    if (x != NULL) {
+      // Disable pruning if either tpl suggests that we keep the frame or
+      // the pred_mv gives us the best sad
+      if (x->tpl_keep_ref_frame[ALTREF2_FRAME] ||
+          x->pred_mv_sad[ALTREF2_FRAME] == x->best_pred_mv_sad[0]) {
+        ref_frame_list[0] = NONE_FRAME;
+      }
+      if (x->tpl_keep_ref_frame[BWDREF_FRAME] ||
+          x->pred_mv_sad[BWDREF_FRAME] == x->best_pred_mv_sad[0]) {
+        ref_frame_list[1] = NONE_FRAME;
+      }
+    }
+
+    if (prune_ref(ref_frame, ref_display_order_hint,
+                  ref_display_order_hint[LAST_FRAME - LAST_FRAME],
+                  ref_frame_list))
+      return 1;
+  }
+
+  if (x != NULL && sf->inter_sf.prune_comp_ref_frames && comp_pred) {
+    int closest_ref_frames = has_closest_ref_frames(
+        ref_frame, cpi->ref_frame_dist_info.nearest_past_ref,
+        cpi->ref_frame_dist_info.nearest_future_ref);
+    if (closest_ref_frames == 0) {
+      // Prune reference frames which are not the closest to the current frame.
+      if (sf->inter_sf.prune_comp_ref_frames >= 2) {
+        return 1;
+      } else if (sf->inter_sf.prune_comp_ref_frames == 1) {
+        // Prune reference frames with non minimum pred_mv_sad.
+        if (has_best_pred_mv_sad(ref_frame, x) == 0) return 1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME.
+static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame(
+    MB_MODE_INFO_EXT_FRAME *mbmi_ext_best,
+    const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type],
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type],
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type];
+  mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/rdopt_data_defs.h b/third_party/aom/av1/encoder/rdopt_data_defs.h
new file mode 100644
index 0000000000..ca7ef810f3
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt_data_defs.h
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+#define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = {
+  THR_DC,         // DC_PRED,
+  THR_V_PRED,     // V_PRED,
+  THR_H_PRED,     // H_PRED,
+  THR_D45_PRED,   // D45_PRED,
+  THR_D135_PRED,  // D135_PRED,
+  THR_D113_PRED,  // D113_PRED,
+  THR_D157_PRED,  // D157_PRED,
+  THR_D203_PRED,  // D203_PRED,
+  THR_D67_PRED,   // D67_PRED,
+  THR_SMOOTH,     // SMOOTH_PRED,
+  THR_SMOOTH_V,   // SMOOTH_V_PRED,
+  THR_SMOOTH_H,   // SMOOTH_H_PRED,
+  THR_PAETH,      // PAETH_PRED,
+};
+
+/* clang-format off */
+static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+    [REF_FRAMES] = {
+    // NEARESTMV,
+        { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3,
+        THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, },
+        // NEARMV,
+        { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3,
+        THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, },
+        // GLOBALMV,
+        { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3,
+        THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, },
+        // NEWMV,
+        { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3,
+        THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, },
+};
+/* clang-format on */
+
+/* clang-format off */
+static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+    [REF_FRAMES] = {
+    // NEAREST_NEARESTMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3,
+            THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB,
+            THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTL2B,
+            THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTL3B,
+            THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTGB,
+            THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAR_NEARMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3,
+            THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB,
+            THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARL2B,
+            THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARL3B,
+            THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARGB,
+            THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAREST_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3,
+            THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB,
+            THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWL2B,
+            THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWL3B,
+            THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWGB,
+            THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEARESTMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3,
+            THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB,
+            THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTL2B,
+            THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTL3B,
+            THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTGB,
+            THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAR_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3,
+            THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB,
+            THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWL2B,
+            THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWL3B,
+            THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWGB,
+            THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEARMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3,
+            THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB,
+            THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARL2B,
+            THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARL3B,
+            THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARGB,
+            THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // GLOBAL_GLOBALMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3,
+            THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB,
+            THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B,
+            THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B,
+            THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALGB,
+            THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3,
+            THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB,
+            THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWL2B,
+            THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWL3B,
+            THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWGB,
+            THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
diff --git a/third_party/aom/av1/encoder/rdopt_utils.h b/third_party/aom/av1/encoder/rdopt_utils.h
new file mode 100644
index 0000000000..b6bc4927e3
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt_utils.h
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_RDOPT_UTILS_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/common/cfl.h"
+#include "av1/common/pred_common.h"
+#include "av1/encoder/rdopt_data_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_REF_MV_SEARCH 3
+#define MAX_TX_RD_GATE_LEVEL 5
+#define INTER_INTRA_RD_THRESH_SCALE 9
+#define INTER_INTRA_RD_THRESH_SHIFT 4
+
+typedef struct {
+  PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+// This array defines the mapping from the enums in THR_MODES to the actual
+// prediction modes and refrence frames
+static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = {
+  { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEWMV, { LAST_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEARMV, { LAST_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+  { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  // TODO(zoeliu): May need to reconsider the order on the modes to check
+
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+  { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  // intra modes
+  { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+};
+
+// Number of winner modes allowed for different values of the speed feature
+// multi_winner_mode_type.
+static const int winner_mode_count_allowed[MULTI_WINNER_MODE_LEVELS] = {
+  1,  // MULTI_WINNER_MODE_OFF
+  2,  // MULTI_WINNER_MODE_FAST
+  3   // MULTI_WINNER_MODE_DEFAULT
+};
+
+static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
+                                       const int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    xd->plane[i].dst.buf = dst.plane[i];
+    xd->plane[i].dst.stride = dst.stride[i];
+  }
+}
+
+static AOM_INLINE void swap_dst_buf(MACROBLOCKD *xd,
+                                    const BUFFER_SET *dst_bufs[2],
+                                    int num_planes) {
+  const BUFFER_SET *buf0 = dst_bufs[0];
+  dst_bufs[0] = dst_bufs[1];
+  dst_bufs[1] = buf0;
+  restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+/* clang-format on */
+// Calculate rd threshold based on ref best rd and relevant scaling factors
+static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
+                                                     int mul_factor,
+                                                     int div_factor) {
+  int64_t rd_thresh = ref_best_rd;
+  if (div_factor != 0) {
+    rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor))
+                    ? ((ref_best_rd / div_factor) * mul_factor)
+                    : INT64_MAX;
+  }
+  return rd_thresh;
+}
+
+static AOM_INLINE THR_MODES
+get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
+                        MV_REFERENCE_FRAME second_ref_frame) {
+  if (this_mode < INTRA_MODE_END) {
+    assert(ref_frame == INTRA_FRAME);
+    assert(second_ref_frame == NONE_FRAME);
+    return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+  }
+  if (this_mode >= SINGLE_INTER_MODE_START &&
+      this_mode < SINGLE_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+                                   [ref_frame];
+  }
+  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END &&
+      second_ref_frame != NONE_FRAME) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    assert((second_ref_frame > INTRA_FRAME) &&
+           (second_ref_frame <= ALTREF_FRAME));
+    return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+                                 [second_ref_frame];
+  }
+  assert(0);
+  return THR_INVALID;
+}
+
+static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+      bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
+    return -1;
+  }
+  return 1;
+}
+
+// Get transform block visible dimensions cropped to the MI units.
+static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+                                          BLOCK_SIZE plane_bsize, int blk_row,
+                                          int blk_col, BLOCK_SIZE tx_bsize,
+                                          int *width, int *height,
+                                          int *visible_width,
+                                          int *visible_height) {
+  assert(tx_bsize <= plane_bsize);
+  const int txb_height = block_size_high[tx_bsize];
+  const int txb_width = block_size_wide[tx_bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+  // than the MI size
+  if (xd->mb_to_bottom_edge >= 0) {
+    *visible_height = txb_height;
+  } else {
+    const int block_height = block_size_high[plane_bsize];
+    const int block_rows =
+        (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+    *visible_height =
+        clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height);
+  }
+  if (height) *height = txb_height;
+
+  if (xd->mb_to_right_edge >= 0) {
+    *visible_width = txb_width;
+  } else {
+    const int block_width = block_size_wide[plane_bsize];
+    const int block_cols =
+        (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+    *visible_width =
+        clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width);
+  }
+  if (width) *width = txb_width;
+}
+
+static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+  int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2);
+  return num_blk;
+}
+
+static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                  int64_t best_skip_rd, int64_t skip_rd,
+                                  int level, int is_luma_only) {
+  int eval_txfm = 1;
+  // Derive aggressiveness factor for gating the transform search
+  // Lower value indicates more aggressiveness. Be more conservative (high
+  // value) for (i) low quantizers (ii) regions where prediction is poor
+  const int scale[MAX_TX_RD_GATE_LEVEL + 1] = { INT_MAX, 4, 3, 2, 2, 1 };
+  const int qslope = 2 * (!is_luma_only);
+  const int level_to_qindex_map[MAX_TX_RD_GATE_LEVEL + 1] = { 0,  0,   0,
+                                                              80, 100, 140 };
+  int aggr_factor = 4;
+  assert(level <= MAX_TX_RD_GATE_LEVEL);
+  const int pred_qindex_thresh = level_to_qindex_map[level];
+  if (!is_luma_only && level <= 2) {
+    aggr_factor = 4 * AOMMAX(1, ROUND_POWER_OF_TWO((MAXQ - x->qindex) * qslope,
+                                                   QINDEX_BITS));
+  }
+  if ((best_skip_rd >
+       (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) &&
+      (x->qindex >= pred_qindex_thresh))
+    aggr_factor *= scale[level];
+  // For level setting 1, be more conservative for non-luma-only case even when
+  // prediction is good.
+  else if ((level <= 1) && !is_luma_only)
+    aggr_factor = (aggr_factor >> 2) * 6;
+
+  // Be more conservative for luma only cases (called from compound type rd)
+  // since best_skip_rd is computed after and skip_rd is computed (with 8-bit
+  // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before
+  // interpolation filter search
+  const int luma_mul[MAX_TX_RD_GATE_LEVEL + 1] = {
+    INT_MAX, 32, 29, 17, 17, 17
+  };
+  int mul_factor = is_luma_only ? luma_mul[level] : 16;
+  int64_t rd_thresh =
+      (best_skip_rd == INT64_MAX)
+          ? best_skip_rd
+          : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 6);
+  if (skip_rd > rd_thresh) eval_txfm = 0;
+  return eval_txfm;
+}
+
+static TX_MODE select_tx_mode(
+    const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) {
+  if (cm->features.coded_lossless) return ONLY_4X4;
+  if (tx_size_search_method == USE_LARGESTALL) {
+    return TX_MODE_LARGEST;
+  } else {
+    assert(tx_size_search_method == USE_FULL_RD ||
+           tx_size_search_method == USE_FAST_RD);
+    return TX_MODE_SELECT;
+  }
+}
+
+// Checks the conditions to disable winner mode processing
+static INLINE int bypass_winner_mode_processing(const MACROBLOCK *const x,
+                                                const SPEED_FEATURES *sf,
+                                                int use_txfm_skip,
+                                                int actual_txfm_skip,
+                                                PREDICTION_MODE best_mode) {
+  const int prune_winner_mode_eval_level =
+      sf->winner_mode_sf.prune_winner_mode_eval_level;
+
+  // Disable winner mode processing for blocks with low source variance.
+  // The aggressiveness of this pruning logic reduces as qindex increases.
+  // The threshold decreases linearly from 64 as qindex varies from 0 to 255.
+  if (prune_winner_mode_eval_level == 1) {
+    const unsigned int src_var_thresh = 64 - 48 * x->qindex / (MAXQ + 1);
+    if (x->source_variance < src_var_thresh) return 1;
+  } else if (prune_winner_mode_eval_level == 2) {
+    // Skip winner mode processing of blocks for which transform turns out to be
+    // skip due to nature of eob alone except NEWMV mode.
+    if (!have_newmv_in_inter_mode(best_mode) && actual_txfm_skip) return 1;
+  } else if (prune_winner_mode_eval_level == 3) {
+    // Skip winner mode processing of blocks for which transform turns out to be
+    // skip except NEWMV mode and considered based on the quantizer.
+    // At high quantizers: Take conservative approach by considering transform
+    // skip based on eob alone.
+    // At low quantizers: Consider transform skip based on eob nature or RD cost
+    // evaluation.
+    const int is_txfm_skip =
+        x->qindex > 127 ? actual_txfm_skip : actual_txfm_skip || use_txfm_skip;
+
+    if (!have_newmv_in_inter_mode(best_mode) && is_txfm_skip) return 1;
+  } else if (prune_winner_mode_eval_level >= 4) {
+    // Do not skip winner mode evaluation at low quantizers if normal mode's
+    // transform search was too aggressive.
+    if (sf->rd_sf.perform_coeff_opt >= 5 && x->qindex <= 70) return 0;
+
+    if (use_txfm_skip || actual_txfm_skip) return 1;
+  }
+
+  return 0;
+}
+
+// Checks the conditions to enable winner mode processing
+static INLINE int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi,
+                                                    const MACROBLOCK *const x,
+                                                    MB_MODE_INFO *const mbmi,
+                                                    int actual_txfm_skip) {
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const PREDICTION_MODE best_mode = mbmi->mode;
+
+  if (bypass_winner_mode_processing(x, sf, mbmi->skip_txfm, actual_txfm_skip,
+                                    best_mode))
+    return 0;
+
+  // TODO(any): Move block independent condition checks to frame level
+  if (is_inter_block(mbmi)) {
+    if (is_inter_mode(best_mode) &&
+        (sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) &&
+        !cpi->oxcf.txfm_cfg.use_inter_dct_only)
+      return 1;
+  } else {
+    if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search &&
+        !cpi->oxcf.txfm_cfg.use_intra_default_tx_only &&
+        !cpi->oxcf.txfm_cfg.use_intra_dct_only)
+      return 1;
+  }
+
+  // Check speed feature related to winner mode processing
+  if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt &&
+      cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT &&
+      cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT)
+    return 1;
+  if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1;
+
+  return 0;
+}
+
+static INLINE void set_tx_size_search_method(
+    const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params,
+    TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch,
+    int is_winner_mode) {
+  // Populate transform size search method/transform mode appropriately
+  txfm_params->tx_size_search_method =
+      winner_mode_params->tx_size_search_methods[DEFAULT_EVAL];
+  if (enable_winner_mode_for_tx_size_srch) {
+    if (is_winner_mode)
+      txfm_params->tx_size_search_method =
+          winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL];
+    else
+      txfm_params->tx_size_search_method =
+          winner_mode_params->tx_size_search_methods[MODE_EVAL];
+  }
+  txfm_params->tx_mode_search_type =
+      select_tx_mode(cm, txfm_params->tx_size_search_method);
+}
+
+static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf,
+                                     TxfmSearchParams *txfm_params,
+                                     int winner_mode_tx_type_pruning,
+                                     int is_winner_mode) {
+  // Populate prune transform mode appropriately
+  txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode;
+  if (!winner_mode_tx_type_pruning) return;
+
+  const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } };
+  txfm_params->prune_2d_txfm_mode =
+      prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode];
+}
+
+static INLINE void set_tx_domain_dist_params(
+    const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params,
+    int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
+  if (txfm_params->use_qm_dist_metric) {
+    // QM-weighted PSNR is computed in transform space, so we need to forcibly
+    // enable the use of tx domain distortion.
+    txfm_params->use_transform_domain_distortion = 1;
+    txfm_params->tx_domain_dist_threshold = 0;
+    return;
+  }
+
+  if (!enable_winner_mode_for_tx_domain_dist) {
+    txfm_params->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL];
+    txfm_params->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL];
+    return;
+  }
+
+  if (is_winner_mode) {
+    txfm_params->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL];
+    txfm_params->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL];
+  } else {
+    txfm_params->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[MODE_EVAL];
+    txfm_params->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[MODE_EVAL];
+  }
+}
+
+// This function sets mode parameters for different mode evaluation stages
+static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        MODE_EVAL_TYPE mode_eval_type) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params;
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+
+  txfm_params->use_qm_dist_metric =
+      cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR;
+
+  switch (mode_eval_type) {
+    case DEFAULT_EVAL:
+      txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
+      txfm_params->use_default_intra_tx_type = 0;
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[DEFAULT_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[DEFAULT_EVAL];
+      // Set default transform domain distortion type
+      set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0);
+
+      // Get default threshold for R-D optimization of coefficients
+      get_rd_opt_coeff_thresh(winner_mode_params->coeff_opt_thresholds,
+                              txfm_params, 0, 0);
+
+      // Set default transform size search method
+      set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0);
+      // Set default transform type prune
+      set_tx_type_prune(sf, txfm_params, 0, 0);
+      break;
+    case MODE_EVAL:
+      txfm_params->use_default_intra_tx_type =
+          (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search ||
+           cpi->oxcf.txfm_cfg.use_intra_default_tx_only);
+      txfm_params->default_inter_tx_type_prob_thresh =
+          cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh;
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[MODE_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[MODE_EVAL];
+      // Set transform domain distortion type for mode evaluation
+      set_tx_domain_dist_params(
+          winner_mode_params, txfm_params,
+          sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0);
+
+      // Get threshold for R-D optimization of coefficients during mode
+      // evaluation
+      get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_thresholds, txfm_params,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+
+      // Set the transform size search method for mode evaluation
+      set_tx_size_search_method(
+          cm, winner_mode_params, txfm_params,
+          sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0);
+      // Set transform type prune for mode evaluation
+      set_tx_type_prune(sf, txfm_params,
+                        sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+                        0);
+      break;
+    case WINNER_MODE_EVAL:
+      txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
+      txfm_params->use_default_intra_tx_type = 0;
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[WINNER_MODE_EVAL];
+
+      // Set transform domain distortion type for winner mode evaluation
+      set_tx_domain_dist_params(
+          winner_mode_params, txfm_params,
+          sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1);
+
+      // Get threshold for R-D optimization of coefficients for winner mode
+      // evaluation
+      get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_thresholds, txfm_params,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+
+      // Set the transform size search method for winner mode evaluation
+      set_tx_size_search_method(
+          cm, winner_mode_params, txfm_params,
+          sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+      // Set default transform type prune mode for winner mode evaluation
+      set_tx_type_prune(sf, txfm_params,
+                        sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+                        1);
+      break;
+    default: assert(0);
+  }
+
+  // Rd record collected at a specific mode evaluation stage can not be used
+  // across other evaluation stages as the transform parameters are different.
+  // Hence, reset mb rd record whenever mode evaluation stage type changes.
+  if (txfm_params->mode_eval_type != mode_eval_type)
+    reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+
+  txfm_params->mode_eval_type = mode_eval_type;
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+                                                      const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
+
+  if (!xd->is_chroma_ref) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // For chroma reference blocks, we should store data in the encoder iff we're
+  // allowed to try out CfL.
+  return is_cfl_allowed(xd);
+}
+
+static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+}
+
+// Store best mode stats for winner mode processing
+static INLINE void store_winner_mode_stats(
+    const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi,
+    RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv,
+    THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd,
+    int multi_winner_mode_type, int txfm_search_done) {
+  WinnerModeStats *winner_mode_stats = x->winner_mode_stats;
+  int mode_idx = 0;
+  int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0;
+  // Mode stat is not required when multiwinner mode processing is disabled
+  if (multi_winner_mode_type == MULTI_WINNER_MODE_OFF) return;
+  // Ignore mode with maximum rd
+  if (this_rd == INT64_MAX) return;
+  // TODO(any): Winner mode processing is currently not applicable for palette
+  // mode in Inter frames. Clean-up the following code, once support is added
+  if (!frame_is_intra_only(cm) && is_palette_mode) return;
+
+  int max_winner_mode_count = winner_mode_count_allowed[multi_winner_mode_type];
+  assert(x->winner_mode_count >= 0 &&
+         x->winner_mode_count <= max_winner_mode_count);
+
+  if (x->winner_mode_count) {
+    // Find the mode which has higher rd cost than this_rd
+    for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++)
+      if (winner_mode_stats[mode_idx].rd > this_rd) break;
+
+    if (mode_idx == max_winner_mode_count) {
+      // No mode has higher rd cost than this_rd
+      return;
+    } else if (mode_idx < max_winner_mode_count - 1) {
+      // Create a slot for current mode and move others to the next slot
+      memmove(
+          &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx],
+          (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats));
+    }
+  }
+  // Add a mode stat for winner mode processing
+  winner_mode_stats[mode_idx].mbmi = *mbmi;
+  winner_mode_stats[mode_idx].rd = this_rd;
+  winner_mode_stats[mode_idx].mode_index = mode_index;
+
+  // Update rd stats required for inter frame
+  if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+    const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END;
+    const int skip_txfm = mbmi->skip_txfm && !is_intra_mode;
+
+    winner_mode_stats[mode_idx].rd_cost = *rd_cost;
+    if (txfm_search_done) {
+      winner_mode_stats[mode_idx].rate_y =
+          rd_cost_y->rate +
+          x->mode_costs
+              .skip_txfm_cost[skip_ctx][rd_cost->skip_txfm || skip_txfm];
+      winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate;
+    }
+  }
+
+  if (color_map) {
+    // Store color_index_map for palette mode
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    int block_width, block_height;
+    av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+                             &block_height, NULL, NULL);
+    memcpy(winner_mode_stats[mode_idx].color_index_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+  }
+
+  x->winner_mode_count =
+      AOMMIN(x->winner_mode_count + 1, max_winner_mode_count);
+}
+
+unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi,
+                                       const MACROBLOCKD *xd,
+                                       const struct buf_2d *ref,
+                                       BLOCK_SIZE bsize, int plane,
+                                       int use_hbd);
+
+unsigned int av1_get_perpixel_variance_facade(const struct AV1_COMP *cpi,
+                                              const MACROBLOCKD *xd,
+                                              const struct buf_2d *ref,
+                                              BLOCK_SIZE bsize, int plane);
+
+static INLINE int is_mode_intra(PREDICTION_MODE mode) {
+  return mode < INTRA_MODE_END;
+}
+
+// This function will copy usable ref_mv_stack[ref_frame][4] and
+// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and
+// weight[ref_frame][8].
+static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
+    const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext,
+    MV_REFERENCE_FRAME ref_frame) {
+  memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame],
+         USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0]));
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame],
+         USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0]));
+}
+
+// Get transform rd gate level for the given transform search case.
+static INLINE int get_txfm_rd_gate_level(
+    const int is_masked_compound_enabled,
+    const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize,
+    TX_SEARCH_CASE tx_search_case, int eval_motion_mode) {
+  assert(tx_search_case < TX_SEARCH_CASES);
+  if (tx_search_case == TX_SEARCH_MOTION_MODE && !eval_motion_mode &&
+      num_pels_log2_lookup[bsize] > 8)
+    return txfm_rd_gate_level[TX_SEARCH_MOTION_MODE];
+  // Enable aggressive gating of transform search only when masked compound type
+  // is enabled.
+  else if (tx_search_case == TX_SEARCH_COMP_TYPE_MODE &&
+           is_masked_compound_enabled)
+    return txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE];
+
+  return txfm_rd_gate_level[TX_SEARCH_DEFAULT];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_UTILS_H_
diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c
new file mode 100644
index 0000000000..9b964113a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static AOM_INLINE void enc_calc_subpel_params(
+    const MV *const src_mv, InterPredParams *const inter_pred_params,
+    uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
+  struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
+  init_subpel_params(src_mv, inter_pred_params, subpel_params, pre_buf->width,
+                     pre_buf->height);
+  *pre = pre_buf->buf0 +
+         (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+         (subpel_params->pos_x >> SCALE_SUBPEL_BITS);
+  *src_stride = pre_buf->stride;
+}
+
+#define IS_DEC 0
+#include "av1/common/reconinter_template.inc"
+#undef IS_DEC
+
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+                                       const MV *src_mv,
+                                       InterPredParams *inter_pred_params) {
+  build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params);
+}
+
+static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                       int plane, const MB_MODE_INFO *mi,
+                                       int bw, int bh, int mi_x, int mi_y) {
+  build_inter_predictors(cm, xd, plane, mi, /*build_for_obmc=*/0, bw, bh, mi_x,
+                         mi_y);
+}
+
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  InterPredParams inter_pred_params;
+
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf;
+  const MV mv = xd->mi[0]->mv[0].as_mv;
+  const struct scale_factors *const sf = xd->block_ref_scale_factors[0];
+
+  av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x,
+                        pd->subsampling_x, pd->subsampling_y, xd->bd,
+                        is_cur_buf_hbd(xd), false, sf, pd->pre,
+                        xd->mi[0]->interp_filters);
+
+  inter_pred_params.conv_params = get_conv_params_no_round(
+      0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd);
+
+  inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+  av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+                                    &inter_pred_params);
+}
+
+void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd,
+                                           InterPredParams *inter_pred_params,
+                                           const SubpelParams *subpel_params) {
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  struct buf_2d *const dst_buf = &pd->dst;
+  const struct buf_2d *pre_buf = &pd->pre[0];
+  const uint8_t *src =
+      pre_buf->buf0 +
+      (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+      (subpel_params->pos_x >> SCALE_SUBPEL_BITS);
+  uint8_t *const dst = dst_buf->buf;
+  int src_stride = pre_buf->stride;
+  int dst_stride = dst_buf->stride;
+  inter_pred_params->ref_frame_buf = *pre_buf;
+
+  // Initialize interp filter for single reference mode.
+  init_interp_filter_params(inter_pred_params->interp_filter_params,
+                            &mbmi->interp_filters.as_filters, pd->width,
+                            pd->height, /*is_intrabc=*/0);
+
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params,
+                           subpel_params);
+}
+
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+                                   int plane_from, int plane_to) {
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+    enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width,
+                               xd->plane[plane].height, mi_x, mi_y);
+
+    if (is_interintra_pred(xd->mi[0])) {
+      BUFFER_SET default_ctx = {
+        { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+        { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+          xd->plane[2].dst.stride }
+      };
+      if (!ctx) {
+        ctx = &default_ctx;
+      }
+      av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
+                                     xd->plane[plane].dst.stride, ctx, plane,
+                                     bsize);
+    }
+  }
+}
+
+static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                   int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                   struct build_prediction_ctxt *ctxt,
+                                   const int num_planes) {
+  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->bsize);
+  const int ref_mi_row = xd->mi_row + mi_row_offset;
+  const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+  const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(ctxt->cm, frame);
+
+  xd->block_ref_scale_factors[0] = sf;
+  if (!av1_is_valid_scale(sf))
+    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Reference frame has invalid dimensions");
+
+  av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+                       num_planes);
+}
+
+static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
+                                         int rel_mi_col, uint8_t op_mi_size,
+                                         int dir, MB_MODE_INFO *above_mbmi,
+                                         void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
+                         num_planes);
+
+  const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2;
+  const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+
+  InterPredParams inter_pred_params;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = 0, bh = 0;
+
+    if (dir) {
+      // prepare left reference block size
+      bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                 block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+      bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
+    } else {
+      // prepare above reference block size
+      bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+      bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                 block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+    }
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue;
+
+    const struct buf_2d *const pre_buf = &pd->pre[0];
+    const MV mv = above_mbmi->mv[0].as_mv;
+
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+                          mi_x >> pd->subsampling_x, pd->subsampling_x,
+                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+                          xd->block_ref_scale_factors[0], pre_buf,
+                          above_mbmi->interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, j, xd->bd);
+
+    av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv,
+                                      &inter_pred_params);
+  }
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->up_available) return;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, NULL
+  };
+  BLOCK_SIZE bsize = xd->mi[0]->bsize;
+  foreach_overlappable_nb_above(cm, xd,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                build_obmc_prediction, &ctxt);
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->left_available) return;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, NULL
+  };
+  BLOCK_SIZE bsize = xd->mi[0]->bsize;
+  foreach_overlappable_nb_left(cm, xd,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               build_obmc_prediction, &ctxt);
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) {
+  const int num_planes = av1_num_planes(cm);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
+                                      dst_stride1);
+  av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+                                     dst_stride2);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
+                       mi_col, 0, num_planes);
+  av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
+                                  dst_stride2);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+    uint8_t *ext_dst[], int ext_dst_stride[]) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const MB_MODE_INFO *mi = xd->mi[0];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  WarpTypesAllowed warp_types;
+  const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+  warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+
+    InterPredParams inter_pred_params;
+
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+                          mi_x >> pd->subsampling_x, pd->subsampling_x,
+                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+                          xd->block_ref_scale_factors[ref], &pd->pre[ref],
+                          mi->interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+    av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+    uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]);
+    const MV mv = mi->mv[ref].as_mv;
+
+    av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv,
+                                      &inter_pred_params);
+  }
+}
+
+static void build_masked_compound(
+    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                     mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void build_masked_compound_highbd(
+    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w, int bd) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  // const uint8_t *mask =
+  //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                            src1_stride, mask, block_size_wide[sb_type], w, h,
+                            subw, subh, bd);
+}
+#endif
+
+static void build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_compound = has_second_ref(mbmi);
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+  const int is_hbd = is_cur_buf_hbd(xd);
+
+  if (is_compound && is_masked_compound_type(comp_data->type)) {
+    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (is_hbd) {
+        av1_build_compound_diffwtd_mask_highbd(
+            comp_data->seg_mask, comp_data->mask_type,
+            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+      } else {
+        av1_build_compound_diffwtd_mask(
+            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+      }
+#else
+      (void)is_hbd;
+      av1_build_compound_diffwtd_mask(comp_data->seg_mask, comp_data->mask_type,
+                                      ext_dst0, ext_dst_stride0, ext_dst1,
+                                      ext_dst_stride1, h, w);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+    }
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_hbd) {
+      build_masked_compound_highbd(
+          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->bsize,
+          h, w, xd->bd);
+    } else {
+      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+                            ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize,
+                            h, w);
+    }
+#else
+    build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+                          ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h,
+                          w);
+#endif
+  } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_hbd) {
+      aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0,
+                               CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h);
+    } else {
+      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
+    }
+#else
+    aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
+#endif
+  }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[],
+                                              int ext_dst_stride0[],
+                                              uint8_t *ext_dst1[],
+                                              int ext_dst_stride1[]) {
+  int plane;
+  assert(bsize < BLOCK_SIZES_ALL);
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    build_wedge_inter_predictor_from_buf(
+        xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+        ext_dst1[plane], ext_dst_stride1[plane]);
+  }
+}
+
+// Get pred block from up-sampled reference.
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                          int mi_row, int mi_col, const MV *const mv,
+                          uint8_t *comp_pred, int width, int height,
+                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                          int ref_stride, int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+                          -1, width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+                         16, width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                          width, intermediate_height);
+    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                         width, height);
+  }
+}
+
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   int mi_row, int mi_col, const MV *const mv,
+                                   uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, int subpel_x_q3,
+                                   int subpel_y_q3, const uint8_t *ref,
+                                   int ref_stride, int subpel_search) {
+  int i, j;
+
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  int mi_row, int mi_col, const MV *const mv,
+                                  uint8_t *comp_pred, const uint8_t *pred,
+                                  int width, int height, int subpel_x_q3,
+                                  int subpel_y_q3, const uint8_t *ref,
+                                  int ref_stride, const uint8_t *mask,
+                                  int mask_stride, int invert_mask,
+                                  int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+                                 const struct AV1Common *const cm, int mi_row,
+                                 int mi_col, const MV *const mv,
+                                 uint8_t *comp_pred8, int width, int height,
+                                 int subpel_x_q3, int subpel_y_q3,
+                                 const uint8_t *ref8, int ref_stride, int bd,
+                                 int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
+                                 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                                kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
+                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                                 intermediate_height, bd);
+    aom_highbd_convolve8_vert_c(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  int i, j;
+
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                              height, subpel_x_q3, subpel_y_q3, ref8,
+                              ref_stride, bd, subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
+                            mask, mask_stride, invert_mask);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h
new file mode 100644
index 0000000000..16932f37a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  int mi_row, int mi_col, const MV *const mv,
+                                  uint8_t *comp_pred, const uint8_t *pred,
+                                  int width, int height, int subpel_x_q3,
+                                  int subpel_y_q3, const uint8_t *ref,
+                                  int ref_stride, const uint8_t *mask,
+                                  int mask_stride, int invert_mask,
+                                  int subpel_search);
+
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search);
+
+// Build single or compound reference inter predictors for all planes.
+// Can build inter-intra predictors, masked predictors etc as well.
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+                                   int plane_from, int plane_to);
+
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col);
+
+void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd,
+                                           InterPredParams *inter_pred_params,
+                                           const SubpelParams *subpel_params);
+
+// Build one inter predictor. It is called for building predictor for single
+// reference case, or just the 1st or 2nd reference in compound reference case.
+// Can build both regular and masked predictors.
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+                                       const MV *src_mv,
+                                       InterPredParams *inter_pred_params);
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive.
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+    uint8_t *ext_dst[], int ext_dst_stride[]);
+
+// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive.
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[],
+                                              int ext_dst_stride0[],
+                                              uint8_t *ext_dst1[],
+                                              int ext_dst_stride1[]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/third_party/aom/av1/encoder/saliency_map.c b/third_party/aom/av1/encoder/saliency_map.c
new file mode 100644
index 0000000000..30019bbec0
--- /dev/null
+++ b/third_party/aom/av1/encoder/saliency_map.c
@@ -0,0 +1,1414 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/saliency_map.h"
+
+// The Gabor filter is generated by setting the parameters as:
+// ksize = 9
+// sigma = 1
+// theta = y*np.pi/4, where y /in {0, 1, 2, 3}, i.e., 0, 45, 90, 135 degree
+// lambda1 = 1
+// gamma=0.8
+// phi =0
+static const double kGaborFilter[4][9][9] = {  // [angle: 0, 45, 90, 135
+                                               // degree][ksize][ksize]
+  { { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03,
+      3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 },
+    { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02,
+      3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 },
+    { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01,
+      1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 },
+    { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01,
+      4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 },
+    { 3.3546262e-04, 1.1108996e-02, 1.3533528e-01, 6.0653067e-01, 1.0000000e+00,
+      6.0653067e-01, 1.3533528e-01, 1.1108996e-02, 3.3546262e-04 },
+    { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01,
+      4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 },
+    { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01,
+      1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 },
+    { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02,
+      3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 },
+    { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03,
+      3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 } },
+
+  { { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04,
+      6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05,
+      3.5712848e-05 },
+    { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03,
+      1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03,
+      -8.1631159e-05 },
+    { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02,
+      -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03,
+      -9.9486928e-04 },
+    { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01,
+      -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02,
+      1.3962291e-03 },
+    { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01,
+      1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02,
+      6.6981313e-04 },
+    { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01,
+      -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03,
+      -4.4602581e-04 },
+    { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02,
+      -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03,
+      3.0079011e-06 },
+    { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02,
+      1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06,
+      3.8760313e-06 },
+    { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03,
+      6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06,
+      -6.2165498e-08 } },
+
+  { { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04,
+      2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 },
+    { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02,
+      8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 },
+    { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01,
+      9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 },
+    { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01,
+      4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 },
+    { 5.9760227e-03, 5.6134764e-02, 2.7803731e-01, 7.2614902e-01, 1.0000000e+00,
+      7.2614902e-01, 2.7803731e-01, 5.6134764e-02, 5.9760227e-03 },
+    { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01,
+      4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 },
+    { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01,
+      9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 },
+    { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02,
+      8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 },
+    { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04,
+      2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 } },
+
+  { { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03,
+      6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06,
+      -6.2165498e-08 },
+    { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02,
+      1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06,
+      3.8760313e-06 },
+    { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02,
+      -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03,
+      3.0079011e-06 },
+    { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01,
+      -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03,
+      -4.4602581e-04 },
+    { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01,
+      1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02,
+      6.6981313e-04 },
+    { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01,
+      -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02,
+      1.3962291e-03 },
+    { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02,
+      -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03,
+      -9.9486928e-04 },
+    { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03,
+      1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03,
+      -8.1631159e-05 },
+    { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04,
+      6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05,
+      3.5712848e-05 } }
+};
+
+// This function is to extract red/green/blue channels, and calculate intensity
+// = (r+g+b)/3. Note that it only handles 8bits case now.
+// TODO(linzhen): add high bitdepth support.
+static void get_color_intensity(const YV12_BUFFER_CONFIG *src,
+                                int subsampling_x, int subsampling_y,
+                                double *cr, double *cg, double *cb,
+                                double *intensity) {
+  const uint8_t *y = src->buffers[0];
+  const uint8_t *u = src->buffers[1];
+  const uint8_t *v = src->buffers[2];
+
+  const int y_height = src->crop_heights[0];
+  const int y_width = src->crop_widths[0];
+  const int y_stride = src->strides[0];
+  const int c_stride = src->strides[1];
+
+  for (int i = 0; i < y_height; ++i) {
+    for (int j = 0; j < y_width; ++j) {
+      cr[i * y_width + j] =
+          fclamp((double)y[i * y_stride + j] +
+                     1.370 * (double)(v[(i >> subsampling_y) * c_stride +
+                                        (j >> subsampling_x)] -
+                                      128),
+                 0, 255);
+      cg[i * y_width + j] =
+          fclamp((double)y[i * y_stride + j] -
+                     0.698 * (double)(u[(i >> subsampling_y) * c_stride +
+                                        (j >> subsampling_x)] -
+                                      128) -
+                     0.337 * (double)(v[(i >> subsampling_y) * c_stride +
+                                        (j >> subsampling_x)] -
+                                      128),
+                 0, 255);
+      cb[i * y_width + j] =
+          fclamp((double)y[i * y_stride + j] +
+                     1.732 * (double)(u[(i >> subsampling_y) * c_stride +
+                                        (j >> subsampling_x)] -
+                                      128),
+                 0, 255);
+
+      intensity[i * y_width + j] =
+          (cr[i * y_width + j] + cg[i * y_width + j] + cb[i * y_width + j]) /
+          3.0;
+      assert(intensity[i * y_width + j] >= 0 &&
+             intensity[i * y_width + j] <= 255);
+
+      intensity[i * y_width + j] /= 256;
+      cr[i * y_width + j] /= 256;
+      cg[i * y_width + j] /= 256;
+      cb[i * y_width + j] /= 256;
+    }
+  }
+}
+
+static INLINE double convolve_map(const double *filter, const double *map,
+                                  const int size) {
+  double result = 0;
+  for (int i = 0; i < size; ++i) {
+    result += filter[i] * map[i];  // symmetric filter is used
+  }
+  return result;
+}
+
+// This function is to decimate the map by half, and apply Gaussian filter on
+// top of the downsampled map.
+static INLINE void decimate_map(const double *map, int height, int width,
+                                int stride, double *downsampled_map) {
+  const int new_width = width / 2;
+  const int window_size = 5;
+  const double gaussian_filter[25] = {
+    1. / 256, 1.0 / 64, 3. / 128, 1. / 64,  1. / 256, 1. / 64, 1. / 16,
+    3. / 32,  1. / 16,  1. / 64,  3. / 128, 3. / 32,  9. / 64, 3. / 32,
+    3. / 128, 1. / 64,  1. / 16,  3. / 32,  1. / 16,  1. / 64, 1. / 256,
+    1. / 64,  3. / 128, 1. / 64,  1. / 256
+  };
+
+  double map_region[25];
+  for (int y = 0; y < height - 1; y += 2) {
+    for (int x = 0; x < width - 1; x += 2) {
+      int i = 0;
+      for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) {
+        for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) {
+          int yvalue = clamp(yy, 0, height - 1);
+          int xvalue = clamp(xx, 0, width - 1);
+          map_region[i++] = map[yvalue * stride + xvalue];
+        }
+      }
+      downsampled_map[(y / 2) * new_width + (x / 2)] =
+          convolve_map(gaussian_filter, map_region, window_size * window_size);
+    }
+  }
+}
+
+// This function is to upscale the map from in_level size to out_level size.
+// Note that the map at "level-1" will upscale the map at "level" by x2.
+static INLINE int upscale_map(const double *input, int in_level, int out_level,
+                              int height[9], int width[9], double *output) {
+  for (int level = in_level; level > out_level; level--) {
+    const int cur_width = width[level];
+    const int cur_height = height[level];
+    const int cur_stride = width[level];
+
+    double *original = (level == in_level) ? (double *)input : output;
+
+    assert(level > 0);
+
+    const int h_upscale = height[level - 1];
+    const int w_upscale = width[level - 1];
+    const int s_upscale = width[level - 1];
+
+    double *upscale = aom_malloc(h_upscale * w_upscale * sizeof(*upscale));
+
+    if (!upscale) {
+      return 0;
+    }
+
+    for (int i = 0; i < h_upscale; ++i) {
+      for (int j = 0; j < w_upscale; ++j) {
+        const int ii = clamp((i >> 1), 0, cur_height - 1);
+        const int jj = clamp((j >> 1), 0, cur_width - 1);
+        upscale[j + i * s_upscale] = (double)original[jj + ii * cur_stride];
+      }
+    }
+    memcpy(output, upscale, h_upscale * w_upscale * sizeof(double));
+    aom_free(upscale);
+  }
+
+  return 1;
+}
+
+// This function calculates the differences between a fine scale c and a
+// coarser scale s yielding the feature maps. c \in {2, 3, 4}, and s = c +
+// delta, where delta \in {3, 4}.
+static int center_surround_diff(const double *input[9], int height[9],
+                                int width[9], saliency_feature_map *output[6]) {
+  int j = 0;
+  for (int k = 2; k < 5; ++k) {
+    int cur_height = height[k];
+    int cur_width = width[k];
+
+    if (upscale_map(input[k + 3], k + 3, k, height, width, output[j]->buf) ==
+        0) {
+      return 0;
+    }
+
+    for (int r = 0; r < cur_height; ++r) {
+      for (int c = 0; c < cur_width; ++c) {
+        output[j]->buf[r * cur_width + c] =
+            fabs((double)(input[k][r * cur_width + c] -
+                          output[j]->buf[r * cur_width + c]));
+      }
+    }
+
+    if (upscale_map(input[k + 4], k + 4, k, height, width,
+                    output[j + 1]->buf) == 0) {
+      return 0;
+    }
+
+    for (int r = 0; r < cur_height; ++r) {
+      for (int c = 0; c < cur_width; ++c) {
+        output[j + 1]->buf[r * cur_width + c] =
+            fabs(input[k][r * cur_width + c] -
+                 output[j + 1]->buf[r * cur_width + c]);
+      }
+    }
+
+    j += 2;
+  }
+  return 1;
+}
+
+// For color channels, the differences is calculated based on "color
+// double-opponency". For example, the RG feature map is constructed between a
+// fine scale c of R-G component and a coarser scale s of G-R component.
+static int center_surround_diff_rgb(const double *input_1[9],
+                                    const double *input_2[9], int height[9],
+                                    int width[9],
+                                    saliency_feature_map *output[6]) {
+  int j = 0;
+  for (int k = 2; k < 5; ++k) {
+    int cur_height = height[k];
+    int cur_width = width[k];
+
+    if (upscale_map(input_2[k + 3], k + 3, k, height, width, output[j]->buf) ==
+        0) {
+      return 0;
+    }
+
+    for (int r = 0; r < cur_height; ++r) {
+      for (int c = 0; c < cur_width; ++c) {
+        output[j]->buf[r * cur_width + c] =
+            fabs((double)(input_1[k][r * cur_width + c] -
+                          output[j]->buf[r * cur_width + c]));
+      }
+    }
+
+    if (upscale_map(input_2[k + 4], k + 4, k, height, width,
+                    output[j + 1]->buf) == 0) {
+      return 0;
+    }
+
+    for (int r = 0; r < cur_height; ++r) {
+      for (int c = 0; c < cur_width; ++c) {
+        output[j + 1]->buf[r * cur_width + c] =
+            fabs(input_1[k][r * cur_width + c] -
+                 output[j + 1]->buf[r * cur_width + c]);
+      }
+    }
+
+    j += 2;
+  }
+  return 1;
+}
+
+// This function is to generate Gaussian pyramid images with indexes from 0 to
+// 8, and construct the feature maps from calculating the center-surround
+// differences.
+static int gaussian_pyramid(const double *src, int width[9], int height[9],
+                            saliency_feature_map *dst[6]) {
+  double *gaussian_map[9];  // scale = 9
+  gaussian_map[0] =
+      (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0]));
+  if (!gaussian_map[0]) {
+    return 0;
+  }
+
+  memcpy(gaussian_map[0], src, width[0] * height[0] * sizeof(double));
+
+  for (int i = 1; i < 9; ++i) {
+    int stride = width[i - 1];
+    int new_width = width[i];
+    int new_height = height[i];
+
+    gaussian_map[i] =
+        (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i]));
+
+    if (!gaussian_map[i]) {
+      for (int l = 0; l < i; ++l) {
+        aom_free(gaussian_map[l]);
+      }
+      return 0;
+    }
+
+    memset(gaussian_map[i], 0, new_width * new_height * sizeof(double));
+
+    decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride,
+                 gaussian_map[i]);
+  }
+
+  if (center_surround_diff((const double **)gaussian_map, height, width, dst) ==
+      0) {
+    for (int l = 0; l < 9; ++l) {
+      aom_free(gaussian_map[l]);
+    }
+    return 0;
+  }
+
+  for (int i = 0; i < 9; ++i) {
+    aom_free(gaussian_map[i]);
+  }
+  return 1;
+}
+
+static int gaussian_pyramid_rgb(double *src_1, double *src_2, int width[9],
+                                int height[9], saliency_feature_map *dst[6]) {
+  double *gaussian_map[2][9];  // scale = 9
+  double *src[2];
+
+  src[0] = src_1;
+  src[1] = src_2;
+
+  for (int k = 0; k < 2; ++k) {
+    gaussian_map[k][0] = (double *)aom_malloc(width[0] * height[0] *
+                                              sizeof(*gaussian_map[k][0]));
+    if (!gaussian_map[k][0]) {
+      for (int l = 0; l < k; ++l) {
+        aom_free(gaussian_map[l][0]);
+      }
+      return 0;
+    }
+    memcpy(gaussian_map[k][0], src[k], width[0] * height[0] * sizeof(double));
+
+    for (int i = 1; i < 9; ++i) {
+      int stride = width[i - 1];
+      int new_width = width[i];
+      int new_height = height[i];
+
+      gaussian_map[k][i] = (double *)aom_malloc(new_width * new_height *
+                                                sizeof(*gaussian_map[k][i]));
+      if (!gaussian_map[k][i]) {
+        for (int l = 0; l < k; ++l) {
+          aom_free(gaussian_map[l][i]);
+        }
+        return 0;
+      }
+      memset(gaussian_map[k][i], 0, new_width * new_height * sizeof(double));
+      decimate_map(gaussian_map[k][i - 1], height[i - 1], width[i - 1], stride,
+                   gaussian_map[k][i]);
+    }
+  }
+
+  if (center_surround_diff_rgb((const double **)gaussian_map[0],
+                               (const double **)gaussian_map[1], height, width,
+                               dst) == 0) {
+    for (int l = 0; l < 2; ++l) {
+      for (int i = 0; i < 9; ++i) {
+        aom_free(gaussian_map[l][i]);
+      }
+    }
+    return 0;
+  }
+
+  for (int l = 0; l < 2; ++l) {
+    for (int i = 0; i < 9; ++i) {
+      aom_free(gaussian_map[l][i]);
+    }
+  }
+  return 1;
+}
+
+static int get_feature_map_intensity(double *intensity, int width[9],
+                                     int height[9],
+                                     saliency_feature_map *i_map[6]) {
+  if (gaussian_pyramid(intensity, width, height, i_map) == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static int get_feature_map_rgb(double *cr, double *cg, double *cb, int width[9],
+                               int height[9], saliency_feature_map *rg_map[6],
+                               saliency_feature_map *by_map[6]) {
+  double *rg_mat = aom_malloc(height[0] * width[0] * sizeof(*rg_mat));
+  double *by_mat = aom_malloc(height[0] * width[0] * sizeof(*by_mat));
+  double *gr_mat = aom_malloc(height[0] * width[0] * sizeof(*gr_mat));
+  double *yb_mat = aom_malloc(height[0] * width[0] * sizeof(*yb_mat));
+
+  if (!rg_mat || !by_mat || !gr_mat || !yb_mat) {
+    aom_free(rg_mat);
+    aom_free(by_mat);
+    aom_free(gr_mat);
+    aom_free(yb_mat);
+    return 0;
+  }
+
+  double r, g, b, y;
+  for (int i = 0; i < height[0]; ++i) {
+    for (int j = 0; j < width[0]; ++j) {
+      r = AOMMAX(0, cr[i * width[0] + j] -
+                        (cg[i * width[0] + j] + cb[i * width[0] + j]) / 2);
+      g = AOMMAX(0, cg[i * width[0] + j] -
+                        (cr[i * width[0] + j] + cb[i * width[0] + j]) / 2);
+      b = AOMMAX(0, cb[i * width[0] + j] -
+                        (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2);
+      y = AOMMAX(0, (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2 -
+                        fabs(cr[i * width[0] + j] - cg[i * width[0] + j]) / 2 -
+                        cb[i * width[0] + j]);
+
+      rg_mat[i * width[0] + j] = r - g;
+      by_mat[i * width[0] + j] = b - y;
+      gr_mat[i * width[0] + j] = g - r;
+      yb_mat[i * width[0] + j] = y - b;
+    }
+  }
+
+  if (gaussian_pyramid_rgb(rg_mat, gr_mat, width, height, rg_map) == 0 ||
+      gaussian_pyramid_rgb(by_mat, yb_mat, width, height, by_map) == 0) {
+    aom_free(rg_mat);
+    aom_free(by_mat);
+    aom_free(gr_mat);
+    aom_free(yb_mat);
+    return 0;
+  }
+
+  aom_free(rg_mat);
+  aom_free(by_mat);
+  aom_free(gr_mat);
+  aom_free(yb_mat);
+  return 1;
+}
+
+static INLINE void filter2d(const double *input, const double kernel[9][9],
+                            int width, int height, double *output) {
+  const int window_size = 9;
+  double map_section[81];
+  for (int y = 0; y <= height - 1; ++y) {
+    for (int x = 0; x <= width - 1; ++x) {
+      int i = 0;
+      for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) {
+        for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) {
+          int yvalue = clamp(yy, 0, height - 1);
+          int xvalue = clamp(xx, 0, width - 1);
+          map_section[i++] = input[yvalue * width + xvalue];
+        }
+      }
+
+      output[y * width + x] = 0;
+      for (int k = 0; k < window_size; ++k) {
+        for (int l = 0; l < window_size; ++l) {
+          output[y * width + x] +=
+              kernel[k][l] * map_section[k * window_size + l];
+        }
+      }
+    }
+  }
+}
+
+static int get_feature_map_orientation(const double *intensity, int width[9],
+                                       int height[9],
+                                       saliency_feature_map *dst[24]) {
+  double *gaussian_map[9];
+
+  gaussian_map[0] =
+      (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0]));
+  if (!gaussian_map[0]) {
+    return 0;
+  }
+  memcpy(gaussian_map[0], intensity, width[0] * height[0] * sizeof(double));
+
+  for (int i = 1; i < 9; ++i) {
+    int stride = width[i - 1];
+    int new_width = width[i];
+    int new_height = height[i];
+
+    gaussian_map[i] =
+        (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i]));
+    if (!gaussian_map[i]) {
+      for (int l = 0; l < i; ++l) {
+        aom_free(gaussian_map[l]);
+      }
+      return 0;
+    }
+    memset(gaussian_map[i], 0, new_width * new_height * sizeof(double));
+    decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride,
+                 gaussian_map[i]);
+  }
+
+  double *tempGaborOutput[4][9];  //[angle: 0, 45, 90, 135 degree][filter_size]
+
+  for (int i = 2; i < 9; ++i) {
+    const int cur_height = height[i];
+    const int cur_width = width[i];
+    for (int j = 0; j < 4; ++j) {
+      tempGaborOutput[j][i] = (double *)aom_malloc(
+          cur_height * cur_width * sizeof(*tempGaborOutput[j][i]));
+      if (!tempGaborOutput[j][i]) {
+        for (int l = 0; l < 9; ++l) {
+          aom_free(gaussian_map[l]);
+        }
+        for (int h = 0; h < 4; ++h) {
+          for (int g = 2; g < 9; ++g) {
+            aom_free(tempGaborOutput[h][g]);
+          }
+        }
+        return 0;
+      }
+      filter2d(gaussian_map[i], kGaborFilter[j], cur_width, cur_height,
+               tempGaborOutput[j][i]);
+    }
+  }
+
+  for (int i = 0; i < 9; ++i) {
+    aom_free(gaussian_map[i]);
+  }
+
+  saliency_feature_map
+      *tmp[4][6];  //[angle: 0, 45, 90, 135 degree][filter_size]
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      tmp[j][i] = dst[j * 6 + i];
+    }
+  }
+
+  for (int j = 0; j < 4; ++j) {
+    if (center_surround_diff((const double **)tempGaborOutput[j], height, width,
+                             tmp[j]) == 0) {
+      for (int h = 0; h < 4; ++h) {
+        for (int g = 2; g < 9; ++g) {
+          aom_free(tempGaborOutput[h][g]);
+        }
+      }
+      return 0;
+    }
+  }
+
+  for (int i = 2; i < 9; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      aom_free(tempGaborOutput[j][i]);
+    }
+  }
+
+  return 1;
+}
+
+static INLINE void find_min_max(const saliency_feature_map *input,
+                                double *max_value, double *min_value) {
+  assert(input && input->buf);
+  *min_value = DBL_MAX;
+  *max_value = 0.0;
+
+  for (int i = 0; i < input->height; ++i) {
+    for (int j = 0; j < input->width; ++j) {
+      assert(input->buf[i * input->width + j] >= 0.0);
+      *min_value = fmin(input->buf[i * input->width + j], *min_value);
+      *max_value = fmax(input->buf[i * input->width + j], *max_value);
+    }
+  }
+}
+
+static INLINE double average_local_max(const saliency_feature_map *input,
+                                       int stepsize) {
+  int numlocal = 0;
+  double lmaxmean = 0, lmax = 0, dummy = 0;
+  saliency_feature_map local_map;
+  local_map.height = stepsize;
+  local_map.width = stepsize;
+  local_map.buf =
+      (double *)aom_malloc(stepsize * stepsize * sizeof(*local_map.buf));
+
+  if (!local_map.buf) {
+    return -1;
+  }
+
+  for (int y = 0; y < input->height - stepsize; y += stepsize) {
+    for (int x = 0; x < input->width - stepsize; x += stepsize) {
+      for (int i = 0; i < stepsize; ++i) {
+        for (int j = 0; j < stepsize; ++j) {
+          local_map.buf[i * stepsize + j] =
+              input->buf[(y + i) * input->width + x + j];
+        }
+      }
+
+      find_min_max(&local_map, &lmax, &dummy);
+      lmaxmean += lmax;
+      numlocal++;
+    }
+  }
+
+  aom_free(local_map.buf);
+
+  return lmaxmean / numlocal;
+}
+
+// Linear normalization the values in the map to [0,1].
+static void minmax_normalize(saliency_feature_map *input) {
+  double max_value, min_value;
+  find_min_max(input, &max_value, &min_value);
+
+  for (int i = 0; i < input->height; ++i) {
+    for (int j = 0; j < input->width; ++j) {
+      if (max_value != min_value) {
+        input->buf[i * input->width + j] =
+            input->buf[i * input->width + j] / (max_value - min_value) +
+            min_value / (min_value - max_value);
+      } else {
+        input->buf[i * input->width + j] -= min_value;
+      }
+    }
+  }
+}
+
+// This function is to promote meaningful “activation spots” in the map and
+// ignores homogeneous areas.
+static int nomalization_operator(saliency_feature_map *input, int stepsize) {
+  minmax_normalize(input);
+  double lmaxmean = average_local_max(input, stepsize);
+  if (lmaxmean < 0) {
+    return 0;
+  }
+  double normCoeff = (1 - lmaxmean) * (1 - lmaxmean);
+
+  for (int i = 0; i < input->height; ++i) {
+    for (int j = 0; j < input->width; ++j) {
+      input->buf[i * input->width + j] *= normCoeff;
+    }
+  }
+
+  return 1;
+}
+
+// Normalize the values in feature maps to [0,1], and then upscale all maps to
+// the original frame size.
+static int normalize_fm(saliency_feature_map *input[6], int width[9],
+                        int height[9], int num_fm,
+                        saliency_feature_map *output[6]) {
+  // Feature maps (FM) are generated by function "center_surround_diff()". The
+  // difference is between a fine scale c and a coarser scale s, where c \in {2,
+  // 3, 4}, and s = c + delta, where delta \in {3, 4}, and the FM size is scale
+  // c. Specifically, i=0: c=2 and s=5, i=1: c=2 and s=6, i=2: c=3 and s=6, i=3:
+  // c=3 and s=7, i=4: c=4 and s=7, i=5: c=4 and s=8.
+  for (int i = 0; i < num_fm; ++i) {
+    if (nomalization_operator(input[i], 8) == 0) {
+      return 0;
+    }
+
+    // Upscale FM to original frame size
+    if (upscale_map(input[i]->buf, (i / 2) + 2, 0, height, width,
+                    output[i]->buf) == 0) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+// Combine feature maps with the same category (intensity, color, or
+// orientation) into one conspicuity map.
+static int normalized_map(saliency_feature_map *input[6], int width[9],
+                          int height[9], saliency_feature_map *output) {
+  int num_fm = 6;
+
+  saliency_feature_map *n_input[6];
+  for (int i = 0; i < 6; ++i) {
+    n_input[i] = (saliency_feature_map *)aom_malloc(sizeof(*n_input[i]));
+    if (!n_input[i]) {
+      return 0;
+    }
+    n_input[i]->buf =
+        (double *)aom_malloc(width[0] * height[0] * sizeof(*n_input[i]->buf));
+    if (!n_input[i]->buf) {
+      aom_free(n_input[i]);
+      return 0;
+    }
+    n_input[i]->height = height[0];
+    n_input[i]->width = width[0];
+  }
+
+  if (normalize_fm(input, width, height, num_fm, n_input) == 0) {
+    for (int i = 0; i < num_fm; ++i) {
+      aom_free(n_input[i]->buf);
+      aom_free(n_input[i]);
+    }
+    return 0;
+  }
+
+  // Add up all normalized feature maps with the same category into one map.
+  for (int i = 0; i < num_fm; ++i) {
+    for (int r = 0; r < height[0]; ++r) {
+      for (int c = 0; c < width[0]; ++c) {
+        output->buf[r * width[0] + c] += n_input[i]->buf[r * width[0] + c];
+      }
+    }
+  }
+
+  for (int i = 0; i < num_fm; ++i) {
+    aom_free(n_input[i]->buf);
+    aom_free(n_input[i]);
+  }
+
+  nomalization_operator(output, 8);
+  return 1;
+}
+
+static int normalized_map_rgb(saliency_feature_map *rg_map[6],
+                              saliency_feature_map *by_map[6], int width[9],
+                              int height[9], saliency_feature_map *output) {
+  saliency_feature_map *color_cm[2];  // 0: color_cm_rg, 1: color_cm_by
+  for (int i = 0; i < 2; ++i) {
+    color_cm[i] = aom_malloc(sizeof(*color_cm[i]));
+    if (!color_cm[i]) {
+      return 0;
+    }
+    color_cm[i]->buf =
+        (double *)aom_malloc(width[0] * height[0] * sizeof(*color_cm[i]->buf));
+    if (!color_cm[i]->buf) {
+      for (int l = 0; l < i; ++l) {
+        aom_free(color_cm[l]->buf);
+      }
+      aom_free(color_cm[i]);
+      return 0;
+    }
+
+    color_cm[i]->width = width[0];
+    color_cm[i]->height = height[0];
+    memset(color_cm[i]->buf, 0,
+           width[0] * height[0] * sizeof(*color_cm[i]->buf));
+  }
+
+  if (normalized_map(rg_map, width, height, color_cm[0]) == 0 ||
+      normalized_map(by_map, width, height, color_cm[1]) == 0) {
+    for (int i = 0; i < 2; ++i) {
+      aom_free(color_cm[i]->buf);
+      aom_free(color_cm[i]);
+    }
+    return 0;
+  }
+
+  for (int r = 0; r < height[0]; ++r) {
+    for (int c = 0; c < width[0]; ++c) {
+      output->buf[r * width[0] + c] = color_cm[0]->buf[r * width[0] + c] +
+                                      color_cm[1]->buf[r * width[0] + c];
+    }
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    aom_free(color_cm[i]->buf);
+    aom_free(color_cm[i]);
+  }
+
+  nomalization_operator(output, 8);
+  return 1;
+}
+
+static int normalized_map_orientation(saliency_feature_map *orientation_map[24],
+                                      int width[9], int height[9],
+                                      saliency_feature_map *output) {
+  int num_fms_per_angle = 6;
+
+  saliency_feature_map *ofm[4][6];
+  for (int i = 0; i < num_fms_per_angle; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      ofm[j][i] = orientation_map[j * num_fms_per_angle + i];
+    }
+  }
+
+  // extract conspicuity map for each angle
+  saliency_feature_map *nofm = aom_malloc(sizeof(*nofm));
+  if (!nofm) {
+    return 0;
+  }
+  nofm->buf = (double *)aom_malloc(width[0] * height[0] * sizeof(*nofm->buf));
+  if (!nofm->buf) {
+    aom_free(nofm);
+    return 0;
+  }
+  nofm->height = height[0];
+  nofm->width = width[0];
+
+  for (int i = 0; i < 4; ++i) {
+    memset(nofm->buf, 0, width[0] * height[0] * sizeof(*nofm->buf));
+    if (normalized_map(ofm[i], width, height, nofm) == 0) {
+      aom_free(nofm->buf);
+      aom_free(nofm);
+      return 0;
+    }
+
+    for (int r = 0; r < height[0]; ++r) {
+      for (int c = 0; c < width[0]; ++c) {
+        output->buf[r * width[0] + c] += nofm->buf[r * width[0] + c];
+      }
+    }
+  }
+
+  aom_free(nofm->buf);
+  aom_free(nofm);
+
+  nomalization_operator(output, 8);
+  return 1;
+}
+
+// Set pixel level saliency mask based on Itti-Koch algorithm
+int av1_set_saliency_map(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  int frm_width = cm->width;
+  int frm_height = cm->height;
+
+  int pyr_height[9];
+  int pyr_width[9];
+
+  pyr_height[0] = frm_height;
+  pyr_width[0] = frm_width;
+
+  for (int i = 1; i < 9; ++i) {
+    pyr_width[i] = pyr_width[i - 1] / 2;
+    pyr_height[i] = pyr_height[i - 1] / 2;
+  }
+
+  double *cr = aom_malloc(frm_width * frm_height * sizeof(*cr));
+  double *cg = aom_malloc(frm_width * frm_height * sizeof(*cg));
+  double *cb = aom_malloc(frm_width * frm_height * sizeof(*cb));
+  double *intensity = aom_malloc(frm_width * frm_height * sizeof(*intensity));
+
+  if (!cr || !cg || !cb || !intensity) {
+    aom_free(cr);
+    aom_free(cg);
+    aom_free(cb);
+    aom_free(intensity);
+    return 0;
+  }
+
+  // Extract red / green / blue channels and intensity component
+  get_color_intensity(cpi->source, cm->seq_params->subsampling_x,
+                      cm->seq_params->subsampling_y, cr, cg, cb, intensity);
+
+  // Feature Map Extraction
+  // intensity map
+  saliency_feature_map *i_map[6];
+  for (int i = 0; i < 6; ++i) {
+    int cur_height = pyr_height[(i / 2) + 2];
+    int cur_width = pyr_width[(i / 2) + 2];
+
+    i_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*i_map[i]));
+    if (!i_map[i]) {
+      aom_free(cr);
+      aom_free(cg);
+      aom_free(cb);
+      aom_free(intensity);
+      for (int l = 0; l < i; ++l) {
+        aom_free(i_map[l]);
+      }
+      return 0;
+    }
+    i_map[i]->buf =
+        (double *)aom_malloc(cur_height * cur_width * sizeof(*i_map[i]->buf));
+    if (!i_map[i]->buf) {
+      aom_free(cr);
+      aom_free(cg);
+      aom_free(cb);
+      aom_free(intensity);
+      for (int l = 0; l < i; ++l) {
+        aom_free(i_map[l]->buf);
+        aom_free(i_map[l]);
+      }
+      return 0;
+    }
+    i_map[i]->height = cur_height;
+    i_map[i]->width = cur_width;
+  }
+
+  if (get_feature_map_intensity(intensity, pyr_width, pyr_height, i_map) == 0) {
+    aom_free(cr);
+    aom_free(cg);
+    aom_free(cb);
+    aom_free(intensity);
+    for (int l = 0; l < 6; ++l) {
+      aom_free(i_map[l]->buf);
+      aom_free(i_map[l]);
+    }
+    return 0;
+  }
+
+  // RGB map
+  saliency_feature_map *rg_map[6], *by_map[6];
+  for (int i = 0; i < 6; ++i) {
+    int cur_height = pyr_height[(i / 2) + 2];
+    int cur_width = pyr_width[(i / 2) + 2];
+    rg_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*rg_map[i]));
+    by_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*by_map[i]));
+    if (!rg_map[i] || !by_map[i]) {
+      aom_free(cr);
+      aom_free(cg);
+      aom_free(cb);
+      aom_free(intensity);
+      for (int l = 0; l < 6; ++l) {
+        aom_free(i_map[l]->buf);
+        aom_free(i_map[l]);
+        aom_free(rg_map[l]);
+        aom_free(by_map[l]);
+      }
+      return 0;
+    }
+    rg_map[i]->buf =
+        (double *)aom_malloc(cur_height * cur_width * sizeof(*rg_map[i]->buf));
+    by_map[i]->buf =
+        (double *)aom_malloc(cur_height * cur_width * sizeof(*by_map[i]->buf));
+    if (!by_map[i]->buf || !rg_map[i]->buf) {
+      aom_free(cr);
+      aom_free(cg);
+      aom_free(cb);
+      aom_free(intensity);
+      for (int l = 0; l < 6; ++l) {
+        aom_free(i_map[l]->buf);
+        aom_free(i_map[l]);
+      }
+      for (int l = 0; l < i; ++l) {
+        aom_free(rg_map[l]->buf);
+        aom_free(by_map[l]->buf);
+        aom_free(rg_map[l]);
+        aom_free(by_map[l]);
+      }
+      return 0;
+    }
+    rg_map[i]->height = cur_height;
+    rg_map[i]->width = cur_width;
+    by_map[i]->height = cur_height;
+    by_map[i]->width = cur_width;
+  }
+
+  if (get_feature_map_rgb(cr, cg, cb, pyr_width, pyr_height, rg_map, by_map) ==
+      0) {
+    aom_free(cr);
+    aom_free(cg);
+    aom_free(cb);
+    aom_free(intensity);
+    for (int l = 0; l < 6; ++l) {
+      aom_free(i_map[l]->buf);
+      aom_free(rg_map[l]->buf);
+      aom_free(by_map[l]->buf);
+      aom_free(i_map[l]);
+      aom_free(rg_map[l]);
+      aom_free(by_map[l]);
+    }
+    return 0;
+  }
+
+  // Orientation map
+  saliency_feature_map *orientation_map[24];
+  for (int i = 0; i < 24; ++i) {
+    int cur_height = pyr_height[((i % 6) / 2) + 2];
+    int cur_width = pyr_width[((i % 6) / 2) + 2];
+
+    orientation_map[i] =
+        (saliency_feature_map *)aom_malloc(sizeof(*orientation_map[i]));
+    if (!orientation_map[i]) {
+      aom_free(cr);
+      aom_free(cg);
+      aom_free(cb);
+      aom_free(intensity);
+      for (int l = 0; l < 6; ++l) {
+        aom_free(i_map[l]->buf);
+        aom_free(rg_map[l]->buf);
+        aom_free(by_map[l]->buf);
+        aom_free(i_map[l]);
+        aom_free(rg_map[l]);
+        aom_free(by_map[l]);
+      }
+      for (int h = 0; h < i; ++h) {
+        aom_free(orientation_map[h]);
+      }
+      return 0;
+    }
+
+    orientation_map[i]->buf = (double *)aom_malloc(
+        cur_height * cur_width * sizeof(*orientation_map[i]->buf));
+    if (!orientation_map[i]->buf) {
+      aom_free(cr);
+      aom_free(cg);
+      aom_free(cb);
+      aom_free(intensity);
+      for (int l = 0; l < 6; ++l) {
+        aom_free(i_map[l]->buf);
+        aom_free(rg_map[l]->buf);
+        aom_free(by_map[l]->buf);
+        aom_free(i_map[l]);
+        aom_free(rg_map[l]);
+        aom_free(by_map[l]);
+      }
+
+      for (int h = 0; h < i; ++h) {
+        aom_free(orientation_map[h]->buf);
+        aom_free(orientation_map[h]->buf);
+        aom_free(orientation_map[h]);
+        aom_free(orientation_map[h]);
+      }
+      return 0;
+    }
+
+    orientation_map[i]->height = cur_height;
+    orientation_map[i]->width = cur_width;
+  }
+
+  if (get_feature_map_orientation(intensity, pyr_width, pyr_height,
+                                  orientation_map) == 0) {
+    aom_free(cr);
+    aom_free(cg);
+    aom_free(cb);
+    aom_free(intensity);
+    for (int l = 0; l < 6; ++l) {
+      aom_free(i_map[l]->buf);
+      aom_free(rg_map[l]->buf);
+      aom_free(by_map[l]->buf);
+      aom_free(i_map[l]);
+      aom_free(rg_map[l]);
+      aom_free(by_map[l]);
+    }
+    for (int h = 0; h < 24; ++h) {
+      aom_free(orientation_map[h]->buf);
+      aom_free(orientation_map[h]);
+    }
+    return 0;
+  }
+
+  aom_free(cr);
+  aom_free(cg);
+  aom_free(cb);
+  aom_free(intensity);
+
+  saliency_feature_map
+      *normalized_maps[3];  // 0: intensity, 1: color, 2: orientation
+
+  for (int i = 0; i < 3; ++i) {
+    normalized_maps[i] = aom_malloc(sizeof(*normalized_maps[i]));
+    if (!normalized_maps[i]) {
+      for (int l = 0; l < 6; ++l) {
+        aom_free(i_map[l]->buf);
+        aom_free(rg_map[l]->buf);
+        aom_free(by_map[l]->buf);
+        aom_free(i_map[l]);
+        aom_free(rg_map[l]);
+        aom_free(by_map[l]);
+      }
+
+      for (int h = 0; h < 24; ++h) {
+        aom_free(orientation_map[h]->buf);
+        aom_free(orientation_map[h]);
+      }
+
+      for (int l = 0; l < i; ++l) {
+        aom_free(normalized_maps[l]);
+      }
+      return 0;
+    }
+    normalized_maps[i]->buf = (double *)aom_malloc(
+        frm_width * frm_height * sizeof(*normalized_maps[i]->buf));
+    if (!normalized_maps[i]->buf) {
+      for (int l = 0; l < 6; ++l) {
+        aom_free(i_map[l]->buf);
+        aom_free(rg_map[l]->buf);
+        aom_free(by_map[l]->buf);
+        aom_free(i_map[l]);
+        aom_free(rg_map[l]);
+        aom_free(by_map[l]);
+      }
+      for (int h = 0; h < 24; ++h) {
+        aom_free(orientation_map[h]->buf);
+        aom_free(orientation_map[h]);
+      }
+      for (int l = 0; l < i; ++l) {
+        aom_free(normalized_maps[l]->buf);
+        aom_free(normalized_maps[l]);
+      }
+      return 0;
+    }
+    normalized_maps[i]->width = frm_width;
+    normalized_maps[i]->height = frm_height;
+    memset(normalized_maps[i]->buf, 0,
+           frm_width * frm_height * sizeof(*normalized_maps[i]->buf));
+  }
+
+  // Conspicuity map generation
+  if (normalized_map(i_map, pyr_width, pyr_height, normalized_maps[0]) == 0 ||
+      normalized_map_rgb(rg_map, by_map, pyr_width, pyr_height,
+                         normalized_maps[1]) == 0 ||
+      normalized_map_orientation(orientation_map, pyr_width, pyr_height,
+                                 normalized_maps[2]) == 0) {
+    for (int i = 0; i < 6; ++i) {
+      aom_free(i_map[i]->buf);
+      aom_free(rg_map[i]->buf);
+      aom_free(by_map[i]->buf);
+      aom_free(i_map[i]);
+      aom_free(rg_map[i]);
+      aom_free(by_map[i]);
+    }
+
+    for (int i = 0; i < 24; ++i) {
+      aom_free(orientation_map[i]->buf);
+      aom_free(orientation_map[i]);
+    }
+
+    for (int i = 0; i < 3; ++i) {
+      aom_free(normalized_maps[i]->buf);
+      aom_free(normalized_maps[i]);
+    }
+    return 0;
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    aom_free(i_map[i]->buf);
+    aom_free(rg_map[i]->buf);
+    aom_free(by_map[i]->buf);
+    aom_free(i_map[i]);
+    aom_free(rg_map[i]);
+    aom_free(by_map[i]);
+  }
+
+  for (int i = 0; i < 24; ++i) {
+    aom_free(orientation_map[i]->buf);
+    aom_free(orientation_map[i]);
+  }
+
+  // Pixel level saliency map
+  saliency_feature_map *combined_saliency_map =
+      aom_malloc(sizeof(*combined_saliency_map));
+  if (!combined_saliency_map) {
+    for (int i = 0; i < 3; ++i) {
+      aom_free(normalized_maps[i]->buf);
+      aom_free(normalized_maps[i]);
+    }
+    return 0;
+  }
+
+  combined_saliency_map->buf = (double *)aom_malloc(
+      frm_width * frm_height * sizeof(*combined_saliency_map->buf));
+  if (!combined_saliency_map->buf) {
+    for (int i = 0; i < 3; ++i) {
+      aom_free(normalized_maps[i]->buf);
+      aom_free(normalized_maps[i]);
+    }
+
+    aom_free(combined_saliency_map);
+    return 0;
+  }
+  combined_saliency_map->height = frm_height;
+  combined_saliency_map->width = frm_width;
+
+  double w_intensity, w_color, w_orient;
+
+  w_intensity = w_color = w_orient = (double)1 / 3;
+
+  for (int r = 0; r < frm_height; ++r) {
+    for (int c = 0; c < frm_width; ++c) {
+      combined_saliency_map->buf[r * frm_width + c] =
+          (w_intensity * normalized_maps[0]->buf[r * frm_width + c] +
+           w_color * normalized_maps[1]->buf[r * frm_width + c] +
+           w_orient * normalized_maps[2]->buf[r * frm_width + c]);
+    }
+  }
+
+  for (int r = 0; r < frm_height; ++r) {
+    for (int c = 0; c < frm_width; ++c) {
+      int index = r * frm_width + c;
+      cpi->saliency_map[index] =
+          (uint8_t)(combined_saliency_map->buf[index] * 255);
+    }
+  }
+
+  for (int i = 0; i < 3; ++i) {
+    aom_free(normalized_maps[i]->buf);
+    aom_free(normalized_maps[i]);
+  }
+
+  aom_free(combined_saliency_map->buf);
+  aom_free(combined_saliency_map);
+
+  return 1;
+}
+
+// Set superblock level saliency mask for rdmult scaling
+int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio) {
+  AV1_COMMON *cm = &cpi->common;
+
+  saliency_feature_map *sb_saliency_map =
+      aom_malloc(sizeof(saliency_feature_map));
+
+  if (sb_saliency_map == NULL) {
+    return 0;
+  }
+
+  const BLOCK_SIZE bsize = cm->seq_params->sb_size;
+  const int num_mi_w = mi_size_wide[bsize];
+  const int num_mi_h = mi_size_high[bsize];
+  const int block_width = block_size_wide[bsize];
+  const int block_height = block_size_high[bsize];
+  const int num_sb_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_sb_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+
+  sb_saliency_map->height = num_sb_rows;
+  sb_saliency_map->width = num_sb_cols;
+  sb_saliency_map->buf = (double *)aom_malloc(num_sb_rows * num_sb_cols *
+                                              sizeof(*sb_saliency_map->buf));
+
+  if (sb_saliency_map->buf == NULL) {
+    aom_free(sb_saliency_map);
+    return 0;
+  }
+
+  for (int row = 0; row < num_sb_rows; ++row) {
+    for (int col = 0; col < num_sb_cols; ++col) {
+      const int index = row * num_sb_cols + col;
+      double total_pixel = 0;
+      double total_weight = 0;
+
+      for (int i = 0; i < block_height; i++) {
+        for (int j = 0; j < block_width; j++) {
+          if ((row * block_height + i) >= cpi->common.height ||
+              (col * block_width + j) >= cpi->common.width)
+            continue;
+          total_pixel++;
+          total_weight +=
+              cpi->saliency_map[(row * block_height + i) * cpi->common.width +
+                                col * block_width + j];
+        }
+      }
+
+      assert(total_pixel > 0);
+
+      // Calculate the superblock level saliency map from pixel level saliency
+      // map
+      sb_saliency_map->buf[index] = total_weight / total_pixel;
+
+      // Further lower the superblock saliency score for boundary superblocks.
+      if (row < 1 || row > num_sb_rows - 2 || col < 1 ||
+          col > num_sb_cols - 2) {
+        sb_saliency_map->buf[index] /= 5;
+      }
+    }
+  }
+
+  // superblock level saliency map finalization
+  minmax_normalize(sb_saliency_map);
+
+  double log_sum = 0.0;
+  double sum = 0.0;
+  int block_count = 0;
+
+  // Calculate the average superblock sm_scaling_factor for a frame, to be used
+  // for clamping later.
+  for (int row = 0; row < num_sb_rows; ++row) {
+    for (int col = 0; col < num_sb_cols; ++col) {
+      const int index = row * num_sb_cols + col;
+      const double saliency = sb_saliency_map->buf[index];
+
+      cpi->sm_scaling_factor[index] = 1 - saliency;
+      sum += cpi->sm_scaling_factor[index];
+      block_count++;
+    }
+  }
+  assert(block_count > 0);
+  sum /= block_count;
+
+  // Calculate the geometric mean of superblock sm_scaling_factor for a frame,
+  // to be used for normalization.
+  for (int row = 0; row < num_sb_rows; ++row) {
+    for (int col = 0; col < num_sb_cols; ++col) {
+      const int index = row * num_sb_cols + col;
+      log_sum += log(fmax(cpi->sm_scaling_factor[index], 0.001));
+      cpi->sm_scaling_factor[index] =
+          fmax(cpi->sm_scaling_factor[index], 0.8 * sum);
+    }
+  }
+
+  log_sum = exp(log_sum / block_count);
+
+  // Normalize the sm_scaling_factor by geometric mean.
+  for (int row = 0; row < num_sb_rows; ++row) {
+    for (int col = 0; col < num_sb_cols; ++col) {
+      const int index = row * num_sb_cols + col;
+      assert(log_sum > 0);
+      cpi->sm_scaling_factor[index] /= log_sum;
+
+      // Modulate the sm_scaling_factor by frame basis motion factor
+      cpi->sm_scaling_factor[index] =
+          cpi->sm_scaling_factor[index] * motion_ratio;
+    }
+  }
+
+  aom_free(sb_saliency_map->buf);
+  aom_free(sb_saliency_map);
+  return 1;
+}
+
+// av1_setup_motion_ratio() is only enabled when CONFIG_REALTIME_ONLY is 0,
+// because the computations need to access the first pass stats which are
+// only available when CONFIG_REALTIME_ONLY is equal to 0.
+#if !CONFIG_REALTIME_ONLY
+// Set motion_ratio that reflects the motion quantities between two consecutive
+// frames. Motion_ratio will be used to set up saliency_map based rdmult scaling
+// factor, i.e., the less the motion quantities are, the more bits will be spent
+// on this frame, and vice versa.
+double av1_setup_motion_ratio(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  int frames_since_key =
+      cm->current_frame.display_order_hint - cpi->rc.frames_since_key;
+  const FIRSTPASS_STATS *cur_stats = av1_firstpass_info_peek(
+      &cpi->ppi->twopass.firstpass_info, frames_since_key);
+  assert(cur_stats != NULL);
+  assert(cpi->ppi->twopass.firstpass_info.total_stats.count > 0);
+
+  const double avg_intra_error =
+      exp(cpi->ppi->twopass.firstpass_info.total_stats.log_intra_error /
+          cpi->ppi->twopass.firstpass_info.total_stats.count);
+  const double avg_inter_error =
+      exp(cpi->ppi->twopass.firstpass_info.total_stats.log_coded_error /
+          cpi->ppi->twopass.firstpass_info.total_stats.count);
+
+  double inter_error = cur_stats->coded_error;
+  double error_stdev = 0;
+  const double avg_error =
+      cpi->ppi->twopass.firstpass_info.total_stats.intra_error /
+      cpi->ppi->twopass.firstpass_info.total_stats.count;
+  for (int i = 0; i < cpi->ppi->twopass.firstpass_info.total_stats.count; i++) {
+    const FIRSTPASS_STATS *stats =
+        &cpi->ppi->twopass.firstpass_info.stats_buf[i];
+    error_stdev +=
+        (stats->intra_error - avg_error) * (stats->intra_error - avg_error);
+  }
+  error_stdev =
+      sqrt(error_stdev / cpi->ppi->twopass.firstpass_info.total_stats.count);
+
+  double motion_ratio = 1;
+  if (error_stdev / fmax(avg_intra_error, 1) > 0.1) {
+    motion_ratio = inter_error / fmax(1, avg_inter_error);
+    motion_ratio = AOMMIN(motion_ratio, 1.5);
+    motion_ratio = AOMMAX(motion_ratio, 0.8);
+  }
+
+  return motion_ratio;
+}
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/av1/encoder/saliency_map.h b/third_party/aom/av1/encoder/saliency_map.h
new file mode 100644
index 0000000000..0d27f83633
--- /dev/null
+++ b/third_party/aom/av1/encoder/saliency_map.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SALIENCY_MAP_H_
+#define AOM_AV1_ENCODER_SALIENCY_MAP_H_
+#include "av1/encoder/encoder.h"
+
+typedef struct saliency_feature_map {
+  double *buf;  // stores values of the map in 1D array
+  int height;
+  int width;
+} saliency_feature_map;
+
+int av1_set_saliency_map(AV1_COMP *cpi);
+#if !CONFIG_REALTIME_ONLY
+double av1_setup_motion_ratio(AV1_COMP *cpi);
+#endif
+int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio);
+
+#endif  // AOM_AV1_ENCODER_SALIENCY_MAP_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
new file mode 100644
index 0000000000..4b4e78779c
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/pred_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/segmentation.h"
+
+void av1_enable_segmentation(struct segmentation *seg) {
+  seg->enabled = 1;
+  seg->update_map = 1;
+  seg->update_data = 1;
+  seg->temporal_update = 0;
+}
+
+void av1_disable_segmentation(struct segmentation *seg) {
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  seg->temporal_update = 0;
+}
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] &= ~(1u << feature_id);
+}
+
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id) {
+  seg->feature_data[segment_id][feature_id] = 0;
+}
+
+void av1_reset_segment_features(AV1_COMMON *cm) {
+  struct segmentation *seg = &cm->seg;
+
+  // Set up default state for MB feature flags
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  av1_clearall_segfeatures(seg);
+}
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
new file mode 100644
index 0000000000..1ad13d66a9
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_enable_segmentation(struct segmentation *seg);
+void av1_disable_segmentation(struct segmentation *seg);
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id);
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id);
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_reset_segment_features(AV1_COMMON *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/sorting_network.h b/third_party/aom/av1/encoder/sorting_network.h
new file mode 100644
index 0000000000..54f4c19dcd
--- /dev/null
+++ b/third_party/aom/av1/encoder/sorting_network.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * This file contains several utility functions used to sort small arrays with
+ * sorting networks.
+ *
+ * Sorting network is a (potentially branch-less) way to quickly sort small
+ * arrays with known size. For more details, consult
+ * (https://en.wikipedia.org/wiki/Sorting_network).
+ */
+#ifndef AOM_AV1_ENCODER_SORTING_NETWORK_H_
+#define AOM_AV1_ENCODER_SORTING_NETWORK_H_
+
+#include "aom/aom_integer.h"
+
+#define SWAP(i, j)                                   \
+  do {                                               \
+    const float maxf = (k[i] >= k[j]) ? k[i] : k[j]; \
+    const float minf = (k[i] >= k[j]) ? k[j] : k[i]; \
+    const int maxi = (k[i] >= k[j]) ? v[i] : v[j];   \
+    const int mini = (k[i] >= k[j]) ? v[j] : v[i];   \
+    k[i] = maxf;                                     \
+    k[j] = minf;                                     \
+    v[i] = maxi;                                     \
+    v[j] = mini;                                     \
+  } while (0)
+
+/*!\brief Sorts two size-16 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out]    k          An length-16 array of float serves as the keys.
+ * \param[in,out]    v          An length-16 array of int32 serves as the
+ *                              value.
+ */
+static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) {
+  SWAP(0, 1);
+  SWAP(2, 3);
+  SWAP(4, 5);
+  SWAP(6, 7);
+  SWAP(8, 9);
+  SWAP(10, 11);
+  SWAP(12, 13);
+  SWAP(14, 15);
+  SWAP(0, 2);
+  SWAP(1, 3);
+  SWAP(4, 6);
+  SWAP(5, 7);
+  SWAP(8, 10);
+  SWAP(9, 11);
+  SWAP(12, 14);
+  SWAP(13, 15);
+  SWAP(1, 2);
+  SWAP(5, 6);
+  SWAP(0, 4);
+  SWAP(3, 7);
+  SWAP(9, 10);
+  SWAP(13, 14);
+  SWAP(8, 12);
+  SWAP(11, 15);
+  SWAP(1, 5);
+  SWAP(2, 6);
+  SWAP(9, 13);
+  SWAP(10, 14);
+  SWAP(0, 8);
+  SWAP(7, 15);
+  SWAP(1, 4);
+  SWAP(3, 6);
+  SWAP(9, 12);
+  SWAP(11, 14);
+  SWAP(2, 4);
+  SWAP(3, 5);
+  SWAP(10, 12);
+  SWAP(11, 13);
+  SWAP(1, 9);
+  SWAP(6, 14);
+  SWAP(3, 4);
+  SWAP(11, 12);
+  SWAP(1, 8);
+  SWAP(2, 10);
+  SWAP(5, 13);
+  SWAP(7, 14);
+  SWAP(3, 11);
+  SWAP(2, 8);
+  SWAP(4, 12);
+  SWAP(7, 13);
+  SWAP(3, 10);
+  SWAP(5, 12);
+  SWAP(3, 9);
+  SWAP(6, 12);
+  SWAP(3, 8);
+  SWAP(7, 12);
+  SWAP(5, 9);
+  SWAP(6, 10);
+  SWAP(4, 8);
+  SWAP(7, 11);
+  SWAP(5, 8);
+  SWAP(7, 10);
+  SWAP(6, 8);
+  SWAP(7, 9);
+  SWAP(7, 8);
+}
+
+/*!\brief Sorts two size-8 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out]    k          An length-8 array of float serves as the keys.
+ * \param[in,out]    v          An length-8 array of int32 serves as the values.
+ */
+static AOM_INLINE void av1_sort_fi32_8(float k[], int32_t v[]) {
+  SWAP(0, 1);
+  SWAP(2, 3);
+  SWAP(4, 5);
+  SWAP(6, 7);
+  SWAP(0, 2);
+  SWAP(1, 3);
+  SWAP(4, 6);
+  SWAP(5, 7);
+  SWAP(1, 2);
+  SWAP(5, 6);
+  SWAP(0, 4);
+  SWAP(3, 7);
+  SWAP(1, 5);
+  SWAP(2, 6);
+  SWAP(1, 4);
+  SWAP(3, 6);
+  SWAP(2, 4);
+  SWAP(3, 5);
+  SWAP(3, 4);
+}
+#undef SWAP
+#endif  // AOM_AV1_ENCODER_SORTING_NETWORK_H_
diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.c b/third_party/aom/av1/encoder/sparse_linear_solver.c
new file mode 100644
index 0000000000..e47c78e148
--- /dev/null
+++ b/third_party/aom/av1/encoder/sparse_linear_solver.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/sparse_linear_solver.h"
+#include "config/aom_config.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/alloccommon.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+/*
+ * Input:
+ * rows: array of row positions
+ * cols: array of column positions
+ * values: array of element values
+ * num_elem: total number of elements in the matrix
+ * num_rows: number of rows in the matrix
+ * num_cols: number of columns in the matrix
+ *
+ * Output:
+ * sm: pointer to the sparse matrix to be initialized
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+                        int num_elem, int num_rows, int num_cols,
+                        SPARSE_MTX *sm) {
+  sm->n_elem = num_elem;
+  sm->n_rows = num_rows;
+  sm->n_cols = num_cols;
+  if (num_elem == 0) {
+    sm->row_pos = NULL;
+    sm->col_pos = NULL;
+    sm->value = NULL;
+    return 0;
+  }
+  sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos));
+  sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos));
+  sm->value = aom_calloc(num_elem, sizeof(*sm->value));
+
+  if (!sm->row_pos || !sm->col_pos || !sm->value) {
+    av1_free_sparse_mtx_elems(sm);
+    return -1;
+  }
+
+  memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos));
+  memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos));
+  memcpy(sm->value, values, num_elem * sizeof(*sm->value));
+
+  return 0;
+}
+
+/*
+ * Combines two sparse matrices (allocating new space).
+ *
+ * Input:
+ * sm1, sm2: matrices to be combined
+ * row_offset1, row_offset2: row offset of each matrix in the new matrix
+ * col_offset1, col_offset2: column offset of each matrix in the new matrix
+ * new_n_rows, new_n_cols: number of rows and columns in the new matrix
+ *
+ * Output:
+ * sm: the combined matrix
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+                                SPARSE_MTX *sm, int row_offset1,
+                                int col_offset1, int row_offset2,
+                                int col_offset2, int new_n_rows,
+                                int new_n_cols) {
+  sm->n_elem = sm1->n_elem + sm2->n_elem;
+  sm->n_cols = new_n_cols;
+  sm->n_rows = new_n_rows;
+
+  if (sm->n_elem == 0) {
+    sm->row_pos = NULL;
+    sm->col_pos = NULL;
+    sm->value = NULL;
+    return 0;
+  }
+
+  sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos));
+  sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos));
+  sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value));
+
+  if (!sm->row_pos || !sm->col_pos || !sm->value) {
+    av1_free_sparse_mtx_elems(sm);
+    return -1;
+  }
+
+  for (int i = 0; i < sm1->n_elem; i++) {
+    sm->row_pos[i] = sm1->row_pos[i] + row_offset1;
+    sm->col_pos[i] = sm1->col_pos[i] + col_offset1;
+  }
+  memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value));
+  int n_elem1 = sm1->n_elem;
+  for (int i = 0; i < sm2->n_elem; i++) {
+    sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2;
+    sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2;
+  }
+  memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value));
+  return 0;
+}
+
+void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) {
+  sm->n_cols = 0;
+  sm->n_rows = 0;
+  if (sm->n_elem != 0) {
+    aom_free(sm->row_pos);
+    aom_free(sm->col_pos);
+    aom_free(sm->value);
+  }
+  sm->n_elem = 0;
+}
+
+/*
+ * Calculate matrix and vector multiplication: A*b
+ *
+ * Input:
+ * sm: matrix A
+ * srcv: the vector b to be multiplied to
+ * dstl: the length of vectors
+ *
+ * Output:
+ * dstv: pointer to the resulting vector
+ */
+void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
+                              double *dstv, int dstl) {
+  memset(dstv, 0, sizeof(*dstv) * dstl);
+  for (int i = 0; i < sm->n_elem; i++) {
+    dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i];
+  }
+}
+/*
+ * Calculate matrix and vector multiplication: b*A
+ *
+ * Input:
+ * sm: matrix A
+ * srcv: the vector b to be multiplied to
+ * dstl: the length of vectors
+ *
+ * Output:
+ * dstv: pointer to the resulting vector
+ */
+void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
+                             double *dstv, int dstl) {
+  memset(dstv, 0, sizeof(*dstv) * dstl);
+  for (int i = 0; i < sm->n_elem; i++) {
+    dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i];
+  }
+}
+
+/*
+ * Calculate inner product of two vectors
+ *
+ * Input:
+ * src1, scr2: the vectors to be multiplied
+ * src1l: length of the vectors
+ *
+ * Output:
+ * the inner product
+ */
+double av1_vect_vect_multi(const double *src1, int src1l, const double *src2) {
+  double result = 0;
+  for (int i = 0; i < src1l; i++) {
+    result += src1[i] * src2[i];
+  }
+  return result;
+}
+
+/*
+ * Multiply each element in the matrix sm with a constant c
+ */
+void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) {
+  for (int i = 0; i < sm->n_elem; i++) {
+    sm->value[i] *= c;
+  }
+}
+
+static INLINE void free_solver_local_buf(double *buf1, double *buf2,
+                                         double *buf3, double *buf4,
+                                         double *buf5, double *buf6,
+                                         double *buf7) {
+  aom_free(buf1);
+  aom_free(buf2);
+  aom_free(buf3);
+  aom_free(buf4);
+  aom_free(buf5);
+  aom_free(buf6);
+  aom_free(buf7);
+}
+
+/*
+ * Solve for Ax = b
+ * no requirement on A
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+                                     int bl, double *x) {
+  double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL,
+         *p_hatA = NULL, *x_hat = NULL;
+  double alpha, beta, rtr, r_norm_2;
+  double denormtemp;
+
+  // initialize
+  r = aom_calloc(bl, sizeof(*r));
+  r_hat = aom_calloc(bl, sizeof(*r_hat));
+  p = aom_calloc(bl, sizeof(*p));
+  p_hat = aom_calloc(bl, sizeof(*p_hat));
+  Ap = aom_calloc(bl, sizeof(*Ap));
+  p_hatA = aom_calloc(bl, sizeof(*p_hatA));
+  x_hat = aom_calloc(bl, sizeof(*x_hat));
+  if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) {
+    free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+    return -1;
+  }
+
+  int i;
+  for (i = 0; i < bl; i++) {
+    r[i] = b[i];
+    r_hat[i] = b[i];
+    p[i] = r[i];
+    p_hat[i] = r_hat[i];
+    x[i] = 0;
+    x_hat[i] = 0;
+  }
+  r_norm_2 = av1_vect_vect_multi(r_hat, bl, r);
+  for (int k = 0; k < MAX_CG_SP_ITER; k++) {
+    rtr = r_norm_2;
+    av1_mtx_vect_multi_right(A, p, Ap, bl);
+    av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl);
+
+    denormtemp = av1_vect_vect_multi(p_hat, bl, Ap);
+    if (denormtemp < 1e-10) break;
+    alpha = rtr / denormtemp;
+    r_norm_2 = 0;
+    for (i = 0; i < bl; i++) {
+      x[i] += alpha * p[i];
+      x_hat[i] += alpha * p_hat[i];
+      r[i] -= alpha * Ap[i];
+      r_hat[i] -= alpha * p_hatA[i];
+      r_norm_2 += r_hat[i] * r[i];
+    }
+    if (sqrt(r_norm_2) < 1e-2) {
+      break;
+    }
+    if (rtr < 1e-10) break;
+    beta = r_norm_2 / rtr;
+    for (i = 0; i < bl; i++) {
+      p[i] = r[i] + beta * p[i];
+      p_hat[i] = r_hat[i] + beta * p_hat[i];
+    }
+  }
+  // free
+  free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+  return 0;
+}
+
+/*
+ * Solve for Ax = b when A is symmetric and positive definite
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                  double *x) {
+  double *r = NULL, *p = NULL, *Ap = NULL;
+  double alpha, beta, rtr, r_norm_2;
+  double denormtemp;
+
+  // initialize
+  r = aom_calloc(bl, sizeof(*r));
+  p = aom_calloc(bl, sizeof(*p));
+  Ap = aom_calloc(bl, sizeof(*Ap));
+  if (!r || !p || !Ap) {
+    free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+    return -1;
+  }
+
+  int i;
+  for (i = 0; i < bl; i++) {
+    r[i] = b[i];
+    p[i] = r[i];
+    x[i] = 0;
+  }
+  r_norm_2 = av1_vect_vect_multi(r, bl, r);
+  int k;
+  for (k = 0; k < MAX_CG_SP_ITER; k++) {
+    rtr = r_norm_2;
+    av1_mtx_vect_multi_right(A, p, Ap, bl);
+    denormtemp = av1_vect_vect_multi(p, bl, Ap);
+    if (denormtemp < 1e-10) break;
+    alpha = rtr / denormtemp;
+    r_norm_2 = 0;
+    for (i = 0; i < bl; i++) {
+      x[i] += alpha * p[i];
+      r[i] -= alpha * Ap[i];
+      r_norm_2 += r[i] * r[i];
+    }
+    if (r_norm_2 < 1e-8 * bl) break;
+    if (rtr < 1e-10) break;
+    beta = r_norm_2 / rtr;
+    for (i = 0; i < bl; i++) {
+      p[i] = r[i] + beta * p[i];
+    }
+  }
+  // free
+  free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+
+  return 0;
+}
+
+/*
+ * Solve for Ax = b using Jacobi method
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) {
+  double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL,
+         *tempx = NULL;
+  double resi2;
+
+  diags = aom_calloc(bl, sizeof(*diags));
+  Rx = aom_calloc(bl, sizeof(*Rx));
+  x_last = aom_calloc(bl, sizeof(*x_last));
+  x_cur = aom_calloc(bl, sizeof(*x_cur));
+
+  if (!diags || !Rx || !x_last || !x_cur) {
+    free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+    return -1;
+  }
+
+  int i;
+  memset(x_last, 0, sizeof(*x_last) * bl);
+  // get the diagonals of A
+  memset(diags, 0, sizeof(*diags) * bl);
+  for (int c = 0; c < A->n_elem; c++) {
+    if (A->row_pos[c] != A->col_pos[c]) continue;
+    diags[A->row_pos[c]] = A->value[c];
+  }
+  int k;
+  for (k = 0; k < MAX_CG_SP_ITER; k++) {
+    // R = A - diag(diags)
+    // get R*x_last
+    memset(Rx, 0, sizeof(*Rx) * bl);
+    for (int c = 0; c < A->n_elem; c++) {
+      if (A->row_pos[c] == A->col_pos[c]) continue;
+      Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c];
+    }
+    resi2 = 0;
+    for (i = 0; i < bl; i++) {
+      x_cur[i] = (b[i] - Rx[i]) / diags[i];
+      resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]);
+    }
+    if (resi2 <= 1e-10 * bl) break;
+    // swap last & cur buffer ptrs
+    tempx = x_last;
+    x_last = x_cur;
+    x_cur = tempx;
+  }
+  printf("\n numiter: %d\n", k);
+  for (i = 0; i < bl; i++) {
+    x[i] = x_cur[i];
+  }
+  free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+  return 0;
+}
+
+/*
+ * Solve for Ax = b using Steepest descent method
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                double *x) {
+  double *d = NULL, *Ad = NULL, *Ax = NULL;
+  double resi2, resi2_last, dAd, temp;
+
+  d = aom_calloc(bl, sizeof(*d));
+  Ax = aom_calloc(bl, sizeof(*Ax));
+  Ad = aom_calloc(bl, sizeof(*Ad));
+
+  if (!d || !Ax || !Ad) {
+    free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+    return -1;
+  }
+
+  int i;
+  // initialize with 0s
+  resi2 = 0;
+  for (i = 0; i < bl; i++) {
+    x[i] = 0;
+    d[i] = b[i];
+    resi2 += d[i] * d[i] / bl;
+  }
+  int k;
+  for (k = 0; k < MAX_CG_SP_ITER; k++) {
+    // get A*x_last
+    av1_mtx_vect_multi_right(A, d, Ad, bl);
+    dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad);
+    for (i = 0; i < bl; i++) {
+      temp = dAd * d[i];
+      x[i] = x[i] + temp;
+    }
+    av1_mtx_vect_multi_right(A, x, Ax, bl);
+    resi2_last = resi2;
+    resi2 = 0;
+    for (i = 0; i < bl; i++) {
+      d[i] = b[i] - Ax[i];
+      resi2 += d[i] * d[i] / bl;
+    }
+    if (resi2 <= 1e-8) break;
+    if (resi2_last - resi2 < 1e-8) {
+      break;
+    }
+  }
+  free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+
+  return 0;
+}
+
+#endif  // CONFIG_OPTICAL_FLOW_API
diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.h b/third_party/aom/av1/encoder/sparse_linear_solver.h
new file mode 100644
index 0000000000..f30fc0f5b1
--- /dev/null
+++ b/third_party/aom/av1/encoder/sparse_linear_solver.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+// Number of iterations for solving linear equations.
+#define MAX_CG_SP_ITER 100
+
+typedef struct {
+  int n_elem;  // number of non-zero elements
+  int n_rows;
+  int n_cols;
+  // using arrays to represent non-zero elements.
+  int *col_pos;
+  int *row_pos;  // starts with 0
+  double *value;
+} SPARSE_MTX;
+
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+                        int num_elem, int num_rows, int num_cols,
+                        SPARSE_MTX *sm);
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+                                SPARSE_MTX *sm, int row_offset1,
+                                int col_offset1, int row_offset2,
+                                int col_offset2, int new_n_rows,
+                                int new_n_cols);
+void av1_free_sparse_mtx_elems(SPARSE_MTX *sm);
+
+void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
+                              double *dstv, int dstl);
+void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
+                             double *dstv, int dstl);
+double av1_vect_vect_multi(const double *src1, int src1l, const double *src2);
+void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c);
+
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                  double *x);
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+                                     int bl, double *x);
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x);
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                double *x);
+
+#endif  // CONFIG_OPTICAL_FLOW_API
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
new file mode 100644
index 0000000000..a6c0971096
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -0,0 +1,2715 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/rdopt.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+// Max speed setting for tx domain evaluation
+#define MAX_TX_DOMAIN_EVAL_SPEED 5
+static MESH_PATTERN
+    good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+      { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+    };
+
+// TODO(huisu@google.com): These settings are pretty relaxed, tune them for
+// each speed setting
+static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+};
+
+// Threshold values to be used for pruning the txfm_domain_distortion
+// based on block MSE
+// Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+// TODO(any): Experiment the threshold logic based on variance metric
+static unsigned int tx_domain_dist_thresholds[4][MODE_EVAL_TYPES] = {
+  { UINT_MAX, UINT_MAX, UINT_MAX },
+  { 22026, 22026, 22026 },
+  { 1377, 1377, 1377 },
+  { 0, 0, 0 }
+};
+
+// Number of different levels of aggressiveness in using transform domain
+// distortion during the R-D evaluation based on the speed feature
+// tx_domain_dist_level.
+#define TX_DOMAIN_DIST_LEVELS 4
+
+// Transform domain distortion type to be used for default, mode and winner mode
+// evaluation Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode
+// evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+static unsigned int
+    tx_domain_dist_types[TX_DOMAIN_DIST_LEVELS][MODE_EVAL_TYPES] = {
+      { 0, 2, 0 }, { 1, 2, 0 }, { 2, 2, 0 }, { 2, 2, 2 }
+    };
+
+// Threshold values to be used for disabling coeff RD-optimization
+// based on block MSE / qstep^2.
+// TODO(any): Experiment the threshold logic based on variance metric.
+// Table has satd and dist threshold value index 0 : dist,index 1: satd
+// For each row, the indices are as follows.
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc)
+// Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
+// feature is ON
+// There are 7 levels with increasing speed, mapping to vertical indices.
+static unsigned int coeff_opt_thresholds[9][MODE_EVAL_TYPES][2] = {
+  { { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 3200, UINT_MAX }, { 250, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 1728, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 864, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 432, UINT_MAX }, { 86, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 864, 97 }, { 142, 16 }, { UINT_MAX, UINT_MAX } },
+  { { 432, 97 }, { 86, 16 }, { UINT_MAX, UINT_MAX } },
+  { { 216, 25 }, { 86, 10 }, { UINT_MAX, UINT_MAX } },
+  { { 216, 25 }, { 0, 10 }, { UINT_MAX, UINT_MAX } }
+};
+
+// Transform size to be used for default, mode and winner mode evaluation
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
+// feature is ON
+static TX_SIZE_SEARCH_METHOD tx_size_search_methods[4][MODE_EVAL_TYPES] = {
+  { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD },
+  { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD },
+  { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD },
+  { USE_LARGESTALL, USE_LARGESTALL, USE_LARGESTALL }
+};
+
+// Predict transform skip levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early skip prediction
+// 1 : conservative early skip prediction using DCT_DCT
+// 2 : early skip prediction based on SSE
+static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+                                                                { 1, 1, 1 },
+                                                                { 1, 2, 1 } };
+
+// Predict skip or DC block level used during transform type search. It is
+// indexed using the following:
+// First index  : Speed feature 'dc_blk_pred_level' (0 to 3)
+// Second index : Mode evaluation type (DEFAULT_EVAL, MODE_EVAL and
+// WINNER_MODE_EVAL).
+//
+// The values of predict_dc_levels[][] indicate the aggressiveness of predicting
+// a block as transform skip or DC only.
+// Type 0 : No skip block or DC only block prediction
+// Type 1 : Prediction of skip block based on residual mean and variance
+// Type 2 : Prediction of skip block or DC only block based on residual mean and
+// variance
+static unsigned int predict_dc_levels[4][MODE_EVAL_TYPES] = {
+  { 0, 0, 0 }, { 1, 1, 0 }, { 2, 2, 0 }, { 2, 2, 2 }
+};
+
+#if !CONFIG_FPMT_TEST
+// This table holds the maximum number of reference frames for global motion.
+// The table is indexed as per the speed feature 'gm_search_type'.
+// 0 : All reference frames are allowed.
+// 1 : All reference frames except L2 and L3 are allowed.
+// 2 : All reference frames except L2, L3 and ARF2 are allowed.
+// 3 : No reference frame is allowed.
+static int gm_available_reference_frames[GM_DISABLE_SEARCH + 1] = {
+  INTER_REFS_PER_FRAME, INTER_REFS_PER_FRAME - 2, INTER_REFS_PER_FRAME - 3, 0
+};
+#endif
+
+// Qindex threshold levels used for selecting full-pel motion search.
+// ms_qthresh[i][j][k] indicates the qindex boundary value for 'k'th qindex band
+// for resolution index 'j' for aggressiveness level 'i'.
+// Aggressiveness increases from i = 0 to 2.
+// j = 0: lower than 720p resolution, j = 1: 720p or larger resolution.
+// Currently invoked only for speed 0, 1 and 2.
+static int ms_qindex_thresh[3][2][2] = { { { 200, 70 }, { MAXQ, 200 } },
+                                         { { 170, 50 }, { MAXQ, 200 } },
+                                         { { 170, 40 }, { 200, 40 } } };
+
+// Full-pel search methods for aggressive search based on qindex.
+// Index 0 is for resolutions lower than 720p, index 1 for 720p or larger
+// resolutions. Currently invoked only for speed 1 and 2.
+static SEARCH_METHODS motion_search_method[2] = { CLAMPED_DIAMOND, DIAMOND };
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const AV1_COMP *cpi) {
+  return frame_is_kf_gf_arf(cpi);
+}
+
+// Set transform rd gate level for all transform search cases.
+static AOM_INLINE void set_txfm_rd_gate_level(
+    int txfm_rd_gate_level[TX_SEARCH_CASES], int level) {
+  assert(level <= MAX_TX_RD_GATE_LEVEL);
+  for (int idx = 0; idx < TX_SEARCH_CASES; idx++)
+    txfm_rd_gate_level[idx] = level;
+}
+
+static void set_allintra_speed_feature_framesize_dependent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+  const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+  const bool use_hbd = cpi->oxcf.use_highbitdepth;
+
+  if (is_480p_or_larger) {
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+    if (is_720p_or_larger)
+      sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+    else
+      sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+  } else {
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
+  }
+
+  if (is_4k_or_larger) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+  }
+
+  // TODO(huisu@google.com): train models for 720P and above.
+  if (!is_720p_or_larger) {
+    sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+    sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+    sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+    sf->part_sf.ml_partition_search_breakout_thresh[3] = 500;  // BLOCK_64X64
+    sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+    sf->part_sf.ml_early_term_after_part_split_level = 1;
+  }
+
+  if (is_720p_or_larger) {
+    // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+    // current block's vertical texture instead of hardcoded with resolution
+    sf->mv_sf.use_downsampled_sad = 2;
+  }
+
+  if (speed >= 1) {
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+      sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+      sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+      sf->part_sf.ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
+      sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;  // BLOCK_128X128
+    }
+    sf->part_sf.ml_early_term_after_part_split_level = 2;
+  }
+
+  if (speed >= 2) {
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+      sf->part_sf.partition_search_breakout_rate_thr = 100;
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+    } else {
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+    }
+  }
+
+  if (speed >= 3) {
+    sf->part_sf.ml_early_term_after_part_split_level = 0;
+
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+      sf->part_sf.partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->part_sf.max_intra_bsize = BLOCK_32X32;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
+    }
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+  }
+
+  if (speed >= 4) {
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+    }
+  }
+
+  if (speed >= 6) {
+    if (is_720p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    }
+
+    if (is_1080p_or_larger) {
+      sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    }
+
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+  }
+
+  if (speed >= 7) {
+    // TODO(kyslov): add more speed features to control speed/quality
+  }
+
+  if (speed >= 8) {
+    if (!is_480p_or_larger) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
+    if (is_720p_or_larger) {
+      sf->rt_sf.force_large_partition_blocks_intra = 1;
+    }
+  }
+
+  if (speed >= 9) {
+    // TODO(kyslov): add more speed features to control speed/quality
+    if (!is_4k_or_larger) {
+      // In av1_select_sb_size(), superblock size is set to 64x64 only for
+      // resolutions less than 4k in speed>=9, to improve the multithread
+      // performance. If cost update levels are set to INTERNAL_COST_UPD_OFF
+      // for resolutions >= 4k, the SB size setting can be modified for these
+      // resolutions as well.
+      sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_OFF;
+      sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_OFF;
+    }
+  }
+}
+
+static void set_allintra_speed_features_framesize_independent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int allow_screen_content_tools =
+      cm->features.allow_screen_content_tools;
+  const int use_hbd = cpi->oxcf.use_highbitdepth;
+
+  sf->part_sf.less_rectangular_check_level = 1;
+  sf->part_sf.ml_prune_partition = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 1;
+  sf->part_sf.prune_part4_search = 2;
+  sf->part_sf.simple_motion_search_prune_rect = 1;
+  sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
+
+  sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.prune_luma_palette_size_search_level = 1;
+  sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+  sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+  sf->tx_sf.adaptive_txb_search_level = 1;
+  sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.model_based_prune_tx_search_level = 1;
+  sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+  sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.use_real_time_ref_set = 0;
+
+  if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
+      cpi->use_screen_content_tools) {
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+  } else {
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+  }
+
+  sf->rd_sf.perform_coeff_opt = 1;
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
+
+  if (speed >= 1) {
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 0 : 2;
+    sf->part_sf.simple_motion_search_early_term_none = 1;
+    // TODO(Venkat): Clean-up frame type dependency for
+    // simple_motion_search_split in partition search function and set the
+    // speed feature accordingly
+    sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+    sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
+    sf->part_sf.reuse_best_prediction_for_part_ab = 1;
+
+    sf->mv_sf.exhaustive_searches_thresh <<= 1;
+
+    sf->intra_sf.prune_palette_search_level = 1;
+    sf->intra_sf.prune_luma_palette_size_search_level = 2;
+    sf->intra_sf.top_intra_model_count_allowed = 3;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.model_based_prune_tx_search_level = 0;
+    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+    sf->tx_sf.tx_type_search.skip_tx_search = 1;
+
+    sf->rd_sf.perform_coeff_opt = 2;
+    sf->rd_sf.tx_domain_dist_level = 1;
+    sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+    sf->lpf_sf.dual_sgr_penalty_level = 1;
+    sf->lpf_sf.enable_sgr_ep_pruning = 1;
+  }
+
+  if (speed >= 2) {
+    sf->mv_sf.auto_mv_step_size = 1;
+
+    sf->intra_sf.disable_smooth_intra = 1;
+    sf->intra_sf.intra_pruning_with_hog = 2;
+    sf->intra_sf.prune_filter_intra_level = 1;
+
+    sf->rd_sf.perform_coeff_opt = 3;
+
+    sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+    sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+  }
+
+  if (speed >= 3) {
+    sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+    sf->part_sf.less_rectangular_check_level = 2;
+    sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL1;
+    sf->part_sf.prune_ext_part_using_split_info = 1;
+
+    sf->mv_sf.full_pixel_search_level = 1;
+    sf->mv_sf.search_method = DIAMOND;
+
+    // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+    // inherited directly from luma hog with some minor tweaking. Eventually we
+    // should run this with a bayesian optimizer to find the Pareto frontier.
+    sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+    sf->intra_sf.intra_pruning_with_hog = 3;
+    sf->intra_sf.prune_palette_search_level = 2;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+    sf->tx_sf.use_rd_based_breakout_for_intra_tx_search = true;
+
+    // TODO(any): evaluate if these lpf features can be moved to speed 2.
+    // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+    // loss.
+    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
+    sf->lpf_sf.disable_loop_restoration_chroma = 0;
+    sf->lpf_sf.reduce_wiener_window_size = 1;
+    sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+  }
+
+  if (speed >= 4) {
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+    sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL2;
+    sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+    sf->part_sf.prune_ext_part_using_split_info = 2;
+    sf->part_sf.early_term_after_none_split = 1;
+    sf->part_sf.ml_predict_breakout_level = 3;
+
+    sf->intra_sf.prune_chroma_modes_using_luma_winner = 1;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+
+    sf->tpl_sf.prune_starting_mv = 2;
+    sf->tpl_sf.subpel_force_stop = HALF_PEL;
+    sf->tpl_sf.search_method = FAST_BIGDIA;
+
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
+
+    sf->rd_sf.perform_coeff_opt = 5;
+    sf->rd_sf.tx_domain_dist_thres_level = 3;
+
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
+
+    sf->mv_sf.reduce_search_range = 1;
+
+    sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_DEFAULT;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+  }
+
+  if (speed >= 5) {
+    sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL3;
+    sf->part_sf.ext_partition_eval_thresh =
+        allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 1 : 2;
+
+    sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+    sf->lpf_sf.use_coarse_filter_level_search = 0;
+    // Disable Wiener and Self-guided Loop restoration filters.
+    sf->lpf_sf.disable_wiener_filter = true;
+    sf->lpf_sf.disable_sgr_filter = true;
+
+    sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
+
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_FAST;
+  }
+
+  if (speed >= 6) {
+    sf->intra_sf.prune_smooth_intra_mode_for_chroma = 1;
+    sf->intra_sf.prune_filter_intra_level = 2;
+    sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_pruning_with_hog = 4;
+    sf->intra_sf.cfl_search_range = 1;
+    sf->intra_sf.top_intra_model_count_allowed = 2;
+    sf->intra_sf.adapt_top_model_rd_count_using_neighbors = 1;
+    sf->intra_sf.prune_luma_odd_delta_angles_in_intra = 1;
+
+    sf->part_sf.prune_rectangular_split_based_on_qidx =
+        allow_screen_content_tools ? 0 : 2;
+    sf->part_sf.prune_rect_part_using_4x4_var_deviation = true;
+    sf->part_sf.prune_rect_part_using_none_pred_mode = true;
+    sf->part_sf.prune_sub_8x8_partition_level =
+        allow_screen_content_tools ? 0 : 1;
+    sf->part_sf.prune_part4_search = 3;
+    // TODO(jingning): This might not be a good trade off if the
+    // target image quality is very low.
+    sf->part_sf.default_max_partition_size = BLOCK_32X32;
+
+    sf->mv_sf.use_bsize_dependent_search_method = 1;
+
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
+    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
+    sf->tx_sf.prune_intra_tx_depths_using_nn = true;
+
+    sf->rd_sf.perform_coeff_opt = 6;
+    sf->rd_sf.tx_domain_dist_level = 3;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+    sf->winner_mode_sf.prune_winner_mode_eval_level = 1;
+    sf->winner_mode_sf.dc_blk_pred_level = 1;
+  }
+  // The following should make all-intra mode speed 7 approximately equal
+  // to real-time speed 6,
+  // all-intra speed 8 close to real-time speed 7, and all-intra speed 9
+  // close to real-time speed 8
+  if (speed >= 7) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->rt_sf.var_part_split_threshold_shift = 7;
+  }
+
+  if (speed >= 8) {
+    sf->rt_sf.hybrid_intra_pickmode = 1;
+    sf->rt_sf.use_nonrd_pick_mode = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 1;
+    sf->rt_sf.var_part_split_threshold_shift = 8;
+    // Set mask for intra modes.
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      if (i >= BLOCK_32X32)
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+      else
+        // Use DC, H, V intra mode for block sizes < 32X32.
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 9) {
+    sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+    sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+
+    sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+    sf->rt_sf.hybrid_intra_pickmode = 0;
+    sf->rt_sf.var_part_split_threshold_shift = 9;
+    sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true;
+    sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
+    sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true;
+    sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
+  }
+
+  // As the speed feature prune_chroma_modes_using_luma_winner already
+  // constrains the number of chroma directional mode evaluations to a maximum
+  // of 1, the HOG computation and the associated pruning logic does not seem to
+  // help speed-up the chroma mode evaluations. Hence disable the speed feature
+  // chroma_intra_pruning_with_hog when prune_chroma_modes_using_luma_winner is
+  // enabled.
+  if (sf->intra_sf.prune_chroma_modes_using_luma_winner)
+    sf->intra_sf.chroma_intra_pruning_with_hog = 0;
+}
+
+static void set_good_speed_feature_framesize_dependent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+  const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+  const bool use_hbd = cpi->oxcf.use_highbitdepth;
+  // Speed features applicable for temporal filtering and tpl modules may be
+  // changed based on frame type at places where the sf is applied (Example :
+  // use_downsampled_sad). This is because temporal filtering and tpl modules
+  // are called before this function (except for the first key frame).
+  // TODO(deepa.kg@ittiam.com): For the speed features applicable to temporal
+  // filtering and tpl modules, modify the sf initialization appropriately
+  // before calling the modules.
+  const int boosted = frame_is_boosted(cpi);
+  const int is_boosted_arf2_bwd_type =
+      boosted ||
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+  const int is_lf_frame =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == LF_UPDATE;
+  const int allow_screen_content_tools =
+      cm->features.allow_screen_content_tools;
+
+  if (is_480p_or_larger) {
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+    if (is_720p_or_larger)
+      sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+    else
+      sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+  } else {
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
+  }
+
+  if (is_4k_or_larger) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+  }
+
+  // TODO(huisu@google.com): train models for 720P and above.
+  if (!is_720p_or_larger) {
+    sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+    sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+    sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+    sf->part_sf.ml_partition_search_breakout_thresh[3] = 500;  // BLOCK_64X64
+    sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+    sf->part_sf.ml_early_term_after_part_split_level = 1;
+  }
+
+  if (is_720p_or_larger) {
+    // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+    // current block's vertical texture instead of hardcoded with resolution
+    sf->mv_sf.use_downsampled_sad = 2;
+  }
+
+  if (!is_720p_or_larger) {
+    const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+    const int rate_tolerance =
+        AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+    sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2);
+  }
+
+  if (speed >= 1) {
+    if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1;
+
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+      sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+      sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+      sf->part_sf.ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
+      sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;  // BLOCK_128X128
+    }
+    sf->part_sf.ml_early_term_after_part_split_level = 2;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+  }
+
+  if (speed >= 2) {
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+      sf->part_sf.partition_search_breakout_rate_thr = 100;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_obmc_prob_thresh = 16;
+    } else {
+      sf->inter_sf.prune_obmc_prob_thresh = 8;
+    }
+
+    if (is_480p_or_larger) {
+      sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
+    } else {
+      sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+    }
+
+    if (is_480p_or_lesser) sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 1 : 0;
+    } else {
+      sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 2 : 0;
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+    } else {
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+      sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = boosted ? 0 : 1;
+      sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = boosted ? 0 : 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->mv_sf.disable_second_mv = 1;
+      sf->mv_sf.auto_mv_step_size = 2;
+    } else {
+      sf->mv_sf.disable_second_mv = boosted ? 0 : 2;
+      sf->mv_sf.auto_mv_step_size = 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->hl_sf.recode_tolerance = 50;
+      sf->inter_sf.disable_interinter_wedge_newmv_search =
+          is_boosted_arf2_bwd_type ? 0 : 1;
+      sf->inter_sf.enable_fast_wedge_mask_search = 1;
+    }
+  }
+
+  if (speed >= 3) {
+    sf->inter_sf.enable_fast_wedge_mask_search = 1;
+    sf->inter_sf.skip_newmv_in_drl = 2;
+    sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+    sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 3 : 0;
+    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
+        frame_is_intra_only(&cpi->common) ? 0 : 1;
+
+    sf->part_sf.ml_early_term_after_part_split_level = 0;
+
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+      sf->part_sf.partition_search_breakout_rate_thr = 200;
+      sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0;
+    } else {
+      sf->part_sf.max_intra_bsize = BLOCK_32X32;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
+      sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 1 : 0;
+    }
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+
+    if (is_480p_or_larger) {
+      sf->part_sf.early_term_after_none_split = 1;
+    } else {
+      sf->part_sf.early_term_after_none_split = 0;
+    }
+    if (is_720p_or_larger) {
+      sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 2;
+    } else {
+      sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 3;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+      sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 1;
+    } else {
+      sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+      sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+      sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL2;
+    }
+
+    sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+  }
+
+  if (speed >= 4) {
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+    }
+    sf->part_sf.early_term_after_none_split = 1;
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+    } else {
+      sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1;
+    }
+
+    sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+    sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+    sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+    if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3;
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_comp_ref_frames = 1;
+    } else if (is_480p_or_larger) {
+      sf->inter_sf.prune_comp_ref_frames = is_boosted_arf2_bwd_type ? 0 : 1;
+    }
+
+    if (is_720p_or_larger)
+      sf->hl_sf.recode_tolerance = 32;
+    else
+      sf->hl_sf.recode_tolerance = 55;
+
+    sf->intra_sf.skip_intra_in_interframe = 4;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
+  }
+
+  if (speed >= 5) {
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_warped_prob_thresh = 16;
+    } else if (is_480p_or_larger) {
+      sf->inter_sf.prune_warped_prob_thresh = 8;
+    }
+    if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40;
+
+    sf->inter_sf.skip_newmv_in_drl = 4;
+    sf->inter_sf.prune_comp_ref_frames = 1;
+    sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1;
+
+    if (!is_720p_or_larger) {
+      sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET;
+      sf->inter_sf.prune_nearest_near_mv_using_refmv_weight =
+          (boosted || allow_screen_content_tools) ? 0 : 1;
+      sf->mv_sf.use_downsampled_sad = 1;
+    }
+
+    if (!is_480p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+    }
+
+    if (is_480p_or_lesser) {
+      sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL1;
+    } else {
+      sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2;
+    }
+
+    if (is_720p_or_larger)
+      sf->part_sf.ext_part_eval_based_on_cur_best =
+          (allow_screen_content_tools || frame_is_intra_only(cm)) ? 0 : 1;
+
+    if (is_480p_or_larger) {
+      sf->tpl_sf.reduce_num_frames = 1;
+    }
+  }
+
+  if (speed >= 6) {
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
+    sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+    sf->inter_sf.prune_comp_ref_frames = 2;
+    sf->inter_sf.prune_nearest_near_mv_using_refmv_weight =
+        (boosted || allow_screen_content_tools) ? 0 : 1;
+    sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 2;
+
+    if (is_720p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    }
+
+    if (is_480p_or_larger) {
+      sf->hl_sf.allow_sub_blk_me_in_tf = 1;
+    }
+
+    if (is_1080p_or_larger) {
+      sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.disable_masked_comp = 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+      sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+    }
+
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 28);
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_ref_mv_idx_search = 2;
+    } else {
+      sf->inter_sf.prune_ref_mv_idx_search = 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+          is_boosted_arf2_bwd_type ? 450 : 150;
+    }
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+
+    sf->hl_sf.recode_tolerance = 55;
+  }
+}
+
+static void set_good_speed_features_framesize_independent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_boosted_arf2_bwd_type =
+      boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+  const int is_inter_frame =
+      gf_group->frame_type[cpi->gf_frame_index] == INTER_FRAME;
+  const int allow_screen_content_tools =
+      cm->features.allow_screen_content_tools;
+  const int use_hbd = cpi->oxcf.use_highbitdepth;
+  if (!cpi->oxcf.tile_cfg.enable_large_scale_tile) {
+    sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA;
+  }
+
+  // Speed 0 for all speed features that give neutral coding performance change.
+  sf->gm_sf.gm_search_type = boosted ? GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2
+                                     : GM_SEARCH_CLOSEST_REFS_ONLY;
+  sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
+  sf->gm_sf.disable_gm_search_based_on_stats = 1;
+
+  sf->part_sf.less_rectangular_check_level = 1;
+  sf->part_sf.ml_prune_partition = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 1;
+  sf->part_sf.prune_part4_search = 2;
+  sf->part_sf.simple_motion_search_prune_rect = 1;
+  sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
+  sf->part_sf.simple_motion_search_prune_agg =
+      allow_screen_content_tools ? NO_PRUNING : SIMPLE_AGG_LVL0;
+
+  // TODO(debargha): Test, tweak and turn on either 1 or 2
+  sf->inter_sf.inter_mode_rd_model_estimation = 1;
+  sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+  sf->inter_sf.prune_compound_using_single_ref = 1;
+  sf->inter_sf.prune_mode_search_simple_translation = 1;
+  sf->inter_sf.prune_ref_frame_for_rect_partitions =
+      (boosted || (allow_screen_content_tools))
+          ? 0
+          : (is_boosted_arf2_bwd_type ? 1 : 2);
+  sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
+  sf->inter_sf.selective_ref_frame = 1;
+  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+
+  sf->interp_sf.use_fast_interpolation_filter_search = 1;
+
+  sf->intra_sf.intra_pruning_with_hog = 1;
+
+  sf->tx_sf.adaptive_txb_search_level = 1;
+  sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.model_based_prune_tx_search_level = 1;
+  sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+  sf->tpl_sf.search_method = NSTEP_8PT;
+
+  sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.use_real_time_ref_set = 0;
+
+  if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
+      cpi->use_screen_content_tools) {
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+  } else {
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+  }
+
+  sf->rd_sf.perform_coeff_opt = 1;
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
+
+  if (speed >= 1) {
+    sf->hl_sf.adjust_num_frames_for_arf_filtering =
+        allow_screen_content_tools ? 0 : 1;
+
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 0 : 2;
+    sf->part_sf.simple_motion_search_early_term_none = 1;
+    // TODO(Venkat): Clean-up frame type dependency for
+    // simple_motion_search_split in partition search function and set the
+    // speed feature accordingly
+    sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+    sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
+
+    sf->mv_sf.exhaustive_searches_thresh <<= 1;
+    sf->mv_sf.obmc_full_pixel_search_level = 1;
+    sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
+    sf->mv_sf.disable_extensive_joint_motion_search = 1;
+
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 1;
+    sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1;
+    sf->inter_sf.prune_ref_frame_for_rect_partitions =
+        (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools))
+            ? 0
+            : (boosted ? 1 : 2);
+    sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
+    sf->inter_sf.reuse_inter_intra_mode = 1;
+    sf->inter_sf.selective_ref_frame = 2;
+    sf->inter_sf.skip_arf_compound = 1;
+
+    sf->interp_sf.use_interp_filter = 1;
+
+    sf->intra_sf.prune_palette_search_level = 1;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.model_based_prune_tx_search_level = 0;
+    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+    sf->tx_sf.tx_type_search.skip_tx_search = 1;
+
+    sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3;
+    sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+    sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+    sf->lpf_sf.dual_sgr_penalty_level = 1;
+    sf->lpf_sf.enable_sgr_ep_pruning = 1;
+
+    // TODO(any, yunqing): move this feature to speed 0.
+    sf->tpl_sf.skip_alike_starting_mv = 1;
+  }
+
+  if (speed >= 2) {
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+    sf->fp_sf.skip_motion_search_threshold = 25;
+
+    sf->gm_sf.num_refinement_steps = 2;
+
+    sf->part_sf.reuse_best_prediction_for_part_ab =
+        !frame_is_intra_only(&cpi->common);
+
+    sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
+    sf->mv_sf.subpel_iters_per_step = 1;
+    sf->mv_sf.reduce_search_range = 1;
+
+    // TODO(chiyotsai@google.com): We can get 10% speed up if we move
+    // adaptive_rd_thresh to speed 1. But currently it performs poorly on some
+    // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a
+    // bit more closely to figure out why.
+    sf->inter_sf.adaptive_rd_thresh = 1;
+    sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+    sf->inter_sf.fast_interintra_wedge_search = 1;
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 1;
+    sf->inter_sf.prune_comp_using_best_single_mode_ref = 2;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+    sf->inter_sf.selective_ref_frame = 3;
+    sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+    sf->inter_sf.enable_fast_compound_mode_search = 1;
+    sf->inter_sf.reuse_mask_search_results = 1;
+    set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 1);
+    sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1;
+    sf->inter_sf.alt_ref_search_fp = 1;
+
+    sf->interp_sf.adaptive_interp_filter_search = 1;
+    sf->interp_sf.disable_dual_filter = 1;
+
+    sf->intra_sf.disable_smooth_intra =
+        !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key > 1);
+    sf->intra_sf.intra_pruning_with_hog = 2;
+    sf->intra_sf.skip_intra_in_interframe = is_inter_frame ? 2 : 1;
+    sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+
+    sf->tpl_sf.prune_starting_mv = 1;
+    sf->tpl_sf.search_method = DIAMOND;
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
+    sf->rd_sf.use_mb_rd_hash = 1;
+
+    sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+    sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+    sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1;
+    sf->lpf_sf.reduce_wiener_window_size = boosted ? 0 : 1;
+
+    // TODO(any): Re-evaluate this feature set to 1 in speed 2.
+    sf->tpl_sf.allow_compound_pred = 0;
+    sf->tpl_sf.prune_ref_frames_in_tpl = 1;
+  }
+
+  if (speed >= 3) {
+    sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+
+    sf->gm_sf.prune_ref_frame_for_gm_search = 1;
+    sf->gm_sf.prune_zero_mv_with_sse = 1;
+    sf->gm_sf.num_refinement_steps = 0;
+
+    sf->part_sf.less_rectangular_check_level = 2;
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools
+            ? SIMPLE_AGG_LVL0
+            : (boosted ? SIMPLE_AGG_LVL1 : QIDX_BASED_AGG_LVL1);
+    sf->part_sf.prune_ext_part_using_split_info = 1;
+    sf->part_sf.simple_motion_search_rect_split = 1;
+
+    sf->mv_sf.full_pixel_search_level = 1;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->mv_sf.search_method = DIAMOND;
+    sf->mv_sf.disable_second_mv = 2;
+    sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1;
+    sf->mv_sf.use_intrabc = 0;
+
+    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+    sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+    sf->inter_sf.disable_onesided_comp = 1;
+    sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+    // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
+    // and clean-up the speed feature
+    sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1;
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1;
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
+    sf->inter_sf.selective_ref_frame = 5;
+    sf->inter_sf.reuse_compound_type_decision = 1;
+    set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level,
+                           boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2));
+    sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2;
+
+    sf->interp_sf.adaptive_interp_filter_search = 2;
+
+    // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+    // inherited directly from luma hog with some minor tweaking. Eventually we
+    // should run this with a bayesian optimizer to find the Pareto frontier.
+    sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+    sf->intra_sf.intra_pruning_with_hog = 3;
+    sf->intra_sf.prune_palette_search_level = 2;
+    sf->intra_sf.top_intra_model_count_allowed = 2;
+
+    sf->tpl_sf.prune_starting_mv = 2;
+    sf->tpl_sf.skip_alike_starting_mv = 2;
+    sf->tpl_sf.prune_intra_modes = 1;
+    sf->tpl_sf.reduce_first_step_size = 6;
+    sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+    sf->tpl_sf.gop_length_decision_method = 1;
+
+    sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
+    sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+
+    // TODO(any): Refactor the code related to following winner mode speed
+    // features
+    sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
+    sf->winner_mode_sf.motion_mode_for_winner_cand =
+        boosted                                                          ? 0
+        : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE ? 1
+                                                                         : 2;
+    sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 4;
+
+    // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+    // loss.
+    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
+    sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+    sf->lpf_sf.use_coarse_filter_level_search =
+        frame_is_intra_only(&cpi->common) ? 0 : 1;
+    sf->lpf_sf.use_downsampled_wiener_stats = 1;
+  }
+
+  if (speed >= 4) {
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+    sf->gm_sf.prune_zero_mv_with_sse = 2;
+
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2;
+    sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+    sf->part_sf.prune_ext_part_using_split_info = 2;
+    sf->part_sf.ml_predict_breakout_level = 3;
+    sf->part_sf.prune_rectangular_split_based_on_qidx =
+        (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
+                                                                          : 1;
+
+    sf->inter_sf.alt_ref_search_fp = 2;
+    sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 3;
+    sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_MOTION_MODE] = boosted ? 0 : 5;
+    sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 3;
+
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 2;
+    sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+    sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+
+    sf->interp_sf.cb_pred_filter_search = 1;
+    sf->interp_sf.skip_sharp_interp_filter_search = 1;
+    sf->interp_sf.use_interp_filter = 2;
+
+    sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+    // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4.
+    // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_sf.skip_intra_in_interframe = 4;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+    sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
+
+    sf->tpl_sf.subpel_force_stop = HALF_PEL;
+    sf->tpl_sf.search_method = FAST_BIGDIA;
+    sf->tpl_sf.use_sad_for_mode_decision = 1;
+
+    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7;
+
+    // TODO(any): Extend multi-winner mode processing support for inter frames
+    sf->winner_mode_sf.multi_winner_mode_type =
+        frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_DEFAULT
+                                          : MULTI_WINNER_MODE_OFF;
+    sf->winner_mode_sf.dc_blk_pred_level = boosted ? 0 : 2;
+
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+  }
+
+  if (speed >= 5) {
+    sf->hl_sf.weight_calc_level_in_tf = 1;
+    sf->hl_sf.adjust_num_frames_for_arf_filtering =
+        allow_screen_content_tools ? 0 : 2;
+
+    sf->fp_sf.reduce_mv_step_param = 4;
+
+    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3;
+    sf->part_sf.ext_partition_eval_thresh =
+        allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+    sf->part_sf.prune_sub_8x8_partition_level =
+        allow_screen_content_tools ? 1 : 2;
+
+    sf->mv_sf.warp_search_method = WARP_SEARCH_DIAMOND;
+
+    sf->inter_sf.prune_inter_modes_if_skippable = 1;
+    sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 1;
+    sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 4;
+    sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 5;
+    sf->inter_sf.enable_fast_compound_mode_search = 2;
+
+    sf->interp_sf.skip_interp_filter_search = boosted ? 0 : 1;
+
+    sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+    // TODO(any): Extend multi-winner mode processing support for inter frames
+    sf->winner_mode_sf.multi_winner_mode_type =
+        frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST
+                                          : MULTI_WINNER_MODE_OFF;
+
+    // Disable Self-guided Loop restoration filter.
+    sf->lpf_sf.disable_sgr_filter = true;
+    sf->lpf_sf.disable_wiener_coeff_refine_search = true;
+
+    sf->tpl_sf.prune_starting_mv = 3;
+    sf->tpl_sf.use_y_only_rate_distortion = 1;
+    sf->tpl_sf.subpel_force_stop = FULL_PEL;
+    sf->tpl_sf.gop_length_decision_method = 2;
+    sf->tpl_sf.use_sad_for_mode_decision = 2;
+
+    sf->winner_mode_sf.dc_blk_pred_level = 2;
+
+    sf->fp_sf.disable_recon = 1;
+  }
+
+  if (speed >= 6) {
+    sf->hl_sf.disable_extra_sc_testing = 1;
+    sf->hl_sf.second_alt_ref_filtering = 0;
+
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
+    sf->inter_sf.selective_ref_frame = 6;
+    sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 3;
+
+    sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC;
+    sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC;
+    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+    sf->part_sf.prune_rectangular_split_based_on_qidx =
+        boosted || allow_screen_content_tools ? 0 : 2;
+
+    sf->part_sf.prune_part4_search = 3;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
+    sf->mv_sf.use_bsize_dependent_search_method = 1;
+
+    sf->tpl_sf.gop_length_decision_method = 3;
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8;
+
+    sf->winner_mode_sf.dc_blk_pred_level = 3;
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+
+    sf->fp_sf.skip_zeromv_motion_search = 1;
+  }
+}
+
+static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
+                                                     SPEED_FEATURES *const sf,
+                                                     int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+
+  if (!is_360p_or_larger) {
+    sf->rt_sf.prune_intra_mode_based_on_mv_range = 1;
+    sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
+    if (speed >= 6)
+      sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+    if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 2;
+    if (speed >= 7) {
+      sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+      sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+      sf->rt_sf.use_rtc_tf = 2;
+    }
+    if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 1;
+    if (speed >= 8) {
+      sf->rt_sf.use_nonrd_filter_search = 1;
+      sf->rt_sf.tx_size_level_based_on_qstep = 1;
+    }
+    if (speed >= 9) {
+      sf->rt_sf.use_comp_ref_nonrd = 0;
+      sf->rt_sf.nonrd_aggressive_skip = 1;
+      sf->rt_sf.skip_intra_pred = 1;
+      // Only turn on enable_ref_short_signaling for low resolution when only
+      // LAST and GOLDEN ref frames are used.
+      sf->rt_sf.enable_ref_short_signaling =
+          (!sf->rt_sf.use_nonrd_altref_frame &&
+           (!sf->rt_sf.use_comp_ref_nonrd ||
+            (!sf->rt_sf.ref_frame_comp_nonrd[1] &&
+             !sf->rt_sf.ref_frame_comp_nonrd[2])));
+
+// TODO(kyslov) Re-enable when AV1 models are trained
+#if 0
+#if CONFIG_RT_ML_PARTITIONING
+      if (!frame_is_intra_only(cm)) {
+        sf->part_sf.partition_search_type = ML_BASED_PARTITION;
+        sf->rt_sf.reuse_inter_pred_nonrd = 0;
+      }
+#endif
+#endif
+      sf->rt_sf.use_adaptive_subpel_search = false;
+    }
+    if (speed >= 10) {
+      // TODO(yunqingwang@google.com): To be conservative, disable
+      // sf->rt_sf.estimate_motion_for_var_based_partition = 3 for speed 10/qvga
+      // for now. May enable it in the future.
+      sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+      sf->rt_sf.skip_intra_pred = 2;
+      sf->rt_sf.hybrid_intra_pickmode = 3;
+      sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+    }
+  } else {
+    sf->rt_sf.prune_intra_mode_based_on_mv_range = 2;
+    sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+    if (speed <= 5) {
+      sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+          boosted ? INT_MAX : 350;
+      sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+    }
+    if (speed == 6) sf->part_sf.disable_8x8_part_based_on_qidx = 1;
+    if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 2;
+    if (speed == 7) {
+      sf->rt_sf.prefer_large_partition_blocks = 1;
+      // Enable this feature for [360p, 720p] resolution range initially.
+      // Only enable for low bitdepth to mitigate issue: b/303023614.
+      if (!cpi->rc.rtc_external_ratectrl &&
+          AOMMIN(cm->width, cm->height) <= 720 && !cpi->oxcf.use_highbitdepth)
+        sf->hl_sf.accurate_bit_estimate = cpi->oxcf.q_cfg.aq_mode == NO_AQ;
+    }
+    if (speed >= 7) {
+      sf->rt_sf.use_rtc_tf = 1;
+    }
+    if (speed == 8 && !cpi->ppi->use_svc) {
+      sf->rt_sf.short_circuit_low_temp_var = 0;
+      sf->rt_sf.use_nonrd_altref_frame = 1;
+    }
+    if (speed >= 8) sf->rt_sf.tx_size_level_based_on_qstep = 2;
+    if (speed >= 9) {
+      sf->rt_sf.gf_length_lvl = 1;
+      sf->rt_sf.skip_cdef_sb = 1;
+      sf->rt_sf.sad_based_adp_altref_lag = 2;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+      sf->rt_sf.use_adaptive_subpel_search = true;
+      sf->interp_sf.cb_pred_filter_search = 1;
+    }
+    if (speed >= 10) {
+      sf->rt_sf.hybrid_intra_pickmode = 2;
+      sf->rt_sf.sad_based_adp_altref_lag = 4;
+      sf->rt_sf.tx_size_level_based_on_qstep = 0;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+      sf->rt_sf.use_adaptive_subpel_search = false;
+      sf->interp_sf.cb_pred_filter_search = 2;
+    }
+  }
+  if (!is_480p_or_larger) {
+    if (speed == 7) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
+  }
+  if (!is_720p_or_larger) {
+    if (speed >= 9) {
+      sf->rt_sf.force_large_partition_blocks_intra = 1;
+    }
+  } else {
+    if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 3;
+    if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 0;
+    if (speed >= 7) {
+      sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 2;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
+    }
+    if (speed >= 9) {
+      sf->rt_sf.sad_based_adp_altref_lag = 1;
+      sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 0;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+    }
+    if (speed >= 10) {
+      sf->rt_sf.sad_based_adp_altref_lag = 3;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+    }
+  }
+  // TODO(Any): Check/Tune settings of other sfs for 1080p.
+  if (is_1080p_or_larger) {
+    if (speed >= 7) {
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+      sf->rt_sf.use_adaptive_subpel_search = 0;
+    }
+    if (speed >= 9) sf->interp_sf.cb_pred_filter_search = 0;
+  } else {
+    if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1;
+  }
+  // TODO(marpan): Tune settings for speed 11 video mode,
+  // for resolutions below 720p.
+  if (speed >= 11 && !is_720p_or_larger &&
+      cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+    sf->rt_sf.skip_cdef_sb = 2;
+    sf->rt_sf.force_only_last_ref = 1;
+    sf->rt_sf.selective_cdf_update = 1;
+    sf->rt_sf.use_nonrd_filter_search = 0;
+    if (is_360p_or_larger) {
+      sf->part_sf.fixed_partition_size = BLOCK_32X32;
+      sf->rt_sf.use_fast_fixed_part = 1;
+    }
+    sf->rt_sf.increase_source_sad_thresh = 1;
+    sf->rt_sf.part_early_exit_zeromv = 2;
+    sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+    for (int i = 0; i < BLOCK_SIZES; ++i) {
+      sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+    }
+  }
+  // Setting for SVC, or when the ref_frame_config control is
+  // used to set the reference structure.
+  if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) {
+    const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+    // For SVC: for greater than 2 temporal layers, use better mv search on
+    // base temporal layers, and only on base spatial layer if highest
+    // resolution is above 640x360.
+    if (cpi->svc.number_temporal_layers >= 2 &&
+        cpi->svc.temporal_layer_id == 0 &&
+        (cpi->svc.spatial_layer_id == 0 ||
+         cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+             640 * 360)) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+      sf->rt_sf.fullpel_search_step_param = 10;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+      if (cm->width * cm->height <= 352 * 288)
+        sf->rt_sf.nonrd_prune_ref_frame_search = 2;
+      sf->rt_sf.force_large_partition_blocks_intra = 0;
+    }
+    if (speed >= 8) {
+      if (cpi->svc.number_temporal_layers > 2)
+        sf->rt_sf.disable_cdf_update_non_reference_frame = true;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+      if (rtc_ref->non_reference_frame) {
+        sf->rt_sf.nonrd_aggressive_skip = 1;
+        sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+      }
+    }
+    if (speed <= 9 && cpi->svc.number_temporal_layers > 2 &&
+        cpi->svc.temporal_layer_id == 0)
+      sf->rt_sf.check_only_zero_zeromv_on_large_blocks = false;
+    else
+      sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+    sf->rt_sf.frame_level_mode_cost_update = false;
+
+    // Compound mode enabling.
+    if (rtc_ref->ref_frame_comp[0] || rtc_ref->ref_frame_comp[1] ||
+        rtc_ref->ref_frame_comp[2]) {
+      sf->rt_sf.use_comp_ref_nonrd = 1;
+      sf->rt_sf.ref_frame_comp_nonrd[0] =
+          rtc_ref->ref_frame_comp[0] && rtc_ref->reference[GOLDEN_FRAME - 1];
+      sf->rt_sf.ref_frame_comp_nonrd[1] =
+          rtc_ref->ref_frame_comp[1] && rtc_ref->reference[LAST2_FRAME - 1];
+      sf->rt_sf.ref_frame_comp_nonrd[2] =
+          rtc_ref->ref_frame_comp[2] && rtc_ref->reference[ALTREF_FRAME - 1];
+    } else {
+      sf->rt_sf.use_comp_ref_nonrd = 0;
+    }
+
+    if (cpi->svc.number_spatial_layers > 1 ||
+        cpi->svc.number_temporal_layers > 1)
+      sf->hl_sf.accurate_bit_estimate = 0;
+
+    sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+
+    // For single layers RPS: bias/adjustment for recovery frame.
+    if (cpi->ppi->rtc_ref.bias_recovery_frame) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+      sf->rt_sf.fullpel_search_step_param = 8;
+      sf->rt_sf.nonrd_aggressive_skip = 0;
+    }
+  }
+  // Screen settings.
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    // TODO(marpan): Check settings for speed 7 and 8.
+    if (speed >= 7) {
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
+      sf->mv_sf.use_bsize_dependent_search_method = 0;
+      sf->rt_sf.skip_cdef_sb = 1;
+      sf->rt_sf.increase_color_thresh_palette = 1;
+      if (!frame_is_intra_only(cm)) sf->rt_sf.dct_only_palette_nonrd = 1;
+    }
+    if (speed >= 8) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 3;
+      sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+      sf->rt_sf.prune_hv_pred_modes_using_src_sad = false;
+    }
+    if (speed >= 9) {
+      sf->rt_sf.prune_idtx_nonrd = 1;
+      sf->rt_sf.part_early_exit_zeromv = 2;
+      sf->rt_sf.skip_lf_screen = 1;
+      sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+      sf->rt_sf.var_part_split_threshold_shift = 10;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+      sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
+      sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+      sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+      sf->interp_sf.cb_pred_filter_search = 0;
+    }
+    if (speed >= 10) {
+      if (cm->width * cm->height > 1920 * 1080)
+        sf->part_sf.disable_8x8_part_based_on_qidx = 1;
+      sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80;
+      sf->rt_sf.part_early_exit_zeromv = 1;
+      sf->rt_sf.nonrd_aggressive_skip = 1;
+    }
+    if (speed >= 11) {
+      sf->rt_sf.skip_lf_screen = 2;
+      sf->rt_sf.skip_cdef_sb = 2;
+      sf->rt_sf.part_early_exit_zeromv = 2;
+      sf->rt_sf.prune_palette_nonrd = 1;
+      sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+      sf->rt_sf.increase_color_thresh_palette = 0;
+    }
+    sf->rt_sf.use_nonrd_altref_frame = 0;
+    sf->rt_sf.use_rtc_tf = 0;
+    sf->rt_sf.use_comp_ref_nonrd = 0;
+    sf->rt_sf.source_metrics_sb_nonrd = 1;
+    if (cpi->rc.high_source_sad == 1) {
+      sf->rt_sf.prefer_large_partition_blocks = 0;
+      sf->part_sf.max_intra_bsize = BLOCK_128X128;
+      for (int i = 0; i < BLOCK_SIZES; ++i) {
+        if (i > BLOCK_32X32)
+          sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+        else
+          sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+      }
+    }
+    if (cpi->rc.max_block_source_sad > 20000 &&
+        cpi->rc.frame_source_sad > 100 && speed >= 6 &&
+        (cpi->rc.percent_blocks_with_motion > 1 ||
+         cpi->svc.last_layer_dropped[0])) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->rt_sf.fullpel_search_step_param = 2;
+    }
+    sf->rt_sf.partition_direct_merging = 0;
+    sf->hl_sf.accurate_bit_estimate = 0;
+    // This feature is for nonrd_pickmode.
+    if (sf->rt_sf.use_nonrd_pick_mode)
+      sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+    else
+      sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+  }
+  if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+    sf->rt_sf.use_rtc_tf = 0;
+    // TODO(aomedia:3412): The setting accurate_bit_estimate = 0
+    // can be removed once it's fixed for lossless mode.
+    sf->hl_sf.accurate_bit_estimate = 0;
+  }
+  if (cpi->oxcf.use_highbitdepth) {
+    // Disable for use_highbitdepth = 1 to mitigate issue: b/303023614.
+    sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+  }
+  if (cpi->oxcf.superres_cfg.enable_superres) {
+    sf->rt_sf.use_rtc_tf = 0;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+  }
+}
+
+// TODO(kyslov): now this is very similar to
+// set_good_speed_features_framesize_independent
+// except it sets non-rd flag on speed 8. This function will likely
+// be modified in the future with RT-specific speed features.
+static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
+                                                        SPEED_FEATURES *sf,
+                                                        int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int boosted = frame_is_boosted(cpi);
+
+  // Currently, rt speed 0, 1, 2, 3, 4, 5 are the same.
+  // Following set of speed features are not impacting encoder's decisions as
+  // the relevant tools are disabled by default.
+  sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+  sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+  sf->inter_sf.reuse_inter_intra_mode = 1;
+  sf->inter_sf.prune_compound_using_single_ref = 0;
+  sf->inter_sf.prune_comp_search_by_single_result = 2;
+  sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+  sf->inter_sf.fast_wedge_sign_estimate = 1;
+  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+  sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+  sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+  sf->interp_sf.cb_pred_filter_search = 0;
+  sf->interp_sf.skip_interp_filter_search = 1;
+  sf->part_sf.ml_prune_partition = 1;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 2;
+  sf->part_sf.less_rectangular_check_level = 2;
+  sf->mv_sf.obmc_full_pixel_search_level = 1;
+  sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+  sf->tx_sf.model_based_prune_tx_search_level = 0;
+  sf->lpf_sf.dual_sgr_penalty_level = 1;
+  // Disable Wiener and Self-guided Loop restoration filters.
+  sf->lpf_sf.disable_wiener_filter = true;
+  sf->lpf_sf.disable_sgr_filter = true;
+  sf->intra_sf.prune_palette_search_level = 2;
+  sf->intra_sf.prune_luma_palette_size_search_level = 2;
+  sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+  // End of set
+
+  // TODO(any, yunqing): tune these features for real-time use cases.
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO;
+  sf->hl_sf.frame_parameter_update = 0;
+
+  sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+  // TODO(any): As per the experiments, this speed feature is doing redundant
+  // computation since the model rd based pruning logic is similar to model rd
+  // based gating when inter_mode_rd_model_estimation = 2. Enable this SF if
+  // either of the condition becomes true.
+  //    (1) inter_mode_rd_model_estimation != 2
+  //    (2) skip_interp_filter_search == 0
+  //    (3) Motion mode or compound mode is enabled */
+  sf->inter_sf.prune_mode_search_simple_translation = 0;
+  sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
+  sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+  sf->inter_sf.selective_ref_frame = 4;
+  sf->inter_sf.alt_ref_search_fp = 2;
+  set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 4);
+  sf->inter_sf.limit_txfm_eval_per_mode = 3;
+
+  sf->inter_sf.adaptive_rd_thresh = 4;
+  sf->inter_sf.inter_mode_rd_model_estimation = 2;
+  sf->inter_sf.prune_inter_modes_if_skippable = 1;
+  sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+  sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
+  sf->inter_sf.skip_newmv_in_drl = 4;
+
+  sf->interp_sf.use_fast_interpolation_filter_search = 1;
+  sf->interp_sf.use_interp_filter = 1;
+  sf->interp_sf.adaptive_interp_filter_search = 1;
+  sf->interp_sf.disable_dual_filter = 1;
+
+  sf->part_sf.default_max_partition_size = BLOCK_128X128;
+  sf->part_sf.default_min_partition_size = BLOCK_8X8;
+  sf->part_sf.use_best_rd_for_pruning = 1;
+  sf->part_sf.early_term_after_none_split = 1;
+  sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+  sf->part_sf.max_intra_bsize = BLOCK_16X16;
+  sf->part_sf.partition_search_breakout_rate_thr = 500;
+  sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+  sf->part_sf.adjust_var_based_rd_partitioning = 2;
+
+  sf->mv_sf.full_pixel_search_level = 1;
+  sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
+  sf->mv_sf.auto_mv_step_size = 1;
+  sf->mv_sf.subpel_iters_per_step = 1;
+  sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
+  sf->mv_sf.search_method = FAST_DIAMOND;
+  sf->mv_sf.subpel_force_stop = EIGHTH_PEL;
+  sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+  for (int i = 0; i < TX_SIZES; ++i) {
+    sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC;
+    sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+  }
+  sf->intra_sf.skip_intra_in_interframe = 5;
+  sf->intra_sf.disable_smooth_intra = 1;
+  sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+
+  sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+  sf->tx_sf.adaptive_txb_search_level = 2;
+  sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+  sf->tx_sf.tx_size_search_lgr_block = 1;
+  sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+  sf->tx_sf.tx_type_search.skip_tx_search = 1;
+  sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+  sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+  sf->tx_sf.refine_fast_tx_search_results = 0;
+  sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+  sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+  sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
+
+  sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+  sf->rd_sf.simple_model_rd_from_var = 1;
+  sf->rd_sf.tx_domain_dist_level = 2;
+  sf->rd_sf.tx_domain_dist_thres_level = 2;
+
+  sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+  sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+  sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 3;
+  sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+  sf->winner_mode_sf.tx_size_search_level = 1;
+  sf->winner_mode_sf.winner_mode_ifs = 1;
+
+  sf->rt_sf.check_intra_pred_nonrd = 1;
+  sf->rt_sf.estimate_motion_for_var_based_partition = 2;
+  sf->rt_sf.hybrid_intra_pickmode = 1;
+  sf->rt_sf.use_comp_ref_nonrd = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[0] = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[1] = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[2] = 0;
+  sf->rt_sf.use_nonrd_filter_search = 1;
+  sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+  sf->rt_sf.num_inter_modes_for_tx_search = 5;
+  sf->rt_sf.prune_inter_modes_using_temp_var = 1;
+  sf->rt_sf.use_real_time_ref_set = 1;
+  sf->rt_sf.use_simple_rd_model = 1;
+  sf->rt_sf.prune_inter_modes_with_golden_ref = boosted ? 0 : 1;
+  // TODO(any): This sf could be removed.
+  sf->rt_sf.short_circuit_low_temp_var = 1;
+  sf->rt_sf.check_scene_detection = 1;
+  if (cpi->rc.rtc_external_ratectrl) sf->rt_sf.check_scene_detection = 0;
+  if (cm->current_frame.frame_type != KEY_FRAME &&
+      cpi->oxcf.rc_cfg.mode == AOM_CBR)
+    sf->rt_sf.overshoot_detection_cbr = FAST_DETECTION_MAXQ;
+  // Enable noise estimation only for high resolutions for now.
+  //
+  // Since use_temporal_noise_estimate has no effect for all-intra frame
+  // encoding, it is disabled for this case.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0 && cm->width * cm->height > 640 * 480)
+    sf->rt_sf.use_temporal_noise_estimate = 1;
+  sf->rt_sf.skip_tx_no_split_var_based_partition = 1;
+  sf->rt_sf.skip_newmv_mode_based_on_sse = 1;
+  sf->rt_sf.mode_search_skip_flags =
+      (cm->current_frame.frame_type == KEY_FRAME)
+          ? 0
+          : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+                FLAG_EARLY_TERMINATE;
+  sf->rt_sf.var_part_split_threshold_shift = 5;
+  if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1;
+  sf->rt_sf.use_fast_fixed_part = 0;
+  sf->rt_sf.increase_source_sad_thresh = 0;
+
+  if (speed >= 6) {
+    sf->mv_sf.use_fullpel_costlist = 1;
+
+    sf->rd_sf.tx_domain_dist_thres_level = 3;
+
+    sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0;
+    sf->inter_sf.limit_inter_mode_cands = 4;
+    sf->inter_sf.prune_warped_prob_thresh = 8;
+    sf->inter_sf.extra_prune_warped = 1;
+
+    sf->rt_sf.gf_refresh_based_on_qp = 1;
+    sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
+    sf->rt_sf.var_part_split_threshold_shift = 7;
+    if (!frame_is_intra_only(&cpi->common))
+      sf->rt_sf.var_part_based_on_qidx = 2;
+
+    sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 3;
+  }
+
+  if (speed >= 7) {
+    sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_1;
+    sf->rt_sf.use_comp_ref_nonrd = 1;
+    sf->rt_sf.ref_frame_comp_nonrd[2] = 1;  // LAST_ALTREF
+    sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+    sf->part_sf.max_intra_bsize = BLOCK_32X32;
+
+    sf->mv_sf.search_method = FAST_DIAMOND;
+    sf->mv_sf.subpel_force_stop = QUARTER_PEL;
+
+    sf->inter_sf.inter_mode_rd_model_estimation = 2;
+    // This sf is not applicable in non-rd path.
+    sf->inter_sf.skip_newmv_in_drl = 0;
+
+    sf->interp_sf.skip_interp_filter_search = 0;
+
+    // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't
+    // good. May need more study.
+    for (int i = 0; i < TX_SIZES; ++i) {
+      sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL;
+    }
+
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5;
+
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+    // This is for rd path only.
+    sf->rt_sf.prune_inter_modes_using_temp_var = 0;
+    sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
+    sf->rt_sf.prune_intra_mode_based_on_mv_range = 0;
+#if !CONFIG_REALTIME_ONLY
+    sf->rt_sf.reuse_inter_pred_nonrd =
+        (cpi->oxcf.motion_mode_cfg.enable_warped_motion == 0);
+#else
+    sf->rt_sf.reuse_inter_pred_nonrd = 1;
+#endif
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0);
+#endif
+    sf->rt_sf.short_circuit_low_temp_var = 0;
+    // For spatial layers, only LAST and GOLDEN are currently used in the SVC
+    // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the
+    // get_ref_frame_flags() for some patterns, so disable it here for
+    // spatial layers.
+    sf->rt_sf.use_nonrd_altref_frame =
+        (cpi->svc.number_spatial_layers > 1) ? 0 : 1;
+    sf->rt_sf.use_nonrd_pick_mode = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 3;
+    sf->rt_sf.skip_intra_pred = 1;
+    sf->rt_sf.source_metrics_sb_nonrd = 1;
+    // Set mask for intra modes.
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      if (i >= BLOCK_32X32)
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+      else
+        // Use DC, H, V intra mode for block sizes < 32X32.
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+
+    sf->winner_mode_sf.dc_blk_pred_level = 0;
+    sf->rt_sf.var_part_based_on_qidx = 3;
+    sf->rt_sf.prune_compoundmode_with_singlecompound_var = true;
+    sf->rt_sf.prune_compoundmode_with_singlemode_var = true;
+    sf->rt_sf.skip_compound_based_on_var = true;
+    sf->rt_sf.use_adaptive_subpel_search = true;
+  }
+
+  if (speed >= 8) {
+    sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_2;
+    sf->intra_sf.intra_pruning_with_hog = 1;
+    sf->rt_sf.short_circuit_low_temp_var = 1;
+    sf->rt_sf.use_nonrd_altref_frame = 0;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 2;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+    sf->rt_sf.var_part_split_threshold_shift = 8;
+    sf->rt_sf.var_part_based_on_qidx = 4;
+    sf->rt_sf.partition_direct_merging = 1;
+    sf->rt_sf.prune_compoundmode_with_singlemode_var = false;
+    sf->mv_sf.use_bsize_dependent_search_method = 2;
+    sf->rt_sf.prune_hv_pred_modes_using_src_sad = true;
+  }
+  if (speed >= 9) {
+    sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3;
+    sf->rt_sf.estimate_motion_for_var_based_partition = 3;
+    sf->rt_sf.prefer_large_partition_blocks = 3;
+    sf->rt_sf.skip_intra_pred = 2;
+    sf->rt_sf.var_part_split_threshold_shift = 9;
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+    sf->rt_sf.var_part_based_on_qidx = 0;
+    sf->rt_sf.frame_level_mode_cost_update = true;
+    sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+    sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+    sf->rt_sf.use_adaptive_subpel_search = true;
+    sf->mv_sf.use_bsize_dependent_search_method = 0;
+  }
+  if (speed >= 10) {
+    sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+    sf->rt_sf.var_part_split_threshold_shift = 10;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+  }
+  if (speed >= 11 && !frame_is_intra_only(cm) &&
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    sf->winner_mode_sf.dc_blk_pred_level = 3;
+  }
+}
+
+static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
+  // best quality defaults
+  hl_sf->frame_parameter_update = 1;
+  hl_sf->recode_loop = ALLOW_RECODE;
+  // Recode loop tolerance %.
+  hl_sf->recode_tolerance = 25;
+  hl_sf->high_precision_mv_usage = CURRENT_Q;
+  hl_sf->superres_auto_search_type = SUPERRES_AUTO_ALL;
+  hl_sf->disable_extra_sc_testing = 0;
+  hl_sf->second_alt_ref_filtering = 1;
+  hl_sf->adjust_num_frames_for_arf_filtering = 0;
+  hl_sf->accurate_bit_estimate = 0;
+  hl_sf->weight_calc_level_in_tf = 0;
+  hl_sf->allow_sub_blk_me_in_tf = 0;
+}
+
+static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
+  fp_sf->reduce_mv_step_param = 3;
+  fp_sf->skip_motion_search_threshold = 0;
+  fp_sf->disable_recon = 0;
+  fp_sf->skip_zeromv_motion_search = 0;
+}
+
+static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
+  tpl_sf->gop_length_decision_method = 0;
+  tpl_sf->prune_intra_modes = 0;
+  tpl_sf->prune_starting_mv = 0;
+  tpl_sf->reduce_first_step_size = 0;
+  tpl_sf->skip_alike_starting_mv = 0;
+  tpl_sf->subpel_force_stop = EIGHTH_PEL;
+  tpl_sf->search_method = NSTEP;
+  tpl_sf->prune_ref_frames_in_tpl = 0;
+  tpl_sf->allow_compound_pred = 1;
+  tpl_sf->use_y_only_rate_distortion = 0;
+  tpl_sf->use_sad_for_mode_decision = 0;
+  tpl_sf->reduce_num_frames = 0;
+}
+
+static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
+  gm_sf->gm_search_type = GM_FULL_SEARCH;
+  gm_sf->prune_ref_frame_for_gm_search = 0;
+  gm_sf->prune_zero_mv_with_sse = 0;
+  gm_sf->disable_gm_search_based_on_stats = 0;
+  gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS;
+}
+
+static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+  part_sf->partition_search_type = SEARCH_PARTITION;
+  part_sf->less_rectangular_check_level = 0;
+  part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+  part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+  part_sf->default_max_partition_size = BLOCK_LARGEST;
+  part_sf->default_min_partition_size = BLOCK_4X4;
+  part_sf->adjust_var_based_rd_partitioning = 0;
+  part_sf->max_intra_bsize = BLOCK_LARGEST;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  part_sf->fixed_partition_size = BLOCK_16X16;
+  // Recode loop tolerance %.
+  part_sf->partition_search_breakout_dist_thr = 0;
+  part_sf->partition_search_breakout_rate_thr = 0;
+  part_sf->prune_ext_partition_types_search_level = 0;
+  part_sf->prune_part4_search = 0;
+  part_sf->ml_prune_partition = 0;
+  part_sf->ml_early_term_after_part_split_level = 0;
+  for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+    part_sf->ml_partition_search_breakout_thresh[i] =
+        -1;  // -1 means not enabled.
+  }
+  part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0;
+  part_sf->simple_motion_search_split = 0;
+  part_sf->simple_motion_search_prune_rect = 0;
+  part_sf->simple_motion_search_early_term_none = 0;
+  part_sf->simple_motion_search_reduce_search_steps = 0;
+  part_sf->intra_cnn_based_part_prune_level = 0;
+  part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+  part_sf->ext_part_eval_based_on_cur_best = 0;
+  part_sf->prune_ext_part_using_split_info = 0;
+  part_sf->prune_rectangular_split_based_on_qidx = 0;
+  part_sf->prune_rect_part_using_4x4_var_deviation = false;
+  part_sf->prune_rect_part_using_none_pred_mode = false;
+  part_sf->early_term_after_none_split = 0;
+  part_sf->ml_predict_breakout_level = 0;
+  part_sf->prune_sub_8x8_partition_level = 0;
+  part_sf->simple_motion_search_rect_split = 0;
+  part_sf->reuse_prev_rd_results_for_part_ab = 0;
+  part_sf->reuse_best_prediction_for_part_ab = 0;
+  part_sf->use_best_rd_for_pruning = 0;
+  part_sf->skip_non_sq_part_based_on_none = 0;
+  part_sf->disable_8x8_part_based_on_qidx = 0;
+}
+
+static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
+  mv_sf->full_pixel_search_level = 0;
+  mv_sf->auto_mv_step_size = 0;
+  mv_sf->exhaustive_searches_thresh = 0;
+  mv_sf->obmc_full_pixel_search_level = 0;
+  mv_sf->prune_mesh_search = PRUNE_MESH_SEARCH_DISABLED;
+  mv_sf->reduce_search_range = 0;
+  mv_sf->search_method = NSTEP;
+  mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL;
+  mv_sf->subpel_force_stop = EIGHTH_PEL;
+  mv_sf->subpel_iters_per_step = 2;
+  mv_sf->subpel_search_method = SUBPEL_TREE;
+  mv_sf->use_accurate_subpel_search = USE_8_TAPS;
+  mv_sf->use_bsize_dependent_search_method = 0;
+  mv_sf->use_fullpel_costlist = 0;
+  mv_sf->use_downsampled_sad = 0;
+  mv_sf->disable_extensive_joint_motion_search = 0;
+  mv_sf->disable_second_mv = 0;
+  mv_sf->skip_fullpel_search_using_startmv = 0;
+  mv_sf->warp_search_method = WARP_SEARCH_SQUARE;
+  mv_sf->warp_search_iters = 8;
+  mv_sf->use_intrabc = 1;
+}
+
+static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
+  inter_sf->adaptive_rd_thresh = 0;
+  inter_sf->model_based_post_interp_filter_breakout = 0;
+  inter_sf->reduce_inter_modes = 0;
+  inter_sf->alt_ref_search_fp = 0;
+  inter_sf->prune_single_ref = 0;
+  inter_sf->prune_comp_ref_frames = 0;
+  inter_sf->selective_ref_frame = 0;
+  inter_sf->prune_ref_frame_for_rect_partitions = 0;
+  inter_sf->fast_wedge_sign_estimate = 0;
+  inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
+  inter_sf->reuse_inter_intra_mode = 0;
+  inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->prune_inter_modes_based_on_tpl = 0;
+  inter_sf->prune_nearmv_using_neighbors = PRUNE_NEARMV_OFF;
+  inter_sf->prune_comp_search_by_single_result = 0;
+  inter_sf->skip_repeated_ref_mv = 0;
+  inter_sf->skip_newmv_in_drl = 0;
+  inter_sf->inter_mode_rd_model_estimation = 0;
+  inter_sf->prune_compound_using_single_ref = 0;
+  inter_sf->prune_ext_comp_using_neighbors = 0;
+  inter_sf->skip_ext_comp_nearmv_mode = 0;
+  inter_sf->prune_comp_using_best_single_mode_ref = 0;
+  inter_sf->prune_nearest_near_mv_using_refmv_weight = 0;
+  inter_sf->disable_onesided_comp = 0;
+  inter_sf->prune_mode_search_simple_translation = 0;
+  inter_sf->prune_comp_type_by_comp_avg = 0;
+  inter_sf->disable_interinter_wedge_newmv_search = 0;
+  inter_sf->fast_interintra_wedge_search = 0;
+  inter_sf->prune_comp_type_by_model_rd = 0;
+  inter_sf->perform_best_rd_based_gating_for_chroma = 0;
+  inter_sf->prune_obmc_prob_thresh = 0;
+  inter_sf->disable_interinter_wedge_var_thresh = 0;
+  inter_sf->disable_interintra_wedge_var_thresh = 0;
+  inter_sf->prune_ref_mv_idx_search = 0;
+  inter_sf->prune_warped_prob_thresh = 0;
+  inter_sf->reuse_compound_type_decision = 0;
+  inter_sf->prune_inter_modes_if_skippable = 0;
+  inter_sf->disable_masked_comp = 0;
+  inter_sf->enable_fast_compound_mode_search = 0;
+  inter_sf->reuse_mask_search_results = 0;
+  inter_sf->enable_fast_wedge_mask_search = 0;
+  inter_sf->inter_mode_txfm_breakout = 0;
+  inter_sf->limit_inter_mode_cands = 0;
+  inter_sf->limit_txfm_eval_per_mode = 0;
+  inter_sf->skip_arf_compound = 0;
+  set_txfm_rd_gate_level(inter_sf->txfm_rd_gate_level, 0);
+}
+
+static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
+  interp_sf->adaptive_interp_filter_search = 0;
+  interp_sf->cb_pred_filter_search = 0;
+  interp_sf->disable_dual_filter = 0;
+  interp_sf->skip_sharp_interp_filter_search = 0;
+  interp_sf->use_fast_interpolation_filter_search = 0;
+  interp_sf->use_interp_filter = 0;
+  interp_sf->skip_interp_filter_search = 0;
+}
+
+static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+  intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB;
+  intra_sf->skip_intra_in_interframe = 1;
+  intra_sf->intra_pruning_with_hog = 0;
+  intra_sf->chroma_intra_pruning_with_hog = 0;
+  intra_sf->prune_palette_search_level = 0;
+  intra_sf->prune_luma_palette_size_search_level = 0;
+
+  for (int i = 0; i < TX_SIZES; i++) {
+    intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
+    intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
+  }
+  intra_sf->disable_smooth_intra = 0;
+  intra_sf->prune_smooth_intra_mode_for_chroma = 0;
+  intra_sf->prune_filter_intra_level = 0;
+  intra_sf->prune_chroma_modes_using_luma_winner = 0;
+  intra_sf->cfl_search_range = 3;
+  intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT;
+  intra_sf->adapt_top_model_rd_count_using_neighbors = 0;
+  intra_sf->early_term_chroma_palette_size_search = 0;
+  intra_sf->skip_filter_intra_in_inter_frames = 0;
+  intra_sf->prune_luma_odd_delta_angles_in_intra = 0;
+}
+
+static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
+  tx_sf->inter_tx_size_search_init_depth_sqr = 0;
+  tx_sf->inter_tx_size_search_init_depth_rect = 0;
+  tx_sf->intra_tx_size_search_init_depth_rect = 0;
+  tx_sf->intra_tx_size_search_init_depth_sqr = 0;
+  tx_sf->tx_size_search_lgr_block = 0;
+  tx_sf->model_based_prune_tx_search_level = 0;
+  tx_sf->tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_1;
+  tx_sf->tx_type_search.ml_tx_split_thresh = 8500;
+  tx_sf->tx_type_search.use_skip_flag_prediction = 1;
+  tx_sf->tx_type_search.use_reduced_intra_txset = 0;
+  tx_sf->tx_type_search.fast_intra_tx_type_search = 0;
+  tx_sf->tx_type_search.fast_inter_tx_type_prob_thresh = INT_MAX;
+  tx_sf->tx_type_search.skip_tx_search = 0;
+  tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
+  tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
+  tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0;
+  tx_sf->txb_split_cap = 1;
+  tx_sf->adaptive_txb_search_level = 0;
+  tx_sf->refine_fast_tx_search_results = 1;
+  tx_sf->prune_tx_size_level = 0;
+  tx_sf->prune_intra_tx_depths_using_nn = false;
+  tx_sf->use_rd_based_breakout_for_intra_tx_search = false;
+}
+
+static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
+                                  const AV1EncoderConfig *oxcf) {
+  const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant;
+  if (disable_trellis_quant == 3) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
+                                       ? NO_ESTIMATE_YRD_TRELLIS_OPT
+                                       : NO_TRELLIS_OPT;
+  } else if (disable_trellis_quant == 2) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
+                                       ? FINAL_PASS_TRELLIS_OPT
+                                       : NO_TRELLIS_OPT;
+  } else if (disable_trellis_quant == 0) {
+    if (is_lossless_requested(&oxcf->rc_cfg)) {
+      rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+    } else {
+      rd_sf->optimize_coefficients = FULL_TRELLIS_OPT;
+    }
+  } else if (disable_trellis_quant == 1) {
+    rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+  } else {
+    assert(0 && "Invalid disable_trellis_quant value");
+  }
+  rd_sf->use_mb_rd_hash = 0;
+  rd_sf->simple_model_rd_from_var = 0;
+  rd_sf->tx_domain_dist_level = 0;
+  rd_sf->tx_domain_dist_thres_level = 0;
+  rd_sf->perform_coeff_opt = 0;
+}
+
+static AOM_INLINE void init_winner_mode_sf(
+    WINNER_MODE_SPEED_FEATURES *winner_mode_sf) {
+  winner_mode_sf->motion_mode_for_winner_cand = 0;
+  // Set this at the appropriate speed levels
+  winner_mode_sf->tx_size_search_level = 0;
+  winner_mode_sf->enable_winner_mode_for_coeff_opt = 0;
+  winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
+  winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
+  winner_mode_sf->multi_winner_mode_type = 0;
+  winner_mode_sf->dc_blk_pred_level = 0;
+  winner_mode_sf->winner_mode_ifs = 0;
+  winner_mode_sf->prune_winner_mode_eval_level = 0;
+}
+
+static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
+  lpf_sf->disable_loop_restoration_chroma = 0;
+  lpf_sf->disable_loop_restoration_luma = 0;
+  lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+  lpf_sf->max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+  lpf_sf->prune_wiener_based_on_src_var = 0;
+  lpf_sf->prune_sgr_based_on_wiener = 0;
+  lpf_sf->enable_sgr_ep_pruning = 0;
+  lpf_sf->reduce_wiener_window_size = 0;
+  lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  lpf_sf->use_coarse_filter_level_search = 0;
+  lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH;
+  // Set decoder side speed feature to use less dual sgr modes
+  lpf_sf->dual_sgr_penalty_level = 0;
+  // Enable Wiener and Self-guided Loop restoration filters by default.
+  lpf_sf->disable_wiener_filter = false;
+  lpf_sf->disable_sgr_filter = false;
+  lpf_sf->disable_wiener_coeff_refine_search = false;
+  lpf_sf->use_downsampled_wiener_stats = 0;
+}
+
+static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
+  rt_sf->check_intra_pred_nonrd = 0;
+  rt_sf->skip_intra_pred = 0;
+  rt_sf->estimate_motion_for_var_based_partition = 0;
+  rt_sf->nonrd_check_partition_merge_mode = 0;
+  rt_sf->nonrd_check_partition_split = 0;
+  rt_sf->mode_search_skip_flags = 0;
+  rt_sf->nonrd_prune_ref_frame_search = 0;
+  rt_sf->use_nonrd_pick_mode = 0;
+  rt_sf->use_nonrd_altref_frame = 0;
+  rt_sf->use_comp_ref_nonrd = 0;
+  rt_sf->use_real_time_ref_set = 0;
+  rt_sf->short_circuit_low_temp_var = 0;
+  rt_sf->reuse_inter_pred_nonrd = 0;
+  rt_sf->num_inter_modes_for_tx_search = INT_MAX;
+  rt_sf->use_nonrd_filter_search = 0;
+  rt_sf->use_simple_rd_model = 0;
+  rt_sf->hybrid_intra_pickmode = 0;
+  rt_sf->source_metrics_sb_nonrd = 0;
+  rt_sf->overshoot_detection_cbr = NO_DETECTION;
+  rt_sf->check_scene_detection = 0;
+  rt_sf->prefer_large_partition_blocks = 0;
+  rt_sf->use_temporal_noise_estimate = 0;
+  rt_sf->fullpel_search_step_param = 0;
+  for (int i = 0; i < BLOCK_SIZES; ++i)
+    rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL;
+  rt_sf->prune_hv_pred_modes_using_src_sad = false;
+  rt_sf->nonrd_aggressive_skip = 0;
+  rt_sf->skip_cdef_sb = 0;
+  rt_sf->force_large_partition_blocks_intra = 0;
+  rt_sf->skip_tx_no_split_var_based_partition = 0;
+  rt_sf->skip_newmv_mode_based_on_sse = 0;
+  rt_sf->gf_length_lvl = 0;
+  rt_sf->prune_inter_modes_with_golden_ref = 0;
+  rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
+  rt_sf->prune_inter_modes_using_temp_var = 0;
+  rt_sf->reduce_mv_pel_precision_highmotion = 0;
+  rt_sf->reduce_mv_pel_precision_lowcomplex = 0;
+  rt_sf->prune_intra_mode_based_on_mv_range = 0;
+  rt_sf->var_part_split_threshold_shift = 7;
+  rt_sf->gf_refresh_based_on_qp = 0;
+  rt_sf->use_rtc_tf = 0;
+  rt_sf->prune_idtx_nonrd = 0;
+  rt_sf->prune_palette_nonrd = 0;
+  rt_sf->dct_only_palette_nonrd = 0;
+  rt_sf->part_early_exit_zeromv = 0;
+  rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED;
+  rt_sf->skip_lf_screen = 0;
+  rt_sf->sad_based_adp_altref_lag = 0;
+  rt_sf->partition_direct_merging = 0;
+  rt_sf->var_part_based_on_qidx = 0;
+  rt_sf->tx_size_level_based_on_qstep = 0;
+  rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false;
+  rt_sf->prune_compoundmode_with_singlecompound_var = false;
+  rt_sf->frame_level_mode_cost_update = false;
+  rt_sf->prune_h_pred_using_best_mode_so_far = false;
+  rt_sf->enable_intra_mode_pruning_using_neighbors = false;
+  rt_sf->prune_intra_mode_using_best_sad_so_far = false;
+  rt_sf->check_only_zero_zeromv_on_large_blocks = false;
+  rt_sf->disable_cdf_update_non_reference_frame = false;
+  rt_sf->prune_compoundmode_with_singlemode_var = false;
+  rt_sf->skip_compound_based_on_var = false;
+  rt_sf->set_zeromv_skip_based_on_source_sad = 1;
+  rt_sf->use_adaptive_subpel_search = false;
+  rt_sf->screen_content_cdef_filter_qindex_thresh = 0;
+  rt_sf->enable_ref_short_signaling = false;
+  rt_sf->check_globalmv_on_single_ref = true;
+  rt_sf->increase_color_thresh_palette = false;
+  rt_sf->selective_cdf_update = 0;
+  rt_sf->force_only_last_ref = 0;
+}
+
+static fractional_mv_step_fp
+    *const fractional_mv_search[SUBPEL_SEARCH_METHODS] = {
+      av1_find_best_sub_pixel_tree,             // SUBPEL_TREE = 0
+      av1_find_best_sub_pixel_tree_pruned,      // SUBPEL_TREE_PRUNED = 1
+      av1_find_best_sub_pixel_tree_pruned_more  // SUBPEL_TREE_PRUNED_MORE = 2
+    };
+
+// Populate appropriate sub-pel search method based on speed feature and user
+// specified settings
+static void set_subpel_search_method(
+    MotionVectorSearchParams *mv_search_params,
+    unsigned int motion_vector_unit_test,
+    SUBPEL_SEARCH_METHOD subpel_search_method) {
+  assert(subpel_search_method <= SUBPEL_TREE_PRUNED_MORE);
+  mv_search_params->find_fractional_mv_step =
+      fractional_mv_search[subpel_search_method];
+
+  // This is only used in motion vector unit test.
+  if (motion_vector_unit_test == 1)
+    mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+  else if (motion_vector_unit_test == 2)
+    mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
+
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  switch (oxcf->mode) {
+    case GOOD:
+      set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+      break;
+    case ALLINTRA:
+      set_allintra_speed_feature_framesize_dependent(cpi, sf, speed);
+      break;
+    case REALTIME:
+      set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+      break;
+  }
+
+  if (!cpi->ppi->seq_params_locked) {
+    cpi->common.seq_params->enable_masked_compound &=
+        !sf->inter_sf.disable_masked_comp;
+    cpi->common.seq_params->enable_interintra_compound &=
+        (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
+  }
+
+  set_subpel_search_method(&cpi->mv_search_params,
+                           cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+                           sf->mv_sf.subpel_search_method);
+
+  // For multi-thread use case with row_mt enabled, cost update for a set of
+  // SB rows is not desirable. Hence, the sf mv_cost_upd_level is set to
+  // INTERNAL_COST_UPD_SBROW in such cases.
+  if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) {
+    if (sf->inter_sf.mv_cost_upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+      // Set mv_cost_upd_level to use row level update.
+      sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+    }
+  }
+}
+
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int i;
+
+  init_hl_sf(&sf->hl_sf);
+  init_fp_sf(&sf->fp_sf);
+  init_tpl_sf(&sf->tpl_sf);
+  init_gm_sf(&sf->gm_sf);
+  init_part_sf(&sf->part_sf);
+  init_mv_sf(&sf->mv_sf);
+  init_inter_sf(&sf->inter_sf);
+  init_interp_sf(&sf->interp_sf);
+  init_intra_sf(&sf->intra_sf);
+  init_tx_sf(&sf->tx_sf);
+  init_rd_sf(&sf->rd_sf, oxcf);
+  init_winner_mode_sf(&sf->winner_mode_sf);
+  init_lpf_sf(&sf->lpf_sf);
+  init_rt_sf(&sf->rt_sf);
+
+  switch (oxcf->mode) {
+    case GOOD:
+      set_good_speed_features_framesize_independent(cpi, sf, speed);
+      break;
+    case ALLINTRA:
+      set_allintra_speed_features_framesize_independent(cpi, sf, speed);
+      break;
+    case REALTIME:
+      set_rt_speed_features_framesize_independent(cpi, sf, speed);
+      break;
+  }
+
+  // Note: when use_nonrd_pick_mode is true, the transform size is the
+  // minimum of 16x16 and the largest possible size of the current block,
+  // which conflicts with the speed feature "enable_tx_size_search".
+  if (!oxcf->txfm_cfg.enable_tx_size_search &&
+      sf->rt_sf.use_nonrd_pick_mode == 0) {
+    sf->winner_mode_sf.tx_size_search_level = 3;
+  }
+
+  if (cpi->mt_info.num_workers > 1) {
+    // Loop restoration stage is conditionally disabled for speed 5, 6 when
+    // num_workers > 1. Since av1_pick_filter_restoration() is not
+    // multi-threaded, enabling the Loop restoration stage will cause an
+    // increase in encode time (3% to 7% increase depends on frame
+    // resolution).
+    // TODO(aomedia:3446): Implement multi-threading of
+    // av1_pick_filter_restoration() and enable Wiener filter for speed 5, 6
+    // similar to single thread encoding path.
+    if (speed >= 5) {
+      sf->lpf_sf.disable_sgr_filter = true;
+      sf->lpf_sf.disable_wiener_filter = true;
+    }
+  }
+
+  if (!cpi->ppi->seq_params_locked) {
+    cpi->common.seq_params->order_hint_info.enable_dist_wtd_comp &=
+        (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+    cpi->common.seq_params->enable_dual_filter &=
+        !sf->interp_sf.disable_dual_filter;
+    // Set the flag 'enable_restoration', if one the Loop restoration filters
+    // (i.e., Wiener or Self-guided) is enabled.
+    cpi->common.seq_params->enable_restoration &=
+        (!sf->lpf_sf.disable_wiener_filter || !sf->lpf_sf.disable_sgr_filter);
+
+    cpi->common.seq_params->enable_interintra_compound &=
+        (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
+  }
+
+  const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    sf->mv_sf.mesh_patterns[i].range =
+        good_quality_mesh_patterns[mesh_speed][i].range;
+    sf->mv_sf.mesh_patterns[i].interval =
+        good_quality_mesh_patterns[mesh_speed][i].interval;
+  }
+
+  // Update the mesh pattern of exhaustive motion search for intraBC
+  // Though intraBC mesh pattern is populated for all frame types, it is used
+  // only for intra frames of screen contents
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    sf->mv_sf.intrabc_mesh_patterns[i].range =
+        intrabc_mesh_patterns[mesh_speed][i].range;
+    sf->mv_sf.intrabc_mesh_patterns[i].interval =
+        intrabc_mesh_patterns[mesh_speed][i].interval;
+  }
+
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (is_stat_generation_stage(cpi))
+    sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+
+  // No recode for 1 pass.
+  if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi))
+    sf->hl_sf.recode_loop = DISALLOW_RECODE;
+
+  set_subpel_search_method(&cpi->mv_search_params,
+                           cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+                           sf->mv_sf.subpel_search_method);
+
+  // assert ensures that tx_domain_dist_level is accessed correctly
+  assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 &&
+         cpi->sf.rd_sf.tx_domain_dist_thres_level < 4);
+  memcpy(winner_mode_params->tx_domain_dist_threshold,
+         tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level],
+         sizeof(winner_mode_params->tx_domain_dist_threshold));
+
+  assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 &&
+         cpi->sf.rd_sf.tx_domain_dist_level < TX_DOMAIN_DIST_LEVELS);
+  memcpy(winner_mode_params->use_transform_domain_distortion,
+         tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level],
+         sizeof(winner_mode_params->use_transform_domain_distortion));
+
+  // assert ensures that coeff_opt_thresholds is accessed correctly
+  assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 &&
+         cpi->sf.rd_sf.perform_coeff_opt < 9);
+  memcpy(winner_mode_params->coeff_opt_thresholds,
+         &coeff_opt_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
+         sizeof(winner_mode_params->coeff_opt_thresholds));
+
+  // assert ensures that predict_skip_levels is accessed correctly
+  assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 &&
+         cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3);
+  memcpy(winner_mode_params->skip_txfm_level,
+         predict_skip_levels[cpi->sf.tx_sf.tx_type_search
+                                 .use_skip_flag_prediction],
+         sizeof(winner_mode_params->skip_txfm_level));
+
+  // assert ensures that tx_size_search_level is accessed correctly
+  assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 &&
+         cpi->sf.winner_mode_sf.tx_size_search_level <= 3);
+  memcpy(winner_mode_params->tx_size_search_methods,
+         tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
+         sizeof(winner_mode_params->tx_size_search_methods));
+  memcpy(winner_mode_params->predict_dc_level,
+         predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level],
+         sizeof(winner_mode_params->predict_dc_level));
+
+  if (cpi->oxcf.row_mt == 1 && (cpi->mt_info.num_workers > 1)) {
+    if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
+      // Revert to type 2
+      sf->inter_sf.inter_mode_rd_model_estimation = 2;
+    }
+
+#if !CONFIG_FPMT_TEST
+    // Disable the speed feature 'prune_ref_frame_for_gm_search' to achieve
+    // better parallelism when number of threads available are greater than or
+    // equal to maximum number of reference frames allowed for global motion.
+    if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH &&
+        (cpi->mt_info.num_workers >=
+         gm_available_reference_frames[sf->gm_sf.gm_search_type]))
+      sf->gm_sf.prune_ref_frame_for_gm_search = 0;
+#endif
+  }
+
+  // This only applies to the real time mode. Adaptive gf refresh is disabled if
+  // gf_cbr_boost_pct that is set by the user is larger than 0.
+  if (cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 0)
+    sf->rt_sf.gf_refresh_based_on_qp = 0;
+}
+
+// Override some speed features based on qindex
+void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+  const int is_1440p_or_larger = AOMMIN(cm->width, cm->height) >= 1440;
+  const int is_arf2_bwd_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+
+  if (cpi->oxcf.mode == REALTIME) {
+    if (speed >= 6) {
+      const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150);
+      sf->part_sf.adjust_var_based_rd_partitioning =
+          frame_is_intra_only(cm)
+              ? 0
+              : cm->quant_params.base_qindex > qindex_thresh;
+    }
+    return;
+  }
+
+  if (speed == 0) {
+    // qindex_thresh for resolution < 720p
+    const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140);
+    if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) {
+      sf->part_sf.simple_motion_search_split =
+          cm->features.allow_screen_content_tools ? 1 : 2;
+      sf->part_sf.simple_motion_search_early_term_none = 1;
+      sf->tx_sf.model_based_prune_tx_search_level = 0;
+    }
+
+    if (is_720p_or_larger && cm->quant_params.base_qindex <= 128) {
+      sf->rd_sf.perform_coeff_opt = 2 + is_1080p_or_larger;
+      memcpy(winner_mode_params->coeff_opt_thresholds,
+             &coeff_opt_thresholds[sf->rd_sf.perform_coeff_opt],
+             sizeof(winner_mode_params->coeff_opt_thresholds));
+      sf->part_sf.simple_motion_search_split =
+          cm->features.allow_screen_content_tools ? 1 : 2;
+      sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+      sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+      sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+      sf->tx_sf.model_based_prune_tx_search_level = 0;
+
+      if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) {
+        sf->inter_sf.selective_ref_frame = 2;
+        sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+        sf->rd_sf.tx_domain_dist_thres_level = 1;
+        sf->part_sf.simple_motion_search_early_term_none = 1;
+        sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+        sf->interp_sf.cb_pred_filter_search = 0;
+        sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+        sf->tx_sf.tx_type_search.skip_tx_search = 1;
+      }
+    }
+  }
+
+  if (speed >= 2) {
+    // Disable extended partitions for lower quantizers
+    const int aggr = AOMMIN(4, speed - 2);
+    const int qindex_thresh1[4] = { 50, 50, 80, 100 };
+    const int qindex_thresh2[4] = { 80, 100, 120, 160 };
+    int qindex_thresh;
+    if (aggr <= 1) {
+      const int qthresh2 =
+          (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr];
+      qindex_thresh = cm->features.allow_screen_content_tools
+                          ? qindex_thresh1[aggr]
+                          : qthresh2;
+      if (cm->quant_params.base_qindex <= qindex_thresh && !boosted)
+        sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    } else if (aggr <= 2) {
+      qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+      if (cm->quant_params.base_qindex <= qindex_thresh &&
+          !frame_is_intra_only(cm))
+        sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    } else if (aggr <= 3) {
+      if (!is_480p_or_larger) {
+        sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+      } else if (!is_720p_or_larger && !frame_is_intra_only(cm) &&
+                 !cm->features.allow_screen_content_tools) {
+        sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+      } else {
+        qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+        if (cm->quant_params.base_qindex <= qindex_thresh &&
+            !frame_is_intra_only(cm))
+          sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+      }
+    } else {
+      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    }
+  }
+
+  if (speed >= 4) {
+    // Disable rectangular partitions for lower quantizers
+    const int aggr = AOMMIN(1, speed - 4);
+    const int qindex_thresh[2] = { 65, 80 };
+    int disable_rect_part;
+    disable_rect_part = !boosted;
+    if (cm->quant_params.base_qindex <= qindex_thresh[aggr] &&
+        disable_rect_part && is_480p_or_larger) {
+      sf->part_sf.rect_partition_eval_thresh = BLOCK_8X8;
+    }
+  }
+
+  if (speed <= 2) {
+    if (!is_stat_generation_stage(cpi)) {
+      // Use faster full-pel motion search for high quantizers.
+      // Also use reduced total search range for low resolutions at high
+      // quantizers.
+      const int aggr = speed;
+      const int qindex_thresh1 = ms_qindex_thresh[aggr][is_720p_or_larger][0];
+      const int qindex_thresh2 = ms_qindex_thresh[aggr][is_720p_or_larger][1];
+      const SEARCH_METHODS search_method =
+          motion_search_method[is_720p_or_larger];
+      if (cm->quant_params.base_qindex > qindex_thresh1) {
+        sf->mv_sf.search_method = search_method;
+        sf->tpl_sf.search_method = search_method;
+      } else if (cm->quant_params.base_qindex > qindex_thresh2) {
+        sf->mv_sf.search_method = NSTEP_8PT;
+      }
+    }
+  }
+
+  if (speed >= 4) {
+    // Disable LR search at low and high quantizers and enable only for
+    // mid-quantizer range.
+    if (!boosted && !is_arf2_bwd_type) {
+      const int qindex_low[2] = { 100, 60 };
+      const int qindex_high[2] = { 180, 160 };
+      if (cm->quant_params.base_qindex <= qindex_low[is_720p_or_larger] ||
+          cm->quant_params.base_qindex > qindex_high[is_720p_or_larger]) {
+        sf->lpf_sf.disable_loop_restoration_luma = 1;
+      }
+    }
+  }
+
+  if (speed == 1) {
+    // Reuse interinter wedge mask search from first search for non-boosted
+    // non-internal-arf frames, except at very high quantizers.
+    if (cm->quant_params.base_qindex <= 200) {
+      if (!boosted && !is_arf2_bwd_type)
+        sf->inter_sf.reuse_mask_search_results = 1;
+    }
+  }
+
+  if (speed == 5) {
+    if (!(frame_is_intra_only(&cpi->common) ||
+          cm->features.allow_screen_content_tools)) {
+      const int qindex[2] = { 256, 128 };
+      // Set the sf value as 3 for low resolution and
+      // for higher resolutions with low quantizers.
+      if (cm->quant_params.base_qindex < qindex[is_480p_or_larger])
+        sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
+    }
+  }
+
+  if (speed >= 5) {
+    // Disable the sf for low quantizers in case of low resolution screen
+    // contents.
+    if (cm->features.allow_screen_content_tools &&
+        cm->quant_params.base_qindex < 128 && is_480p_or_lesser) {
+      sf->part_sf.prune_sub_8x8_partition_level = 0;
+    }
+  }
+
+  // Loop restoration size search
+  // At speed 0, always search all available sizes for the maximum possible gain
+  sf->lpf_sf.min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+  sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+
+  if (speed >= 1) {
+    // For large frames, small restoration units are almost never useful,
+    // so prune them away
+    if (is_1440p_or_larger) {
+      sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+    } else if (is_720p_or_larger) {
+      sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+    }
+  }
+
+  if (speed >= 3 || (cpi->oxcf.mode == ALLINTRA && speed >= 1)) {
+    // At this speed, a full search is too expensive. Instead, pick a single
+    // size based on size and qindex. Note that, in general, higher quantizers
+    // (== lower quality) and larger frames generally want to use larger
+    // restoration units.
+    int qindex_thresh = 96;
+    if (cm->quant_params.base_qindex <= qindex_thresh && !is_1440p_or_larger) {
+      sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+      sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+    } else {
+      sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+      sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+    }
+  }
+
+  set_subpel_search_method(&cpi->mv_search_params,
+                           cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+                           sf->mv_sf.subpel_search_method);
+}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
new file mode 100644
index 0000000000..60c000e4f4
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -0,0 +1,2025 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
+
+#include "av1/common/enums.h"
+#include "av1/encoder/enc_enums.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encodemb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @file */
+
+/*!\cond */
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
+enum {
+  GM_FULL_SEARCH,
+  GM_REDUCED_REF_SEARCH_SKIP_L2_L3,
+  GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2,
+
+  // Same as GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 but with extra filtering
+  // to keep at most two ref frames
+  GM_SEARCH_CLOSEST_REFS_ONLY,
+
+  GM_DISABLE_SEARCH
+} UENUM1BYTE(GM_SEARCH_TYPE);
+
+enum {
+  DIST_WTD_COMP_ENABLED,
+  DIST_WTD_COMP_SKIP_MV_SEARCH,
+  DIST_WTD_COMP_DISABLED,
+} UENUM1BYTE(DIST_WTD_COMP_FLAG);
+
+enum {
+  INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+              (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
+              (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) |
+              (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED),
+  UV_INTRA_ALL =
+      (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+      (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) |
+      (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) |
+      (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) |
+      (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC = (1 << UV_DC_PRED),
+  UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED),
+  UV_INTRA_DC_PAETH_CFL =
+      (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
+                        (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                          (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                              (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+                              (1 << UV_CFL_PRED),
+  INTRA_DC = (1 << DC_PRED),
+  INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
+  INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_H_V_SMOOTH =
+      (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << SMOOTH_PRED),
+  INTRA_DC_PAETH_H_V =
+      (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+              (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+              (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
+              (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+                            (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
+                            (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+                            (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
+                            (1 << NEAR_NEARMV),
+  INTER_SINGLE_ALL =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV),
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+                            (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+enum {
+  TXFM_CODING_SF = 1,
+  INTER_PRED_SF = 2,
+  INTRA_PRED_SF = 4,
+  PARTITION_SF = 8,
+  LOOP_FILTER_SF = 16,
+  RD_SKIP_SF = 32,
+  RESERVE_2_SF = 64,
+  RESERVE_3_SF = 128,
+} UENUM1BYTE(DEV_SPEED_FEATURES);
+
+/* This enumeration defines when the rate control recode loop will be
+ * enabled.
+ */
+enum {
+  /*
+   * No recodes allowed
+   */
+  DISALLOW_RECODE = 0,
+  /*
+   * Allow recode only for KF/ARF/GF frames
+   */
+  ALLOW_RECODE_KFARFGF = 1,
+  /*
+   * Allow recode for all frame types based on bitrate constraints.
+   */
+  ALLOW_RECODE = 2,
+} UENUM1BYTE(RECODE_LOOP_TYPE);
+
+enum {
+  SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,       // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,  // Prunes 1/2-pel searches more aggressively
+  SUBPEL_SEARCH_METHODS
+} UENUM1BYTE(SUBPEL_SEARCH_METHOD);
+
+enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try the full image filter search with non-dual filter only.
+  LPF_PICK_FROM_FULL_IMAGE_NON_DUAL,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+  // Pick 0 to disable LPF if LPF was enabled last frame
+  LPF_PICK_MINIMAL_LPF
+} UENUM1BYTE(LPF_PICK_METHOD);
+/*!\endcond */
+
+/*!\enum CDEF_PICK_METHOD
+ * \brief This enumeration defines a variety of CDEF pick methods
+ */
+typedef enum {
+  CDEF_FULL_SEARCH,      /**< Full search */
+  CDEF_FAST_SEARCH_LVL1, /**< Search among a subset of all possible filters. */
+  CDEF_FAST_SEARCH_LVL2, /**< Search reduced subset of filters than Level 1. */
+  CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than
+                              Level 2. */
+  CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */
+  CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */
+  CDEF_PICK_FROM_Q,      /**< Estimate filter strength based on quantizer. */
+  CDEF_PICK_METHODS
+} CDEF_PICK_METHOD;
+
+/*!\cond */
+enum {
+  // Terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1 << 0,
+
+  // Skips comp inter modes if the best so far is an intra mode.
+  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+  // Skips oblique intra modes if the best so far is an inter mode.
+  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions.
+  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+  // Skips intra modes other than DC_PRED if the source variance is small
+  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC);
+
+enum {
+  // No tx type pruning
+  TX_TYPE_PRUNE_0 = 0,
+  // adaptively prunes the least perspective tx types out of all 16
+  // (tuned to provide negligible quality loss)
+  TX_TYPE_PRUNE_1 = 1,
+  // similar, but applies much more aggressive pruning to get better speed-up
+  TX_TYPE_PRUNE_2 = 2,
+  TX_TYPE_PRUNE_3 = 3,
+  // More aggressive pruning based on tx type score and allowed tx count
+  TX_TYPE_PRUNE_4 = 4,
+  TX_TYPE_PRUNE_5 = 5,
+} UENUM1BYTE(TX_TYPE_PRUNE_MODE);
+
+enum {
+  // No reaction to rate control on a detected slide/scene change.
+  NO_DETECTION = 0,
+
+  // Set to larger Q based only on the detected slide/scene change and
+  // current/past Q.
+  FAST_DETECTION_MAXQ = 1,
+} UENUM1BYTE(OVERSHOOT_DETECTION_CBR);
+
+enum {
+  // Turns off multi-winner mode. So we will do txfm search on either all modes
+  // if winner mode is off, or we will only on txfm search on a single winner
+  // mode.
+  MULTI_WINNER_MODE_OFF = 0,
+
+  // Limits the number of winner modes to at most 2
+  MULTI_WINNER_MODE_FAST = 1,
+
+  // Uses the default number of winner modes, which is 3 for intra mode, and 1
+  // for inter mode.
+  MULTI_WINNER_MODE_DEFAULT = 2,
+
+  // Maximum number of winner modes allowed.
+  MULTI_WINNER_MODE_LEVELS,
+} UENUM1BYTE(MULTI_WINNER_MODE_TYPE);
+
+enum {
+  PRUNE_NEARMV_OFF = 0,     // Turn off nearmv pruning
+  PRUNE_NEARMV_LEVEL1 = 1,  // Prune nearmv for qindex (0-85)
+  PRUNE_NEARMV_LEVEL2 = 2,  // Prune nearmv for qindex (0-170)
+  PRUNE_NEARMV_LEVEL3 = 3,  // Prune nearmv more aggressively for qindex (0-170)
+  PRUNE_NEARMV_MAX = PRUNE_NEARMV_LEVEL3,
+} UENUM1BYTE(PRUNE_NEARMV_LEVEL);
+
+enum {
+  // Default transform search used in evaluation of best inter candidates
+  // (MODE_EVAL stage) and motion mode winner processing (WINNER_MODE_EVAL
+  // stage).
+  TX_SEARCH_DEFAULT = 0,
+  // Transform search in motion mode rd during MODE_EVAL stage.
+  TX_SEARCH_MOTION_MODE,
+  // Transform search in compound type mode rd during MODE_EVAL stage.
+  TX_SEARCH_COMP_TYPE_MODE,
+  // All transform search cases
+  TX_SEARCH_CASES
+} UENUM1BYTE(TX_SEARCH_CASE);
+
+typedef struct {
+  TX_TYPE_PRUNE_MODE prune_2d_txfm_mode;
+  int fast_intra_tx_type_search;
+
+  // INT_MAX: Disable fast search.
+  // 1 - 1024: Probability threshold used for conditionally forcing tx type,
+  // during mode search.
+  // 0: Force tx type to be DCT_DCT unconditionally, during
+  // mode search.
+  int fast_inter_tx_type_prob_thresh;
+
+  // Prune less likely chosen transforms for each intra mode. The speed
+  // feature ranges from 0 to 2, for different speed / compression trade offs.
+  int use_reduced_intra_txset;
+
+  // Use a skip flag prediction model to detect blocks with skip = 1 early
+  // and avoid doing full TX type search for such blocks.
+  int use_skip_flag_prediction;
+
+  // Threshold used by the ML based method to predict TX block split decisions.
+  int ml_tx_split_thresh;
+
+  // skip remaining transform type search when we found the rdcost of skip is
+  // better than applying transform
+  int skip_tx_search;
+
+  // Prune tx type search using previous frame stats.
+  int prune_tx_type_using_stats;
+  // Prune tx type search using estimated RDcost
+  int prune_tx_type_est_rd;
+
+  // Flag used to control the winner mode processing for tx type pruning for
+  // inter blocks. It enables further tx type mode pruning based on ML model for
+  // mode evaluation and disables tx type mode pruning for winner mode
+  // processing.
+  int winner_mode_tx_type_pruning;
+} TX_TYPE_SEARCH;
+
+enum {
+  // Search partitions using RD criterion
+  SEARCH_PARTITION,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION,
+
+  // Partition using source variance
+  VAR_BASED_PARTITION,
+
+#if CONFIG_RT_ML_PARTITIONING
+  // Partition using ML model
+  ML_BASED_PARTITION
+#endif
+} UENUM1BYTE(PARTITION_SEARCH_TYPE);
+
+enum {
+  NOT_IN_USE,
+  DIRECT_PRED,
+  RELAXED_PRED,
+  ADAPT_PRED
+} UENUM1BYTE(MAX_PART_PRED_MODE);
+
+enum {
+  LAST_MV_DATA,
+  CURRENT_Q,
+  QTR_ONLY,
+} UENUM1BYTE(MV_PREC_LOGIC);
+
+enum {
+  SUPERRES_AUTO_ALL,   // Tries all possible superres ratios
+  SUPERRES_AUTO_DUAL,  // Tries no superres and q-based superres ratios
+  SUPERRES_AUTO_SOLO,  // Only apply the q-based superres ratio
+} UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE);
+/*!\endcond */
+
+/*!\enum INTERNAL_COST_UPDATE_TYPE
+ * \brief This enum decides internally how often to update the entropy costs
+ *
+ * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly
+ * more flexibility in update frequency. This enum is separate from \ref
+ * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its
+ * values are public so it cannot be modified without breaking public API.
+ * Due to the use of AOMMIN() in populate_unified_cost_update_freq() to
+ * compute the unified cost update frequencies (out of COST_UPDATE_TYPE and
+ * INTERNAL_COST_UPDATE_TYPE), the values of this enum type must be listed in
+ * the order of increasing frequencies.
+ *
+ * \warning  In case of any updates/modifications to the enum COST_UPDATE_TYPE,
+ * update the enum INTERNAL_COST_UPDATE_TYPE as well.
+ */
+typedef enum {
+  INTERNAL_COST_UPD_OFF,       /*!< Turn off cost updates. */
+  INTERNAL_COST_UPD_TILE,      /*!< Update every tile. */
+  INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */
+  INTERNAL_COST_UPD_SBROW,     /*!< Update every sb rows inside a tile. */
+  INTERNAL_COST_UPD_SB,        /*!< Update every sb. */
+} INTERNAL_COST_UPDATE_TYPE;
+
+/*!\enum SIMPLE_MOTION_SEARCH_PRUNE_LEVEL
+ * \brief This enumeration defines a variety of simple motion search based
+ * partition prune levels
+ */
+typedef enum {
+  NO_PRUNING = -1,
+  SIMPLE_AGG_LVL0,     /*!< Simple prune aggressiveness level 0. */
+  SIMPLE_AGG_LVL1,     /*!< Simple prune aggressiveness level 1. */
+  SIMPLE_AGG_LVL2,     /*!< Simple prune aggressiveness level 2. */
+  SIMPLE_AGG_LVL3,     /*!< Simple prune aggressiveness level 3. */
+  QIDX_BASED_AGG_LVL1, /*!< Qindex based prune aggressiveness level, aggressive
+                          level maps to simple agg level 1 or 2 based on qindex.
+                        */
+  TOTAL_SIMPLE_AGG_LVLS = QIDX_BASED_AGG_LVL1, /*!< Total number of simple prune
+                                                  aggressiveness levels. */
+  TOTAL_QINDEX_BASED_AGG_LVLS =
+      QIDX_BASED_AGG_LVL1 -
+      SIMPLE_AGG_LVL3, /*!< Total number of qindex based simple prune
+                          aggressiveness levels. */
+  TOTAL_AGG_LVLS = TOTAL_SIMPLE_AGG_LVLS +
+                   TOTAL_QINDEX_BASED_AGG_LVLS, /*!< Total number of levels. */
+} SIMPLE_MOTION_SEARCH_PRUNE_LEVEL;
+
+/*!\enum PRUNE_MESH_SEARCH_LEVEL
+ * \brief This enumeration defines a variety of mesh search prune levels.
+ */
+typedef enum {
+  PRUNE_MESH_SEARCH_DISABLED = 0, /*!< Prune mesh search level 0. */
+  PRUNE_MESH_SEARCH_LVL_1 = 1,    /*!< Prune mesh search level 1. */
+  PRUNE_MESH_SEARCH_LVL_2 = 2,    /*!< Prune mesh search level 2. */
+} PRUNE_MESH_SEARCH_LEVEL;
+
+/*!\enum INTER_SEARCH_EARLY_TERM_IDX
+ * \brief This enumeration defines inter search early termination index in
+ * non-rd path based on sse value.
+ */
+typedef enum {
+  EARLY_TERM_DISABLED =
+      0, /*!< Early terminate inter mode search based on sse disabled. */
+  EARLY_TERM_IDX_1 =
+      1, /*!< Early terminate inter mode search based on sse, index 1. */
+  EARLY_TERM_IDX_2 =
+      2, /*!< Early terminate inter mode search based on sse, index 2. */
+  EARLY_TERM_IDX_3 =
+      3, /*!< Early terminate inter mode search based on sse, index 3. */
+  EARLY_TERM_IDX_4 =
+      4, /*!< Early terminate inter mode search based on sse, index 4. */
+  EARLY_TERM_INDICES, /*!< Total number of early terminate indices */
+} INTER_SEARCH_EARLY_TERM_IDX;
+
+/*!
+ * \brief Sequence/frame level speed vs quality features
+ */
+typedef struct HIGH_LEVEL_SPEED_FEATURES {
+  /*! Frame level coding parameter update. */
+  int frame_parameter_update;
+
+  /*!
+   * Cases and frame types for which the recode loop is enabled.
+   */
+  RECODE_LOOP_TYPE recode_loop;
+
+  /*!
+   * Controls the tolerance vs target rate used in deciding whether to
+   * recode a frame. It has no meaning if recode is disabled.
+   */
+  int recode_tolerance;
+
+  /*!
+   * Determine how motion vector precision is chosen. The possibilities are:
+   * LAST_MV_DATA: use the mv data from the last coded frame
+   * CURRENT_Q: use the current q as a threshold
+   * QTR_ONLY: use quarter pel precision only.
+   */
+  MV_PREC_LOGIC high_precision_mv_usage;
+
+  /*!
+   * Always set to 0. If on it enables 0 cost background transmission
+   * (except for the initial transmission of the segmentation). The feature is
+   * disabled because the addition of very large block sizes make the
+   * backgrounds very to cheap to encode, and the segmentation we have
+   * adds overhead.
+   */
+  int static_segmentation;
+
+  /*!
+   * Superres-auto mode search type:
+   */
+  SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type;
+
+  /*!
+   * Enable/disable extra screen content test by encoding key frame twice.
+   */
+  int disable_extra_sc_testing;
+
+  /*!
+   * Enable/disable second_alt_ref temporal filtering.
+   */
+  int second_alt_ref_filtering;
+
+  /*!
+   * The number of frames to be used during temporal filtering of an ARF frame
+   * is adjusted based on noise level of the current frame. The sf has three
+   * levels to decide number of frames to be considered for filtering:
+   * 0       : Use default number of frames
+   * 1 and 2 : Reduce the number of frames based on noise level with varied
+   * aggressiveness
+   */
+  int adjust_num_frames_for_arf_filtering;
+
+  /*!
+   * Decide the bit estimation approach used in qindex decision.
+   * 0: estimate bits based on a constant value;
+   * 1: estimate bits more accurately based on the frame complexity.
+   */
+  int accurate_bit_estimate;
+
+  /*!
+   * Decide the approach for weight calculation during temporal filtering.
+   * 0: Calculate weight using exp()
+   * 1: Calculate weight using a lookup table that approximates exp().
+   */
+  int weight_calc_level_in_tf;
+
+  /*!
+   * Decide whether to perform motion estimation at split block (i.e. 16x16)
+   * level or not.
+   * 0: Always allow motion estimation.
+   * 1: Conditionally allow motion estimation based on 4x4 sub-blocks variance.
+   */
+  int allow_sub_blk_me_in_tf;
+} HIGH_LEVEL_SPEED_FEATURES;
+
+/*!
+ * Speed features for the first pass.
+ */
+typedef struct FIRST_PASS_SPEED_FEATURES {
+  /*!
+   * \brief Reduces the mv search window.
+   * By default, the initial search window is around
+   * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023).
+   * Each step reduction decrease the window size by about a factor of 2.
+   */
+  int reduce_mv_step_param;
+
+  /*!
+   * \brief Skips the motion search when the zero mv has small sse.
+   */
+  int skip_motion_search_threshold;
+
+  /*!
+   * \brief Skips reconstruction by using source buffers for prediction
+   */
+  int disable_recon;
+
+  /*!
+   * \brief Skips the motion search centered on 0,0 mv.
+   */
+  int skip_zeromv_motion_search;
+} FIRST_PASS_SPEED_FEATURES;
+
+/*!\cond */
+typedef struct TPL_SPEED_FEATURES {
+  // GOP length adaptive decision.
+  // If set to 0, tpl model decides whether a shorter gf interval is better.
+  // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and
+  // (base+2) layer decide whether a shorter gf interval is better.
+  // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost
+  // decide whether a shorter gf interval is better.
+  // If set to 3, gop length adaptive decision is disabled.
+  int gop_length_decision_method;
+  // Prune the intra modes search by tpl.
+  // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
+  // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
+  int prune_intra_modes;
+  // This parameter controls which step in the n-step process we start at.
+  int reduce_first_step_size;
+  // Skip motion estimation based on the precision of center MVs and the
+  // difference between center MVs.
+  // If set to 0, motion estimation is skipped for duplicate center MVs
+  // (default). If set to 1, motion estimation is skipped for duplicate
+  // full-pixel center MVs. If set to 2, motion estimation is skipped if the
+  // difference between center MVs is less than the threshold.
+  int skip_alike_starting_mv;
+
+  // When to stop subpel search.
+  SUBPEL_FORCE_STOP subpel_force_stop;
+
+  // Which search method to use.
+  SEARCH_METHODS search_method;
+
+  // Prune starting mvs in TPL based on sad scores.
+  int prune_starting_mv;
+
+  // Prune reference frames in TPL.
+  int prune_ref_frames_in_tpl;
+
+  // Support compound predictions.
+  int allow_compound_pred;
+
+  // Calculate rate and distortion based on Y plane only.
+  int use_y_only_rate_distortion;
+
+  // Use SAD instead of SATD during intra/inter mode search.
+  // If set to 0, use SATD always.
+  // If set to 1, use SAD during intra/inter mode search for frames in the
+  // higher temporal layers of the hierarchical prediction structure.
+  // If set to 2, use SAD during intra/inter mode search for all frames.
+  // This sf is disabled for the first GF group of the key-frame interval,
+  // i.e., SATD is used during intra/inter mode search of the first GF group.
+  int use_sad_for_mode_decision;
+
+  // Skip tpl processing for frames of type LF_UPDATE.
+  // This sf is disabled for the first GF group of the key-frame interval.
+  int reduce_num_frames;
+} TPL_SPEED_FEATURES;
+
+typedef struct GLOBAL_MOTION_SPEED_FEATURES {
+  GM_SEARCH_TYPE gm_search_type;
+
+  // During global motion estimation, prune remaining reference frames in a
+  // given direction(past/future), if the evaluated ref_frame in that direction
+  // yields gm_type as INVALID/TRANSLATION/IDENTITY
+  int prune_ref_frame_for_gm_search;
+
+  // When the current GM type is set to ZEROMV, prune ZEROMV if its performance
+  // is worse than NEWMV under SSE metric.
+  // 0 : no pruning
+  // 1 : conservative pruning
+  // 2 : aggressive pruning
+  int prune_zero_mv_with_sse;
+
+  // Disable global motion estimation based on stats of previous frames in the
+  // GF group
+  int disable_gm_search_based_on_stats;
+
+  // Number of refinement steps to apply after initial model generation
+  int num_refinement_steps;
+} GLOBAL_MOTION_SPEED_FEATURES;
+
+typedef struct PARTITION_SPEED_FEATURES {
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_PARTITION
+  BLOCK_SIZE fixed_partition_size;
+
+  // Prune extended partition types search based on the current best partition
+  // and the combined rdcost of the subblocks estimated from previous
+  // partitions. Can take values 0 - 2, 0 referring to no pruning, and 1 - 2
+  // increasing aggressiveness of pruning in order.
+  int prune_ext_partition_types_search_level;
+
+  // Prune part4 based on block size
+  int prune_part4_search;
+
+  // Use a ML model to prune rectangular, ab and 4-way horz
+  // and vert partitions
+  int ml_prune_partition;
+
+  // Use a ML model to adaptively terminate partition search after trying
+  // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and
+  // 1 - 2 increasing aggressiveness in order.
+  int ml_early_term_after_part_split_level;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split. Can take values 0 - 2, 0 referring to no
+  // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+  int less_rectangular_check_level;
+
+  // Use square partition only beyond this block size.
+  BLOCK_SIZE use_square_partition_only_threshold;
+
+  // Sets max square partition levels for this superblock based on
+  // motion vector and prediction error distribution produced from 16x16
+  // simple motion search
+  MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
+
+  // Min and max square partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
+
+  // Sets level of adjustment of variance-based partitioning during
+  // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions
+  // for small blocks and high QP, 2 - try to merge partitions, 3 - try to merge
+  // and split leaf partitions and 0 - 3 decreasing aggressiveness in order.
+  int adjust_var_based_rd_partitioning;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
+
+  // Thresholds for ML based partition search breakout.
+  int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
+  // Aggressiveness levels for pruning split and rectangular partitions based on
+  // simple_motion_search. SIMPLE_AGG_LVL0 to SIMPLE_AGG_LVL3 correspond to
+  // simple motion search based pruning. QIDX_BASED_AGG_LVL1 corresponds to
+  // qindex based and simple motion search based pruning.
+  int simple_motion_search_prune_agg;
+
+  // Perform simple_motion_search on each possible subblock and use it to prune
+  // PARTITION_HORZ and PARTITION_VERT.
+  int simple_motion_search_prune_rect;
+
+  // Perform simple motion search before none_partition to decide if we
+  // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this
+  // model is disabled. If set to 1, the model attempts to perform
+  // PARTITION_SPLIT only. If set to 2, the model also attempts to prune
+  // PARTITION_SPLIT.
+  int simple_motion_search_split;
+
+  // Use features from simple_motion_search to terminate prediction block
+  // partition after PARTITION_NONE
+  int simple_motion_search_early_term_none;
+
+  // Controls whether to reduce the number of motion search steps. If this is 0,
+  // then simple_motion_search has the same number of steps as
+  // single_motion_search (assuming no other speed features). Otherwise, reduce
+  // the number of steps by the value contained in this variable.
+  int simple_motion_search_reduce_search_steps;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // Use CNN with luma pixels on source frame on each of the 64x64 subblock to
+  // perform partition pruning in intra frames.
+  // 0: No Pruning
+  // 1: Prune split and rectangular partitions only
+  // 2: Prune none, split and rectangular partitions
+  int intra_cnn_based_part_prune_level;
+
+  // Disable extended partition search if the current bsize is greater than the
+  // threshold. Must be a square block size BLOCK_8X8 or higher.
+  BLOCK_SIZE ext_partition_eval_thresh;
+
+  // Use best partition decision so far to tune 'ext_partition_eval_thresh'
+  int ext_part_eval_based_on_cur_best;
+
+  // Disable rectangular partitions for larger block sizes.
+  int rect_partition_eval_thresh;
+
+  // Prune extended partition search based on whether the split/rect partitions
+  // provided an improvement in the previous search.
+  // 0 : no pruning
+  // 1 : prune 1:4 partition search using winner info from split partitions
+  // 2 : prune 1:4 and AB partition search using split and HORZ/VERT info
+  int prune_ext_part_using_split_info;
+
+  // Prunt rectangular, AB and 4-way partition based on q index and block size
+  // 0 : no pruning
+  // 1 : prune sub_8x8 at very low quantizers
+  // 2 : prune all block size based on qindex
+  int prune_rectangular_split_based_on_qidx;
+
+  // Prune rectangular partitions based on 4x4 sub-block variance
+  // false : no pruning
+  // true : prune rectangular partitions based on 4x4 sub-block variance
+  // deviation
+  //
+  // For allintra encode, this speed feature reduces instruction count by 6.4%
+  // for speed=6 with coding performance change less than 0.24%. For AVIF image
+  // encode, this speed feature reduces encode time by 8.14% for speed 6 on a
+  // typical image dataset with coding performance change less than 0.16%. This
+  // speed feature is not applicable to speed >= 7.
+  bool prune_rect_part_using_4x4_var_deviation;
+
+  // Prune rectangular partitions based on prediction mode chosen by NONE
+  // partition.
+  // false : no pruning
+  // true : prunes rectangular partition as described below
+  // If prediction mode chosen by NONE partition is
+  // DC_PRED or SMOOTH_PRED: Prunes both horizontal and vertical partitions if
+  // at least one of the left and top neighbor blocks is larger than the
+  // current block.
+  // Directional Mode: Prunes either of the horizontal and vertical partition
+  // based on center angle of the prediction mode chosen by NONE partition. For
+  // example, vertical partition is pruned if center angle of the prediction
+  // mode chosen by NONE partition is close to 180 degrees (i.e. horizontal
+  // direction) and vice versa.
+  // For allintra encode, this speed feature reduces instruction count by 5.1%
+  // for speed=6 with coding performance change less than 0.22%. For AVIF image
+  // encode, this speed feature reduces encode time by 4.44% for speed 6 on a
+  // typical image dataset with coding performance change less than 0.15%.
+  // For speed >= 7, variance-based logic is used to determine the partition
+  // structure instead of recursive partition search. Therefore, this speed
+  // feature is not applicable in such cases.
+  bool prune_rect_part_using_none_pred_mode;
+
+  // Terminate partition search for child partition,
+  // when NONE and SPLIT partition rd_costs are INT64_MAX.
+  int early_term_after_none_split;
+
+  // Level used to adjust threshold for av1_ml_predict_breakout(). At lower
+  // levels, more conservative threshold is used, and value of 0 indicates
+  // av1_ml_predict_breakout() is disabled. Value of 3 corresponds to default
+  // case with no adjustment to lbd thresholds.
+  int ml_predict_breakout_level;
+
+  // Prune sub_8x8 (BLOCK_4X4, BLOCK_4X8 and BLOCK_8X4) partitions.
+  // 0 : no pruning
+  // 1 : pruning based on neighbour block information
+  // 2 : prune always
+  int prune_sub_8x8_partition_level;
+
+  // Prune rectangular split based on simple motion search split/no_split score.
+  // 0: disable pruning, 1: enable pruning
+  int simple_motion_search_rect_split;
+
+  // The current encoder adopts a DFS search for block partitions.
+  // Therefore the mode selection and associated rdcost is ready for smaller
+  // blocks before the mode selection for some partition types.
+  // AB partition could use previous rd information and skip mode search.
+  // An example is:
+  //
+  //  current block
+  //  +---+---+
+  //  |       |
+  //  +       +
+  //  |       |
+  //  +-------+
+  //
+  //  SPLIT partition has been searched first before trying HORZ_A
+  //  +---+---+
+  //  | R | R |
+  //  +---+---+
+  //  | R | R |
+  //  +---+---+
+  //
+  //  HORZ_A
+  //  +---+---+
+  //  |   |   |
+  //  +---+---+
+  //  |       |
+  //  +-------+
+  //
+  //  With this speed feature, the top two sub blocks can directly use rdcost
+  //  searched in split partition, and the mode info is also copied from
+  //  saved info. Similarly, the bottom rectangular block can also use
+  //  the available information from previous rectangular search.
+  int reuse_prev_rd_results_for_part_ab;
+
+  // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
+  // when encoding PARTITION_AB.
+  int reuse_best_prediction_for_part_ab;
+
+  // The current partition search records the best rdcost so far and uses it
+  // in mode search and transform search to early skip when some criteria is
+  // met. For example, when the current rdcost is larger than the best rdcost,
+  // or the model rdcost is larger than the best rdcost times some thresholds.
+  // By default, this feature is turned on to speed up the encoder partition
+  // search.
+  // If disabling it, at speed 0, 30 frames, we could get
+  // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown.
+  int use_best_rd_for_pruning;
+
+  // Skip evaluation of non-square partitions based on the corresponding NONE
+  // partition.
+  // 0: no pruning
+  // 1: prune extended partitions if NONE is skippable
+  // 2: on top of 1, prune rectangular partitions if NONE is inter, not a newmv
+  // mode and skippable
+  int skip_non_sq_part_based_on_none;
+
+  // Disables 8x8 and below partitions for low quantizers.
+  int disable_8x8_part_based_on_qidx;
+} PARTITION_SPEED_FEATURES;
+
+typedef struct MV_SPEED_FEATURES {
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+  SEARCH_METHODS search_method;
+
+  // Enable the use of faster, less accurate mv search method
+  // 0: disable, 1: if bsize >= BLOCK_32X32, 2: based on bsize, SAD and qp
+  // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into
+  // account.
+  int use_bsize_dependent_search_method;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
+  int auto_mv_step_size;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
+  SUBPEL_SEARCH_METHOD subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
+  int subpel_iters_per_step;
+
+  // When to stop subpel search.
+  SUBPEL_FORCE_STOP subpel_force_stop;
+
+  // When to stop subpel search in simple motion search.
+  SUBPEL_FORCE_STOP simple_motion_subpel_force_stop;
+
+  // If true, sub-pixel search uses the exact convolve function used for final
+  // encoding and decoding; otherwise, it uses bilinear interpolation.
+  SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
+
+  // Threshold for allowing exhaustive motion search.
+  int exhaustive_searches_thresh;
+
+  // Pattern to be used for any exhaustive mesh searches (except intraBC ME).
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+  // Pattern to be used for exhaustive mesh searches of intraBC ME.
+  MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP];
+
+  // Reduce single motion search range based on MV result of prior ref_mv_idx.
+  int reduce_search_range;
+
+  // Prune mesh search.
+  PRUNE_MESH_SEARCH_LEVEL prune_mesh_search;
+
+  // Use the rd cost around the best FULLPEL_MV to speed up subpel search
+  int use_fullpel_costlist;
+
+  // Set the full pixel search level of obmc
+  // 0: obmc_full_pixel_diamond
+  // 1: obmc_refining_search_sad (faster)
+  int obmc_full_pixel_search_level;
+
+  // Accurate full pixel motion search based on TPL stats.
+  int full_pixel_search_level;
+
+  // Allow intrabc motion search
+  int use_intrabc;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 16 rows. When this sf is
+  // active, if there is a large discrepancy in the SAD values for the final
+  // motion vector between skipping vs not skipping, motion search is redone
+  // with skip row features off.
+  // 0: Disabled (do not downsample rows)
+  // 1: Skip SAD calculation of odd rows if the SAD deviation of the even and
+  //    odd rows for the starting MV is small. Redo motion search with sf off
+  //    when SAD deviation is high for the final motion vector.
+  // 2: Skip SAD calculation of odd rows. SAD deviation is not tested for the
+  //    start MV and tested only for the final MV.
+  int use_downsampled_sad;
+
+  // Enable/disable extensive joint motion search.
+  int disable_extensive_joint_motion_search;
+
+  // Enable second best mv check in joint mv search.
+  // 0: allow second MV (use rd cost as the metric)
+  // 1: use var as the metric
+  // 2: disable second MV
+  int disable_second_mv;
+
+  // Skips full pixel search based on start mv of prior ref_mv_idx.
+  // 0: Disabled
+  // 1: Skips the full pixel search upto 4 neighbor full-pel MV positions.
+  // 2: Skips the full pixel search upto 8 neighbor full-pel MV positions.
+  int skip_fullpel_search_using_startmv;
+
+  // Method to use for refining WARPED_CAUSAL motion vectors
+  // TODO(rachelbarker): Can this be unified with OBMC in some way?
+  WARP_SEARCH_METHOD warp_search_method;
+
+  // Maximum number of iterations in WARPED_CAUSAL refinement search
+  int warp_search_iters;
+} MV_SPEED_FEATURES;
+
+typedef struct INTER_MODE_SPEED_FEATURES {
+  // 2-pass inter mode model estimation where the preliminary pass skips
+  // transform search and uses a model to estimate rd, while the final pass
+  // computes the full transform search. Two types of models are supported:
+  // 0: not used
+  // 1: used with online dynamic rd model
+  // 2: used with static rd model
+  int inter_mode_rd_model_estimation;
+
+  // Bypass transform search based on skip rd at following stages
+  //   i. Compound type mode search
+  //  ii. Motion mode search (mode evaluation and winner motion mode stage)
+  // iii. Transform search for best inter candidates
+  int txfm_rd_gate_level[TX_SEARCH_CASES];
+
+  // Limit the inter mode tested in the RD loop
+  int reduce_inter_modes;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Aggressively prune inter modes when best mode is skippable.
+  int prune_inter_modes_if_skippable;
+
+  // Drop less likely to be picked reference frames in the RD search.
+  // Has seven levels for now: 0, 1, 2, 3, 4, 5 and 6 where higher levels prune
+  // more aggressively than lower ones. (0 means no pruning).
+  int selective_ref_frame;
+
+  // Prune reference frames for rectangular partitions.
+  // 0 implies no pruning
+  // 1 implies prune for extended partition
+  // 2 implies prune horiz, vert and extended partition
+  int prune_ref_frame_for_rect_partitions;
+
+  // Prune inter modes w.r.t past reference frames
+  // 0 no pruning
+  // 1 prune inter modes w.r.t ALTREF2 and ALTREF reference frames
+  // 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames
+  int alt_ref_search_fp;
+
+  // Prune reference frames for single prediction modes based on temporal
+  // distance and pred MV SAD. Feasible values are 0, 1, 2. The feature is
+  // disabled for 0. An increasing value indicates more aggressive pruning
+  // threshold.
+  int prune_single_ref;
+
+  // Prune compound reference frames
+  // 0 no pruning
+  // 1 prune compound references which do not satisfy the two conditions:
+  //   a) The references are at a nearest distance from the current frame in
+  //   both past and future direction.
+  //   b) The references have minimum pred_mv_sad in both past and future
+  //   direction.
+  // 2 prune compound references except the one with nearest distance from the
+  //   current frame in both past and future direction.
+  int prune_comp_ref_frames;
+
+  // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc.
+  // This speed feature equaling 0 means no skipping.
+  // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode
+  // if we have already encountered ref_mv in the drl such that:
+  //  1. The other drl has the same mv during the SIMPLE_TRANSLATION search
+  //     process as the current mv.
+  //  2. The rate needed to encode the current mv is larger than that for the
+  //     other ref_mv.
+  // The speed feature equaling 1 means using subpel mv in the comparison.
+  // The speed feature equaling 2 means using fullpel mv in the comparison.
+  // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on
+  // known full_mv bestsme and drl cost.
+  int skip_newmv_in_drl;
+
+  // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
+  // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
+  // TODO(any): Instead of skipping repeated ref mv, use the recalculated
+  // rd-cost based on mode rate and skip the mode evaluation
+  int skip_repeated_ref_mv;
+
+  // Flag used to control the ref_best_rd based gating for chroma
+  int perform_best_rd_based_gating_for_chroma;
+
+  // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+  // single ref modes
+  int reuse_inter_intra_mode;
+
+  // prune wedge and compound segment approximate rd evaluation based on
+  // compound average modeled rd
+  int prune_comp_type_by_model_rd;
+
+  // prune wedge and compound segment approximate rd evaluation based on
+  // compound average rd/ref_best_rd
+  int prune_comp_type_by_comp_avg;
+
+  // Skip some ref frames in compound motion search by single motion search
+  // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+  // increasing aggressiveness of skipping in order.
+  // Note: The search order might affect the result. It assumes that the single
+  // reference modes are searched before compound modes. It is better to search
+  // same single inter mode as a group.
+  int prune_comp_search_by_single_result;
+
+  // Instead of performing a full MV search, do a simple translation first
+  // and only perform a full MV search on the motion vectors that performed
+  // well.
+  int prune_mode_search_simple_translation;
+
+  // Only search compound modes with at least one "good" reference frame.
+  // A reference frame is good if, after looking at its performance among
+  // the single reference modes, it is one of the two best performers.
+  int prune_compound_using_single_ref;
+
+  // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV,
+  // NEW_NEARMV) using ref frames of above and left neighbor
+  // blocks.
+  // 0 : no pruning
+  // 1 : prune ext compound modes using neighbor blocks (less aggressiveness)
+  // 2 : prune ext compound modes using neighbor blocks (high aggressiveness)
+  // 3 : prune ext compound modes unconditionally (highest aggressiveness)
+  int prune_ext_comp_using_neighbors;
+
+  // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+  int skip_ext_comp_nearmv_mode;
+
+  // Skip extended compound mode when ref frame corresponding to NEWMV does not
+  // have NEWMV as single mode winner.
+  // 0 : no pruning
+  // 1 : prune extended compound mode (less aggressiveness)
+  // 2 : prune extended compound mode (high aggressiveness)
+  int prune_comp_using_best_single_mode_ref;
+
+  // Skip NEARESTMV and NEARMV using weight computed in ref mv list population
+  //
+  // Pruning is enabled only when both the top and left neighbor blocks are
+  // available and when the current block already has a valid inter prediction.
+  int prune_nearest_near_mv_using_refmv_weight;
+
+  // Based on previous ref_mv_idx search result, prune the following search.
+  int prune_ref_mv_idx_search;
+
+  // Disable one sided compound modes.
+  int disable_onesided_comp;
+
+  // Prune obmc search using previous frame stats.
+  // INT_MAX : disable obmc search
+  int prune_obmc_prob_thresh;
+
+  // Prune warped motion search using previous frame stats.
+  int prune_warped_prob_thresh;
+
+  // Variance threshold to enable/disable Interintra wedge search
+  unsigned int disable_interintra_wedge_var_thresh;
+
+  // Variance threshold to enable/disable Interinter wedge search
+  unsigned int disable_interinter_wedge_var_thresh;
+
+  // De-couple wedge and mode search during interintra RDO.
+  int fast_interintra_wedge_search;
+
+  // Whether fast wedge sign estimate is used
+  int fast_wedge_sign_estimate;
+
+  // Enable/disable ME for interinter wedge search.
+  int disable_interinter_wedge_newmv_search;
+
+  // Decide when and how to use joint_comp.
+  DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
+
+  // Clip the frequency of updating the mv cost.
+  INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level;
+
+  // Clip the frequency of updating the coeff cost.
+  INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level;
+
+  // Clip the frequency of updating the mode cost.
+  INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level;
+
+  // Prune inter modes based on tpl stats
+  // 0 : no pruning
+  // 1 - 3 indicate increasing aggressiveness in order.
+  int prune_inter_modes_based_on_tpl;
+
+  // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left
+  // neighbor blocks and qindex.
+  PRUNE_NEARMV_LEVEL prune_nearmv_using_neighbors;
+
+  // Model based breakout after interpolation filter search
+  // 0: no breakout
+  // 1: use model based rd breakout
+  int model_based_post_interp_filter_breakout;
+
+  // Reuse compound type rd decision when exact match is found
+  // 0: No reuse
+  // 1: Reuse the compound type decision
+  int reuse_compound_type_decision;
+
+  // Enable/disable masked compound.
+  int disable_masked_comp;
+
+  // Enable/disable MV refinement for compound modes corresponds to compound
+  // types COMPOUND_AVERAGE, COMPOUND_DISTWTD (currently, this compound type
+  // is disabled for speeds >= 2 using the sf 'use_dist_wtd_comp_flag') and
+  // COMPOUND_DIFFWTD based on the availability. Levels 0 to 3 indicate
+  // increasing order of aggressiveness to disable MV refinement.
+  // 0: MV Refinement is enabled and for NEW_NEWMV mode used two iterations of
+  // refinement in av1_joint_motion_search().
+  // 1: MV Refinement is disabled for COMPOUND_DIFFWTD and enabled for
+  // COMPOUND_AVERAGE & COMPOUND_DISTWTD.
+  // 2: MV Refinement is enabled for COMPOUND_AVERAGE & COMPOUND_DISTWTD for
+  // NEW_NEWMV mode with one iteration of refinement in
+  // av1_joint_motion_search() and MV Refinement is disabled for other compound
+  // type modes.
+  // 3: MV Refinement is disabled.
+  int enable_fast_compound_mode_search;
+
+  // Reuse masked compound type search results
+  int reuse_mask_search_results;
+
+  // Enable/disable fast search for wedge masks
+  int enable_fast_wedge_mask_search;
+
+  // Early breakout from transform search of inter modes
+  int inter_mode_txfm_breakout;
+
+  // Limit number of inter modes for txfm search if a newmv mode gets
+  // evaluated among the top modes.
+  // 0: no pruning
+  // 1 to 3 indicate increasing order of aggressiveness
+  int limit_inter_mode_cands;
+
+  // Cap the no. of txfm searches for a given prediction mode.
+  // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches.
+  int limit_txfm_eval_per_mode;
+
+  // Prune warped motion search based on block size.
+  int extra_prune_warped;
+
+  // Do not search compound modes for ARF.
+  // The intuition is that ARF is predicted by frames far away from it,
+  // whose temporal correlations with the ARF are likely low.
+  // It is therefore likely that compound modes do not work as well for ARF
+  // as other inter frames.
+  // Speed/quality impact:
+  // Speed 1: 12% faster, 0.1% psnr loss.
+  // Speed 2: 2%  faster, 0.05% psnr loss.
+  // No change for speed 3 and up, because |disable_onesided_comp| is true.
+  int skip_arf_compound;
+} INTER_MODE_SPEED_FEATURES;
+
+typedef struct INTERP_FILTER_SPEED_FEATURES {
+  // Do limited interpolation filter search for dual filters, since best choice
+  // usually includes EIGHTTAP_REGULAR.
+  int use_fast_interpolation_filter_search;
+
+  // Disable dual filter
+  int disable_dual_filter;
+
+  // Save results of av1_interpolation_filter_search for a block
+  // Check mv and ref_frames before search, if they are very close with previous
+  // saved results, filter search can be skipped.
+  int use_interp_filter;
+
+  // skip sharp_filter evaluation based on regular and smooth filter rd for
+  // dual_filter=0 case
+  int skip_sharp_interp_filter_search;
+
+  // skip interpolation filter search for a block in chessboard pattern
+  int cb_pred_filter_search;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+
+  // Forces interpolation filter to EIGHTTAP_REGULAR and skips interpolation
+  // filter search.
+  int skip_interp_filter_search;
+} INTERP_FILTER_SPEED_FEATURES;
+
+typedef struct INTRA_MODE_SPEED_FEATURES {
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // flag to allow skipping intra mode for inter frame prediction
+  int skip_intra_in_interframe;
+
+  // Prune intra mode candidates based on source block histogram of gradient.
+  // Applies to luma plane only.
+  // Feasible values are 0..4. The feature is disabled for 0. An increasing
+  // value indicates more aggressive pruning threshold.
+  int intra_pruning_with_hog;
+
+  // Prune intra mode candidates based on source block histogram of gradient.
+  // Applies to chroma plane only.
+  // Feasible values are 0..4. The feature is disabled for 0. An increasing
+  // value indicates more aggressive pruning threshold.
+  int chroma_intra_pruning_with_hog;
+
+  // Enable/disable smooth intra modes.
+  int disable_smooth_intra;
+
+  // Prune UV_SMOOTH_PRED mode for chroma based on chroma source variance.
+  // false : No pruning
+  // true  : Prune UV_SMOOTH_PRED mode based on chroma source variance
+  //
+  // For allintra encode, this speed feature reduces instruction count
+  // by 1.90%, 2.21% and 1.97% for speed 6, 7 and 8 with coding performance
+  // change less than 0.04%. For AVIF image encode, this speed feature reduces
+  // encode time by 1.56%, 2.14% and 0.90% for speed 6, 7 and 8 on a typical
+  // image dataset with coding performance change less than 0.05%.
+  bool prune_smooth_intra_mode_for_chroma;
+
+  // Prune filter intra modes in intra frames.
+  // 0 : No pruning
+  // 1 : Evaluate applicable filter intra modes based on best intra mode so far
+  // 2 : Do not evaluate filter intra modes
+  int prune_filter_intra_level;
+
+  // prune palette search
+  // 0: No pruning
+  // 1: Perform coarse search to prune the palette colors. For winner colors,
+  // neighbors are also evaluated using a finer search.
+  // 2: Perform 2 way palette search from max colors to min colors (and min
+  // colors to remaining colors) and terminate the search if current number of
+  // palette colors is not the winner.
+  int prune_palette_search_level;
+
+  // Terminate early in luma palette_size search. Speed feature values indicate
+  // increasing level of pruning.
+  // 0: No early termination
+  // 1: Terminate early for higher luma palette_size, if header rd cost of lower
+  // palette_size is more than 2 * best_rd. This level of pruning is more
+  // conservative when compared to sf level 2 as the cases which will get pruned
+  // with sf level 1 is a subset of the cases which will get pruned with sf
+  // level 2.
+  // 2: Terminate early for higher luma palette_size, if header rd cost of lower
+  // palette_size is more than best_rd.
+  // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%,
+  // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4,
+  // 5, 6, 7 and 8 on screen content set with coding performance change less
+  // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF
+  // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%,
+  // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6,
+  // 7 and 8 on a typical image dataset with coding performance change less than
+  // 0.01%.
+  int prune_luma_palette_size_search_level;
+
+  // Prune chroma intra modes based on luma intra mode winner.
+  // 0: No pruning
+  // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED,
+  // UV_CFL_PRED and the mode that corresponds to luma intra mode winner.
+  int prune_chroma_modes_using_luma_winner;
+
+  // Clip the frequency of updating the mv cost for intrabc.
+  INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level;
+
+  // We use DCT_DCT transform followed by computing SATD (Sum of Absolute
+  // Transformed Differences) as an estimation of RD score to quickly find the
+  // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search
+  // near the best possible parameter. The search range is set here.
+  // The range of cfl_searh_range should be [1, 33], and the following are the
+  // recommended values.
+  // 1: Fastest mode.
+  // 3: Default mode that provides good speedup without losing compression
+  // performance at speed 0.
+  // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only
+  // be used for debugging purpose.
+  int cfl_search_range;
+
+  // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in
+  // intra mode decision. Here, add a speed feature to reduce this number for
+  // higher speeds.
+  int top_intra_model_count_allowed;
+
+  // Adapt top_intra_model_count_allowed locally to prune luma intra modes using
+  // neighbor block and quantizer information.
+  int adapt_top_model_rd_count_using_neighbors;
+
+  // Prune the evaluation of odd delta angles of directional luma intra modes by
+  // using the rdcosts of neighbouring delta angles.
+  // For allintra encode, this speed feature reduces instruction count
+  // by 4.461%, 3.699% and 3.536% for speed 6, 7 and 8 on a typical video
+  // dataset with coding performance change less than 0.26%. For AVIF image
+  // encode, this speed feature reduces encode time by 2.849%, 2.471%,
+  // and 2.051% for speed 6, 7 and 8 on a typical image dataset with coding
+  // performance change less than 0.27%.
+  int prune_luma_odd_delta_angles_in_intra;
+
+  // Terminate early in chroma palette_size search.
+  // 0: No early termination
+  // 1: Terminate early for higher palette_size, if header rd cost of lower
+  // palette_size is more than best_rd.
+  // For allintra encode, this sf reduces instruction count by 0.45%,
+  // 0.62%, 1.73%, 2.50%, 2.89%, 3.09% and 3.86% for speed 0 to 6 on screen
+  // content set with coding performance change less than 0.01%.
+  // For AVIF image encode, this sf reduces instruction count by 0.45%, 0.81%,
+  // 0.85%, 1.05%, 1.45%, 1.66% and 1.95% for speed 0 to 6 on a typical image
+  // dataset with no quality drop.
+  int early_term_chroma_palette_size_search;
+
+  // Skips the evaluation of filter intra modes in inter frames if rd evaluation
+  // of luma intra dc mode results in invalid rd stats.
+  int skip_filter_intra_in_inter_frames;
+} INTRA_MODE_SPEED_FEATURES;
+
+typedef struct TX_SPEED_FEATURES {
+  // Init search depth for square and rectangular transform partitions.
+  // Values:
+  // 0 - search full tree, 1: search 1 level, 2: search the highest level only
+  int inter_tx_size_search_init_depth_sqr;
+  int inter_tx_size_search_init_depth_rect;
+  int intra_tx_size_search_init_depth_sqr;
+  int intra_tx_size_search_init_depth_rect;
+
+  // If any dimension of a coding block size above 64, always search the
+  // largest transform only, since the largest transform block size is 64x64.
+  int tx_size_search_lgr_block;
+
+  TX_TYPE_SEARCH tx_type_search;
+
+  // Skip split transform block partition when the collocated bigger block
+  // is selected as all zero coefficients.
+  int txb_split_cap;
+
+  // Shortcut the transform block partition and type search when the target
+  // rdcost is relatively lower.
+  // Values are 0 (not used) , or 1 - 2 with progressively increasing
+  // aggressiveness
+  int adaptive_txb_search_level;
+
+  // Prune level for tx_size_type search for inter based on rd model
+  // 0: no pruning
+  // 1-2: progressively increasing aggressiveness of pruning
+  int model_based_prune_tx_search_level;
+
+  // Refine TX type after fast TX search.
+  int refine_fast_tx_search_results;
+
+  // Prune transform split/no_split eval based on residual properties. A value
+  // of 0 indicates no pruning, and the aggressiveness of pruning progressively
+  // increases from levels 1 to 3.
+  int prune_tx_size_level;
+
+  // Prune the evaluation of transform depths as decided by the NN model.
+  // false: No pruning.
+  // true : Avoid the evaluation of specific transform depths using NN model.
+  //
+  // For allintra encode, this speed feature reduces instruction count
+  // by 4.76%, 8.92% and 11.28% for speed 6, 7 and 8 with coding performance
+  // change less than 0.32%. For AVIF image encode, this speed feature reduces
+  // encode time by 4.65%, 9.16% and 10.45% for speed 6, 7 and 8 on a typical
+  // image dataset with coding performance change less than 0.19%.
+  bool prune_intra_tx_depths_using_nn;
+
+  // Enable/disable early breakout during transform search of intra modes, by
+  // using the minimum rd cost possible. By using this approach, the rd
+  // evaluation of applicable transform blocks (in the current block) can be
+  // avoided as
+  // 1) best_rd evolves during the search in choose_tx_size_type_from_rd()
+  // 2) appropriate ref_best_rd is passed in intra_block_yrd()
+  //
+  // For allintra encode, this speed feature reduces instruction count
+  // by 1.11%, 1.08%, 1.02% and 0.93% for speed 3, 6, 7 and 8 with coding
+  // performance change less than 0.02%. For AVIF image encode, this speed
+  // feature reduces encode time by 0.93%, 1.46%, 1.07%, 0.84%, 0.99% and 0.73%
+  // for speed 3, 4, 5, 6, 7 and 8 on a typical image dataset with coding
+  // performance change less than 0.004%.
+  bool use_rd_based_breakout_for_intra_tx_search;
+} TX_SPEED_FEATURES;
+
+typedef struct RD_CALC_SPEED_FEATURES {
+  // Fast approximation of av1_model_rd_from_var_lapndz
+  int simple_model_rd_from_var;
+
+  // Perform faster distortion computation during the R-D evaluation by trying
+  // to approximate the prediction error with transform coefficients (faster but
+  // less accurate) rather than computing distortion in the pixel domain (slower
+  // but more accurate). The following methods are used for distortion
+  // computation:
+  // Method 0: Always compute distortion in the pixel domain
+  // Method 1: Based on block error, try using transform domain distortion for
+  // tx_type search and compute distortion in pixel domain for final RD_STATS
+  // Method 2: Based on block error, try to compute distortion in transform
+  // domain
+  // Methods 1 and 2 may fallback to computing distortion in the pixel domain in
+  // case the block error is less than the threshold, which is controlled by the
+  // speed feature tx_domain_dist_thres_level.
+  //
+  // The speed feature tx_domain_dist_level decides which of the above methods
+  // needs to be used across different mode evaluation stages as described
+  // below:
+  // Eval type:    Default      Mode        Winner
+  // Level 0  :    Method 0    Method 2    Method 0
+  // Level 1  :    Method 1    Method 2    Method 0
+  // Level 2  :    Method 2    Method 2    Method 0
+  // Level 3  :    Method 2    Method 2    Method 2
+  int tx_domain_dist_level;
+
+  // Transform domain distortion threshold level
+  int tx_domain_dist_thres_level;
+
+  // Trellis (dynamic programming) optimization of quantized values
+  TRELLIS_OPT_TYPE optimize_coefficients;
+
+  // Use hash table to store macroblock RD search results
+  // to avoid repeated search on the same residue signal.
+  int use_mb_rd_hash;
+
+  // Flag used to control the extent of coeff R-D optimization
+  int perform_coeff_opt;
+} RD_CALC_SPEED_FEATURES;
+
+typedef struct WINNER_MODE_SPEED_FEATURES {
+  // Flag used to control the winner mode processing for better R-D optimization
+  // of quantized coeffs
+  int enable_winner_mode_for_coeff_opt;
+
+  // Flag used to control the winner mode processing for transform size
+  // search method
+  int enable_winner_mode_for_tx_size_srch;
+
+  // Control transform size search level
+  // Eval type: Default       Mode        Winner
+  // Level 0  : FULL RD     LARGEST ALL   FULL RD
+  // Level 1  : FAST RD     LARGEST ALL   FULL RD
+  // Level 2  : LARGEST ALL LARGEST ALL   FULL RD
+  // Level 3 :  LARGEST ALL LARGEST ALL   LARGEST ALL
+  int tx_size_search_level;
+
+  // Flag used to control the winner mode processing for use transform
+  // domain distortion
+  int enable_winner_mode_for_use_tx_domain_dist;
+
+  // Flag used to enable processing of multiple winner modes
+  MULTI_WINNER_MODE_TYPE multi_winner_mode_type;
+
+  // Motion mode for winner candidates:
+  // 0: speed feature OFF
+  // 1 / 2 : Use configured number of winner candidates
+  int motion_mode_for_winner_cand;
+
+  // Controls the prediction of transform skip block or DC only block.
+  //
+  // Different speed feature values (0 to 3) decide the aggressiveness of
+  // prediction (refer to predict_dc_levels[][] in speed_features.c) to be used
+  // during different mode evaluation stages.
+  int dc_blk_pred_level;
+
+  // If on, disables interpolation filter search in handle_inter_mode loop, and
+  // performs it during winner mode processing by \ref
+  // tx_search_best_inter_candidates.
+  int winner_mode_ifs;
+
+  // Controls the disabling of winner mode processing. Speed feature levels
+  // are ordered in increasing aggressiveness of pruning. The method considered
+  // for disabling, depends on the sf level value and it is described as below.
+  // 0: Do not disable
+  // 1: Disable for blocks with low source variance.
+  // 2: Disable for blocks which turn out to be transform skip (skipped based on
+  // eob) during MODE_EVAL stage except NEWMV mode.
+  // 3: Disable for blocks which turn out to be transform skip during MODE_EVAL
+  // stage except NEWMV mode. For high quantizers, prune conservatively based on
+  // transform skip (skipped based on eob) except for NEWMV mode.
+  // 4: Disable for blocks which turn out to be transform skip during MODE_EVAL
+  // stage.
+  int prune_winner_mode_eval_level;
+} WINNER_MODE_SPEED_FEATURES;
+
+typedef struct LOOP_FILTER_SPEED_FEATURES {
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // Skip some final iterations in the determination of the best loop filter
+  // level.
+  int use_coarse_filter_level_search;
+
+  // Control how the CDEF strength is determined.
+  CDEF_PICK_METHOD cdef_pick_method;
+
+  // Decoder side speed feature to add penalty for use of dual-sgr filters.
+  // Takes values 0 - 10, 0 indicating no penalty and each additional level
+  // adding a penalty of 1%
+  int dual_sgr_penalty_level;
+
+  // prune sgr ep using binary search like mechanism
+  int enable_sgr_ep_pruning;
+
+  // Disable loop restoration for Chroma plane
+  int disable_loop_restoration_chroma;
+
+  // Disable loop restoration for luma plane
+  int disable_loop_restoration_luma;
+
+  // Range of loop restoration unit sizes to search
+  // The minimum size is clamped against the superblock size in
+  // av1_pick_filter_restoration, so that the code which sets this value does
+  // not need to know the superblock size ahead of time.
+  int min_lr_unit_size;
+  int max_lr_unit_size;
+
+  // Prune RESTORE_WIENER evaluation based on source variance
+  // 0 : no pruning
+  // 1 : conservative pruning
+  // 2 : aggressive pruning
+  int prune_wiener_based_on_src_var;
+
+  // Prune self-guided loop restoration based on wiener search results
+  // 0 : no pruning
+  // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE
+  // 2 : pruning based on winner restoration type among RESTORE_WIENER and
+  // RESTORE_NONE
+  int prune_sgr_based_on_wiener;
+
+  // Reduce the wiener filter win size for luma
+  int reduce_wiener_window_size;
+
+  // Flag to disable Wiener Loop restoration filter.
+  bool disable_wiener_filter;
+
+  // Flag to disable Self-guided Loop restoration filter.
+  bool disable_sgr_filter;
+
+  // Disable the refinement search around the wiener filter coefficients.
+  bool disable_wiener_coeff_refine_search;
+
+  // Whether to downsample the rows in computation of wiener stats.
+  int use_downsampled_wiener_stats;
+} LOOP_FILTER_SPEED_FEATURES;
+
+typedef struct REAL_TIME_SPEED_FEATURES {
+  // check intra prediction for non-RD mode.
+  int check_intra_pred_nonrd;
+
+  // Skip checking intra prediction.
+  // 0 - don't skip
+  // 1 - skip if TX is skipped and best mode is not NEWMV
+  // 2 - skip if TX is skipped
+  // Skipping aggressiveness increases from level 1 to 2.
+  int skip_intra_pred;
+
+  // Estimate motion before calculating variance in variance-based partition
+  // 0 - Only use zero MV
+  // 1 - perform coarse ME
+  // 2 - perform coarse ME, and also use neighbours' MVs
+  // 3 - use neighbours' MVs without performing coarse ME
+  int estimate_motion_for_var_based_partition;
+
+  // For nonrd_use_partition: mode of extra check of leaf partition
+  // 0 - don't check merge
+  // 1 - always check merge
+  // 2 - check merge and prune checking final split
+  // 3 - check merge and prune checking final split based on bsize and qindex
+  int nonrd_check_partition_merge_mode;
+
+  // For nonrd_use_partition: check of leaf partition extra split
+  int nonrd_check_partition_split;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // For nonrd: Reduces ref frame search.
+  // 0 - low level of search prune in non last frames
+  // 1 - pruned search in non last frames
+  // 2 - more pruned search in non last frames
+  int nonrd_prune_ref_frame_search;
+
+  // This flag controls the use of non-RD mode decision.
+  int use_nonrd_pick_mode;
+
+  // Use ALTREF frame in non-RD mode decision.
+  int use_nonrd_altref_frame;
+
+  // Use compound reference for non-RD mode.
+  int use_comp_ref_nonrd;
+
+  // Reference frames for compound prediction for nonrd pickmode:
+  // LAST_GOLDEN (0), LAST_LAST2 (1), or LAST_ALTREF (2).
+  int ref_frame_comp_nonrd[3];
+
+  // use reduced ref set for real-time mode
+  int use_real_time_ref_set;
+
+  // Skip a number of expensive mode evaluations for blocks with very low
+  // temporal variance.
+  int short_circuit_low_temp_var;
+
+  // Reuse inter prediction in fast non-rd mode.
+  int reuse_inter_pred_nonrd;
+
+  // Number of best inter modes to search transform. INT_MAX - search all.
+  int num_inter_modes_for_tx_search;
+
+  // Use interpolation filter search in non-RD mode decision.
+  int use_nonrd_filter_search;
+
+  // Use simplified RD model for interpolation search and Intra
+  int use_simple_rd_model;
+
+  // For nonrd mode: use hybrid intra mode search for intra only frames based on
+  // block properties.
+  // 0 : use nonrd pick intra for all blocks
+  // 1 : use rd for bsize < 16x16, nonrd otherwise
+  // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise
+  int hybrid_intra_pickmode;
+
+  // Compute variance/sse on source difference, prior to encoding superblock.
+  int source_metrics_sb_nonrd;
+
+  // Flag to indicate process for handling overshoot on slide/scene change,
+  // for real-time CBR mode.
+  OVERSHOOT_DETECTION_CBR overshoot_detection_cbr;
+
+  // Check for scene/content change detection on every frame before encoding.
+  int check_scene_detection;
+
+  // For nonrd mode: Prefer larger partition blks in variance based partitioning
+  // 0: disabled, 1-3: increasing aggressiveness
+  int prefer_large_partition_blocks;
+
+  // uses results of temporal noise estimate
+  int use_temporal_noise_estimate;
+
+  // Parameter indicating initial search window to be used in full-pixel search
+  // for nonrd_pickmode. Range [0, MAX_MVSEARCH_STEPS - 1]. Lower value
+  // indicates larger window. If set to 0, step_param is set based on internal
+  // logic in set_mv_search_params().
+  int fullpel_search_step_param;
+
+  // Bit mask to enable or disable intra modes for each prediction block size
+  // separately, for nonrd_pickmode.  Currently, the sf is not respected when
+  // 'force_intra_check' is true in 'av1_estimate_intra_mode()' function. Also,
+  // H and V pred modes allowed through this sf can be further pruned when
+  //'prune_hv_pred_modes_using_src_sad' sf is true.
+  int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES];
+
+  // Prune H and V intra predition modes evalution in inter frame.
+  // The sf does not have any impact.
+  // i. when frame_source_sad is 1.1 times greater than avg_source_sad
+  // ii. when cyclic_refresh_segment_id_boosted is enabled
+  // iii. when SB level source sad is greater than kMedSad
+  // iv. when color sensitivity is non zero for both the chroma channels
+  bool prune_hv_pred_modes_using_src_sad;
+
+  // Skips mode checks more aggressively in nonRD mode
+  int nonrd_aggressive_skip;
+
+  // Skip cdef on 64x64 blocks/
+  // 0: disabled
+  // 1: skip when NEWMV or INTRA is not picked or color sensitivity is off.
+  // When color sensitivity is on for a superblock, all 64x64 blocks within
+  // will not skip.
+  // 2: more aggressive mode where skip is done for all frames where
+  // rc->high_source_sad = 0 (non slide-changes), and color sensitivity off.
+  int skip_cdef_sb;
+
+  // Force selective cdf update.
+  int selective_cdf_update;
+
+  // Force only single reference (LAST) for prediction.
+  int force_only_last_ref;
+
+  // Forces larger partition blocks in variance based partitioning for intra
+  // frames
+  int force_large_partition_blocks_intra;
+
+  // Use fixed partition for superblocks based on source_sad.
+  // 0: disabled
+  // 1: enabled
+  int use_fast_fixed_part;
+
+  // Increase source_sad thresholds in nonrd pickmode.
+  int increase_source_sad_thresh;
+
+  // Skip evaluation of no split in tx size selection for merge partition
+  int skip_tx_no_split_var_based_partition;
+
+  // Intermediate termination of newMV mode evaluation based on so far best mode
+  // sse
+  int skip_newmv_mode_based_on_sse;
+
+  // Define gf length multiplier.
+  // Level 0: use large multiplier, level 1: use medium multiplier.
+  int gf_length_lvl;
+
+  // Prune inter modes with golden frame as reference for NEARMV and NEWMV modes
+  int prune_inter_modes_with_golden_ref;
+
+  // Prune inter modes w.r.t golden or alt-ref frame based on sad
+  int prune_inter_modes_wrt_gf_arf_based_on_sad;
+
+  // Prune inter mode search in rd path based on current block's temporal
+  // variance wrt LAST reference.
+  int prune_inter_modes_using_temp_var;
+
+  // Reduce MV precision to halfpel for higher int MV value & frame-level motion
+  // 0: disabled
+  // 1-2: Reduce precision to halfpel, fullpel based on conservative
+  // thresholds, aggressiveness increases with increase in level
+  // 3: Reduce precision to halfpel using more aggressive thresholds
+  int reduce_mv_pel_precision_highmotion;
+
+  // Reduce MV precision for low complexity blocks
+  // 0: disabled
+  // 1: Reduce the mv resolution for zero mv if the variance is low
+  // 2: Switch to halfpel, fullpel based on low block spatial-temporal
+  // complexity.
+  int reduce_mv_pel_precision_lowcomplex;
+
+  // Prune intra mode evaluation in inter frames based on mv range.
+  BLOCK_SIZE prune_intra_mode_based_on_mv_range;
+  // The number of times to left shift the splitting thresholds in variance
+  // based partitioning. The minimum values should be 7 to avoid left shifting
+  // by a negative number.
+  int var_part_split_threshold_shift;
+
+  // Qindex based variance partition threshold index, which determines
+  // the aggressiveness of partition pruning
+  // 0: disabled for speeds 9,10
+  // 1,2: (rd-path) lowers qindex thresholds conditionally (for low SAD sb)
+  // 3,4: (non-rd path) uses pre-tuned qindex thresholds
+  int var_part_based_on_qidx;
+
+  // Enable GF refresh based on Q value.
+  int gf_refresh_based_on_qp;
+
+  // Temporal filtering
+  // The value can be 1 or 2, which indicates the threshold to use.
+  // Must be off for lossless mode.
+  int use_rtc_tf;
+
+  // Prune the use of the identity transform in nonrd_pickmode,
+  // used for screen content mode: only for smaller blocks
+  // and higher spatial variance, and when skip_txfm is not
+  // already set.
+  int prune_idtx_nonrd;
+
+  // Prune the use of paletter mode in nonrd pickmode.
+  int prune_palette_nonrd;
+
+  // Force to only use dct for palette search in nonrd pickmode.
+  int dct_only_palette_nonrd;
+
+  // Skip loopfilter, for static content after slide change
+  // or key frame, once quality has ramped up.
+  // 0: disabled
+  // 1: skip only after quality is ramped up.
+  // 2: aggrssive mode, where skip is done for all frames that
+  // where rc->high_source_sad = 0 (no slide-changes).
+  int skip_lf_screen;
+
+  // For nonrd: early exit out of variance partition that sets the
+  // block size to superblock size, and sets mode to zeromv-last skip.
+  // 0: disabled
+  // 1: zeromv-skip is enabled at SB level only
+  // 2: zeromv-skip is enabled at SB level and coding block level
+  int part_early_exit_zeromv;
+
+  // Early terminate inter mode search based on sse in non-rd path.
+  INTER_SEARCH_EARLY_TERM_IDX sse_early_term_inter_search;
+
+  // SAD based adaptive altref selection
+  int sad_based_adp_altref_lag;
+
+  // Enable/disable partition direct merging.
+  int partition_direct_merging;
+
+  // Level of aggressiveness for obtaining tx size based on qstep
+  int tx_size_level_based_on_qstep;
+
+  // Avoid the partitioning of a 16x16 block in variance based partitioning
+  // (VBP) by making use of minimum and maximum sub-block variances.
+  // For allintra encode, this speed feature reduces instruction count by 5.39%
+  // for speed 9 on a typical video dataset with coding performance gain
+  // of 1.44%.
+  // For AVIF image encode, this speed feature reduces encode time
+  // by 8.44% for speed 9 on a typical image dataset with coding performance
+  // gain of 0.78%.
+  bool vbp_prune_16x16_split_using_min_max_sub_blk_var;
+
+  // A qindex threshold that determines whether to use qindex based CDEF filter
+  // strength estimation for screen content types. The strength estimation model
+  // used for screen contents prefers to allow cdef filtering for more frames.
+  // This sf is used to limit the frames which go through cdef filtering and
+  // following explains the setting of the same.
+  // MAXQ (255): This disables the usage of this sf. Here, frame does not use a
+  // screen content model thus reduces the number of frames that go through cdef
+  // filtering.
+  // MINQ (0): Frames always use screen content model thus increasing the number
+  // of frames that go through cdef filtering.
+  // This speed feature has a substantial gain on coding metrics, with moderate
+  // increase encoding time. Select threshold based on speed vs quality
+  // trade-off.
+  int screen_content_cdef_filter_qindex_thresh;
+
+  // Prune compound mode if its variance is higher than the variance of single
+  // modes.
+  bool prune_compoundmode_with_singlecompound_var;
+
+  // Allow mode cost update at frame level every couple frames. This
+  // overrides the command line setting --mode-cost-upd-freq=3 (never update
+  // except on key frame and first delta).
+  bool frame_level_mode_cost_update;
+
+  // Prune H_PRED during intra mode evaluation in the nonrd path based on best
+  // mode so far.
+  //
+  // For allintra encode, this speed feature reduces instruction count by 1.10%
+  // for speed 9 with coding performance change less than 0.04%.
+  // For AVIF image encode, this speed feature reduces encode time by 1.03% for
+  // speed 9 on a typical image dataset with coding performance change less than
+  // 0.08%.
+  bool prune_h_pred_using_best_mode_so_far;
+
+  // Enable pruning of intra mode evaluations in nonrd path based on source
+  // variance and best mode so far. The pruning logic is enabled only if the
+  // mode is not a winner mode of both the neighboring blocks (left/top).
+  //
+  // For allintra encode, this speed feature reduces instruction count by 3.96%
+  // for speed 9 with coding performance change less than 0.38%.
+  // For AVIF image encode, this speed feature reduces encode time by 3.46% for
+  // speed 9 on a typical image dataset with coding performance change less than
+  // -0.06%.
+  bool enable_intra_mode_pruning_using_neighbors;
+
+  // Prune intra mode evaluations in nonrd path based on best sad so far.
+  //
+  // For allintra encode, this speed feature reduces instruction count by 3.05%
+  // for speed 9 with coding performance change less than 0.24%.
+  // For AVIF image encode, this speed feature reduces encode time by 1.87% for
+  // speed 9 on a typical image dataset with coding performance change less than
+  // 0.16%.
+  bool prune_intra_mode_using_best_sad_so_far;
+
+  // If compound is enabled, and the current block size is \geq BLOCK_16X16,
+  // limit the compound modes to GLOBAL_GLOBALMV. This does not apply to the
+  // base layer of svc.
+  bool check_only_zero_zeromv_on_large_blocks;
+
+  // Allow for disabling cdf update for non reference frames in svc mode.
+  bool disable_cdf_update_non_reference_frame;
+
+  // Prune compound modes if the single modes variances do not perform well.
+  bool prune_compoundmode_with_singlemode_var;
+
+  // Skip searching all compound mode if the variance of single_mode residue is
+  // sufficiently low.
+  bool skip_compound_based_on_var;
+
+  // Sets force_zeromv_skip based on the source sad available. Aggressiveness
+  // increases with increase in the level set for speed feature.
+  // 0: No setting
+  // 1: If source sad is kZeroSad
+  // 2: If source sad <= kVeryLowSad
+  int set_zeromv_skip_based_on_source_sad;
+
+  // Downgrades the block-level subpel motion search to
+  // av1_find_best_sub_pixel_tree_pruned_more for higher QP and when fullpel
+  // search performed well, zeromv has low sad or low source_var
+  bool use_adaptive_subpel_search;
+
+  // A flag used in RTC case to control frame_refs_short_signaling. Note that
+  // the final decision is made in check_frame_refs_short_signaling(). The flag
+  // can only be turned on when res < 360p and speed >= 9, in which case only
+  // LAST and GOLDEN ref frames are used now.
+  bool enable_ref_short_signaling;
+
+  // A flag that controls if we check or bypass GLOBALMV in rtc single ref frame
+  // case.
+  bool check_globalmv_on_single_ref;
+
+  // Allows for increasing the color_threshold for palette prediction.
+  // This generally leads to better coding efficiency but with some speed loss.
+  // Only used for screen content and for nonrd_pickmode.
+  bool increase_color_thresh_palette;
+} REAL_TIME_SPEED_FEATURES;
+
+/*!\endcond */
+
+/*!
+ * \brief Top level speed vs quality trade off data struture.
+ */
+typedef struct SPEED_FEATURES {
+  /*!
+   * Sequence/frame level speed features:
+   */
+  HIGH_LEVEL_SPEED_FEATURES hl_sf;
+
+  /*!
+   * Speed features for the first pass.
+   */
+  FIRST_PASS_SPEED_FEATURES fp_sf;
+
+  /*!
+   * Speed features related to how tpl's searches are done.
+   */
+  TPL_SPEED_FEATURES tpl_sf;
+
+  /*!
+   * Global motion speed features:
+   */
+  GLOBAL_MOTION_SPEED_FEATURES gm_sf;
+
+  /*!
+   * Partition search speed features:
+   */
+  PARTITION_SPEED_FEATURES part_sf;
+
+  /*!
+   * Motion search speed features:
+   */
+  MV_SPEED_FEATURES mv_sf;
+
+  /*!
+   * Inter mode search speed features:
+   */
+  INTER_MODE_SPEED_FEATURES inter_sf;
+
+  /*!
+   * Interpolation filter search speed features:
+   */
+  INTERP_FILTER_SPEED_FEATURES interp_sf;
+
+  /*!
+   * Intra mode search speed features:
+   */
+  INTRA_MODE_SPEED_FEATURES intra_sf;
+
+  /*!
+   * Transform size/type search speed features:
+   */
+  TX_SPEED_FEATURES tx_sf;
+
+  /*!
+   * RD calculation speed features:
+   */
+  RD_CALC_SPEED_FEATURES rd_sf;
+
+  /*!
+   * Two-pass mode evaluation features:
+   */
+  WINNER_MODE_SPEED_FEATURES winner_mode_sf;
+
+  /*!
+   * In-loop filter speed features:
+   */
+  LOOP_FILTER_SPEED_FEATURES lpf_sf;
+
+  /*!
+   * Real-time mode speed features:
+   */
+  REAL_TIME_SPEED_FEATURES rt_sf;
+} SPEED_FEATURES;
+/*!\cond */
+
+struct AV1_COMP;
+
+/*!\endcond */
+/*!\brief Frame size independent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting. (Higher speed gives lower
+ *         quality)
+ */
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
+                                                  int speed);
+
+/*!\brief Frame size dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting and frame size. (Higher speed
+ *         corresponds to lower quality)
+ */
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
+                                                int speed);
+/*!\brief Q index dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting and current frame's Q index.
+ *         (Higher speed corresponds to lower quality)
+ */
+void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/superres_scale.c b/third_party/aom/av1/encoder/superres_scale.c
new file mode 100644
index 0000000000..3b47909b15
--- /dev/null
+++ b/third_party/aom/av1/encoder/superres_scale.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/random.h"
+
+// Compute the horizontal frequency components' energy in a frame
+// by calculuating the 16x4 Horizontal DCT. This is to be used to
+// decide the superresolution parameters.
+static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
+  uint64_t freq_energy[16] = { 0 };
+  const YV12_BUFFER_CONFIG *buf = cpi->source;
+  const int bd = cpi->td.mb.e_mbd.bd;
+  const int width = buf->y_crop_width;
+  const int height = buf->y_crop_height;
+  DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
+  int n = 0;
+  memset(freq_energy, 0, sizeof(freq_energy));
+  if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
+    for (int i = 0; i < height - 4; i += 4) {
+      for (int j = 0; j < width - 16; j += 16) {
+        av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
+                            H_DCT, bd);
+        for (int k = 1; k < 16; ++k) {
+          const uint64_t this_energy =
+              ((int64_t)coeff[k] * coeff[k]) +
+              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+              ((int64_t)coeff[k + 48] * coeff[k + 48]);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
+        }
+        n++;
+      }
+    }
+  } else {
+    assert(bd == 8);
+    DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
+    for (int i = 0; i < height - 4; i += 4) {
+      for (int j = 0; j < width - 16; j += 16) {
+        for (int ii = 0; ii < 4; ++ii)
+          for (int jj = 0; jj < 16; ++jj)
+            src16[ii * 16 + jj] =
+                buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
+        av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
+        for (int k = 1; k < 16; ++k) {
+          const uint64_t this_energy =
+              ((int64_t)coeff[k] * coeff[k]) +
+              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+              ((int64_t)coeff[k + 48] * coeff[k + 48]);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
+        }
+        n++;
+      }
+    }
+  }
+  if (n) {
+    for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
+    // Convert to cumulative energy
+    for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
+  } else {
+    for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
+  }
+}
+
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 56789;
+  const ResizeCfg *resize_cfg = &cpi->oxcf.resize_cfg;
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR;
+  switch (resize_cfg->resize_mode) {
+    case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+    case RESIZE_FIXED:
+      if (cpi->common.current_frame.frame_type == KEY_FRAME)
+        new_denom = resize_cfg->resize_kf_scale_denominator;
+      else
+        new_denom = resize_cfg->resize_scale_denominator;
+      break;
+    case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  // Empirically found to not be beneficial for image coding.
+  return oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO &&
+         cpi->sf.hl_sf.superres_auto_search_type != SUPERRES_AUTO_SOLO &&
+         cpi->rc.frames_to_key > 1;
+}
+
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
+#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
+#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
+
+static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
+                                      const RATE_CONTROL *rc,
+                                      int gf_frame_index) {
+  // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
+  // level.
+  if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) {
+    return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
+  } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) {
+    if (rc->frames_to_key <= 1)
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
+    else
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
+  } else {
+    assert(0);
+  }
+  return 0;
+}
+
+static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
+                                                     double threshq,
+                                                     double threshp) {
+  const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
+  const double tq = threshq * q * q;
+  const double tp = threshp * energy[1];
+  const double thresh = AOMMIN(tq, tp);
+  int k;
+  for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
+    if (energy[k - 1] > thresh) break;
+  }
+  return 3 * SCALE_NUMERATOR - k;
+}
+
+static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
+                                             int sr_kf, int sr_arf) {
+  // Use superres for Key-frames and Alt-ref frames only.
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+      gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) {
+    return SCALE_NUMERATOR;
+  }
+
+  double energy[16];
+  analyze_hor_freq(cpi, energy);
+
+  const double energy_by_q2_thresh =
+      get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index);
+  int denom = get_superres_denom_from_qindex_energy(
+      qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
+  /*
+  printf("\nenergy = [");
+  for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
+  printf("]\n");
+  printf("boost = %d\n",
+         (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE)
+             ? cpi->ppi->p_rc.kf_boost
+             : cpi->rc.gfu_boost);
+  printf("denom = %d\n", denom);
+  */
+  if (av1_superres_in_recode_allowed(cpi)) {
+    assert(cpi->superres_mode != AOM_SUPERRES_NONE);
+    // Force superres to be tried in the recode loop, as full-res is also going
+    // to be tried anyway.
+    denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
+  }
+  return denom;
+}
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 34567;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  // Make sure that superres mode of the frame is consistent with the
+  // sequence-level flag.
+  assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE,
+                 cpi->common.seq_params->enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params->enable_superres,
+                 superres_cfg->superres_mode == AOM_SUPERRES_NONE));
+  // Make sure that superres mode for current encoding is consistent with user
+  // provided superres mode.
+  assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_AUTO,
+                 cpi->superres_mode == superres_cfg->superres_mode));
+
+  // Note: we must look at the current superres_mode to be tried in 'cpi' here,
+  // not the user given mode in 'oxcf'.
+  switch (cpi->superres_mode) {
+    case AOM_SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+    case AOM_SUPERRES_FIXED:
+      if (cpi->common.current_frame.frame_type == KEY_FRAME)
+        new_denom = superres_cfg->superres_kf_scale_denominator;
+      else
+        new_denom = superres_cfg->superres_scale_denominator;
+      break;
+    case AOM_SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    case AOM_SUPERRES_QTHRESH: {
+      // Do not use superres when screen content tools are used.
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+        av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+          &bottom_index, &top_index);
+
+      const int qthresh = (frame_is_intra_only(&cpi->common))
+                              ? superres_cfg->superres_kf_qthresh
+                              : superres_cfg->superres_qthresh;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;
+      } else {
+        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+      }
+      break;
+    }
+    case AOM_SUPERRES_AUTO: {
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+        av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+          &bottom_index, &top_index);
+
+      const SUPERRES_AUTO_SEARCH_TYPE sr_search_type =
+          cpi->sf.hl_sf.superres_auto_search_type;
+      const int qthresh = (sr_search_type == SUPERRES_AUTO_SOLO) ? 128 : 0;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;  // Don't use superres.
+      } else {
+        if (sr_search_type == SUPERRES_AUTO_ALL) {
+          if (cpi->common.current_frame.frame_type == KEY_FRAME)
+            new_denom = superres_cfg->superres_kf_scale_denominator;
+          else
+            new_denom = superres_cfg->superres_scale_denominator;
+        } else {
+          new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+        }
+      }
+      break;
+    }
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+  return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+  // Only need to check the width, as scaling is horizontal only.
+  (void)oheight;
+  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
+}
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+                                aom_superres_mode superres_mode, int owidth,
+                                int oheight, size_params_type *rsz) {
+  if (dimensions_are_ok(owidth, oheight, rsz)) {  // Nothing to do.
+    return 1;
+  }
+
+  // Calculate current resize scale.
+  int resize_denom =
+      AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+             DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+  if (resize_mode != RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) {
+    // Alter superres scale as needed to enforce conformity.
+    rsz->superres_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+    }
+  } else if (resize_mode == RESIZE_RANDOM &&
+             superres_mode != AOM_SUPERRES_RANDOM) {
+    // Alter resize scale as needed to enforce conformity.
+    resize_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+    rsz->resize_width = owidth;
+    rsz->resize_height = oheight;
+    av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                              resize_denom);
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (resize_denom > SCALE_NUMERATOR) {
+        --resize_denom;
+        rsz->resize_width = owidth;
+        rsz->resize_height = oheight;
+        av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                  resize_denom);
+      }
+    }
+  } else if (resize_mode == RESIZE_RANDOM &&
+             superres_mode == AOM_SUPERRES_RANDOM) {
+    // Alter both resize and superres scales as needed to enforce conformity.
+    do {
+      if (resize_denom > rsz->superres_denom)
+        --resize_denom;
+      else
+        --rsz->superres_denom;
+      rsz->resize_width = owidth;
+      rsz->resize_height = oheight;
+      av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                resize_denom);
+    } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+             (resize_denom > SCALE_NUMERATOR ||
+              rsz->superres_denom > SCALE_NUMERATOR));
+  } else {  // We are allowed to alter neither resize scale nor superres
+            // scale.
+    return 0;
+  }
+  return dimensions_are_ok(owidth, oheight, rsz);
+}
+
+// Calculates resize and superres params for next frame
+static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height,
+                           SCALE_NUMERATOR };
+  int resize_denom = SCALE_NUMERATOR;
+  if (has_no_stats_stage(cpi) && cpi->ppi->use_svc &&
+      (cpi->common.width != cpi->oxcf.frm_dim_cfg.width ||
+       cpi->common.height != cpi->oxcf.frm_dim_cfg.height)) {
+    rsz.resize_width = cpi->common.width;
+    rsz.resize_height = cpi->common.height;
+    return rsz;
+  }
+  if (is_stat_generation_stage(cpi)) return rsz;
+  if (resize_pending_params->width && resize_pending_params->height) {
+    rsz.resize_width = resize_pending_params->width;
+    rsz.resize_height = resize_pending_params->height;
+    resize_pending_params->width = resize_pending_params->height = 0;
+    if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE) return rsz;
+  } else {
+    resize_denom = calculate_next_resize_scale(cpi);
+    rsz.resize_width = frm_dim_cfg->width;
+    rsz.resize_height = frm_dim_cfg->height;
+    av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+                              resize_denom);
+  }
+  rsz.superres_denom = calculate_next_superres_scale(cpi);
+  if (!validate_size_scales(oxcf->resize_cfg.resize_mode, cpi->superres_mode,
+                            frm_dim_cfg->width, frm_dim_cfg->height, &rsz))
+    assert(0 && "Invalid scale parameters");
+  return rsz;
+}
+
+static void setup_frame_size_from_params(AV1_COMP *cpi,
+                                         const size_params_type *rsz) {
+  int encode_width = rsz->resize_width;
+  int encode_height = rsz->resize_height;
+
+  AV1_COMMON *cm = &cpi->common;
+  cm->superres_upscaled_width = encode_width;
+  cm->superres_upscaled_height = encode_height;
+  cm->superres_scale_denominator = rsz->superres_denom;
+  av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+                                     rsz->superres_denom);
+  av1_set_frame_size(cpi, encode_width, encode_height);
+}
+
+void av1_setup_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  // Reset superres params from previous frame.
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
+  const size_params_type rsz = calculate_next_size_params(cpi);
+  setup_frame_size_from_params(cpi, &rsz);
+
+  assert(av1_is_min_tile_width_satisfied(cm));
+}
+
+void av1_superres_post_encode(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  assert(cpi->oxcf.superres_cfg.enable_superres);
+  assert(!is_lossless_requested(&cpi->oxcf.rc_cfg));
+  assert(!cm->features.all_lossless);
+
+  av1_superres_upscale(cm, NULL, cpi->image_pyramid_levels);
+
+  // If regular resizing is occurring the source will need to be downscaled to
+  // match the upscaled superres resolution. Otherwise the original source is
+  // used.
+  if (!av1_resize_scaled(cm)) {
+    cpi->source = cpi->unscaled_source;
+    if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+  } else {
+    assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+    assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+    // Do downscale. cm->(width|height) has been updated by
+    // av1_superres_upscale
+    cpi->source = realloc_and_scale_source(cpi, cm->superres_upscaled_width,
+                                           cm->superres_upscaled_height);
+  }
+}
diff --git a/third_party/aom/av1/encoder/superres_scale.h b/third_party/aom/av1/encoder/superres_scale.h
new file mode 100644
index 0000000000..450a4ed902
--- /dev/null
+++ b/third_party/aom/av1/encoder/superres_scale.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+#define AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi);
+void av1_superres_post_encode(AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SUPERRES_SCALE_H_
diff --git a/third_party/aom/av1/encoder/svc_layercontext.c b/third_party/aom/av1/encoder/svc_layercontext.c
new file mode 100644
index 0000000000..2c99cb89b8
--- /dev/null
+++ b/third_party/aom/av1/encoder/svc_layercontext.c
@@ -0,0 +1,701 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+
+static void swap_ptr(void *a, void *b) {
+  void **a_p = (void **)a;
+  void **b_p = (void **)b;
+  void *c = *a_p;
+  *a_p = *b_p;
+  *b_p = c;
+}
+
+void av1_init_layer_context(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  SVC *const svc = &cpi->svc;
+  int mi_rows = cpi->common.mi_params.mi_rows;
+  int mi_cols = cpi->common.mi_params.mi_cols;
+  svc->base_framerate = 30.0;
+  svc->current_superframe = 0;
+  svc->force_zero_mode_spatial_ref = 1;
+  svc->num_encoded_top_layer = 0;
+  svc->use_flexible_mode = 0;
+  svc->has_lower_quality_layer = 0;
+
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+      lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q;
+      lp_rc->total_actual_bits = 0;
+      lrc->ni_tot_qi = 0;
+      lp_rc->tot_q = 0.0;
+      lp_rc->avg_q = 0.0;
+      lp_rc->ni_frames = 0;
+      lrc->decimation_count = 0;
+      lrc->decimation_factor = 0;
+      lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+      lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+      lrc->rtc_external_ratectrl = 0;
+      for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+        lp_rc->rate_correction_factors[i] = 1.0;
+      }
+      lc->target_bandwidth = lc->layer_target_bitrate;
+      lp_rc->last_q[INTER_FRAME] = lrc->worst_quality;
+      lp_rc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
+      lp_rc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
+      lp_rc->buffer_level =
+          oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000;
+      lp_rc->bits_off_target = lp_rc->buffer_level;
+      // Initialize the cyclic refresh parameters. If spatial layers are used
+      // (i.e., ss_number_layers > 1), these need to be updated per spatial
+      // layer. Cyclic refresh is only applied on base temporal layer.
+      if (svc->number_spatial_layers > 1 && tl == 0) {
+        lc->sb_index = 0;
+        lc->actual_num_seg1_blocks = 0;
+        lc->actual_num_seg2_blocks = 0;
+        lc->counter_encode_maxq_scene_change = 0;
+        aom_free(lc->map);
+        CHECK_MEM_ERROR(cm, lc->map,
+                        aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
+      }
+    }
+    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_phase[sl] = 8;
+    svc->last_layer_dropped[sl] = false;
+    svc->drop_spatial_layer[sl] = false;
+  }
+  if (svc->number_spatial_layers == 3) {
+    svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH;
+  }
+}
+
+bool av1_alloc_layer_context(AV1_COMP *cpi, int num_layers) {
+  SVC *const svc = &cpi->svc;
+  if (svc->layer_context == NULL || svc->num_allocated_layers < num_layers) {
+    assert(num_layers > 1);
+    aom_free(svc->layer_context);
+    svc->num_allocated_layers = 0;
+    svc->layer_context =
+        (LAYER_CONTEXT *)aom_calloc(num_layers, sizeof(*svc->layer_context));
+    if (svc->layer_context == NULL) return false;
+    svc->num_allocated_layers = num_layers;
+  }
+  return true;
+}
+
+// Update the layer context from a change_config() call.
+void av1_update_layer_context_change_config(AV1_COMP *const cpi,
+                                            const int64_t target_bandwidth) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  AV1_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  int layer = 0;
+  int64_t spatial_layer_target = 0;
+  float bitrate_alloc = 1.0;
+  const int mi_rows = cm->mi_params.mi_rows;
+  const int mi_cols = cm->mi_params.mi_cols;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate;
+    }
+    spatial_layer_target = svc->layer_context[layer].target_bandwidth;
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      LAYER_CONTEXT *const lc =
+          &svc->layer_context[sl * svc->number_temporal_layers + tl];
+      RATE_CONTROL *const lrc = &lc->rc;
+      PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+      lc->spatial_layer_target_bandwidth = spatial_layer_target;
+      if (target_bandwidth != 0) {
+        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      }
+      lp_rc->starting_buffer_level =
+          (int64_t)(p_rc->starting_buffer_level * bitrate_alloc);
+      lp_rc->optimal_buffer_level =
+          (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc);
+      lp_rc->maximum_buffer_size =
+          (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc);
+      lp_rc->bits_off_target =
+          AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+      lp_rc->buffer_level =
+          AOMMIN(lp_rc->buffer_level, lp_rc->maximum_buffer_size);
+      lc->framerate = cpi->framerate / lc->framerate_factor;
+      lrc->avg_frame_bandwidth =
+          (int)round(lc->target_bandwidth / lc->framerate);
+      lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+      lrc->rtc_external_ratectrl = rc->rtc_external_ratectrl;
+      lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+      lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+      if (rc->use_external_qp_one_pass) {
+        lrc->worst_quality = rc->worst_quality;
+        lrc->best_quality = rc->best_quality;
+      }
+      // Reset the cyclic refresh parameters, if needed (map is NULL),
+      // or number of spatial layers has changed.
+      // Cyclic refresh is only applied on base temporal layer.
+      if (svc->number_spatial_layers > 1 && tl == 0 &&
+          (lc->map == NULL ||
+           svc->prev_number_spatial_layers != svc->number_spatial_layers)) {
+        lc->sb_index = 0;
+        lc->actual_num_seg1_blocks = 0;
+        lc->actual_num_seg2_blocks = 0;
+        lc->counter_encode_maxq_scene_change = 0;
+        aom_free(lc->map);
+        CHECK_MEM_ERROR(cm, lc->map,
+                        aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
+      }
+    }
+  }
+}
+
+/*!\brief Return layer context for current layer.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi   Top level encoder structure
+ *
+ * \return LAYER_CONTEXT for current layer.
+ */
+static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) {
+  return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                                     cpi->svc.number_temporal_layers +
+                                 cpi->svc.temporal_layer_id];
+}
+
+void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  RATE_CONTROL *const lrc = &lc->rc;
+  const int tl = svc->temporal_layer_id;
+  lc->framerate = cpi->framerate / lc->framerate_factor;
+  lrc->avg_frame_bandwidth = (int)round(lc->target_bandwidth / lc->framerate);
+  lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+  // Update the average layer frame size (non-cumulative per-frame-bw).
+  if (tl == 0) {
+    lc->avg_frame_size = lrc->avg_frame_bandwidth;
+  } else {
+    int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers +
+                     svc->temporal_layer_id - 1;
+    LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer];
+    const double prev_layer_framerate =
+        cpi->framerate / lcprev->framerate_factor;
+    const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate;
+    lc->avg_frame_size =
+        (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+                   (lc->framerate - prev_layer_framerate));
+  }
+}
+
+static AOM_INLINE bool check_ref_is_low_spatial_res_super_frame(
+    int ref_frame, const SVC *svc, const RTC_REF *rtc_ref) {
+  int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1];
+  return rtc_ref->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
+         rtc_ref->buffer_spatial_layer[ref_frame_idx] <=
+             svc->spatial_layer_id - 1;
+}
+
+void av1_restore_layer_context(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  const AV1_COMMON *const cm = &cpi->common;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  const int old_frame_since_key = cpi->rc.frames_since_key;
+  const int old_frame_to_key = cpi->rc.frames_to_key;
+  const int max_consec_drop = cpi->rc.max_consec_drop;
+  // Restore layer rate control.
+  cpi->rc = lc->rc;
+  cpi->ppi->p_rc = lc->p_rc;
+  cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth;
+  cpi->gf_frame_index = 0;
+  cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude;
+  if (cpi->mv_search_params.max_mv_magnitude == 0)
+    cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height);
+  // Reset the frames_since_key and frames_to_key counters to their values
+  // before the layer restore. Keep these defined for the stream (not layer).
+  cpi->rc.frames_since_key = old_frame_since_key;
+  cpi->rc.frames_to_key = old_frame_to_key;
+  // Reset to value before the layer restore.
+  cpi->rc.max_consec_drop = max_consec_drop;
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+      svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    swap_ptr(&cr->map, &lc->map);
+    cr->sb_index = lc->sb_index;
+    cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
+    cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+    cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change;
+  }
+  svc->skip_mvsearch_last = 0;
+  svc->skip_mvsearch_gf = 0;
+  svc->skip_mvsearch_altref = 0;
+  // For each reference (LAST/GOLDEN) set the skip_mvsearch_last/gf frame flags.
+  // This is to skip searching mv for that reference if it was last
+  // refreshed (i.e., buffer slot holding that reference was refreshed) on the
+  // previous spatial layer(s) at the same time (current_superframe).
+  if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref &&
+      cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) {
+      svc->skip_mvsearch_last = 1;
+    }
+    if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc, rtc_ref)) {
+      svc->skip_mvsearch_gf = 1;
+    }
+    if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc, rtc_ref)) {
+      svc->skip_mvsearch_altref = 1;
+    }
+  }
+}
+
+void av1_svc_update_buffer_slot_refreshed(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  const unsigned int current_frame =
+      cpi->ppi->use_svc ? svc->current_superframe
+                        : cpi->common.current_frame.frame_number;
+  // For any buffer slot that is refreshed, update it with
+  // the spatial_layer_id and the current_superframe.
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    // All slots are refreshed on KEY.
+    for (unsigned int i = 0; i < REF_FRAMES; i++) {
+      rtc_ref->buffer_time_index[i] = current_frame;
+      rtc_ref->buffer_spatial_layer[i] = svc->spatial_layer_id;
+    }
+  } else if (rtc_ref->set_ref_frame_config) {
+    for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+      const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+      if (cpi->ppi->rtc_ref.refresh[ref_frame_map_idx]) {
+        rtc_ref->buffer_time_index[ref_frame_map_idx] = current_frame;
+        rtc_ref->buffer_spatial_layer[ref_frame_map_idx] =
+            svc->spatial_layer_id;
+      }
+    }
+  }
+}
+
+void av1_save_layer_context(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  const AV1_COMMON *const cm = &cpi->common;
+  LAYER_CONTEXT *lc = get_layer_context(cpi);
+  lc->rc = cpi->rc;
+  lc->p_rc = cpi->ppi->p_rc;
+  lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth;
+  lc->group_index = cpi->gf_frame_index;
+  lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude;
+  if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate;
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    signed char *temp = lc->map;
+    lc->map = cr->map;
+    cr->map = temp;
+    lc->sb_index = cr->sb_index;
+    lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
+    lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+    lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+  }
+  av1_svc_update_buffer_slot_refreshed(cpi);
+  for (unsigned int i = 0; i < REF_FRAMES; i++) {
+    if (frame_is_intra_only(cm) ||
+        cm->current_frame.refresh_frame_flags & (1 << i)) {
+      svc->spatial_layer_fb[i] = svc->spatial_layer_id;
+      svc->temporal_layer_fb[i] = svc->temporal_layer_id;
+    }
+  }
+  if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+    svc->current_superframe++;
+    // Reset drop flag to false for next superframe.
+    for (int sl = 0; sl < svc->number_spatial_layers; sl++)
+      svc->drop_spatial_layer[sl] = false;
+  }
+}
+
+int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) {
+  const SVC *const svc = &cpi->svc;
+  const AV1_COMMON *const cm = &cpi->common;
+  int fb_idx = -1;
+  int primary_ref_frame = PRIMARY_REF_NONE;
+  if (cpi->svc.number_spatial_layers > 1 ||
+      cpi->svc.number_temporal_layers > 1) {
+    // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST
+    // was last updated on a lower temporal layer (or base TL0) and for the
+    // same spatial layer. For RTC patterns this allows for continued decoding
+    // when set of enhancement layers are dropped (continued decoding starting
+    // at next base TL0), so error_resilience can be off/0 for all layers.
+    fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+    if (svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id &&
+        (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id ||
+         svc->temporal_layer_fb[fb_idx] == 0)) {
+      primary_ref_frame = 0;  // LAST_FRAME: ref_frame - LAST_FRAME
+    }
+  } else if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+    const ExternalFlags *const ext_flags = &cpi->ext_flags;
+    int flags = ext_flags->ref_frame_flags;
+    if (flags & AOM_LAST_FLAG) {
+      primary_ref_frame = 0;  // LAST_FRAME: ref_frame - LAST_FRAME
+    } else if (flags & AOM_GOLD_FLAG) {
+      primary_ref_frame = GOLDEN_FRAME - LAST_FRAME;
+    } else if (flags & AOM_ALT_FLAG) {
+      primary_ref_frame = ALTREF_FRAME - LAST_FRAME;
+    }
+  }
+  return primary_ref_frame;
+}
+
+void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      aom_free(lc->map);
+      lc->map = NULL;
+    }
+  }
+}
+
+void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) {
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = NULL;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+      if (is_key) lc->frames_from_key_frame = 0;
+    }
+  }
+  av1_update_temporal_layer_framerate(cpi);
+  av1_restore_layer_context(cpi);
+}
+
+void av1_get_layer_resolution(const int width_org, const int height_org,
+                              const int num, const int den, int *width_out,
+                              int *height_out) {
+  int w, h;
+  if (width_out == NULL || height_out == NULL || den == 0) return;
+  if (den == 1 && num == 1) {
+    *width_out = width_org;
+    *height_out = height_org;
+    return;
+  }
+  w = width_org * num / den;
+  h = height_org * num / den;
+  // Make height and width even.
+  w += w % 2;
+  h += h % 2;
+  *width_out = w;
+  *height_out = h;
+}
+
+void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  AV1_COMMON *const cm = &cpi->common;
+  LAYER_CONTEXT *lc = NULL;
+  int width = 0, height = 0;
+  lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+                           svc->temporal_layer_id];
+  // Set the lower quality layer flag.
+  svc->has_lower_quality_layer = 0;
+  if (cpi->svc.spatial_layer_id > 0) {
+    const LAYER_CONTEXT *lc_prev =
+        &svc->layer_context[(svc->spatial_layer_id - 1) *
+                                svc->number_temporal_layers +
+                            svc->temporal_layer_id];
+    if (lc_prev->scaling_factor_den == 1 && lc_prev->scaling_factor_num == 1)
+      svc->has_lower_quality_layer = 1;
+  }
+  av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width,
+                           cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num,
+                           lc->scaling_factor_den, &width, &height);
+  // Use Eightap_smooth for low resolutions.
+  if (width * height <= 320 * 240)
+    svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH;
+
+  cm->width = width;
+  cm->height = height;
+  alloc_mb_mode_info_buffers(cpi);
+  av1_update_frame_size(cpi);
+  if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+    svc->mi_cols_full_resoln = cm->mi_params.mi_cols;
+    svc->mi_rows_full_resoln = cm->mi_params.mi_rows;
+  }
+}
+
+enum {
+  SVC_LAST_FRAME = 0,
+  SVC_LAST2_FRAME,
+  SVC_LAST3_FRAME,
+  SVC_GOLDEN_FRAME,
+  SVC_BWDREF_FRAME,
+  SVC_ALTREF2_FRAME,
+  SVC_ALTREF_FRAME
+};
+
+// For fixed svc mode: fixed pattern is set based on the number of
+// spatial and temporal layers, and the ksvc_fixed_mode.
+void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  int i;
+  assert(svc->use_flexible_mode == 0);
+  // Fixed SVC mode only supports at most 3 spatial or temporal layers.
+  assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 &&
+         svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3);
+  rtc_ref->set_ref_frame_config = 1;
+  int superframe_cnt = svc->current_superframe;
+  // Set the reference map buffer idx for the 7 references:
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = i;
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->reference[i] = 0;
+  for (i = 0; i < REF_FRAMES; i++) rtc_ref->refresh[i] = 0;
+  // Always reference LAST, and reference GOLDEN on SL > 0.
+  // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later
+  // when frame_type is set.
+  rtc_ref->reference[SVC_LAST_FRAME] = 1;
+  if (svc->spatial_layer_id > 0) rtc_ref->reference[SVC_GOLDEN_FRAME] = 1;
+  if (svc->temporal_layer_id == 0) {
+    // Base temporal layer.
+    if (svc->spatial_layer_id == 0) {
+      // Set all buffer_idx to 0. Update slot 0 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->refresh[0] = 1;
+    } else if (svc->spatial_layer_id == 1) {
+      // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to
+      // slot 0. Update slot 1 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+      rtc_ref->refresh[1] = 1;
+    } else if (svc->spatial_layer_id == 2) {
+      // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to
+      // slot 1. Update slot 2 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 1;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+      rtc_ref->refresh[2] = 1;
+    }
+  } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) {
+    // First top temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Reference LAST (slot 0).
+      // Set GOLDEN to slot 3 and update slot 3.
+      // Set all other buffer_idx to slot 0.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        rtc_ref->refresh[3] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+      // GOLDEN (and all other refs) to slot 3.
+      // Set LAST2 to slot 4 and Update slot 4.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 3;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4;
+        rtc_ref->refresh[4] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+      // GOLDEN (and all other refs) to slot 4.
+      // No update.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 4;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+    }
+  } else if (svc->temporal_layer_id == 1) {
+    // Middle temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Reference LAST.
+      // Set all buffer_idx to 0.
+      // Set GOLDEN to slot 5 and update slot 5.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+          svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5;
+        rtc_ref->refresh[5] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+      // GOLDEN (and all other refs) to slot 5.
+      // Set LAST3 to slot 6 and update slot 6.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+          svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6;
+        rtc_ref->refresh[6] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+      // GOLDEN (and all other refs) to slot 6.
+      // Set LAST3 to slot 7 and update slot 7.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 6;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+        rtc_ref->ref_idx[SVC_LAST3_FRAME] = 7;
+        rtc_ref->refresh[7] = 1;
+      }
+    }
+  } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) {
+    // Second top temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Set LAST to slot 5 and reference LAST.
+      // Set GOLDEN to slot 3 and update slot 3.
+      // Set all other buffer_idx to 0.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 5;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        rtc_ref->refresh[3] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+      // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 6;
+      rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4;
+        rtc_ref->refresh[4] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+      // GOLDEN to slot 4. No update.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 7;
+      rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 4;
+    }
+  }
+}
+
+void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+    // If avg_frame_bandwidth for top temporal layer is not set
+    // (because enhancement layer was inactive), use the base TL0
+    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+                                 svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    int avg_frame_bandwidth = lrc->avg_frame_bandwidth;
+    int prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth;
+    if (avg_frame_bandwidth == 0 || prev_avg_frame_bandwidth == 0) {
+      // Use base TL0.
+      layer = LAYER_IDS_TO_IDX(sl, 0, svc->number_temporal_layers);
+      lc = &svc->layer_context[layer];
+      lrc = &lc->rc;
+      avg_frame_bandwidth = lrc->avg_frame_bandwidth;
+      prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth;
+    }
+    if (avg_frame_bandwidth > (3 * prev_avg_frame_bandwidth >> 1) ||
+        avg_frame_bandwidth < (prev_avg_frame_bandwidth >> 1)) {
+      // Reset for all temporal layers with spatial layer sl.
+      for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        LAYER_CONTEXT *lc2 = &svc->layer_context[layer2];
+        RATE_CONTROL *lrc2 = &lc2->rc;
+        PRIMARY_RATE_CONTROL *lp_rc2 = &lc2->p_rc;
+        PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc;
+        lrc2->rc_1_frame = 0;
+        lrc2->rc_2_frame = 0;
+        lp_rc2->bits_off_target = lp_rc->optimal_buffer_level;
+        lp_rc2->buffer_level = lp_rc->optimal_buffer_level;
+      }
+    }
+  }
+}
+
+void av1_svc_set_last_source(AV1_COMP *const cpi, EncodeFrameInput *frame_input,
+                             YV12_BUFFER_CONFIG *prev_source) {
+  frame_input->last_source = prev_source != NULL ? prev_source : NULL;
+  if (!cpi->ppi->use_svc && cpi->rc.prev_frame_is_dropped &&
+      cpi->rc.frame_number_encoded > 0) {
+    frame_input->last_source = &cpi->svc.source_last_TL0;
+  } else {
+    RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+    if (cpi->svc.spatial_layer_id == 0) {
+      // For base spatial layer: if the LAST reference (index 0) is not
+      // the previous (super)frame set the last_source to the source
+      // corresponding to the last TL0, otherwise keep it at prev_source.
+      // Always use source_last_TL0 if previous base TL0 was dropped.
+      if (cpi->svc.current_superframe > 0) {
+        const int buffslot_last = rtc_ref->ref_idx[0];
+        // Check if previous frame was dropped on base TL0 layer.
+        const int layer =
+            LAYER_IDS_TO_IDX(0, 0, cpi->svc.number_temporal_layers);
+        LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        if (lrc->prev_frame_is_dropped ||
+            rtc_ref->buffer_time_index[buffslot_last] <
+                cpi->svc.current_superframe - 1) {
+          frame_input->last_source = &cpi->svc.source_last_TL0;
+        }
+      }
+    } else if (cpi->svc.spatial_layer_id > 0) {
+      // For spatial enhancement layers: the previous source (prev_source)
+      // corresponds to the lower spatial layer (which is the same source so
+      // we can't use that), so always set the last_source to the source of the
+      // last TL0.
+      if (cpi->svc.current_superframe > 0)
+        frame_input->last_source = &cpi->svc.source_last_TL0;
+      else
+        frame_input->last_source = NULL;
+    }
+  }
+}
+
+int av1_svc_get_min_ref_dist(const AV1_COMP *cpi) {
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  int min_dist = INT_MAX;
+  const unsigned int current_frame_num =
+      cpi->ppi->use_svc ? cpi->svc.current_superframe
+                        : cpi->common.current_frame.frame_number;
+  for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+    if (cpi->ppi->rtc_ref.reference[i]) {
+      const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+      const int dist =
+          current_frame_num - rtc_ref->buffer_time_index[ref_frame_map_idx];
+      if (dist < min_dist) min_dist = dist;
+    }
+  }
+  return min_dist;
+}
+
+void av1_svc_set_reference_was_previous(AV1_COMP *cpi) {
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  // Check if the encoded frame had some reference that was the
+  // previous frame.
+  const unsigned int current_frame =
+      cpi->ppi->use_svc ? cpi->svc.current_superframe
+                        : cpi->common.current_frame.frame_number;
+  rtc_ref->reference_was_previous_frame = true;
+  if (current_frame > 0) {
+    rtc_ref->reference_was_previous_frame = false;
+    for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+      if (rtc_ref->reference[i]) {
+        const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+        if (rtc_ref->buffer_time_index[ref_frame_map_idx] == current_frame - 1)
+          rtc_ref->reference_was_previous_frame = true;
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/svc_layercontext.h b/third_party/aom/av1/encoder/svc_layercontext.h
new file mode 100644
index 0000000000..93118be2d4
--- /dev/null
+++ b/third_party/aom/av1/encoder/svc_layercontext.h
@@ -0,0 +1,325 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+#define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief The stucture of quantities related to each spatial and temporal layer.
+ * \ingroup SVC
+ */
+typedef struct {
+  /*!\cond */
+  RATE_CONTROL rc;
+  PRIMARY_RATE_CONTROL p_rc;
+  int framerate_factor;
+  int64_t layer_target_bitrate;  // In bits per second.
+  int scaling_factor_num;
+  int scaling_factor_den;
+  int64_t target_bandwidth;
+  int64_t spatial_layer_target_bandwidth;
+  double framerate;
+  int avg_frame_size;
+  int max_q;
+  int min_q;
+  int frames_from_key_frame;
+  /*!\endcond */
+
+  /*!
+   * Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+   */
+  int sb_index;
+  /*!
+   * Segmentation map
+   */
+  int8_t *map;
+  /*!
+   * Number of blocks on segment 1
+   */
+  int actual_num_seg1_blocks;
+
+  /*!
+   * Number of blocks on segment 2
+   */
+  int actual_num_seg2_blocks;
+  /*!
+   * Counter used to detect scene change.
+   */
+  int counter_encode_maxq_scene_change;
+
+  /*!
+   * Speed settings for each layer.
+   */
+  uint8_t speed;
+  /*!
+   * GF group index.
+   */
+  unsigned char group_index;
+  /*!
+   * If current layer is key frame.
+   */
+  int is_key_frame;
+  /*!
+   * Maximum motion magnitude of previous encoded layer.
+   */
+  int max_mv_magnitude;
+} LAYER_CONTEXT;
+
+/*!
+ * \brief The stucture of SVC.
+ * \ingroup SVC
+ */
+typedef struct SVC {
+  /*!\cond */
+  int spatial_layer_id;
+  int temporal_layer_id;
+  int number_spatial_layers;
+  int number_temporal_layers;
+  int prev_number_spatial_layers;
+  int use_flexible_mode;
+  int ksvc_fixed_mode;
+  /*!\endcond */
+
+  /*!\cond */
+  double base_framerate;
+  unsigned int current_superframe;
+  int skip_mvsearch_last;
+  int skip_mvsearch_gf;
+  int skip_mvsearch_altref;
+  int spatial_layer_fb[REF_FRAMES];
+  int temporal_layer_fb[REF_FRAMES];
+  int num_encoded_top_layer;
+  int first_layer_denoise;
+  YV12_BUFFER_CONFIG source_last_TL0;
+  int mi_cols_full_resoln;
+  int mi_rows_full_resoln;
+  /*!\endcond */
+
+  /*!
+   * Layer context used for rate control in CBR mode.
+   * An array. The index for spatial layer `sl` and temporal layer `tl` is
+   * sl * number_temporal_layers + tl.
+   */
+  LAYER_CONTEXT *layer_context;
+
+  /*!
+   * Number of layers allocated for layer_context. If nonzero, must be greater
+   * than or equal to number_spatial_layers * number_temporal_layers.
+   */
+  int num_allocated_layers;
+
+  /*!
+   * EIGHTTAP_SMOOTH or BILINEAR
+   */
+  InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS];
+
+  /*!
+   * Downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+   * = 8 will center the target pixel and get a symmetric averaging filter.
+   */
+  int downsample_filter_phase[AOM_MAX_SS_LAYERS];
+
+  /*!
+   * Force zero-mv in mode search for the spatial/inter-layer reference.
+   */
+  int force_zero_mode_spatial_ref;
+
+  /*!
+   * Flag to indicate that current spatial layer has a lower quality layer
+   * (at the same timestamp) that can be used as a reference.
+   * Lower quality layer refers to the same resolution but encoded at
+   * different/lower bitrate.
+   */
+  int has_lower_quality_layer;
+
+  /*!
+   * Flag to indicate the frame drop mode for SVC: one of the two settings:
+   * AOM_LAYER_DROP (default) or AOM_FULL_SUPERFRAME_DROP.
+   */
+  AOM_SVC_FRAME_DROP_MODE framedrop_mode;
+
+  /*!
+   * Flag to indicate if frame was dropped for a given spatial_layer_id on
+   * previous superframe.
+   */
+  bool last_layer_dropped[AOM_MAX_SS_LAYERS];
+
+  /*!
+   * Flag to indicate if a previous spatial was dropped for the same superframe.
+   */
+  bool drop_spatial_layer[AOM_MAX_SS_LAYERS];
+} SVC;
+
+struct AV1_COMP;
+struct EncodeFrameInput;
+
+/*!\brief Initialize layer context data from init_config().
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \remark  Nothing returned. Set cpi->svc.
+ */
+void av1_init_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Allocate layer context data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ * \param[in]       num_layers  Number of layers to be allocated
+ *
+ * \remark  Allocates memory for cpi->svc.layer_context.
+ * \return  True on success, false on allocation failure.
+ */
+bool av1_alloc_layer_context(struct AV1_COMP *cpi, int num_layers);
+
+/*!\brief Update the layer context from a change_config() call.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ * \param[in]       target_bandwidth  Total target bandwidth
+ *
+ * \remark  Nothing returned. Buffer level for each layer is set.
+ */
+void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
+                                            const int64_t target_bandwidth);
+
+/*!\brief Prior to encoding the frame, update framerate-related quantities
+          for the current temporal layer.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \remark  Nothing returned. Frame related quantities for current temporal
+ layer are updated.
+ */
+void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
+
+/*!\brief Prior to encoding the frame, set the layer context, for the current
+ layer to be encoded, to the cpi struct.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \remark  Nothing returned. Layer context for current layer is set.
+ */
+void av1_restore_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Save the layer context after encoding the frame.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ */
+void av1_save_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Free the memory used for cyclic refresh in layer context.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ */
+void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
+
+/*!\brief Reset on key frame: reset counters, references and buffer updates.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ * \param[in]       is_key  Whether current layer is key frame
+ */
+void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
+
+/*!\brief Before encoding, set resolutions and allocate compressor data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ */
+void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi);
+
+/*!\brief Get primary reference frame for current layer
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  The primary reference frame for current layer.
+ */
+int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi);
+
+/*!\brief Get resolution for current layer.
+ *
+ * \ingroup SVC
+ * \param[in]       width_org    Original width, unscaled
+ * \param[in]       height_org   Original height, unscaled
+ * \param[in]       num          Numerator for the scale ratio
+ * \param[in]       den          Denominator for the scale ratio
+ * \param[in]       width_out    Output width, scaled for current layer
+ * \param[in]       height_out   Output height, scaled for current layer
+ *
+ * \remark Nothing is returned. Instead the scaled width and height are set.
+ */
+void av1_get_layer_resolution(const int width_org, const int height_org,
+                              const int num, const int den, int *width_out,
+                              int *height_out);
+
+void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi);
+
+void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi);
+
+void av1_svc_set_last_source(struct AV1_COMP *const cpi,
+                             struct EncodeFrameInput *frame_input,
+                             YV12_BUFFER_CONFIG *prev_source);
+
+void av1_svc_update_buffer_slot_refreshed(struct AV1_COMP *const cpi);
+
+int av1_svc_get_min_ref_dist(const struct AV1_COMP *cpi);
+
+void av1_svc_set_reference_was_previous(struct AV1_COMP *cpi);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
new file mode 100644
index 0000000000..7d4d25de6a
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/odintrin.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/temporal_filter.h"
+
+/*!\cond */
+
+// NOTE: All `tf` in this file means `temporal filtering`.
+
+// Forward Declaration.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         MV *subblock_mvs, int *subblock_mses);
+
+// This function returns the minimum and maximum log variances for 4x4 sub
+// blocks in the current block.
+static INLINE void get_log_var_4x4sub_blk(
+    AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row,
+    int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min,
+    double *blk_4x4_var_max, int is_hbd) {
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  int var_min = INT_MAX;
+  int var_max = 0;
+
+  // Derive the source buffer.
+  const int src_stride = frame_to_filter->y_stride;
+  const int y_offset = mb_row * mb_height * src_stride + mb_col * mb_width;
+  const uint8_t *src_buf = frame_to_filter->y_buffer + y_offset;
+
+  for (int i = 0; i < mb_height; i += MI_SIZE) {
+    for (int j = 0; j < mb_width; j += MI_SIZE) {
+      // Calculate the 4x4 sub-block variance.
+      const int var = av1_calc_normalized_variance(
+          cpi->ppi->fn_ptr[BLOCK_4X4].vf, src_buf + (i * src_stride) + j,
+          src_stride, is_hbd);
+
+      // Record min and max for over-arching block
+      var_min = AOMMIN(var_min, var);
+      var_max = AOMMAX(var_max, var);
+    }
+  }
+
+  *blk_4x4_var_min = log1p(var_min / 16.0);
+  *blk_4x4_var_max = log1p(var_max / 16.0);
+}
+
+/*!\endcond */
+/*!\brief Does motion search for blocks in temporal filtering. This is
+ *  the first step for temporal filtering. More specifically, given a frame to
+ * be filtered and another frame as reference, this function searches the
+ * reference frame to find out the most similar block as that from the frame
+ * to be filtered. This found block will be further used for weighted
+ * averaging.
+ *
+ * NOTE: Besides doing motion search for the entire block, this function will
+ *       also do motion search for each 1/4 sub-block to get more precise
+ *       predictions. Then, this function will determines whether to use 4
+ *       sub-blocks to replace the entire block. If we do need to split the
+ *       entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to
+ *       the searched motion vector and search error (MSE) w.r.t. each sub-block
+ *       respectively. Otherwise, the 4 elements will be the same, all of which
+ *       are assigned as the searched motion vector and search error (MSE) for
+ *       the entire block.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   mb                    Pointer to macroblock
+ * \param[in]   frame_to_filter       Pointer to the frame to be filtered
+ * \param[in]   ref_frame             Pointer to the reference frame
+ * \param[in]   block_size            Block size used for motion search
+ * \param[in]   mb_row                Row index of the block in the frame
+ * \param[in]   mb_col                Column index of the block in the frame
+ * \param[in]   ref_mv                Reference motion vector, which is commonly
+ *                                    inherited from the motion search result of
+ *                                    previous frame.
+ * \param[in]   allow_me_for_sub_blks Flag to indicate whether motion search at
+ *                                    16x16 sub-block level is needed or not.
+ * \param[out]  subblock_mvs          Pointer to the motion vectors for
+ *                                    4 sub-blocks
+ * \param[out]  subblock_mses         Pointer to the search errors (MSE) for
+ *                                    4 sub-blocks
+ *
+ * \remark Nothing will be returned. Results are saved in subblock_mvs and
+ *         subblock_mses
+ */
+static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
+                             const YV12_BUFFER_CONFIG *frame_to_filter,
+                             const YV12_BUFFER_CONFIG *ref_frame,
+                             const BLOCK_SIZE block_size, const int mb_row,
+                             const int mb_col, MV *ref_mv,
+                             bool allow_me_for_sub_blks, MV *subblock_mvs,
+                             int *subblock_mses) {
+  // Frame information
+  const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
+
+  // Block information (ONLY Y-plane is used for motion search).
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int y_stride = frame_to_filter->y_stride;
+  const int src_width = frame_to_filter->y_width;
+  const int ref_width = ref_frame->y_width;
+  assert(y_stride == ref_frame->y_stride);
+  assert(src_width == ref_width);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+  // Save input state.
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  const struct buf_2d ori_src_buf = mb->plane[0].src;
+  const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+
+  // Parameters used for motion search.
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+  const int step_param = av1_init_search_range(
+      AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
+  const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
+  const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv;
+  const MV_COST_TYPE mv_cost_type =
+      min_frame_size >= 720
+          ? MV_COST_L1_HDRES
+          : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES);
+
+  // Starting position for motion search.
+  FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+  // Baseline position for motion search (used for rate distortion comparison).
+  const MV baseline_mv = kZeroMv;
+
+  // Setup.
+  mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
+  mb->plane[0].src.stride = y_stride;
+  mb->plane[0].src.width = src_width;
+  mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
+  mbd->plane[0].pre[0].stride = y_stride;
+  mbd->plane[0].pre[0].width = ref_width;
+
+  const SEARCH_METHODS search_method = NSTEP;
+  const search_site_config *search_site_cfg =
+      av1_get_search_site_config(cpi, mb, search_method);
+
+  // Unused intermediate results for motion search.
+  unsigned int sse, error;
+  int distortion;
+  int cost_list[5];
+
+  // Do motion search.
+  int_mv best_mv;  // Searched motion vector.
+  FULLPEL_MV_STATS best_mv_stats;
+  int block_mse = INT_MAX;
+  MV block_mv = kZeroMv;
+  const int q = av1_get_q(cpi);
+
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+                                     &baseline_mv, start_mv, search_site_cfg,
+                                     search_method,
+                                     /*fine_search_interval=*/0);
+  full_ms_params.run_mesh_search = 1;
+  full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+  if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+    // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+    full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+    full_ms_params.mesh_search_mv_diff_threshold = 2;
+  }
+
+  av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
+                        &best_mv_stats, NULL);
+
+  if (force_integer_mv == 1) {  // Only do full search on the entire block.
+    const int mv_row = best_mv.as_mv.row;
+    const int mv_col = best_mv.as_mv.col;
+    best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
+    best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
+    const int mv_offset = mv_row * y_stride + mv_col;
+    error = cpi->ppi->fn_ptr[block_size].vf(
+        ref_frame->y_buffer + y_offset + mv_offset, y_stride,
+        frame_to_filter->y_buffer + y_offset, y_stride, &sse);
+    block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+    block_mv = best_mv.as_mv;
+  } else {  // Do fractional search on the entire block and all sub-blocks.
+    av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
+                                      &baseline_mv, cost_list);
+    ms_params.forced_stop = EIGHTH_PEL;
+    ms_params.var_params.subpel_search_type = subpel_search_type;
+    // Since we are merely refining the result from full pixel search, we don't
+    // need regularization for subpel search
+    ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+    best_mv_stats.err_cost = 0;
+
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+    error = cpi->mv_search_params.find_fractional_mv_step(
+        &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats,
+        &best_mv.as_mv, &distortion, &sse, NULL);
+    block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+    block_mv = best_mv.as_mv;
+    *ref_mv = best_mv.as_mv;
+
+    if (allow_me_for_sub_blks) {
+      // On 4 sub-blocks.
+      const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
+      const int subblock_height = block_size_high[subblock_size];
+      const int subblock_width = block_size_wide[subblock_size];
+      const int subblock_pels = subblock_height * subblock_width;
+      start_mv = get_fullmv_from_mv(ref_mv);
+
+      int subblock_idx = 0;
+      for (int i = 0; i < mb_height; i += subblock_height) {
+        for (int j = 0; j < mb_width; j += subblock_width) {
+          const int offset = i * y_stride + j;
+          mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
+          mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
+          av1_make_default_fullpel_ms_params(
+              &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv,
+              search_site_cfg, search_method,
+              /*fine_search_interval=*/0);
+          full_ms_params.run_mesh_search = 1;
+          full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+          if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+            // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+            full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+            full_ms_params.mesh_search_mv_diff_threshold = 2;
+          }
+          av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                                cond_cost_list(cpi, cost_list),
+                                &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+          av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
+                                            &baseline_mv, cost_list);
+          ms_params.forced_stop = EIGHTH_PEL;
+          ms_params.var_params.subpel_search_type = subpel_search_type;
+          // Since we are merely refining the result from full pixel search, we
+          // don't need regularization for subpel search
+          ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+          best_mv_stats.err_cost = 0;
+
+          subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+          assert(
+              av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+          error = cpi->mv_search_params.find_fractional_mv_step(
+              &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
+              &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL);
+          subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+          subblock_mvs[subblock_idx] = best_mv.as_mv;
+          ++subblock_idx;
+        }
+      }
+    }
+  }
+
+  // Restore input state.
+  mb->plane[0].src = ori_src_buf;
+  mbd->plane[0].pre[0] = ori_pre_buf;
+
+  // Make partition decision.
+  if (allow_me_for_sub_blks) {
+    tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
+                                 subblock_mses);
+  } else {
+    // Copy 32X32 block mv and mse values to sub blocks
+    for (int i = 0; i < 4; ++i) {
+      subblock_mvs[i] = block_mv;
+      subblock_mses[i] = block_mse;
+    }
+  }
+  // Do not pass down the reference motion vector if error is too large.
+  const int thresh = (min_frame_size >= 720) ? 12 : 3;
+  if (block_mse > (thresh << (mbd->bd - 8))) {
+    *ref_mv = kZeroMv;
+  }
+}
+/*!\cond */
+
+// Determines whether to split the entire block to 4 sub-blocks for filtering.
+// In particular, this decision is made based on the comparison between the
+// motion search error of the entire block and the errors of all sub-blocks.
+// Inputs:
+//   block_mv: Motion vector for the entire block (ONLY as reference).
+//   block_mse: Motion search error (MSE) for the entire block (ONLY as
+//              reference).
+//   subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
+//                 modified based on the partition decision).
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
+//                  be modified based on the partition decision).
+// Returns:
+//   Nothing will be returned. Results are saved in `subblock_mvs` and
+//   `subblock_mses`.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         MV *subblock_mvs, int *subblock_mses) {
+  int min_subblock_mse = INT_MAX;
+  int max_subblock_mse = INT_MIN;
+  int64_t sum_subblock_mse = 0;
+  for (int i = 0; i < 4; ++i) {
+    sum_subblock_mse += subblock_mses[i];
+    min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
+    max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
+  }
+
+  // TODO(any): The following magic numbers may be tuned to improve the
+  // performance OR find a way to get rid of these magic numbers.
+  if (((block_mse * 15 < sum_subblock_mse * 4) &&
+       max_subblock_mse - min_subblock_mse < 48) ||
+      ((block_mse * 14 < sum_subblock_mse * 4) &&
+       max_subblock_mse - min_subblock_mse < 24)) {  // No split.
+    for (int i = 0; i < 4; ++i) {
+      subblock_mvs[i] = block_mv;
+      subblock_mses[i] = block_mse;
+    }
+  }
+}
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+  return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+/*!\endcond */
+/*!\brief Builds predictor for blocks in temporal filtering. This is the
+ * second step for temporal filtering, which is to construct predictions from
+ * all reference frames INCLUDING the frame to be filtered itself. These
+ * predictors are built based on the motion search results (motion vector is
+ * set as 0 for the frame to be filtered), and will be futher used for
+ * weighted averaging.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   ref_frame      Pointer to the reference frame (or the frame
+ *                             to be filtered)
+ * \param[in]   mbd            Pointer to the block for filtering. Besides
+ *                             containing the subsampling information of all
+ *                             planes, this field also gives the searched
+ *                             motion vector for the entire block, i.e.,
+ *                             `mbd->mi[0]->mv[0]`. This vector  should be 0
+ *                             if the `ref_frame` itself is the frame to be
+ *                             filtered.
+ * \param[in]   block_size     Size of the block
+ * \param[in]   mb_row         Row index of the block in the frame
+ * \param[in]   mb_col         Column index of the block in the frame
+ * \param[in]   num_planes     Number of planes in the frame
+ * \param[in]   scale          Scaling factor
+ * \param[in]   subblock_mvs   The motion vectors for each sub-block (row-major
+ *                             order)
+ * \param[out]  pred           Pointer to the predictor to be built
+ *
+ * \remark Nothing returned, But the contents of `pred` will be modified
+ */
+static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
+                               const MACROBLOCKD *mbd,
+                               const BLOCK_SIZE block_size, const int mb_row,
+                               const int mb_col, const int num_planes,
+                               const struct scale_factors *scale,
+                               const MV *subblock_mvs, uint8_t *pred) {
+  // Information of the entire block.
+  const int mb_height = block_size_high[block_size];  // Height.
+  const int mb_width = block_size_wide[block_size];   // Width.
+  const int mb_y = mb_height * mb_row;                // Y-coord (Top-left).
+  const int mb_x = mb_width * mb_col;                 // X-coord (Top-left).
+  const int bit_depth = mbd->bd;                      // Bit depth.
+  const int is_intrabc = 0;                           // Is intra-copied?
+  const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
+
+  // Default interpolation filters.
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(MULTITAP_SHARP2);
+
+  // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    // Information of each sub-block in current plane.
+    const int plane_h = mb_height >> subsampling_y;  // Plane height.
+    const int plane_w = mb_width >> subsampling_x;   // Plane width.
+    const int plane_y = mb_y >> subsampling_y;       // Y-coord (Top-left).
+    const int plane_x = mb_x >> subsampling_x;       // X-coord (Top-left).
+    const int h = plane_h >> 1;                      // Sub-block height.
+    const int w = plane_w >> 1;                      // Sub-block width.
+    const int is_y_plane = (plane == 0);             // Is Y-plane?
+
+    const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
+                                    ref_frame->widths[is_y_plane ? 0 : 1],
+                                    ref_frame->heights[is_y_plane ? 0 : 1],
+                                    ref_frame->strides[is_y_plane ? 0 : 1] };
+
+    // Handle each subblock.
+    int subblock_idx = 0;
+    for (int i = 0; i < plane_h; i += h) {
+      for (int j = 0; j < plane_w; j += w) {
+        // Choose proper motion vector.
+        const MV mv = subblock_mvs[subblock_idx++];
+        assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
+               mv.col >= INT16_MIN && mv.col <= INT16_MAX);
+
+        const int y = plane_y + i;
+        const int x = plane_x + j;
+
+        // Build predictior for each sub-block on current plane.
+        InterPredParams inter_pred_params;
+        av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                              subsampling_y, bit_depth, is_high_bitdepth,
+                              is_intrabc, scale, &ref_buf, interp_filters);
+        inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+        av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
+                                          plane_w, &mv, &inter_pred_params);
+      }
+    }
+    plane_offset += plane_h * plane_w;
+  }
+}
+/*!\cond */
+
+// Computes temporal filter weights and accumulators for the frame to be
+// filtered. More concretely, the filter weights for all pixels are the same.
+// Inputs:
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes as well as the bit-depth.
+//   block_size: Size of the block.
+//   num_planes: Number of planes in the frame.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
+                                   const MACROBLOCKD *mbd,
+                                   const BLOCK_SIZE block_size,
+                                   const int mb_row, const int mb_col,
+                                   const int num_planes, uint32_t *accum,
+                                   uint16_t *count) {
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int is_high_bitdepth = is_cur_buf_hbd(mbd);
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    const int h = mb_height >> subsampling_y;  // Plane height.
+    const int w = mb_width >> subsampling_x;   // Plane width.
+
+    const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const uint8_t *buf8 = ref_frame->buffers[plane];
+    const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8);
+    const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+
+    int pred_idx = 0;
+    int pixel_idx = 0;
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth
+                                   ? buf16[frame_offset + pixel_idx]
+                                   : buf8[frame_offset + pixel_idx];
+        accum[idx] += TF_WEIGHT_SCALE * pred_value;
+        count[idx] += TF_WEIGHT_SCALE;
+        ++pred_idx;
+        ++pixel_idx;
+      }
+      pixel_idx += (frame_stride - w);
+    }
+    plane_offset += h * w;
+  }
+}
+
+// Function to compute pixel-wise squared difference between two buffers.
+// Inputs:
+//   ref: Pointer to reference buffer.
+//   ref_offset: Start position of reference buffer for computation.
+//   ref_stride: Stride for reference buffer.
+//   tgt: Pointer to target buffer.
+//   tgt_offset: Start position of target buffer for computation.
+//   tgt_stride: Stride for target buffer.
+//   height: Height of block for computation.
+//   width: Width of block for computation.
+//   is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
+//   square_diff: Pointer to save the squared differces.
+// Returns:
+//   Nothing will be returned. But the content to which `square_diff` points
+//   will be modified.
+static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
+                                       const int ref_stride, const uint8_t *tgt,
+                                       const int tgt_offset,
+                                       const int tgt_stride, const int height,
+                                       const int width,
+                                       const int is_high_bitdepth,
+                                       uint32_t *square_diff) {
+  const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+  const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
+
+  int ref_idx = 0;
+  int tgt_idx = 0;
+  int idx = 0;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
+                                                  : ref[ref_offset + ref_idx];
+      const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
+                                                  : tgt[tgt_offset + tgt_idx];
+      const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
+                                                    : (tgt_value - ref_value);
+      square_diff[idx] = diff * diff;
+
+      ++ref_idx;
+      ++tgt_idx;
+      ++idx;
+    }
+    ref_idx += (ref_stride - width);
+    tgt_idx += (tgt_stride - width);
+  }
+}
+
+// Function to accumulate pixel-wise squared difference between two luma buffers
+// to be consumed while filtering the chroma planes.
+// Inputs:
+//   square_diff: Pointer to squared differences from luma plane.
+//   luma_sse_sum: Pointer to save the sum of luma squared differences.
+//   block_height: Height of block for computation.
+//   block_width: Width of block for computation.
+//   ss_x_shift: Chroma subsampling shift in 'X' direction
+//   ss_y_shift: Chroma subsampling shift in 'Y' direction
+// Returns:
+//   Nothing will be returned. But the content to which `luma_sse_sum` points
+//   will be modified.
+void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
+                               int block_height, int block_width,
+                               int ss_x_shift, int ss_y_shift) {
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+        for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+          const int yy = (i << ss_y_shift) + ii;     // Y-coord on Y-plane.
+          const int xx = (j << ss_x_shift) + jj;     // X-coord on Y-plane.
+          const int ww = block_width << ss_x_shift;  // Width of Y-plane.
+          luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx];
+        }
+      }
+    }
+  }
+}
+
+/*!\endcond */
+/*!\brief Applies temporal filtering. NOTE that there are various optimised
+ * versions of this function called where the appropriate instruction set is
+ * supported.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   frame_to_filter Pointer to the frame to be filtered, which is
+ *                              used as reference to compute squared
+ *                              difference from the predictor.
+ * \param[in]   mbd             Pointer to the block for filtering, ONLY used
+ *                              to get subsampling information for the  planes
+ * \param[in]   block_size      Size of the block
+ * \param[in]   mb_row          Row index of the block in the frame
+ * \param[in]   mb_col          Column index of the block in the frame
+ * \param[in]   num_planes      Number of planes in the frame
+ * \param[in]   noise_levels    Estimated noise levels for each plane
+ *                              in the frame (Y,U,V)
+ * \param[in]   subblock_mvs    Pointer to the motion vectors for 4 sub-blocks
+ * \param[in]   subblock_mses   Pointer to the search errors (MSE) for 4
+ *                              sub-blocks
+ * \param[in]   q_factor        Quantization factor. This is actually the `q`
+ *                              defined in libaom, converted from `qindex`
+ * \param[in]   filter_strength Filtering strength. This value lies in range
+ *                              [0, 6] where 6 is the maximum strength.
+ * \param[in]   tf_wgt_calc_lvl Controls the weight calculation method during
+ *                              temporal filtering
+ * \param[out]  pred            Pointer to the well-built predictors
+ * \param[out]  accum           Pointer to the pixel-wise accumulator for
+ *                              filtering
+ * \param[out]  count           Pointer to the pixel-wise counter for
+ *                              filtering
+ *
+ * \remark Nothing returned, But the contents of `accum`, `pred` and 'count'
+ *         will be modified
+ */
+void av1_apply_temporal_filter_c(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Decay factors for non-local mean approach.
+  double decay_factor[MAX_MB_PLANE] = { 0 };
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  for (int plane = 0; plane < num_planes; plane++) {
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
+  }
+  double d_factor[4] = { 0 };
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Allocate memory for pixel-wise squared differences. They,
+  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
+  uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
+  if (!square_diff) {
+    aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
+                       "Error allocating temporal filter data");
+  }
+  memset(square_diff, 0, mb_pels * sizeof(square_diff[0]));
+
+  // Allocate memory for accumulated luma squared error. This value will be
+  // consumed while filtering the chroma planes.
+  uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t));
+  if (!luma_sse_sum) {
+    aom_free(square_diff);
+    aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
+                       "Error allocating temporal filter data");
+  }
+  memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0]));
+
+  // Get window size for pixel-wise filtering.
+  assert(TF_WINDOW_LENGTH % 2 == 1);
+  const int half_window = TF_WINDOW_LENGTH >> 1;
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    // Locate pixel on reference frame.
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    const int h = mb_height >> subsampling_y;  // Plane height.
+    const int w = mb_width >> subsampling_x;   // Plane width.
+    const int frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+    const uint8_t *ref = frame_to_filter->buffers[plane];
+    const int ss_y_shift =
+        subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int ss_x_shift =
+        subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane will
+    // be more accurate. The luma sse sum is reused in both chroma planes.
+    if (plane == AOM_PLANE_U)
+      compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift,
+                                ss_y_shift);
+    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w,
+                        h, w, is_high_bitdepth, square_diff);
+
+    // Perform filtering.
+    int pred_idx = 0;
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        // non-local mean approach
+        uint64_t sum_square_diff = 0;
+
+        for (int wi = -half_window; wi <= half_window; ++wi) {
+          for (int wj = -half_window; wj <= half_window; ++wj) {
+            const int y = CLIP(i + wi, 0, h - 1);  // Y-coord on current plane.
+            const int x = CLIP(j + wj, 0, w - 1);  // X-coord on current plane.
+            sum_square_diff += square_diff[y * w + x];
+          }
+        }
+
+        sum_square_diff += luma_sse_sum[i * w + j];
+
+        // Scale down the difference for high bit depth input.
+        if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2);
+
+        // Combine window error and block error, and normalize it.
+        const double window_error = sum_square_diff * inv_num_ref_pixels;
+        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor[plane];
+        scaled_error = AOMMIN(scaled_error, 7);
+        int weight;
+        if (tf_wgt_calc_lvl == 0) {
+          weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+        } else {
+          const float fweight =
+              approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+          weight = iroundpf(fweight);
+        }
+
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        accum[idx] += weight * pred_value;
+        count[idx] += weight;
+
+        ++pred_idx;
+      }
+    }
+    plane_offset += h * w;
+  }
+
+  aom_free(square_diff);
+  aom_free(luma_sse_sum);
+}
+#if CONFIG_AV1_HIGHBITDEPTH
+// Calls High bit-depth temporal filter
+void av1_highbd_apply_temporal_filter_c(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
+                              num_planes, noise_levels, subblock_mvs,
+                              subblock_mses, q_factor, filter_strength,
+                              tf_wgt_calc_lvl, pred, accum, count);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+/*!\brief Normalizes the accumulated filtering result to produce the filtered
+ *        frame
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   mbd            Pointer to the block for filtering, which is
+ *                             ONLY used to get subsampling information for
+ *                             all the planes
+ * \param[in]   block_size     Size of the block
+ * \param[in]   mb_row         Row index of the block in the frame
+ * \param[in]   mb_col         Column index of the block in the frame
+ * \param[in]   num_planes     Number of planes in the frame
+ * \param[in]   accum          Pointer to the pre-computed accumulator
+ * \param[in]   count          Pointer to the pre-computed count
+ * \param[out]  result_buffer  Pointer to result buffer
+ *
+ * \remark Nothing returned, but the content to which `result_buffer` pointer
+ *         will be modified
+ */
+static void tf_normalize_filtered_frame(
+    const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
+    const int mb_col, const int num_planes, const uint32_t *accum,
+    const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+    uint8_t *const buf = result_buffer->buffers[plane];
+    uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf);
+
+    int plane_idx = 0;             // Pixel index on current plane (block-base).
+    int frame_idx = frame_offset;  // Pixel index on the entire frame.
+    for (int i = 0; i < plane_h; ++i) {
+      for (int j = 0; j < plane_w; ++j) {
+        const int idx = plane_idx + plane_offset;
+        const uint16_t rounding = count[idx] >> 1;
+        if (is_high_bitdepth) {
+          buf16[frame_idx] =
+              (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+        } else {
+          buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+        }
+        ++plane_idx;
+        ++frame_idx;
+      }
+      frame_idx += (frame_stride - plane_w);
+    }
+    plane_offset += plane_h * plane_w;
+  }
+}
+
+int av1_get_q(const AV1_COMP *cpi) {
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+  const int q =
+      (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
+                                   cpi->common.seq_params->bit_depth);
+  return q;
+}
+
+void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  const int num_frames = tf_ctx->num_frames;
+  const int filter_frame_idx = tf_ctx->filter_frame_idx;
+  const int compute_frame_diff = tf_ctx->compute_frame_diff;
+  const struct scale_factors *scale = &tf_ctx->sf;
+  const double *noise_levels = tf_ctx->noise_levels;
+  const int num_pels = tf_ctx->num_pels;
+  const int q_factor = tf_ctx->q_factor;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+  MACROBLOCK *const mb = &td->mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  TemporalFilterData *const tf_data = &td->tf_data;
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mi_h = mi_size_high_log2[block_size];
+  const int mi_w = mi_size_wide_log2[block_size];
+  const int num_planes = av1_num_planes(&cpi->common);
+  const int weight_calc_level_in_tf = cpi->sf.hl_sf.weight_calc_level_in_tf;
+  uint32_t *accum = tf_data->accum;
+  uint16_t *count = tf_data->count;
+  uint8_t *pred = tf_data->pred;
+
+  // Factor to control the filering strength.
+  const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength;
+
+  // Do filtering.
+  FRAME_DIFF *diff = &td->tf_data.diff;
+  av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                        (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+                        cpi->oxcf.border_in_pixels);
+  for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) {
+    av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                          (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+                          cpi->oxcf.border_in_pixels);
+    memset(accum, 0, num_pels * sizeof(accum[0]));
+    memset(count, 0, num_pels * sizeof(count[0]));
+    MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
+                          // Perform temporal filtering frame by frame.
+
+    // Decide whether to perform motion search at 16x16 sub-block level or not
+    // based on 4x4 sub-blocks source variance. Allow motion search for split
+    // partition only if the difference between max and min source variance of
+    // 4x4 blocks is greater than a threshold (which is derived empirically).
+    bool allow_me_for_sub_blks = true;
+    if (cpi->sf.hl_sf.allow_sub_blk_me_in_tf) {
+      const int is_hbd = is_frame_high_bitdepth(frame_to_filter);
+      // Initialize minimum variance to a large value and maximum variance to 0.
+      double blk_4x4_var_min = DBL_MAX;
+      double blk_4x4_var_max = 0;
+      get_log_var_4x4sub_blk(cpi, frame_to_filter, mb_row, mb_col,
+                             TF_BLOCK_SIZE, &blk_4x4_var_min, &blk_4x4_var_max,
+                             is_hbd);
+      // TODO(sanampudi.venkatarao@ittiam.com): Experiment and adjust the
+      // threshold for high bit depth.
+      if ((blk_4x4_var_max - blk_4x4_var_min) <= 4.0)
+        allow_me_for_sub_blks = false;
+    }
+
+    for (int frame = 0; frame < num_frames; frame++) {
+      if (frames[frame] == NULL) continue;
+
+      // Motion search.
+      MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+      int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+      if (frame ==
+          filter_frame_idx) {  // Frame to be filtered.
+                               // Change ref_mv sign for following frames.
+        ref_mv.row *= -1;
+        ref_mv.col *= -1;
+      } else {  // Other reference frames.
+        tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
+                         mb_row, mb_col, &ref_mv, allow_me_for_sub_blks,
+                         subblock_mvs, subblock_mses);
+      }
+
+      // Perform weighted averaging.
+      if (frame == filter_frame_idx) {  // Frame to be filtered.
+        tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row,
+                                      mb_col, num_planes, accum, count);
+      } else {  // Other reference frames.
+        tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
+                           num_planes, scale, subblock_mvs, pred);
+
+        // All variants of av1_apply_temporal_filter() contain floating point
+        // operations. Hence, clear the system state.
+
+        // TODO(any): avx2/sse2 version should be changed to align with C
+        // function before using. In particular, current avx2/sse2 function
+        // only supports 32x32 block size and 5x5 filtering window.
+        if (is_frame_high_bitdepth(frame_to_filter)) {  // for high bit-depth
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+            av1_highbd_apply_temporal_filter(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, weight_calc_level_in_tf, pred, accum, count);
+          } else {
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+            av1_apply_temporal_filter_c(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, weight_calc_level_in_tf, pred, accum, count);
+#if CONFIG_AV1_HIGHBITDEPTH
+          }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+        } else {
+          // for 8-bit
+          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+            av1_apply_temporal_filter(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, weight_calc_level_in_tf, pred, accum, count);
+          } else {
+            av1_apply_temporal_filter_c(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, weight_calc_level_in_tf, pred, accum, count);
+          }
+        }
+      }
+    }
+    tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
+                                accum, count, tf_ctx->output_frame);
+
+    if (compute_frame_diff) {
+      const int y_height = mb_height >> mbd->plane[0].subsampling_y;
+      const int y_width = mb_width >> mbd->plane[0].subsampling_x;
+      const int source_y_stride = frame_to_filter->y_stride;
+      const int filter_y_stride = tf_ctx->output_frame->y_stride;
+      const int source_offset =
+          mb_row * y_height * source_y_stride + mb_col * y_width;
+      const int filter_offset =
+          mb_row * y_height * filter_y_stride + mb_col * y_width;
+      unsigned int sse = 0;
+      cpi->ppi->fn_ptr[block_size].vf(
+          frame_to_filter->y_buffer + source_offset, source_y_stride,
+          tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride,
+          &sse);
+      diff->sum += sse;
+      diff->sse += sse * (int64_t)sse;
+    }
+  }
+}
+
+/*!\brief Does temporal filter for a given frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi                   Top level encoder instance structure
+ *
+ * \remark Nothing will be returned, but the contents of td->diff will be
+ modified.
+ */
+static void tf_do_filtering(AV1_COMP *cpi) {
+  // Basic information.
+  ThreadData *td = &cpi->td;
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  const struct scale_factors *scale = &tf_ctx->sf;
+  const int num_planes = av1_num_planes(&cpi->common);
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  MACROBLOCKD *mbd = &td->mb.e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  MB_MODE_INFO **input_mb_mode_info;
+  tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+  tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+  // Perform temporal filtering for each row.
+  for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++)
+    av1_tf_do_filtering_row(cpi, td, mb_row);
+
+  tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+}
+
+/*!\brief Setups the frame buffer for temporal filtering. This fuction
+ * determines how many frames will be used for temporal filtering and then
+ * groups them into a buffer. This function will also estimate the noise level
+ * of the to-filter frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi             Top level encoder instance structure
+ * \param[in]   filter_frame_lookahead_idx  The index of the to-filter frame
+ *                              in the lookahead buffer cpi->lookahead
+ * \param[in]   gf_frame_index  GOP index
+ *
+ * \remark Nothing will be returned. But the fields `frames`, `num_frames`,
+ *         `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx.
+ */
+static void tf_setup_filtering_buffer(AV1_COMP *cpi,
+                                      int filter_frame_lookahead_idx,
+                                      int gf_frame_index) {
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
+  const FRAME_TYPE frame_type = gf_group->frame_type[gf_frame_index];
+  const int is_forward_keyframe =
+      av1_gop_check_forward_keyframe(gf_group, gf_frame_index);
+
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable
+  // temporal filtering.
+  int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1);
+  int num_before = 0;  // Number of filtering frames before the to-filter frame.
+  int num_after = 0;   // Number of filtering frames after the to-filer frame.
+  const int lookahead_depth =
+      av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+
+  // Temporal filtering should not go beyond key frames
+  const int key_to_curframe =
+      AOMMAX(cpi->rc.frames_since_key + filter_frame_lookahead_idx, 0);
+  const int curframe_to_key =
+      AOMMAX(cpi->rc.frames_to_key - filter_frame_lookahead_idx - 1, 0);
+
+  // Number of buffered frames before the to-filter frame.
+  int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
+
+  // Number of buffered frames after the to-filter frame.
+  int max_after =
+      AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key);
+
+  // Estimate noises for each plane.
+  const struct lookahead_entry *to_filter_buf = av1_lookahead_peek(
+      cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage);
+  assert(to_filter_buf != NULL);
+  const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
+  const int num_planes = av1_num_planes(&cpi->common);
+  double *noise_levels = tf_ctx->noise_levels;
+  av1_estimate_noise_level(to_filter_frame, noise_levels, AOM_PLANE_Y,
+                           num_planes - 1, cpi->common.seq_params->bit_depth,
+                           NOISE_ESTIMATION_EDGE_THRESHOLD);
+  // Get quantization factor.
+  const int q = av1_get_q(cpi);
+  // Get correlation estimates from first-pass;
+  const FIRSTPASS_STATS *stats =
+      cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0);
+  double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
+  for (int i = 1; i <= max_after; i++) {
+    if (stats + filter_frame_lookahead_idx + i >=
+        cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+      max_after = i - 1;
+      break;
+    }
+    accu_coeff1 *=
+        AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001);
+  }
+  if (max_after >= 1) {
+    accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
+  }
+  for (int i = 1; i <= max_before; i++) {
+    if (stats + filter_frame_lookahead_idx - i + 1 <=
+        cpi->ppi->twopass.stats_buf_ctx->stats_in_start) {
+      max_before = i - 1;
+      break;
+    }
+    accu_coeff0 *=
+        AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001);
+  }
+  if (max_before >= 1) {
+    accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
+  }
+
+  // Adjust number of filtering frames based on quantization factor. When the
+  // quantization factor is small enough (lossless compression), we will not
+  // change the number of frames for key frame filtering, which is to avoid
+  // visual quality drop.
+  int adjust_num = 6;
+  const int adjust_num_frames_for_arf_filtering =
+      cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering;
+  if (num_frames == 1) {  // `arnr_max_frames = 1` is used to disable filtering.
+    adjust_num = 0;
+  } else if ((update_type == KF_UPDATE) && q <= 10) {
+    adjust_num = 0;
+  } else if (adjust_num_frames_for_arf_filtering > 0 &&
+             update_type != KF_UPDATE && (cpi->rc.frames_since_key > 0)) {
+    // Since screen content detection happens after temporal filtering,
+    // 'frames_since_key' check is added to ensure the sf is disabled for the
+    // first alt-ref frame.
+    // Adjust number of frames to be considered for filtering based on noise
+    // level of the current frame. For low-noise frame, use more frames to
+    // filter such that the filtered frame can provide better predictions for
+    // subsequent frames and vice versa.
+    const uint8_t av1_adjust_num_using_noise_lvl[2][3] = { { 6, 4, 2 },
+                                                           { 4, 2, 0 } };
+    const uint8_t *adjust_num_frames =
+        av1_adjust_num_using_noise_lvl[adjust_num_frames_for_arf_filtering - 1];
+
+    if (noise_levels[AOM_PLANE_Y] < 0.5)
+      adjust_num = adjust_num_frames[0];
+    else if (noise_levels[AOM_PLANE_Y] < 1.0)
+      adjust_num = adjust_num_frames[1];
+    else
+      adjust_num = adjust_num_frames[2];
+  }
+  num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
+
+  if (frame_type == KEY_FRAME) {
+    num_before = AOMMIN(is_forward_keyframe ? num_frames / 2 : 0, max_before);
+    num_after = AOMMIN(num_frames - 1, max_after);
+  } else {
+    int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame,
+                                       &cpi->ppi->p_rc, &cpi->frame_info,
+                                       filter_frame_lookahead_idx, max_before,
+                                       max_after, NULL, NULL, 0);
+
+    num_frames = AOMMIN(num_frames, gfu_boost / 150);
+    num_frames += !(num_frames & 1);  // Make the number odd.
+
+    // Only use 2 neighbours for the second ARF.
+    if (update_type == INTNL_ARF_UPDATE) num_frames = AOMMIN(num_frames, 3);
+    if (AOMMIN(max_after, max_before) >= num_frames / 2) {
+      // just use half half
+      num_before = num_frames / 2;
+      num_after = num_frames / 2;
+    } else {
+      if (max_after < num_frames / 2) {
+        num_after = max_after;
+        num_before = AOMMIN(num_frames - 1 - num_after, max_before);
+      } else {
+        num_before = max_before;
+        num_after = AOMMIN(num_frames - 1 - num_before, max_after);
+      }
+      // Adjust insymmetry based on frame-level correlation
+      if (max_after > 0 && max_before > 0) {
+        if (num_after < num_before) {
+          const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01));
+          num_before = AOMMIN(num_before, num_after + insym);
+        } else {
+          const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01));
+          num_after = AOMMIN(num_after, num_before + insym);
+        }
+      }
+    }
+  }
+  num_frames = num_before + 1 + num_after;
+
+  // Setup the frame buffer.
+  for (int frame = 0; frame < num_frames; ++frame) {
+    const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx;
+    struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage);
+    assert(buf != NULL);
+    frames[frame] = &buf->img;
+  }
+  tf_ctx->num_frames = num_frames;
+  tf_ctx->filter_frame_idx = num_before;
+  assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
+
+  av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
+                       cpi->common.seq_params->sb_size);
+  av1_setup_block_planes(&cpi->td.mb.e_mbd,
+                         cpi->common.seq_params->subsampling_x,
+                         cpi->common.seq_params->subsampling_y, num_planes);
+}
+
+/*!\cond */
+
+double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height,
+                                              int width, int stride,
+                                              int edge_thresh) {
+  int64_t accum = 0;
+  int count = 0;
+
+  for (int i = 1; i < height - 1; ++i) {
+    for (int j = 1; j < width - 1; ++j) {
+      // Setup a small 3x3 matrix.
+      const int center_idx = i * stride + j;
+      int mat[3][3];
+      for (int ii = -1; ii <= 1; ++ii) {
+        for (int jj = -1; jj <= 1; ++jj) {
+          const int idx = center_idx + ii * stride + jj;
+          mat[ii + 1][jj + 1] = src[idx];
+        }
+      }
+      // Compute sobel gradients.
+      const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), 0);
+      // Accumulate Laplacian.
+      if (Ga < edge_thresh) {  // Only count smooth pixels.
+        const int v = 4 * mat[1][1] -
+                      2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                      (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+        accum += ROUND_POWER_OF_TWO(abs(v), 0);
+        ++count;
+      }
+    }
+  }
+
+  // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+  return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16,
+                                                     int height, int width,
+                                                     const int stride,
+                                                     int bit_depth,
+                                                     int edge_thresh) {
+  int64_t accum = 0;
+  int count = 0;
+  for (int i = 1; i < height - 1; ++i) {
+    for (int j = 1; j < width - 1; ++j) {
+      // Setup a small 3x3 matrix.
+      const int center_idx = i * stride + j;
+      int mat[3][3];
+      for (int ii = -1; ii <= 1; ++ii) {
+        for (int jj = -1; jj <= 1; ++jj) {
+          const int idx = center_idx + ii * stride + jj;
+          mat[ii + 1][jj + 1] = src16[idx];
+        }
+      }
+      // Compute sobel gradients.
+      const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
+      // Accumulate Laplacian.
+      if (Ga < edge_thresh) {  // Only count smooth pixels.
+        const int v = 4 * mat[1][1] -
+                      2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                      (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+        accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8);
+        ++count;
+      }
+    }
+  }
+
+  // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+  return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+#endif
+
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+                              double *noise_level, int plane_from, int plane_to,
+                              int bit_depth, int edge_thresh) {
+  for (int plane = plane_from; plane <= plane_to; plane++) {
+    const bool is_uv_plane = (plane != AOM_PLANE_Y);
+    const int height = frame->crop_heights[is_uv_plane];
+    const int width = frame->crop_widths[is_uv_plane];
+    const int stride = frame->strides[is_uv_plane];
+    const uint8_t *src = frame->buffers[plane];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+    if (is_high_bitdepth) {
+      noise_level[plane] = av1_highbd_estimate_noise_from_single_plane(
+          src16, height, width, stride, bit_depth, edge_thresh);
+    } else {
+      noise_level[plane] = av1_estimate_noise_from_single_plane(
+          src, height, width, stride, edge_thresh);
+    }
+#else
+    (void)bit_depth;
+    noise_level[plane] = av1_estimate_noise_from_single_plane(
+        src, height, width, stride, edge_thresh);
+#endif
+  }
+}
+
+// Initializes the members of TemporalFilterCtx
+// Inputs:
+//   cpi: Top level encoder instance structure
+//   check_show_existing: If 1, check whether the filtered frame is similar
+//                        to the original frame.
+//   filter_frame_lookahead_idx: The index of the frame to be filtered in the
+//                               lookahead buffer cpi->lookahead.
+// Returns:
+//   Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
+static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
+                        int gf_frame_index, int compute_frame_diff,
+                        YV12_BUFFER_CONFIG *output_frame) {
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  // Setup frame buffer for filtering.
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  tf_ctx->num_frames = 0;
+  tf_ctx->filter_frame_idx = -1;
+  tf_ctx->output_frame = output_frame;
+  tf_ctx->compute_frame_diff = compute_frame_diff;
+  tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, gf_frame_index);
+  assert(tf_ctx->num_frames > 0);
+  assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames);
+
+  // Setup scaling factors. Scaling on each of the arnr frames is not
+  // supported.
+  // ARF is produced at the native frame size and resized when coded.
+  struct scale_factors *sf = &tf_ctx->sf;
+  av1_setup_scale_factors_for_frame(
+      sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+      frames[0]->y_crop_width, frames[0]->y_crop_height);
+
+  // Initialize temporal filter parameters.
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  const int filter_frame_idx = tf_ctx->filter_frame_idx;
+  const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int mb_width = block_size_wide[block_size];
+  const int mb_height = block_size_high[block_size];
+  const int mb_rows = get_num_blocks(frame_height, mb_height);
+  const int mb_cols = get_num_blocks(frame_width, mb_width);
+  const int mb_pels = mb_width * mb_height;
+  const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const int num_planes = av1_num_planes(&cpi->common);
+  int num_pels = 0;
+  for (int i = 0; i < num_planes; i++) {
+    const int subsampling_x = mbd->plane[i].subsampling_x;
+    const int subsampling_y = mbd->plane[i].subsampling_y;
+    num_pels += mb_pels >> (subsampling_x + subsampling_y);
+  }
+  tf_ctx->num_pels = num_pels;
+  tf_ctx->mb_rows = mb_rows;
+  tf_ctx->mb_cols = mb_cols;
+  tf_ctx->is_highbitdepth = is_highbitdepth;
+  tf_ctx->q_factor = av1_get_q(cpi);
+}
+
+int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
+                                  const FRAME_DIFF *frame_diff, int q_index,
+                                  aom_bit_depth_t bit_depth) {
+  const int frame_height = frame->y_crop_height;
+  const int frame_width = frame->y_crop_width;
+  const int block_height = block_size_high[TF_BLOCK_SIZE];
+  const int block_width = block_size_wide[TF_BLOCK_SIZE];
+  const int mb_rows = get_num_blocks(frame_height, block_height);
+  const int mb_cols = get_num_blocks(frame_width, block_width);
+  const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
+  const float mean = (float)frame_diff->sum / num_mbs;
+  const float std = (float)sqrt((float)frame_diff->sse / num_mbs - mean * mean);
+
+  const int ac_q_step = av1_ac_quant_QTX(q_index, 0, bit_depth);
+  const float threshold = 0.7f * ac_q_step * ac_q_step;
+
+  if (mean < threshold && std < mean * 1.2) {
+    return 1;
+  }
+  return 0;
+}
+
+void av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+                         int gf_frame_index, FRAME_DIFF *frame_diff,
+                         YV12_BUFFER_CONFIG *output_frame) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  // Basic informaton of the current frame.
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  TemporalFilterData *tf_data = &cpi->td.tf_data;
+  const int compute_frame_diff = frame_diff != NULL;
+  // TODO(anyone): Currently, we enforce the filtering strength on internal
+  // ARFs except the second ARF to be zero. We should investigate in which case
+  // it is more beneficial to use non-zero strength filtering.
+  // Only parallel level 0 frames go through temporal filtering.
+  assert(cpi->ppi->gf_group.frame_parallel_level[gf_frame_index] == 0);
+
+  // Initialize temporal filter context structure.
+  init_tf_ctx(cpi, filter_frame_lookahead_idx, gf_frame_index,
+              compute_frame_diff, output_frame);
+
+  // Allocate and reset temporal filter buffers.
+  const int is_highbitdepth = tf_ctx->is_highbitdepth;
+  if (!tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth)) {
+    aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating temporal filter data");
+  }
+
+  // Perform temporal filtering process.
+  if (mt_info->num_workers > 1)
+    av1_tf_do_filtering_mt(cpi);
+  else
+    tf_do_filtering(cpi);
+
+  if (compute_frame_diff) {
+    *frame_diff = tf_data->diff;
+  }
+  // Deallocate temporal filter buffers.
+  tf_dealloc_data(tf_data, is_highbitdepth);
+}
+
+int av1_is_temporal_filter_on(const AV1EncoderConfig *oxcf) {
+  return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1;
+}
+
+bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf);
+  if (tf_info->is_temporal_filter_on == 0) return true;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+    if (aom_realloc_frame_buffer(
+            &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width,
+            oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+            NULL, cpi->image_pyramid_levels, 0)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) {
+  if (tf_info->is_temporal_filter_on == 0) return;
+  for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+    aom_free_frame_buffer(&tf_info->tf_buf[i]);
+  }
+  aom_free_frame_buffer(&tf_info->tf_buf_second_arf);
+}
+
+void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info) {
+  av1_zero(tf_info->tf_buf_valid);
+  av1_zero(tf_info->tf_buf_gf_index);
+  av1_zero(tf_info->tf_buf_display_index_offset);
+}
+
+void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi,
+                           const GF_GROUP *gf_group) {
+  if (tf_info->is_temporal_filter_on == 0) return;
+  const AV1_COMMON *const cm = &cpi->common;
+  for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+    int update_type = gf_group->update_type[gf_index];
+    if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
+      int buf_idx = gf_group->frame_type[gf_index] == INTER_FRAME;
+      int lookahead_idx = gf_group->arf_src_offset[gf_index] +
+                          gf_group->cur_frame_idx[gf_index];
+      // This function is designed to be called multiple times after
+      // av1_tf_info_reset(). It will only generate the filtered frame that does
+      // not exist yet.
+      if (tf_info->tf_buf_valid[buf_idx] == 0 ||
+          tf_info->tf_buf_display_index_offset[buf_idx] != lookahead_idx) {
+        YV12_BUFFER_CONFIG *out_buf = &tf_info->tf_buf[buf_idx];
+        av1_temporal_filter(cpi, lookahead_idx, gf_index,
+                            &tf_info->frame_diff[buf_idx], out_buf);
+        aom_extend_frame_borders(out_buf, av1_num_planes(cm));
+        tf_info->tf_buf_gf_index[buf_idx] = gf_index;
+        tf_info->tf_buf_display_index_offset[buf_idx] = lookahead_idx;
+        tf_info->tf_buf_valid[buf_idx] = 1;
+      }
+    }
+  }
+}
+
+YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
+                                                 int gf_index,
+                                                 FRAME_DIFF *frame_diff) {
+  if (tf_info->is_temporal_filter_on == 0) return NULL;
+  YV12_BUFFER_CONFIG *out_buf = NULL;
+  for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+    if (tf_info->tf_buf_valid[i] && tf_info->tf_buf_gf_index[i] == gf_index) {
+      out_buf = &tf_info->tf_buf[i];
+      *frame_diff = tf_info->frame_diff[i];
+    }
+  }
+  return out_buf;
+}
+/*!\endcond */
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
new file mode 100644
index 0000000000..6504b91b66
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+struct AV1_COMP;
+struct AV1EncoderConfig;
+struct ThreadData;
+// TODO(wtc): These two variables are only used in avx2, sse2, neon
+// implementations, where the block size is still hard coded to TF_BLOCK_SIZE.
+// This should be fixed to align with the c implementation.
+#define BH 32
+#define BW 32
+
+// Block size used in temporal filtering.
+#define TF_BLOCK_SIZE BLOCK_32X32
+
+// Window size for temporal filtering.
+#define TF_WINDOW_LENGTH 5
+
+// A constant number, sqrt(pi / 2),  used for noise estimation.
+static const double SQRT_PI_BY_2 = 1.25331413732;
+
+// Hyper-parameters used to compute filtering weight. These hyper-parameters can
+// be tuned for a better performance.
+// 0. A scale factor used in temporal filtering to raise the filter weight from
+//    `double` with range [0, 1] to `int` with range [0, 1000].
+#define TF_WEIGHT_SCALE 1000
+// 1. Weight factor used to balance the weighted-average between window error
+//    and block error. The weight is for window error while the weight for block
+//    error is always set as 1.
+#define TF_WINDOW_BLOCK_BALANCE_WEIGHT 5
+// 2. Threshold for using q to adjust the filtering weight. Concretely, when
+//    using a small q (high bitrate), we would like to reduce the filtering
+//    strength such that more detailed information can be preserved. Hence, when
+//    q is smaller than this threshold, we will adjust the filtering weight
+//    based on the q-value.
+#define TF_Q_DECAY_THRESHOLD 20
+// 3. Normalization factor used to normalize the motion search error. Since the
+//    motion search error can be large and uncontrollable, we will simply
+//    normalize it before using it to compute the filtering weight.
+#define TF_SEARCH_ERROR_NORM_WEIGHT 20
+// 4. Threshold for using `arnr_strength` to adjust the filtering strength.
+//    Concretely, users can use `arnr_strength` arguments to control the
+//    strength of temporal filtering. When `arnr_strength` is small enough (
+//    i.e., smaller than this threshold), we will adjust the filtering weight
+//    based on the strength value.
+#define TF_STRENGTH_THRESHOLD 4
+// 5. Threshold for using motion search distance to adjust the filtering weight.
+//    Concretely, larger motion search vector leads to a higher probability of
+//    unreliable search. Hence, we would like to reduce the filtering strength
+//    when the distance is large enough. Considering that the distance actually
+//    relies on the frame size, this threshold is also a resolution-based
+//    threshold. Taking 720p videos as an instance, if this field equals to 0.1,
+//    then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold
+//    for 360p videos will be 360 * 0.1 = 36.
+#define TF_SEARCH_DISTANCE_THRESHOLD 0.1
+// 6. Threshold to identify if the q is in a relative high range.
+//    Above this cutoff q, a stronger filtering is applied.
+//    For a high q, the quantization throws away more information, and thus a
+//    stronger filtering is less likely to distort the encoded quality, while a
+//    stronger filtering could reduce bit rates.
+//    Ror a low q, more details are expected to be retained. Filtering is thus
+//    more conservative.
+#define TF_QINDEX_CUTOFF 128
+
+#define NOISE_ESTIMATION_EDGE_THRESHOLD 50
+
+// Sum and SSE source vs filtered frame difference returned by
+// temporal filter.
+typedef struct {
+  int64_t sum;
+  int64_t sse;
+} FRAME_DIFF;
+
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to temporal filtering.
+ */
+typedef struct {
+  /*!
+   * Frame buffers used for temporal filtering.
+   */
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+  /*!
+   * Number of frames in the frame buffer.
+   */
+  int num_frames;
+
+  /*!
+   * Output filtered frame
+   */
+  YV12_BUFFER_CONFIG *output_frame;
+
+  /*!
+   * Index of the frame to be filtered.
+   */
+  int filter_frame_idx;
+  /*!
+   * Whether to accumulate diff for show existing condition check.
+   */
+  int compute_frame_diff;
+  /*!
+   * Frame scaling factor.
+   */
+  struct scale_factors sf;
+  /*!
+   * Estimated noise levels for each plane in the frame.
+   */
+  double noise_levels[MAX_MB_PLANE];
+  /*!
+   * Number of pixels in the temporal filtering block across all planes.
+   */
+  int num_pels;
+  /*!
+   * Number of temporal filtering block rows.
+   */
+  int mb_rows;
+  /*!
+   * Number of temporal filtering block columns.
+   */
+  int mb_cols;
+  /*!
+   * Whether the frame is high-bitdepth or not.
+   */
+  int is_highbitdepth;
+  /*!
+   * Quantization factor used in temporal filtering.
+   */
+  int q_factor;
+} TemporalFilterCtx;
+
+/*!
+ * buffer count in TEMPORAL_FILTER_INFO
+ * Currently we only apply filtering on KEY and ARF after
+ * define_gf_group(). Hence, the count is two.
+ */
+#define TF_INFO_BUF_COUNT 2
+
+/*!
+ * \brief Temporal filter info for a gop
+ */
+typedef struct TEMPORAL_FILTER_INFO {
+  /*!
+   * A flag indicate whether temporal filter shoud be applied.
+   * This flag will stored the result of
+   * av1_is_temporal_filter_on()
+   */
+  int is_temporal_filter_on;
+  /*!
+   * buffers used for temporal filtering in a GOP
+   * index 0 for key frame and index 1 for ARF
+   */
+  YV12_BUFFER_CONFIG tf_buf[TF_INFO_BUF_COUNT];
+
+  /*!
+   * buffers used for temporal filtering for
+   * INTNL_ARF_UPDATE
+   * Check av1_gop_is_second_arf() for the
+   * definition of second_arf in detail
+   */
+  YV12_BUFFER_CONFIG tf_buf_second_arf;
+  /*!
+   * whether to show the buffer directly or not.
+   */
+  FRAME_DIFF frame_diff[TF_INFO_BUF_COUNT];
+  /*!
+   * the corresponding gf_index for the buffer.
+   */
+  int tf_buf_gf_index[TF_INFO_BUF_COUNT];
+  /*!
+   * the display_index offset between next show frame and the frames in the GOP
+   */
+  int tf_buf_display_index_offset[TF_INFO_BUF_COUNT];
+  /*!
+   * whether the buf is valid or not.
+   */
+  int tf_buf_valid[TF_INFO_BUF_COUNT];
+} TEMPORAL_FILTER_INFO;
+
+/*!\brief Check whether we should apply temporal filter at all.
+ * \param[in]   oxcf           AV1 encoder config
+ *
+ * \return 1: temporal filter is on 0: temporal is off
+ */
+int av1_is_temporal_filter_on(const struct AV1EncoderConfig *oxcf);
+
+/*!\brief Allocate buffers for TEMPORAL_FILTER_INFO
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ * \param[in,out]   cpi               Top level encoder instance structure
+ *
+ * \return True on success, false on memory allocation failure.
+ */
+bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info,
+                       const struct AV1_COMP *cpi);
+
+/*!\brief Free buffers for TEMPORAL_FILTER_INFO
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ */
+void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info);
+
+/*!\brief Reset validity of tf_buf in TEMPORAL_FILTER_INFO
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ */
+void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info);
+
+/*!\brief Apply temporal filter for key frame and ARF in a gop
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ * \param[in,out]   cpi               Top level encoder instance structure
+ * \param[in]       gf_group          GF/ARF group data structure
+ */
+void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, struct AV1_COMP *cpi,
+                           const GF_GROUP *gf_group);
+
+/*!\brief Get a filtered buffer from TEMPORAL_FILTER_INFO
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ * \param[in]       gf_index          gf_index for the target buffer
+ * \param[out]      show_tf_buf       whether the target buffer can be shown
+ * directly
+ */
+YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
+                                                 int gf_index,
+                                                 FRAME_DIFF *frame_diff);
+
+/*!\cond */
+
+// Data related to temporal filtering.
+typedef struct {
+  // Source vs filtered frame error.
+  FRAME_DIFF diff;
+  // Pointer to temporary block info used to store state in temporal filtering
+  // process.
+  MB_MODE_INFO *tmp_mbmi;
+  // Pointer to accumulator buffer used in temporal filtering process.
+  uint32_t *accum;
+  // Pointer to count buffer used in temporal filtering process.
+  uint16_t *count;
+  // Pointer to predictor used in temporal filtering process.
+  uint8_t *pred;
+} TemporalFilterData;
+
+// Data related to temporal filter multi-thread synchronization.
+typedef struct {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used for dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif  // CONFIG_MULTITHREAD
+  // Next temporal filter block row to be filtered.
+  int next_tf_row;
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool tf_mt_exit;
+} AV1TemporalFilterSync;
+
+// Estimates noise level from a given frame using a single plane (Y, U, or V).
+// This is an adaptation of the mehtod in the following paper:
+// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
+// estimation using Laplacian operator and adaptive edge detection",
+// Proc. 3rd International Symposium on Communications, Control and
+// Signal Processing, 2008, St Julians, Malta.
+// Inputs:
+//   frame: Pointer to the frame to estimate noise level from.
+//   noise_level: Pointer to store the estimated noise.
+//   plane_from: Index of the starting plane used for noise estimation.
+//               Commonly, 0 for Y-plane, 1 for U-plane, and 2 for V-plane.
+//   plane_to: Index of the end plane used for noise estimation.
+//   bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame.
+//   edge_thresh: Edge threshold.
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+                              double *noise_level, int plane_from, int plane_to,
+                              int bit_depth, int edge_thresh);
+/*!\endcond */
+
+/*!\brief Does temporal filter for a given macroblock row.
+*
+* \ingroup src_frame_proc
+* \param[in]   cpi                   Top level encoder instance structure
+* \param[in]   td                    Pointer to thread data
+* \param[in]   mb_row                Macroblock row to be filtered
+filtering
+*
+* \remark Nothing will be returned, but the contents of td->diff will be
+modified.
+*/
+void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                             int mb_row);
+
+/*!\brief Performs temporal filtering if needed on a source frame.
+ * For example to create a filtered alternate reference frame (ARF)
+ *
+ * In this function, the lookahead index is different from the 0-based
+ * real index. For example, if we want to filter the first frame in the
+ * pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
+ * of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
+ * second frame in the pre-fetched buffer. Another example: if we want to filter
+ * the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
+ * Futhermore, negative number is used for key frame in one-pass mode, where key
+ * frame is filtered with the frames before it instead of after it. For example,
+ * -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]      cpi                        Top level encoder instance
+ *                                            structure
+ * \param[in]      filter_frame_lookahead_idx The index of the
+ *                                            to-filter frame in the lookahead
+ *                                            buffer cpi->lookahead.
+ * \param[in]      gf_frame_index             Index of GOP
+ * \param[in,out]  frame_diff                 structure of sse and sum of the
+ *                                            filtered frame.
+ * \param[out]     output_frame               Ouput filtered frame.
+ */
+void av1_temporal_filter(struct AV1_COMP *cpi,
+                         const int filter_frame_lookahead_idx,
+                         int gf_frame_index, FRAME_DIFF *frame_diff,
+                         YV12_BUFFER_CONFIG *output_frame);
+
+/*!\brief Check whether a filtered frame can be show directly
+ *
+ * This function will use the filtered frame's sse and current q index
+ * to make decision.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]  frame        filtered frame's buffer
+ * \param[in]  frame_diff   structure of sse and sum of the
+ *                          filtered frame.
+ * \param[in]  q_index      q_index used for this frame
+ * \param[in]  bit_depth    bit depth
+ * \return     return 1 if this frame can be shown directly, otherwise
+ *             return 0
+ */
+int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
+                                  const FRAME_DIFF *frame_diff, int q_index,
+                                  aom_bit_depth_t bit_depth);
+
+/*!\cond */
+// Helper function to get `q` used for encoding.
+int av1_get_q(const struct AV1_COMP *cpi);
+
+// Allocates memory for members of TemporalFilterData.
+// Inputs:
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   num_pels: Number of pixels in the block across all planes.
+//   is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+//   True if allocation is successful and false otherwise.
+static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
+                                               int num_pels,
+                                               int is_high_bitdepth) {
+  tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi));
+  tf_data->accum =
+      (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum));
+  tf_data->count =
+      (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count));
+  if (is_high_bitdepth)
+    tf_data->pred = CONVERT_TO_BYTEPTR(
+        aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred)));
+  else
+    tf_data->pred =
+        (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred));
+  // In case of an allocation failure, other successfully allocated buffers will
+  // be freed by the tf_dealloc_data() call in encoder_destroy().
+  if (!(tf_data->tmp_mbmi && tf_data->accum && tf_data->count && tf_data->pred))
+    return false;
+  memset(&tf_data->diff, 0, sizeof(tf_data->diff));
+  return true;
+}
+
+// Setup macroblockd params for temporal filtering process.
+// Inputs:
+//   mbd: Pointer to the block for filtering.
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   scale: Scaling factor.
+// Returns:
+//   Nothing will be returned. Contents of mbd will be modified.
+static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd,
+                                            TemporalFilterData *tf_data,
+                                            const struct scale_factors *scale) {
+  mbd->block_ref_scale_factors[0] = scale;
+  mbd->block_ref_scale_factors[1] = scale;
+  mbd->mi = &tf_data->tmp_mbmi;
+  mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+}
+
+// Deallocates the memory allocated for members of TemporalFilterData.
+// Inputs:
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+//   Nothing will be returned.
+static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
+                                       int is_high_bitdepth) {
+  if (is_high_bitdepth)
+    tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred);
+  aom_free(tf_data->tmp_mbmi);
+  tf_data->tmp_mbmi = NULL;
+  aom_free(tf_data->accum);
+  tf_data->accum = NULL;
+  aom_free(tf_data->count);
+  tf_data->count = NULL;
+  aom_free(tf_data->pred);
+  tf_data->pred = NULL;
+}
+
+// Saves the state prior to temporal filter process.
+// Inputs:
+//   mbd: Pointer to the block for filtering.
+//   input_mbmi: Backup block info to save input state.
+//   input_buffer: Backup buffer pointer to save input state.
+//   num_planes: Number of planes.
+// Returns:
+//   Nothing will be returned. Contents of input_mbmi and input_buffer will be
+//   modified.
+static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi,
+                                 uint8_t **input_buffer, int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+  }
+  *input_mbmi = mbd->mi;
+}
+
+// Restores the initial state after temporal filter process.
+// Inputs:
+//   mbd: Pointer to the block for filtering.
+//   input_mbmi: Backup block info from where input state is restored.
+//   input_buffer: Backup buffer pointer from where input state is restored.
+//   num_planes: Number of planes.
+// Returns:
+//   Nothing will be returned. Contents of mbd will be modified.
+static INLINE void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi,
+                                    uint8_t **input_buffer, int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+  }
+  mbd->mi = input_mbmi;
+}
+
+/*!\endcond */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/thirdpass.c b/third_party/aom/av1/encoder/thirdpass.c
new file mode 100644
index 0000000000..a25522fbc5
--- /dev/null
+++ b/third_party/aom/av1/encoder/thirdpass.c
@@ -0,0 +1,877 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/encoder/thirdpass.h"
+
+#if CONFIG_THREE_PASS && CONFIG_AV1_DECODER
+#include "aom/aom_codec.h"
+#include "aom/aomdx.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/common/blockd.h"
+#include "common/ivfdec.h"
+
+static void setup_two_pass_stream_input(
+    struct AvxInputContext **input_ctx_ptr, const char *input_file_name,
+    struct aom_internal_error_info *err_info) {
+  FILE *infile;
+  infile = fopen(input_file_name, "rb");
+  if (!infile) {
+    aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+                       "Failed to open input file '%s'.", input_file_name);
+  }
+  struct AvxInputContext *aom_input_ctx = aom_malloc(sizeof(*aom_input_ctx));
+  if (!aom_input_ctx) {
+    fclose(infile);
+    aom_internal_error(err_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate memory for third-pass context.");
+  }
+  memset(aom_input_ctx, 0, sizeof(*aom_input_ctx));
+  aom_input_ctx->filename = input_file_name;
+  aom_input_ctx->file = infile;
+
+  if (file_is_ivf(aom_input_ctx)) {
+    aom_input_ctx->file_type = FILE_TYPE_IVF;
+  } else {
+    fclose(infile);
+    aom_free(aom_input_ctx);
+    aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+                       "Unrecognized input file type.");
+  }
+  *input_ctx_ptr = aom_input_ctx;
+}
+
+static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+  if (!ctx->input_ctx) {
+    if (ctx->input_file_name == NULL) {
+      aom_internal_error(ctx->err_info, AOM_CODEC_INVALID_PARAM,
+                         "No third pass input specified.");
+    }
+    setup_two_pass_stream_input(&ctx->input_ctx, ctx->input_file_name,
+                                ctx->err_info);
+  }
+
+  if (!ctx->decoder.iface) {
+    aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo;
+    if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) {
+      aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                         "Failed to initialize decoder.");
+    }
+  }
+}
+
+// Return 0: success
+//        1: cannot read because this is end of file
+//       -1: failure to read the frame
+static int read_frame(THIRD_PASS_DEC_CTX *ctx) {
+  if (!ctx->input_ctx || !ctx->decoder.iface) {
+    init_third_pass(ctx);
+  }
+  if (!ctx->have_frame) {
+    if (ivf_read_frame(ctx->input_ctx, &ctx->buf, &ctx->bytes_in_buffer,
+                       &ctx->buffer_size, NULL) != 0) {
+      if (feof(ctx->input_ctx->file)) {
+        return 1;
+      } else {
+        return -1;
+      }
+    }
+    ctx->frame = ctx->buf;
+    ctx->end_frame = ctx->frame + ctx->bytes_in_buffer;
+    ctx->have_frame = 1;
+  }
+
+  Av1DecodeReturn adr;
+  if (aom_codec_decode(&ctx->decoder, ctx->frame,
+                       (unsigned int)ctx->bytes_in_buffer,
+                       &adr) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to decode frame for third pass.");
+  }
+  ctx->this_frame_bits = (int)(adr.buf - ctx->frame) << 3;
+  ctx->frame = adr.buf;
+  ctx->bytes_in_buffer = ctx->end_frame - ctx->frame;
+  if (ctx->frame == ctx->end_frame) ctx->have_frame = 0;
+  return 0;
+}
+
+static void free_frame_info(THIRD_PASS_FRAME_INFO *frame_info) {
+  if (!frame_info) return;
+  aom_free(frame_info->mi_info);
+  frame_info->mi_info = NULL;
+}
+
+// This function gets the information needed from the recently decoded frame,
+// via various decoder APIs, and saves the info into ctx->frame_info.
+// Return 0: success
+//        1: cannot read because this is end of file
+//       -1: failure to read the frame
+static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) {
+  int ret = read_frame(ctx);
+  if (ret != 0) return ret;
+  int cur = ctx->frame_info_count;
+
+  ctx->frame_info[cur].actual_bits = ctx->this_frame_bits;
+
+  if (cur >= MAX_THIRD_PASS_BUF) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Third pass frame info ran out of available slots.");
+  }
+  aom_codec_frame_flags_t frame_type_flags = 0;
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS,
+                        &frame_type_flags) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read frame flags.");
+  }
+  if (frame_type_flags & AOM_FRAME_IS_KEY) {
+    ctx->frame_info[cur].frame_type = KEY_FRAME;
+  } else if (frame_type_flags & AOM_FRAME_IS_INTRAONLY) {
+    ctx->frame_info[cur].frame_type = INTRA_ONLY_FRAME;
+  } else if (frame_type_flags & AOM_FRAME_IS_SWITCH) {
+    ctx->frame_info[cur].frame_type = S_FRAME;
+  } else {
+    ctx->frame_info[cur].frame_type = INTER_FRAME;
+  }
+
+  // Get frame width and height
+  int frame_size[2];
+  if (aom_codec_control(&ctx->decoder, AV1D_GET_FRAME_SIZE, frame_size) !=
+      AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read frame size.");
+  }
+
+  // Check if we need to re-alloc the mi fields.
+  const int mi_cols = (frame_size[0] + 3) >> 2;
+  const int mi_rows = (frame_size[1] + 3) >> 2;
+  ctx->frame_info[cur].mi_stride = mi_cols;
+  ctx->frame_info[cur].mi_rows = mi_rows;
+  ctx->frame_info[cur].mi_cols = mi_cols;
+
+  if (ctx->frame_info[cur].width != frame_size[0] ||
+      ctx->frame_info[cur].height != frame_size[1] ||
+      !ctx->frame_info[cur].mi_info) {
+    free_frame_info(&ctx->frame_info[cur]);
+
+    ctx->frame_info[cur].mi_info =
+        aom_malloc(mi_cols * mi_rows * sizeof(*ctx->frame_info[cur].mi_info));
+
+    if (!ctx->frame_info[cur].mi_info) {
+      aom_internal_error(ctx->err_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate mi buffer for the third pass.");
+    }
+  }
+
+  ctx->frame_info[cur].width = frame_size[0];
+  ctx->frame_info[cur].height = frame_size[1];
+
+  // Get frame base q idx
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_BASE_Q_IDX,
+                        &ctx->frame_info[cur].base_q_idx) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read base q index.");
+  }
+
+  // Get show existing frame flag
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+                        &ctx->frame_info[cur].is_show_existing_frame) !=
+      AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read show existing frame flag.");
+  }
+
+  // Get show frame flag
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_FRAME_FLAG,
+                        &ctx->frame_info[cur].is_show_frame) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read show frame flag.");
+  }
+
+  // Get order hint
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_ORDER_HINT,
+                        &ctx->frame_info[cur].order_hint) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read order hint.");
+  }
+
+  // Clear MI info
+  for (int mi_row = 0; mi_row < mi_rows; mi_row++) {
+    for (int mi_col = 0; mi_col < mi_cols; mi_col++) {
+      ctx->frame_info[cur].mi_info[mi_row * mi_cols + mi_col].bsize =
+          BLOCK_INVALID;
+    }
+  }
+
+  // Get relevant information regarding each 4x4 MI
+  MB_MODE_INFO cur_mi_info;
+  THIRD_PASS_MI_INFO *const this_mi = ctx->frame_info[cur].mi_info;
+  for (int mi_row = 0; mi_row < mi_rows; mi_row++) {
+    for (int mi_col = 0; mi_col < mi_cols; mi_col++) {
+      const int offset = mi_row * mi_cols + mi_col;
+      if (this_mi[offset].bsize != BLOCK_INVALID) {
+        continue;
+      }
+      // Get info of this MI
+      if (aom_codec_control(&ctx->decoder, AV1D_GET_MI_INFO, mi_row, mi_col,
+                            &cur_mi_info) != AOM_CODEC_OK) {
+        aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                           "Failed to read mi info.");
+      }
+      const int blk_mi_rows = mi_size_high[cur_mi_info.bsize];
+      const int blk_mi_cols = mi_size_wide[cur_mi_info.bsize];
+
+      for (int h = 0; h < blk_mi_rows; h++) {
+        for (int w = 0; w < blk_mi_cols; w++) {
+          if (h + mi_row >= mi_rows || w + mi_col >= mi_cols) {
+            continue;
+          }
+          const int this_offset = offset + h * mi_cols + w;
+          this_mi[this_offset].bsize = cur_mi_info.bsize;
+          this_mi[this_offset].partition = cur_mi_info.partition;
+          this_mi[this_offset].mi_row_start = mi_row;
+          this_mi[this_offset].mi_col_start = mi_col;
+          this_mi[this_offset].mv[0] = cur_mi_info.mv[0];
+          this_mi[this_offset].mv[1] = cur_mi_info.mv[1];
+          this_mi[this_offset].ref_frame[0] = cur_mi_info.ref_frame[0];
+          this_mi[this_offset].ref_frame[1] = cur_mi_info.ref_frame[1];
+          this_mi[this_offset].pred_mode = cur_mi_info.mode;
+        }
+      }
+    }
+  }
+
+  ctx->frame_info_count++;
+
+  return 0;
+}
+
+#define USE_SECOND_PASS_FILE 1
+
+#if !USE_SECOND_PASS_FILE
+// Parse the frames in the gop and determine the last frame of the current GOP.
+// Decode more frames if necessary. The variable max_num is the maximum static
+// GOP length if we detect an IPPP structure, and it is expected that max_mum >=
+// MAX_GF_INTERVAL.
+static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num,
+                                int *last_idx) {
+  assert(max_num >= MAX_GF_INTERVAL);
+  *last_idx = 0;
+  int cur_idx = 0;
+  int arf_order_hint = -1;
+  int num_show_frames = 0;
+  while (num_show_frames < max_num) {
+    assert(cur_idx < MAX_THIRD_PASS_BUF);
+    // Read in from bitstream if needed.
+    if (cur_idx >= ctx->frame_info_count) {
+      int ret = get_frame_info(ctx);
+      if (ret == 1) {
+        // At the end of the file, GOP ends in the prev frame.
+        if (arf_order_hint >= 0) {
+          aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                             "Failed to derive GOP length.");
+        }
+        *last_idx = cur_idx - 1;
+        return;
+      }
+      if (ret < 0) {
+        aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                           "Failed to read frame for third pass.");
+      }
+    }
+
+    // TODO(bohanli): verify that fwd_kf works here.
+    if (ctx->frame_info[cur_idx].frame_type == KEY_FRAME &&
+        ctx->frame_info[cur_idx].is_show_frame) {
+      if (cur_idx != 0) {
+        // If this is a key frame and is not the first kf in this kf group, we
+        // have reached the next key frame. Stop here.
+        *last_idx = cur_idx - 1;
+        return;
+      }
+    } else if (!ctx->frame_info[cur_idx].is_show_frame &&
+               arf_order_hint == -1) {
+      // If this is an arf (the first no show)
+      if (num_show_frames <= 1) {
+        // This is an arf and we should end the GOP with its overlay.
+        arf_order_hint = ctx->frame_info[cur_idx].order_hint;
+      } else {
+        // There are multiple show frames before the this arf, so we treat the
+        // frames previous to this arf as a GOP.
+        *last_idx = cur_idx - 1;
+        return;
+      }
+    } else if (arf_order_hint >= 0 && ctx->frame_info[cur_idx].order_hint ==
+                                          (unsigned int)arf_order_hint) {
+      // If this is the overlay/show existing of the arf
+      assert(ctx->frame_info[cur_idx].is_show_frame);
+      *last_idx = cur_idx;
+      return;
+    } else {
+      // This frame is part of the GOP.
+      if (ctx->frame_info[cur_idx].is_show_frame) num_show_frames++;
+    }
+    cur_idx++;
+  }
+  // This is a long IPPP GOP and we will use a length of max_num here.
+  assert(arf_order_hint < 0);
+  *last_idx = max_num - 1;
+  return;
+}
+#endif
+
+static AOM_INLINE void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) {
+  int cur_idx = 0;
+  while (cur_idx < ctx->gop_info.num_frames) {
+    assert(cur_idx < MAX_THIRD_PASS_BUF);
+    // Read in from bitstream if needed.
+    if (cur_idx >= ctx->frame_info_count) {
+      int ret = get_frame_info(ctx);
+      if (ret != 0) {
+        aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                           "Failed to read frame for third pass.");
+      }
+    }
+    cur_idx++;
+  }
+  return;
+}
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+  // Read in future frames in the current GOP.
+  read_gop_frames(ctx);
+
+  int gf_len = 0;
+  // Check the GOP length against the value read from second_pass_file
+  for (int i = 0; i < ctx->gop_info.num_frames; i++) {
+    if (ctx->frame_info[i].is_show_frame) gf_len++;
+  }
+
+  if (gf_len != ctx->gop_info.gf_length) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Mismatch in third pass GOP length!");
+  }
+}
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) {
+  if (ctx->frame_info_count == 0) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "No available frame info for third pass.");
+  }
+  ctx->frame_info_count--;
+  free_frame_info(&ctx->frame_info[0]);
+  for (int i = 0; i < ctx->frame_info_count; i++) {
+    ctx->frame_info[i] = ctx->frame_info[i + 1];
+  }
+  ctx->frame_info[ctx->frame_info_count].mi_info = NULL;
+}
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+                            const char *file) {
+  av1_free_thirdpass_ctx(*ctx);
+  CHECK_MEM_ERROR(cm, *ctx, aom_calloc(1, sizeof(**ctx)));
+  THIRD_PASS_DEC_CTX *ctx_ptr = *ctx;
+  ctx_ptr->input_file_name = file;
+  ctx_ptr->prev_gop_end = -1;
+  ctx_ptr->err_info = cm->error;
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) {
+  if (ctx == NULL) return;
+  if (ctx->decoder.iface) {
+    aom_codec_destroy(&ctx->decoder);
+  }
+  if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file);
+  aom_free(ctx->input_ctx);
+  if (ctx->buf) free(ctx->buf);
+  for (int i = 0; i < MAX_THIRD_PASS_BUF; i++) {
+    free_frame_info(&ctx->frame_info[i]);
+  }
+  aom_free(ctx);
+}
+
+void av1_write_second_pass_gop_info(AV1_COMP *cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+  if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) {
+    // Write the GOP length to a log file.
+    av1_open_second_pass_log(cpi, 0);
+
+    THIRD_PASS_GOP_INFO gop_info;
+
+    gop_info.num_frames = gf_group->size;
+    gop_info.use_arf = (gf_group->arf_index >= 0);
+    gop_info.gf_length = p_rc->baseline_gf_interval;
+
+    size_t count =
+        fwrite(&gop_info, sizeof(gop_info), 1, cpi->second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not write to second pass log file!");
+    }
+  }
+}
+
+void av1_write_second_pass_per_frame_info(AV1_COMP *cpi, int gf_index) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+  if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) {
+    // write target bitrate
+    int bits = gf_group->bit_allocation[gf_index];
+    size_t count = fwrite(&bits, sizeof(bits), 1, cpi->second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not write to second pass log file!");
+    }
+
+    // write sse
+    uint64_t sse = 0;
+    int pkt_idx = cpi->ppi->output_pkt_list->cnt - 1;
+    if (pkt_idx >= 0 &&
+        cpi->ppi->output_pkt_list->pkts[pkt_idx].kind == AOM_CODEC_PSNR_PKT) {
+      sse = cpi->ppi->output_pkt_list->pkts[pkt_idx].data.psnr.sse[0];
+#if CONFIG_INTERNAL_STATS
+    } else if (cpi->ppi->b_calculate_psnr) {
+      sse = cpi->ppi->total_sq_error[0];
+#endif
+    } else {
+      const YV12_BUFFER_CONFIG *orig = cpi->source;
+      const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+      PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+      const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+      const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+      aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+      aom_calc_psnr(orig, recon, &psnr);
+#endif
+      sse = psnr.sse[0];
+    }
+
+    count = fwrite(&sse, sizeof(sse), 1, cpi->second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not write to second pass log file!");
+    }
+
+    // write bpm_factor
+    double factor = cpi->ppi->twopass.bpm_factor;
+    count = fwrite(&factor, sizeof(factor), 1, cpi->second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not write to second pass log file!");
+    }
+  }
+}
+void av1_open_second_pass_log(AV1_COMP *cpi, int is_read) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  if (oxcf->second_pass_log == NULL) {
+    aom_internal_error(cpi->common.error, AOM_CODEC_INVALID_PARAM,
+                       "No second pass log file specified for the third pass!");
+  }
+  // Read the GOP length from a file.
+  if (!cpi->second_pass_log_stream) {
+    if (is_read) {
+      cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "rb");
+    } else {
+      cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "wb");
+    }
+    if (!cpi->second_pass_log_stream) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not open second pass log file!");
+    }
+  }
+}
+
+void av1_close_second_pass_log(AV1_COMP *cpi) {
+  if (cpi->second_pass_log_stream) {
+    int ret = fclose(cpi->second_pass_log_stream);
+    if (ret != 0) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not close second pass log file!");
+    }
+    cpi->second_pass_log_stream = 0;
+  }
+}
+
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+                                   THIRD_PASS_GOP_INFO *gop_info,
+                                   struct aom_internal_error_info *error) {
+  size_t count = fread(gop_info, sizeof(*gop_info), 1, second_pass_log_stream);
+  if (count < 1) {
+    aom_internal_error(error, AOM_CODEC_ERROR,
+                       "Could not read from second pass log file!");
+  }
+}
+
+void av1_read_second_pass_per_frame_info(
+    FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr,
+    int frame_info_count, struct aom_internal_error_info *error) {
+  for (int i = 0; i < frame_info_count; i++) {
+    // read target bits
+    int bits = 0;
+    size_t count = fread(&bits, sizeof(bits), 1, second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(error, AOM_CODEC_ERROR,
+                         "Could not read from second pass log file!");
+    }
+    frame_info_arr[i].bits_allocated = bits;
+
+    // read distortion
+    uint64_t sse;
+    count = fread(&sse, sizeof(sse), 1, second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(error, AOM_CODEC_ERROR,
+                         "Could not read from second pass log file!");
+    }
+    frame_info_arr[i].sse = sse;
+
+    // read bpm factor
+    double factor;
+    count = fread(&factor, sizeof(factor), 1, second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(error, AOM_CODEC_ERROR,
+                         "Could not read from second pass log file!");
+    }
+    frame_info_arr[i].bpm_factor = factor;
+  }
+}
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) {
+  if (ctx == NULL) return -1;
+  int use_arf = 0;
+  for (int i = 0; i < ctx->gop_info.gf_length; i++) {
+    if (ctx->frame_info[i].order_hint != 0 &&
+        ctx->frame_info[i].is_show_frame == 0) {
+      use_arf = 1;
+    }
+  }
+  if (use_arf != ctx->gop_info.use_arf) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Mismatch in third pass GOP length!");
+  }
+  return use_arf;
+}
+
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+                              int fwidth, double *ratio_h, double *ratio_w) {
+  assert(ctx);
+  assert(fidx < ctx->frame_info_count);
+  const int fheight_second_pass = ctx->frame_info[fidx].height;
+  const int fwidth_second_pass = ctx->frame_info[fidx].width;
+  assert(fheight_second_pass <= fheight && fwidth_second_pass <= fwidth);
+
+  *ratio_h = (double)fheight / fheight_second_pass;
+  *ratio_w = (double)fwidth / fwidth_second_pass;
+}
+
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+                                          int mi_row, int mi_col,
+                                          double ratio_h, double ratio_w) {
+  assert(ctx);
+  assert(fidx < ctx->frame_info_count);
+
+  const int mi_rows_second_pass = ctx->frame_info[fidx].mi_rows;
+  const int mi_cols_second_pass = ctx->frame_info[fidx].mi_cols;
+
+  const int mi_row_second_pass =
+      clamp((int)round(mi_row / ratio_h), 0, mi_rows_second_pass - 1);
+  const int mi_col_second_pass =
+      clamp((int)round(mi_col / ratio_w), 0, mi_cols_second_pass - 1);
+
+  const int mi_stride_second_pass = ctx->frame_info[fidx].mi_stride;
+  THIRD_PASS_MI_INFO *this_mi = ctx->frame_info[fidx].mi_info +
+                                mi_row_second_pass * mi_stride_second_pass +
+                                mi_col_second_pass;
+  return this_mi;
+}
+
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+                                    double ratio_h, double ratio_w, int *mi_row,
+                                    int *mi_col) {
+  *mi_row = (int)round(third_pass_mi->mi_row_start * ratio_h);
+  *mi_col = (int)round(third_pass_mi->mi_col_start * ratio_w);
+}
+
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+                                      double ratio_h, double ratio_w,
+                                      MV_REFERENCE_FRAME frame) {
+  assert(this_mi != NULL);
+  int_mv cur_mv;
+  cur_mv.as_int = INVALID_MV;
+
+  if (frame < LAST_FRAME || frame > ALTREF_FRAME) return cur_mv;
+
+  for (int r = 0; r < 2; r++) {
+    if (this_mi->ref_frame[r] == frame) {
+      cur_mv.as_mv.row = (int16_t)round(this_mi->mv[r].as_mv.row * ratio_h);
+      cur_mv.as_mv.col = (int16_t)round(this_mi->mv[r].as_mv.col * ratio_w);
+    }
+  }
+
+  return cur_mv;
+}
+
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+                                                double ratio_h,
+                                                double ratio_w) {
+  assert(this_mi != NULL);
+  BLOCK_SIZE bsize = BLOCK_INVALID;
+
+  const BLOCK_SIZE bsize_second_pass = this_mi->bsize;
+  assert(bsize_second_pass != BLOCK_INVALID);
+
+  const int w_second_pass = block_size_wide[bsize_second_pass];
+  const int h_second_pass = block_size_high[bsize_second_pass];
+
+  int part_type;
+
+  if (w_second_pass == h_second_pass) {
+    part_type = PARTITION_NONE;
+  } else if (w_second_pass / h_second_pass == 2) {
+    part_type = PARTITION_HORZ;
+  } else if (w_second_pass / h_second_pass == 4) {
+    part_type = PARTITION_HORZ_4;
+  } else if (h_second_pass / w_second_pass == 2) {
+    part_type = PARTITION_VERT;
+  } else if (h_second_pass / w_second_pass == 4) {
+    part_type = PARTITION_VERT_4;
+  } else {
+    part_type = PARTITION_INVALID;
+  }
+  assert(part_type != PARTITION_INVALID);
+
+  const int w = (int)(round(w_second_pass * ratio_w));
+  const int h = (int)(round(h_second_pass * ratio_h));
+
+  for (int i = 0; i < SQR_BLOCK_SIZES; i++) {
+    const BLOCK_SIZE this_bsize = subsize_lookup[part_type][i];
+    if (this_bsize == BLOCK_INVALID) continue;
+
+    const int this_w = block_size_wide[this_bsize];
+    const int this_h = block_size_high[this_bsize];
+
+    if (this_w >= w && this_h >= h) {
+      // find the smallest block size that contains the mapped block
+      bsize = this_bsize;
+      break;
+    }
+  }
+  if (bsize == BLOCK_INVALID) {
+    // could not find a proper one, just use the largest then.
+    bsize = BLOCK_128X128;
+  }
+
+  return bsize;
+}
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+                                               THIRD_PASS_MI_INFO *this_mi) {
+  int mi_stride = ctx->frame_info[0].mi_stride;
+
+  int mi_row = this_mi->mi_row_start;
+  int mi_col = this_mi->mi_col_start;
+
+  THIRD_PASS_MI_INFO *corner_mi =
+      &ctx->frame_info[0].mi_info[mi_row * mi_stride + mi_col];
+
+  return corner_mi->partition;
+}
+
+#else   // !(CONFIG_THREE_PASS && CONFIG_AV1_DECODER)
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+                            const char *file) {
+  (void)ctx;
+  (void)file;
+  aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                     "To utilize three-pass encoding, libaom must be built "
+                     "with CONFIG_THREE_PASS=1 & CONFIG_AV1_DECODER=1.");
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read) {
+  (void)cpi;
+  (void)is_read;
+}
+
+void av1_close_second_pass_log(struct AV1_COMP *cpi) { (void)cpi; }
+
+void av1_write_second_pass_gop_info(struct AV1_COMP *cpi) { (void)cpi; }
+
+void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index) {
+  (void)cpi;
+  (void)gf_index;
+}
+
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+                                   THIRD_PASS_GOP_INFO *gop_info,
+                                   struct aom_internal_error_info *error) {
+  (void)second_pass_log_stream;
+  (void)gop_info;
+  (void)error;
+}
+
+void av1_read_second_pass_per_frame_info(
+    FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr,
+    int frame_info_count, struct aom_internal_error_info *error) {
+  (void)second_pass_log_stream;
+  (void)frame_info_arr;
+  (void)frame_info_count;
+  (void)error;
+}
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) {
+  (void)ctx;
+  return 1;
+}
+
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+                              int fwidth, double *ratio_h, double *ratio_w) {
+  (void)ctx;
+  (void)fidx;
+  (void)fheight;
+  (void)fwidth;
+  (void)ratio_h;
+  (void)ratio_w;
+}
+
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+                                          int mi_row, int mi_col,
+                                          double ratio_h, double ratio_w) {
+  (void)ctx;
+  (void)fidx;
+  (void)mi_row;
+  (void)mi_col;
+  (void)ratio_h;
+  (void)ratio_w;
+  return NULL;
+}
+
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+                                      double ratio_h, double ratio_w,
+                                      MV_REFERENCE_FRAME frame) {
+  (void)this_mi;
+  (void)ratio_h;
+  (void)ratio_w;
+  (void)frame;
+  int_mv mv;
+  mv.as_int = INVALID_MV;
+  return mv;
+}
+
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+                                                double ratio_h,
+                                                double ratio_w) {
+  (void)this_mi;
+  (void)ratio_h;
+  (void)ratio_w;
+  return BLOCK_INVALID;
+}
+
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+                                    double ratio_h, double ratio_w, int *mi_row,
+                                    int *mi_col) {
+  (void)third_pass_mi;
+  (void)ratio_h;
+  (void)ratio_w;
+  (void)mi_row;
+  (void)mi_col;
+}
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+                                               THIRD_PASS_MI_INFO *this_mi) {
+  (void)ctx;
+  (void)this_mi;
+  return PARTITION_INVALID;
+}
+#endif  // CONFIG_THREE_PASS && CONFIG_AV1_DECODER
+
+#if CONFIG_BITRATE_ACCURACY
+static void fwrite_and_check(const void *ptr, size_t size, size_t nmemb,
+                             FILE *stream,
+                             struct aom_internal_error_info *error) {
+  size_t count = fwrite(ptr, size, nmemb, stream);
+  if (count < nmemb) {
+    aom_internal_error(error, AOM_CODEC_ERROR, "fwrite_and_check failed\n");
+  }
+}
+
+static void fread_and_check(void *ptr, size_t size, size_t nmemb, FILE *stream,
+                            struct aom_internal_error_info *error) {
+  size_t count = fread(ptr, size, nmemb, stream);
+  if (count < nmemb) {
+    aom_internal_error(error, AOM_CODEC_ERROR, "fread_and_check failed\n");
+  }
+}
+
+void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group,
+                       const TplParams *tpl_data) {
+  tpl_info->tpl_ready = tpl_data->ready;
+  if (tpl_info->tpl_ready) {
+    tpl_info->gf_length = gf_group->size;
+    for (int i = 0; i < tpl_info->gf_length; ++i) {
+      tpl_info->txfm_stats_list[i] = tpl_data->txfm_stats_list[i];
+      tpl_info->qstep_ratio_ls[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+      tpl_info->update_type_list[i] = gf_group->update_type[i];
+    }
+  }
+}
+
+void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream,
+                        struct aom_internal_error_info *error) {
+  fwrite_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1,
+                   log_stream, error);
+  if (tpl_info->tpl_ready) {
+    fwrite_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1,
+                     log_stream, error);
+    assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS);
+    fwrite_and_check(&tpl_info->txfm_stats_list,
+                     sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length,
+                     log_stream, error);
+    fwrite_and_check(&tpl_info->qstep_ratio_ls,
+                     sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length,
+                     log_stream, error);
+    fwrite_and_check(&tpl_info->update_type_list,
+                     sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length,
+                     log_stream, error);
+  }
+}
+
+void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream,
+                       struct aom_internal_error_info *error) {
+  av1_zero(*tpl_info);
+  fread_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1,
+                  log_stream, error);
+  if (tpl_info->tpl_ready) {
+    fread_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1,
+                    log_stream, error);
+    assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS);
+    fread_and_check(&tpl_info->txfm_stats_list,
+                    sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length,
+                    log_stream, error);
+    fread_and_check(&tpl_info->qstep_ratio_ls,
+                    sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length,
+                    log_stream, error);
+    fread_and_check(&tpl_info->update_type_list,
+                    sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length,
+                    log_stream, error);
+  }
+}
+#endif  // CONFIG_BITRATE_ACCURACY
diff --git a/third_party/aom/av1/encoder/thirdpass.h b/third_party/aom/av1/encoder/thirdpass.h
new file mode 100644
index 0000000000..8080c06cb6
--- /dev/null
+++ b/third_party/aom/av1/encoder/thirdpass.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_THIRDPASS_H_
+#define AOM_AV1_ENCODER_THIRDPASS_H_
+
+#include "av1/common/enums.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+
+struct AV1_COMP;
+
+// TODO(bohanli): optimize this number
+#define MAX_THIRD_PASS_BUF \
+  (AOMMAX((2 * MAX_GF_INTERVAL + 1), MAX_STATIC_GF_GROUP_LENGTH))
+
+// Struct to store useful information related to a GOP, in addition to what is
+// available in the bitstream
+typedef struct {
+  int gf_length;
+  int num_frames;
+  int use_arf;
+} THIRD_PASS_GOP_INFO;
+
+#if CONFIG_BITRATE_ACCURACY
+typedef struct TPL_INFO {
+  int gf_length;
+  int tpl_ready;
+  TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS];
+  double qstep_ratio_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  FRAME_UPDATE_TYPE update_type_list[MAX_LENGTH_TPL_FRAME_STATS];
+} TPL_INFO;
+#endif  // CONFIG_BITRATE_ACCURACY
+
+typedef struct {
+  BLOCK_SIZE bsize;
+  PARTITION_TYPE partition;
+  int mi_row_start;
+  int mi_col_start;
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frame[2];
+  PREDICTION_MODE pred_mode;
+} THIRD_PASS_MI_INFO;
+
+// Struct to store useful information about a frame for the third pass.
+// The members are extracted from the decoder by function get_frame_info.
+typedef struct {
+  int width;
+  int height;
+  int mi_stride;
+  int mi_rows;
+  int mi_cols;
+  int base_q_idx;
+  int is_show_existing_frame;
+  int is_show_frame;
+  int bits_allocated;
+  int actual_bits;
+  uint64_t sse;
+  double bpm_factor;
+  FRAME_TYPE frame_type;
+  unsigned int order_hint;
+  THIRD_PASS_MI_INFO *mi_info;
+} THIRD_PASS_FRAME_INFO;
+
+typedef struct {
+  /* --- Input and decoding related members --- */
+  // the input file
+  const char *input_file_name;
+#if CONFIG_THREE_PASS
+  // input context
+  struct AvxInputContext *input_ctx;
+#endif
+  // decoder codec context
+  aom_codec_ctx_t decoder;
+  // start of the frame in buf
+  const unsigned char *frame;
+  // end of the frame(s) in buf
+  const unsigned char *end_frame;
+  // whether we still have following frames in buf
+  int have_frame;
+  // pointer to buffer for the read frames
+  uint8_t *buf;
+  // size of data in buffer
+  size_t bytes_in_buffer;
+  // current buffer size
+  size_t buffer_size;
+  // error info pointer
+  struct aom_internal_error_info *err_info;
+
+  int this_frame_bits;
+
+  /* --- Members for third pass encoding --- */
+  // Array to store info about each frame.
+  // frame_info[0] should point to the current frame.
+  THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+  // number of frames available in frame_info
+  int frame_info_count;
+  // the end of the previous GOP (order hint)
+  int prev_gop_end;
+  THIRD_PASS_GOP_INFO gop_info;
+} THIRD_PASS_DEC_CTX;
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+                            const char *file);
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx);
+
+// Set the GOP structure from the twopass bitstream.
+// TODO(bohanli): this is currently a skeleton and we only return the gop
+// length. This function also saves all frame information in the array
+// ctx->frame_info for this GOP.
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx);
+
+// Pop one frame out of the array ctx->frame_info. This function is used to make
+// sure that frame_info[0] always corresponds to the current frame.
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx);
+
+void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read);
+void av1_close_second_pass_log(struct AV1_COMP *cpi);
+
+// Write the current GOP information into the second pass log file.
+void av1_write_second_pass_gop_info(struct AV1_COMP *cpi);
+// Write the information of the frames in this GOP into the second pass log
+// file.
+void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index);
+
+// Read the next GOP information from the second pass log file.
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+                                   THIRD_PASS_GOP_INFO *gop_info,
+                                   struct aom_internal_error_info *error);
+// read the information of the frames in next GOP from the second pass log file.
+void av1_read_second_pass_per_frame_info(FILE *second_pass_log_stream,
+                                         THIRD_PASS_FRAME_INFO *frame_info_arr,
+                                         int frame_info_count,
+                                         struct aom_internal_error_info *error);
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx);
+
+// Calculate the ratio of third pass frame dimensions over second pass frame
+// dimensions. Return them in ratio_h and ratio_w.
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+                              int fwidth, double *ratio_h, double *ratio_w);
+
+// Get the pointer to a second pass mi info, where mi_row and mi_col are the mi
+// location in the thirdpass frame.
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+                                          int mi_row, int mi_col,
+                                          double ratio_h, double ratio_w);
+
+// Get the adjusted MVs of this_mi, associated with the reference frame. If no
+// MV is found with the reference frame, INVALID_MV is returned.
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+                                      double ratio_h, double ratio_w,
+                                      MV_REFERENCE_FRAME frame);
+
+// Get the adjusted block size of this_mi.
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+                                                double ratio_h, double ratio_w);
+
+// Get the adjusted mi position in the third pass frame, of a given
+// third_pass_mi. Location is returned in mi_row and mi_col.
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+                                    double ratio_h, double ratio_w, int *mi_row,
+                                    int *mi_col);
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+                                               THIRD_PASS_MI_INFO *this_mi);
+
+#if CONFIG_BITRATE_ACCURACY
+
+void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group,
+                       const TplParams *tpl_data);
+
+void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream,
+                        struct aom_internal_error_info *error);
+
+void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream,
+                       struct aom_internal_error_info *error);
+
+#endif  // CONFIG_BITRATE_ACCURACY
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_THIRDPASS_H_
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
new file mode 100644
index 0000000000..ffac886e32
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/entropy.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static AOM_INLINE int av1_fast_palette_color_index_context_on_edge(
+    const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+  const bool has_left = (c - 1 >= 0);
+  const bool has_above = (r - 1 >= 0);
+  assert(r > 0 || c > 0);
+  assert(has_above ^ has_left);
+  assert(color_idx);
+  (void)has_left;
+
+  const uint8_t color_neighbor = has_above
+                                     ? color_map[(r - 1) * stride + (c - 0)]
+                                     : color_map[(r - 0) * stride + (c - 1)];
+  // If the neighbor color has higher index than current color index, then we
+  // move up by 1.
+  const uint8_t current_color = *color_idx = color_map[r * stride + c];
+  if (color_neighbor > current_color) {
+    (*color_idx)++;
+  } else if (color_neighbor == current_color) {
+    *color_idx = 0;
+  }
+
+  // Get hash value of context.
+  // The non-diagonal neighbors get a weight of 2.
+  const uint8_t color_score = 2;
+  const uint8_t hash_multiplier = 1;
+  const uint8_t color_index_ctx_hash = color_score * hash_multiplier;
+
+  // Lookup context from hash.
+  const int color_index_ctx =
+      av1_palette_color_index_context_lookup[color_index_ctx_hash];
+  assert(color_index_ctx == 0);
+  (void)color_index_ctx;
+  return 0;
+}
+
+#define SWAP(i, j)                           \
+  do {                                       \
+    const uint8_t tmp_score = score_rank[i]; \
+    const uint8_t tmp_color = color_rank[i]; \
+    score_rank[i] = score_rank[j];           \
+    color_rank[i] = color_rank[j];           \
+    score_rank[j] = tmp_score;               \
+    color_rank[j] = tmp_color;               \
+  } while (0)
+#define INVALID_COLOR_IDX (UINT8_MAX)
+
+// A faster version of av1_get_palette_color_index_context used by the encoder
+// exploiting the fact that the encoder does not need to maintain a color order.
+static AOM_INLINE int av1_fast_palette_color_index_context(
+    const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+  assert(r > 0 || c > 0);
+
+  const bool has_above = (r - 1 >= 0);
+  const bool has_left = (c - 1 >= 0);
+  assert(has_above || has_left);
+  if (has_above ^ has_left) {
+    return av1_fast_palette_color_index_context_on_edge(color_map, stride, r, c,
+                                                        color_idx);
+  }
+
+  // This goes in the order of left, top, and top-left. This has the advantage
+  // that unless anything here are not distinct or invalid, this will already
+  // be in sorted order. Furthermore, if either of the first two is
+  // invalid, we know the last one is also invalid.
+  uint8_t color_neighbors[NUM_PALETTE_NEIGHBORS];
+  color_neighbors[0] = color_map[(r - 0) * stride + (c - 1)];
+  color_neighbors[1] = color_map[(r - 1) * stride + (c - 0)];
+  color_neighbors[2] = color_map[(r - 1) * stride + (c - 1)];
+
+  // Aggregate duplicated values.
+  // Since our array is so small, using a couple if statements is faster
+  uint8_t scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 };
+  uint8_t num_invalid_colors = 0;
+  if (color_neighbors[0] == color_neighbors[1]) {
+    scores[0] += scores[1];
+    color_neighbors[1] = INVALID_COLOR_IDX;
+    num_invalid_colors += 1;
+
+    if (color_neighbors[0] == color_neighbors[2]) {
+      scores[0] += scores[2];
+      num_invalid_colors += 1;
+    }
+  } else if (color_neighbors[0] == color_neighbors[2]) {
+    scores[0] += scores[2];
+    num_invalid_colors += 1;
+  } else if (color_neighbors[1] == color_neighbors[2]) {
+    scores[1] += scores[2];
+    num_invalid_colors += 1;
+  }
+
+  const uint8_t num_valid_colors = NUM_PALETTE_NEIGHBORS - num_invalid_colors;
+
+  uint8_t *color_rank = color_neighbors;
+  uint8_t *score_rank = scores;
+
+  // Sort everything
+  if (num_valid_colors > 1) {
+    if (color_neighbors[1] == INVALID_COLOR_IDX) {
+      scores[1] = scores[2];
+      color_neighbors[1] = color_neighbors[2];
+    }
+
+    // We need to swap the first two elements if they have the same score but
+    // the color indices are not in the right order
+    if (score_rank[0] < score_rank[1] ||
+        (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) {
+      SWAP(0, 1);
+    }
+    if (num_valid_colors > 2) {
+      if (score_rank[0] < score_rank[2]) {
+        SWAP(0, 2);
+      }
+      if (score_rank[1] < score_rank[2]) {
+        SWAP(1, 2);
+      }
+    }
+  }
+
+  // If any of the neighbor colors has higher index than current color index,
+  // then we move up by 1 unless the current color is the same as one of the
+  // neighbors.
+  const uint8_t current_color = *color_idx = color_map[r * stride + c];
+  for (int idx = 0; idx < num_valid_colors; idx++) {
+    if (color_rank[idx] > current_color) {
+      (*color_idx)++;
+    } else if (color_rank[idx] == current_color) {
+      *color_idx = idx;
+      break;
+    }
+  }
+
+  // Get hash value of context.
+  uint8_t color_index_ctx_hash = 0;
+  static const uint8_t hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+  for (int idx = 0; idx < num_valid_colors; ++idx) {
+    color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx];
+  }
+  assert(color_index_ctx_hash > 0);
+  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+  // Lookup context from hash.
+  const int color_index_ctx = 9 - color_index_ctx_hash;
+  assert(color_index_ctx ==
+         av1_palette_color_index_context_lookup[color_index_ctx_hash]);
+  assert(color_index_ctx >= 0);
+  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+  return color_index_ctx;
+}
+#undef INVALID_COLOR_IDX
+#undef SWAP
+
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t,
+                                 int plane, int calc_rate, int allow_update_cdf,
+                                 FRAME_COUNTS *counts) {
+  const uint8_t *const color_map = param->color_map;
+  MapCdf map_cdf = param->map_cdf;
+  ColorCost color_cost = param->color_cost;
+  const int plane_block_width = param->plane_width;
+  const int rows = param->rows;
+  const int cols = param->cols;
+  const int n = param->n_colors;
+  const int palette_size_idx = n - PALETTE_MIN_SIZE;
+  int this_rate = 0;
+
+  (void)plane;
+  (void)counts;
+
+  for (int k = 1; k < rows + cols - 1; ++k) {
+    for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+      int i = k - j;
+      int color_new_idx;
+      const int color_ctx = av1_fast_palette_color_index_context(
+          color_map, plane_block_width, i, j, &color_new_idx);
+      assert(color_new_idx >= 0 && color_new_idx < n);
+      if (calc_rate) {
+        this_rate += color_cost[palette_size_idx][color_ctx][color_new_idx];
+      } else {
+        (*t)->token = color_new_idx;
+        (*t)->color_ctx = color_ctx;
+        ++(*t);
+        if (allow_update_cdf)
+          update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
+#if CONFIG_ENTROPY_STATS
+        if (plane) {
+          ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
+                                          [color_new_idx];
+        } else {
+          ++counts->palette_y_color_index[palette_size_idx][color_ctx]
+                                         [color_new_idx];
+        }
+#endif
+      }
+    }
+  }
+  if (calc_rate) return this_rate;
+  return 0;
+}
+
+static void get_palette_params(const MACROBLOCK *const x, int plane,
+                               BLOCK_SIZE bsize, Av1ColorMapParam *params) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  params->color_map = xd->plane[plane].color_index_map;
+  params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+                          : xd->tile_ctx->palette_y_color_index_cdf;
+  params->color_cost = plane ? x->mode_costs.palette_uv_color_cost
+                             : x->mode_costs.palette_y_color_cost;
+  params->n_colors = pmi->palette_size[plane];
+  av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
+                           &params->rows, &params->cols);
+}
+
+// TODO(any): Remove this function
+static void get_color_map_params(const MACROBLOCK *const x, int plane,
+                                 BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                 COLOR_MAP_TYPE type,
+                                 Av1ColorMapParam *params) {
+  (void)tx_size;
+  memset(params, 0, sizeof(*params));
+  switch (type) {
+    case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
+    default: assert(0 && "Invalid color map type"); return;
+  }
+}
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam color_map_params;
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
+}
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+                            TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            FRAME_COUNTS *counts) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam color_map_params;
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+  // The first color index does not use context or entropy.
+  (*t)->token = color_map_params.color_map[0];
+  (*t)->color_ctx = -1;
+  ++(*t);
+  cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
+                        counts);
+}
+
+static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
+                           BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+                           int block, int plane, void *arg) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+
+  if (tx_size == plane_tx_size || plane) {
+    plane_bsize =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+    struct tokenize_b_args *args = arg;
+    if (args->allow_update_cdf)
+      av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+                                        plane_bsize, tx_size, arg);
+    else
+      av1_record_txb_context(plane, block, blk_row, blk_col, plane_bsize,
+                             tx_size, arg);
+
+  } else {
+    // Half the block size in transform block unit.
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsw * bsh;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
+        const int offsetc = blk_col + col;
+
+        tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane,
+                       arg);
+        block += step;
+      }
+    }
+  }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td,
+                           RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
+
+  if (mbmi->skip_txfm) {
+    av1_reset_entropy_context(xd, bsize, num_planes);
+    return;
+  }
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int ss_x = pd->subsampling_x;
+    const int ss_y = pd->subsampling_y;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bw = mi_size_wide[txb_size];
+    const int bh = mi_size_high[txb_size];
+    int block = 0;
+    const int step =
+        tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
+
+    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+    for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+        for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col,
+                           block, plane, &arg);
+            block += step;
+          }
+        }
+      }
+    }
+  }
+  if (rate) *rate += arg.this_rate;
+}
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
new file mode 100644
index 0000000000..f675c489ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/encoder/block.h"
+#include "aom_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The token and color_ctx members of the TokenExtra structure are used
+// to store the indices of color and color context of each pixel in
+// case of palette mode.
+// 1) token can take values in the range of [0, 7] as maximum number of possible
+// colors is 8 (PALETTE_COLORS). Hence token requires 3 bits (unsigned).
+// 2) The reserved field (1-bit) is positioned such that color_ctx occupies the
+// most significant bits and token occupies the least significant bits of the
+// byte. Thus accesses to token and color_ctx are optimal. If TokenExtra is
+// defined as:
+//   typedef struct {
+//     int8_t color_ctx : 4;
+//     uint8_t token : 3;
+//   } TokenExtra;
+// then read of color_ctx requires an extra left shift to facilitate sign
+// extension and write of token requires an extra masking.
+// 3) color_ctx can take 5 (PALETTE_COLOR_INDEX_CONTEXTS) valid values, i.e.,
+// from 0 to 4. As per the current implementation it can take values in the
+// range of [-1, 4]. Here -1 corresponds to invalid color index context and is
+// used for default initialization. Hence color_ctx requires 4 bits (signed).
+typedef struct {
+  uint8_t token : 3;
+  uint8_t reserved : 1;
+  int8_t color_ctx : 4;
+} TokenExtra;
+
+typedef struct {
+  TokenExtra *start;
+  unsigned int count;
+} TokenList;
+
+typedef struct {
+  // Number of tile tokens for which memory is allocated.
+  unsigned int tokens_allocated;
+  // tile_tok[i][j] is a pointer to the buffer storing palette tokens of the ith
+  // tile row, jth tile column.
+  TokenExtra *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+  // tplist[i][j][k] holds the start pointer of tile_tok[i][j] and the count of
+  // palette tokens for the kth superblock row of the ith tile row, jth tile
+  // column.
+  TokenList *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+} TokenInfo;
+
+struct AV1_COMP;
+struct ThreadData;
+struct FRAME_COUNTS;
+
+enum {
+  OUTPUT_ENABLED = 0,
+  DRY_RUN_NORMAL,
+  DRY_RUN_COSTCOEFFS,
+} UENUM1BYTE(RUN_TYPE);
+
+struct tokenize_b_args {
+  const struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  int this_rate;
+  uint8_t allow_update_cdf;
+  RUN_TYPE dry_run;
+};
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+                           RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf);
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type);
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+                            TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            struct FRAME_COUNTS *counts);
+
+static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
+                                 TX_SIZE tx_size) {
+  const int eob_max = av1_get_max_eob(tx_size);
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+                                           int sb_size_log2,
+                                           const int num_planes) {
+  // Calculate the maximum number of max superblocks in the image.
+  const int shift = sb_size_log2 - 4;
+  const int sb_size = 1 << sb_size_log2;
+  const int sb_size_square = sb_size * sb_size;
+  const int sb_rows = CEIL_POWER_OF_TWO(mb_rows, shift);
+  const int sb_cols = CEIL_POWER_OF_TWO(mb_cols, shift);
+
+  // One palette token for each pixel. There can be palettes on two planes.
+  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+  return sb_rows * sb_cols * sb_palette_toks;
+}
+
+// Allocate memory for token related info.
+static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info,
+                                        unsigned int tokens_required) {
+  int sb_rows =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  token_info->tokens_allocated = tokens_required;
+
+  CHECK_MEM_ERROR(cm, token_info->tile_tok[0][0],
+                  (TokenExtra *)aom_calloc(
+                      tokens_required, sizeof(*token_info->tile_tok[0][0])));
+
+  CHECK_MEM_ERROR(
+      cm, token_info->tplist[0][0],
+      (TokenList *)aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+                              sizeof(*token_info->tplist[0][0])));
+}
+
+// Check if memory allocation has been done for token related info.
+static AOM_INLINE bool is_token_info_allocated(const TokenInfo *token_info) {
+  return ((token_info->tile_tok[0][0] != NULL) &&
+          (token_info->tplist[0][0] != NULL));
+}
+
+// Free memory from token related variables.
+static AOM_INLINE void free_token_info(TokenInfo *token_info) {
+  aom_free(token_info->tile_tok[0][0]);
+  token_info->tile_tok[0][0] = NULL;
+
+  aom_free(token_info->tplist[0][0]);
+  token_info->tplist[0][0] = NULL;
+
+  token_info->tokens_allocated = 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/tpl_model.c b/third_party/aom/av1/encoder/tpl_model.c
new file mode 100644
index 0000000000..ca60e4981e
--- /dev/null
+++ b/third_party/aom/av1/encoder/tpl_model.c
@@ -0,0 +1,2511 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <stdint.h>
+
+#include "av1/encoder/thirdpass.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+
+static INLINE double exp_bounded(double v) {
+  // When v > 700 or <-700, the exp function will be close to overflow
+  // For details, see the "Notes" in the following link.
+  // https://en.cppreference.com/w/c/numeric/math/exp
+  if (v > 700) {
+    return DBL_MAX;
+  } else if (v < -700) {
+    return 0;
+  }
+  return exp(v);
+}
+
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) {
+  tpl_txfm_stats->ready = 0;
+  tpl_txfm_stats->coeff_num = 256;
+  tpl_txfm_stats->txfm_block_count = 0;
+  memset(tpl_txfm_stats->abs_coeff_sum, 0,
+         sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num);
+  memset(tpl_txfm_stats->abs_coeff_mean, 0,
+         sizeof(tpl_txfm_stats->abs_coeff_mean[0]) * tpl_txfm_stats->coeff_num);
+}
+
+#if CONFIG_BITRATE_ACCURACY
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+                                   TplTxfmStats *accumulated_stats) {
+  accumulated_stats->txfm_block_count += sub_stats->txfm_block_count;
+  for (int i = 0; i < accumulated_stats->coeff_num; ++i) {
+    accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i];
+  }
+}
+
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+                               const tran_low_t *coeff) {
+  // For transform larger than 16x16, the scale of coeff need to be adjusted.
+  // It's not LOSSLESS_Q_STEP.
+  assert(tpl_txfm_stats->coeff_num <= 256);
+  for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) {
+    tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP;
+  }
+  ++tpl_txfm_stats->txfm_block_count;
+}
+
+void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats) {
+  if (txfm_stats->txfm_block_count > 0) {
+    for (int j = 0; j < txfm_stats->coeff_num; j++) {
+      txfm_stats->abs_coeff_mean[j] =
+          txfm_stats->abs_coeff_sum[j] / txfm_stats->txfm_block_count;
+    }
+    txfm_stats->ready = 1;
+  } else {
+    txfm_stats->ready = 0;
+  }
+}
+
+static AOM_INLINE void av1_tpl_store_txfm_stats(
+    TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats,
+    const int frame_index) {
+  tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats;
+}
+#endif  // CONFIG_BITRATE_ACCURACY
+
+static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
+                                          const tran_low_t *coeff,
+                                          tran_low_t *qcoeff,
+                                          tran_low_t *dqcoeff, TX_SIZE tx_size,
+                                          uint16_t *eob, int64_t *recon_error,
+                                          int64_t *sse) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+  QUANT_PARAM quant_param;
+  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob,
+                                  scan_order, &quant_param);
+    *recon_error =
+        av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift;
+  } else {
+    av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+                           &quant_param);
+    *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  }
+#else
+  (void)xd;
+  av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+                         &quant_param);
+  *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+  *recon_error = AOMMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = AOMMAX(*sse, 1);
+}
+
+static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2,
+                                                uint8_t *tpl_bsize_1d) {
+  // tpl stats bsize: 2 means 16x16
+  *block_mis_log2 = 2;
+  // Block size used in tpl motion estimation
+  *tpl_bsize_1d = 16;
+  // MIN_TPL_BSIZE_1D = 16;
+  assert(*tpl_bsize_1d >= 16);
+}
+
+void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
+                           CommonModeInfoParams *const mi_params, int width,
+                           int height, int byte_alignment, int lag_in_frames) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  TplParams *const tpl_data = &ppi->tpl_data;
+  set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+                           &tpl_data->tpl_bsize_1d);
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  tpl_data->border_in_pixels =
+      ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5);
+
+  const int alloc_y_plane_only =
+      ppi->cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : 0;
+  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
+    const int mi_cols =
+        ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
+    const int mi_rows =
+        ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
+    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame];
+    tpl_frame->is_valid = 0;
+    tpl_frame->width = mi_cols >> block_mis_log2;
+    tpl_frame->height = mi_rows >> block_mis_log2;
+    tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width;
+    tpl_frame->mi_rows = mi_params->mi_rows;
+    tpl_frame->mi_cols = mi_params->mi_cols;
+  }
+  tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
+
+  // If lag_in_frames <= 1, TPL module is not invoked. Hence dynamic memory
+  // allocations are avoided for buffers in tpl_data.
+  if (lag_in_frames <= 1) return;
+
+  AOM_CHECK_MEM_ERROR(&ppi->error, tpl_data->txfm_stats_list,
+                      aom_calloc(MAX_LENGTH_TPL_FRAME_STATS,
+                                 sizeof(*tpl_data->txfm_stats_list)));
+
+  for (int frame = 0; frame < lag_in_frames; ++frame) {
+    AOM_CHECK_MEM_ERROR(
+        &ppi->error, tpl_data->tpl_stats_pool[frame],
+        aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
+                       tpl_data->tpl_stats_buffer[frame].height,
+                   sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
+
+    if (aom_alloc_frame_buffer(
+            &tpl_data->tpl_rec_pool[frame], width, height,
+            seq_params->subsampling_x, seq_params->subsampling_y,
+            seq_params->use_highbitdepth, tpl_data->border_in_pixels,
+            byte_alignment, 0, alloc_y_plane_only))
+      aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate frame buffer");
+  }
+}
+
+static AOM_INLINE int32_t tpl_get_satd_cost(BitDepthInfo bd_info,
+                                            int16_t *src_diff, int diff_stride,
+                                            const uint8_t *src, int src_stride,
+                                            const uint8_t *dst, int dst_stride,
+                                            tran_low_t *coeff, int bw, int bh,
+                                            TX_SIZE tx_size) {
+  const int pix_num = bw * bh;
+
+  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+                     dst, dst_stride);
+  av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
+  return aom_satd(coeff, pix_num);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+  int rate_cost = 1;
+
+  for (int idx = 0; idx < eob; ++idx) {
+    unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+  }
+
+  return (rate_cost << AV1_PROB_COST_SHIFT);
+}
+
+static AOM_INLINE void txfm_quant_rdcost(
+    const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src,
+    int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
+    int do_recon, int *rate_cost, int64_t *recon_error, int64_t *sse) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  uint16_t eob;
+  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+                     dst, dst_stride);
+  av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
+
+  get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
+                     sse);
+
+  *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+
+  if (do_recon)
+    av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst,
+                                dst_stride, eob, 0);
+}
+
+static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
+                                  uint8_t *cur_frame_buf,
+                                  uint8_t *ref_frame_buf, int stride,
+                                  int ref_stride, int width, int ref_width,
+                                  BLOCK_SIZE bsize, MV center_mv,
+                                  int_mv *best_mv) {
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+  int step_param;
+  uint32_t bestsme = UINT_MAX;
+  FULLPEL_MV_STATS best_mv_stats;
+  int distortion;
+  uint32_t sse;
+  int cost_list[5];
+  FULLPEL_MV start_mv = get_fullmv_from_mv(&center_mv);
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  x->plane[0].src.width = width;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = ref_stride;
+  xd->plane[0].pre[0].width = ref_width;
+
+  step_param = tpl_sf->reduce_first_step_size;
+  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  const search_site_config *search_site_cfg =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
+  if (search_site_cfg->stride != ref_stride)
+    search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+  assert(search_site_cfg->stride == ref_stride);
+
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+                                     start_mv, search_site_cfg,
+                                     tpl_sf->search_method,
+                                     /*fine_search_interval=*/0);
+
+  bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                                  cond_cost_list(cpi, cost_list),
+                                  &best_mv->as_fullmv, &best_mv_stats, NULL);
+
+  // When sub-pel motion search is skipped, populate sub-pel precision MV and
+  // return.
+  if (tpl_sf->subpel_force_stop == FULL_PEL) {
+    best_mv->as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+    return bestsme;
+  }
+
+  SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+  av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &center_mv,
+                                    cost_list);
+  ms_params.forced_stop = tpl_sf->subpel_force_stop;
+  ms_params.var_params.subpel_search_type = USE_2_TAPS;
+  ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+  best_mv_stats.err_cost = 0;
+  MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+  assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+  bestsme = cpi->mv_search_params.find_fractional_mv_step(
+      xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv,
+      &distortion, &sse, NULL);
+
+  return bestsme;
+}
+
+typedef struct {
+  int_mv mv;
+  int sad;
+} center_mv_t;
+
+static int compare_sad(const void *a, const void *b) {
+  const int diff = ((center_mv_t *)a)->sad - ((center_mv_t *)b)->sad;
+  if (diff < 0)
+    return -1;
+  else if (diff > 0)
+    return 1;
+  return 0;
+}
+
+static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs,
+                       int center_mvs_count, int skip_alike_starting_mv) {
+  // MV difference threshold is in 1/8 precision.
+  const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) };
+  int thr = mv_diff_thr[skip_alike_starting_mv];
+  int i;
+
+  for (i = 0; i < center_mvs_count; i++) {
+    if (abs(center_mvs[i].mv.as_mv.col - candidate_mv.as_mv.col) < thr &&
+        abs(center_mvs[i].mv.as_mv.row - candidate_mv.as_mv.row) < thr)
+      return 1;
+  }
+
+  return 0;
+}
+
+static void get_rate_distortion(
+    int *rate_cost, int64_t *recon_error, int64_t *pred_error,
+    int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff,
+    tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
+    const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3],
+    const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode,
+    int mi_row, int mi_col, int use_y_only_rate_distortion, int do_recon,
+    TplTxfmStats *tpl_txfm_stats) {
+  const SequenceHeader *seq_params = cm->seq_params;
+  *rate_cost = 0;
+  *recon_error = 1;
+  *pred_error = 1;
+
+  (void)tpl_txfm_stats;
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  int is_compound = (best_mode == NEW_NEWMV);
+  int num_planes = use_y_only_rate_distortion ? 1 : MAX_MB_PLANE;
+
+  uint8_t *src_buffer_pool[MAX_MB_PLANE] = {
+    xd->cur_buf->y_buffer,
+    xd->cur_buf->u_buffer,
+    xd->cur_buf->v_buffer,
+  };
+  const int src_stride_pool[MAX_MB_PLANE] = {
+    xd->cur_buf->y_stride,
+    xd->cur_buf->uv_stride,
+    xd->cur_buf->uv_stride,
+  };
+
+  const int_interpfilters kernel =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    BLOCK_SIZE bsize_plane =
+        av1_ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x]
+                          [pd->subsampling_y];
+
+    int dst_buffer_stride = rec_stride_pool[plane];
+    int dst_mb_offset =
+        ((mi_row * MI_SIZE * dst_buffer_stride) >> pd->subsampling_y) +
+        ((mi_col * MI_SIZE) >> pd->subsampling_x);
+    uint8_t *dst_buffer = rec_buffer_pool[plane] + dst_mb_offset;
+    for (int ref = 0; ref < 1 + is_compound; ++ref) {
+      if (!is_inter_mode(best_mode)) {
+        av1_predict_intra_block(
+            xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+            block_size_wide[bsize_plane], block_size_high[bsize_plane],
+            max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0,
+            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer,
+            dst_buffer_stride, 0, 0, plane);
+      } else {
+        int_mv best_mv = xd->mi[0]->mv[ref];
+        uint8_t *ref_buffer_pool[MAX_MB_PLANE] = {
+          ref_frame_ptr[ref]->y_buffer,
+          ref_frame_ptr[ref]->u_buffer,
+          ref_frame_ptr[ref]->v_buffer,
+        };
+        InterPredParams inter_pred_params;
+        struct buf_2d ref_buf = {
+          NULL, ref_buffer_pool[plane],
+          plane ? ref_frame_ptr[ref]->uv_width : ref_frame_ptr[ref]->y_width,
+          plane ? ref_frame_ptr[ref]->uv_height : ref_frame_ptr[ref]->y_height,
+          plane ? ref_frame_ptr[ref]->uv_stride : ref_frame_ptr[ref]->y_stride
+        };
+        av1_init_inter_params(&inter_pred_params, block_size_wide[bsize_plane],
+                              block_size_high[bsize_plane],
+                              (mi_row * MI_SIZE) >> pd->subsampling_y,
+                              (mi_col * MI_SIZE) >> pd->subsampling_x,
+                              pd->subsampling_x, pd->subsampling_y, xd->bd,
+                              is_cur_buf_hbd(xd), 0,
+                              xd->block_ref_scale_factors[0], &ref_buf, kernel);
+        if (is_compound) av1_init_comp_mode(&inter_pred_params);
+        inter_pred_params.conv_params = get_conv_params_no_round(
+            ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+        av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,
+                                          &best_mv.as_mv, &inter_pred_params);
+      }
+    }
+
+    int src_stride = src_stride_pool[plane];
+    int src_mb_offset = ((mi_row * MI_SIZE * src_stride) >> pd->subsampling_y) +
+                        ((mi_col * MI_SIZE) >> pd->subsampling_x);
+
+    int this_rate = 1;
+    int64_t this_recon_error = 1;
+    int64_t sse;
+    txfm_quant_rdcost(
+        x, src_diff, block_size_wide[bsize_plane],
+        src_buffer_pool[plane] + src_mb_offset, src_stride, dst_buffer,
+        dst_buffer_stride, coeff, qcoeff, dqcoeff, block_size_wide[bsize_plane],
+        block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane],
+        do_recon, &this_rate, &this_recon_error, &sse);
+
+#if CONFIG_BITRATE_ACCURACY
+    if (plane == 0 && tpl_txfm_stats) {
+      // We only collect Y plane's transform coefficient
+      av1_record_tpl_txfm_block(tpl_txfm_stats, coeff);
+    }
+#endif  // CONFIG_BITRATE_ACCURACY
+
+    *recon_error += this_recon_error;
+    *pred_error += sse;
+    *rate_cost += this_rate;
+  }
+}
+
+static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd,
+                                         const uint8_t *src_mb_buffer,
+                                         int src_stride,
+                                         TplBuffers *tpl_tmp_buffers,
+                                         BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                         int mi_row, int mi_col, int rf_idx,
+                                         MV *rfidx_mv, int use_pred_sad) {
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
+  const YV12_BUFFER_CONFIG *const ref_frame_ptr =
+      tpl_data->src_ref_frame[rf_idx];
+  int16_t *src_diff = tpl_tmp_buffers->src_diff;
+  tran_low_t *coeff = tpl_tmp_buffers->coeff;
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  int32_t inter_cost;
+
+  if (cpi->sf.tpl_sf.subpel_force_stop != FULL_PEL) {
+    const int_interpfilters kernel =
+        av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+    uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+    uint8_t *predictor =
+        is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+    struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+                              ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+                              ref_frame_ptr->y_stride };
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          &tpl_data->sf, &ref_buf, kernel);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    av1_enc_build_one_inter_predictor(predictor, bw, rfidx_mv,
+                                      &inter_pred_params);
+
+    if (use_pred_sad) {
+      inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(src_mb_buffer, src_stride,
+                                                    predictor, bw);
+    } else {
+      inter_cost =
+          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                            predictor, bw, coeff, bw, bh, tx_size);
+    }
+  } else {
+    int ref_mb_offset =
+        mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+    uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+    int ref_stride = ref_frame_ptr->y_stride;
+    const FULLPEL_MV fullmv = get_fullmv_from_mv(rfidx_mv);
+    // Since sub-pel motion search is not performed, use the prediction pixels
+    // directly from the reference block ref_mb
+    if (use_pred_sad) {
+      inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(
+          src_mb_buffer, src_stride,
+          &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride);
+    } else {
+      inter_cost =
+          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                            &ref_mb[fullmv.row * ref_stride + fullmv.col],
+                            ref_stride, coeff, bw, bh, tx_size);
+    }
+  }
+  return inter_cost;
+}
+
+static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
+                                       TplTxfmStats *tpl_txfm_stats,
+                                       TplBuffers *tpl_tmp_buffers,
+                                       MACROBLOCK *x, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                       TplDepStats *tpl_stats) {
+  AV1_COMMON *cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+
+  (void)gf_group;
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+
+  int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index;
+
+  int32_t best_intra_cost = INT32_MAX;
+  int32_t intra_cost;
+  PREDICTION_MODE best_mode = DC_PRED;
+
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset;
+  const int src_stride = xd->cur_buf->y_stride;
+  const int src_width = xd->cur_buf->y_width;
+
+  int dst_mb_offset =
+      mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE;
+  uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset;
+  int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
+  int use_y_only_rate_distortion = tpl_sf->use_y_only_rate_distortion;
+
+  uint8_t *rec_buffer_pool[3] = {
+    tpl_frame->rec_picture->y_buffer,
+    tpl_frame->rec_picture->u_buffer,
+    tpl_frame->rec_picture->v_buffer,
+  };
+
+  const int rec_stride_pool[3] = {
+    tpl_frame->rec_picture->y_stride,
+    tpl_frame->rec_picture->uv_stride,
+    tpl_frame->rec_picture->uv_stride,
+  };
+
+  for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    pd->subsampling_x = xd->cur_buf->subsampling_x;
+    pd->subsampling_y = xd->cur_buf->subsampling_y;
+  }
+
+  uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+  int16_t *src_diff = tpl_tmp_buffers->src_diff;
+  tran_low_t *coeff = tpl_tmp_buffers->coeff;
+  tran_low_t *qcoeff = tpl_tmp_buffers->qcoeff;
+  tran_low_t *dqcoeff = tpl_tmp_buffers->dqcoeff;
+  uint8_t *predictor =
+      is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+  int64_t recon_error = 1;
+  int64_t pred_error = 1;
+
+  memset(tpl_stats, 0, sizeof(*tpl_stats));
+  tpl_stats->ref_frame_index[0] = -1;
+  tpl_stats->ref_frame_index[1] = -1;
+
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+  set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+               av1_num_planes(cm));
+  xd->mi[0]->bsize = bsize;
+  xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+
+  // Intra prediction search
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+  // Pre-load the bottom left line.
+  if (xd->left_available &&
+      mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
+    if (is_cur_buf_hbd(xd)) {
+      uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
+      for (int i = 0; i < bw; ++i)
+        dst[(bw + i) * dst_buffer_stride - 1] =
+            dst[(bw - 1) * dst_buffer_stride - 1];
+    } else {
+      for (int i = 0; i < bw; ++i)
+        dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+            dst_buffer[(bw - 1) * dst_buffer_stride - 1];
+    }
+  }
+
+  // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
+  // H_PRED, and V_PRED
+  const PREDICTION_MODE last_intra_mode =
+      tpl_sf->prune_intra_modes ? D45_PRED : INTRA_MODE_END;
+  const SequenceHeader *seq_params = cm->seq_params;
+  for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode;
+       ++mode) {
+    av1_predict_intra_block(xd, seq_params->sb_size,
+                            seq_params->enable_intra_edge_filter,
+                            block_size_wide[bsize], block_size_high[bsize],
+                            tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+                            dst_buffer_stride, predictor, bw, 0, 0, 0);
+
+    if (tpl_frame->use_pred_sad) {
+      intra_cost = (int32_t)cpi->ppi->fn_ptr[bsize].sdf(
+          src_mb_buffer, src_stride, predictor, bw);
+    } else {
+      intra_cost =
+          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                            predictor, bw, coeff, bw, bh, tx_size);
+    }
+
+    if (intra_cost < best_intra_cost) {
+      best_intra_cost = intra_cost;
+      best_mode = mode;
+    }
+  }
+  // Calculate SATD of the best intra mode if SAD was used for mode decision
+  // as best_intra_cost is used in ML model to skip intra mode evaluation.
+  if (tpl_frame->use_pred_sad) {
+    av1_predict_intra_block(
+        xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+        block_size_wide[bsize], block_size_high[bsize], tx_size, best_mode, 0,
+        0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0,
+        0, 0);
+    best_intra_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
+  }
+
+  int rate_cost = 1;
+
+  if (cpi->use_ducky_encode) {
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, NULL, rec_buffer_pool,
+                        rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                        use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+
+    tpl_stats->intra_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->intra_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->intra_rate = rate_cost;
+  }
+
+  if (cpi->third_pass_ctx &&
+      frame_offset < cpi->third_pass_ctx->frame_info_count &&
+      tpl_data->frame_idx < gf_group->size) {
+    double ratio_h, ratio_w;
+    av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+                             cm->width, &ratio_h, &ratio_w);
+    THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+        cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+    PREDICTION_MODE third_pass_mode = this_mi->pred_mode;
+
+    if (third_pass_mode >= last_intra_mode &&
+        third_pass_mode < INTRA_MODE_END) {
+      av1_predict_intra_block(
+          xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+          block_size_wide[bsize], block_size_high[bsize], tx_size,
+          third_pass_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+          dst_buffer_stride, predictor, bw, 0, 0, 0);
+
+      intra_cost =
+          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                            predictor, bw, coeff, bw, bh, tx_size);
+
+      if (intra_cost < best_intra_cost) {
+        best_intra_cost = intra_cost;
+        best_mode = third_pass_mode;
+      }
+    }
+  }
+
+  // Motion compensated prediction
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->ref_frame[1] = NONE_FRAME;
+  xd->mi[0]->compound_idx = 1;
+
+  int best_rf_idx = -1;
+  int_mv best_mv[2];
+  int32_t inter_cost;
+  int32_t best_inter_cost = INT32_MAX;
+  int rf_idx;
+  int_mv single_mv[INTER_REFS_PER_FRAME];
+
+  best_mv[0].as_int = INVALID_MV;
+  best_mv[1].as_int = INVALID_MV;
+
+  for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) {
+    single_mv[rf_idx].as_int = INVALID_MV;
+    if (tpl_data->ref_frame[rf_idx] == NULL ||
+        tpl_data->src_ref_frame[rf_idx] == NULL) {
+      tpl_stats->mv[rf_idx].as_int = INVALID_MV;
+      continue;
+    }
+
+    const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx];
+    const int ref_mb_offset =
+        mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+    uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+    const int ref_stride = ref_frame_ptr->y_stride;
+    const int ref_width = ref_frame_ptr->y_width;
+
+    int_mv best_rfidx_mv = { 0 };
+    uint32_t bestsme = UINT32_MAX;
+
+    center_mv_t center_mvs[4] = { { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX } };
+    int refmv_count = 1;
+    int idx;
+
+    if (xd->up_available) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       tpl_sf->skip_alike_starting_mv)) {
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    if (xd->left_available) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       tpl_sf->skip_alike_starting_mv)) {
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row - mi_height, mi_col + mi_width, tpl_frame->stride,
+          block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       tpl_sf->skip_alike_starting_mv)) {
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    if (cpi->third_pass_ctx &&
+        frame_offset < cpi->third_pass_ctx->frame_info_count &&
+        tpl_data->frame_idx < gf_group->size) {
+      double ratio_h, ratio_w;
+      av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+                               cm->width, &ratio_h, &ratio_w);
+      THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+          cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+      int_mv tp_mv = av1_get_third_pass_adjusted_mv(this_mi, ratio_h, ratio_w,
+                                                    rf_idx + LAST_FRAME);
+      if (tp_mv.as_int != INVALID_MV &&
+          !is_alike_mv(tp_mv, center_mvs + 1, refmv_count - 1,
+                       tpl_sf->skip_alike_starting_mv)) {
+        center_mvs[0].mv = tp_mv;
+      }
+    }
+
+    // Prune starting mvs
+    if (tpl_sf->prune_starting_mv && refmv_count > 1) {
+      // Get each center mv's sad.
+      for (idx = 0; idx < refmv_count; ++idx) {
+        FULLPEL_MV mv = get_fullmv_from_mv(&center_mvs[idx].mv.as_mv);
+        clamp_fullmv(&mv, &x->mv_limits);
+        center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf(
+            src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col],
+            ref_stride);
+      }
+
+      // Rank center_mv using sad.
+      qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad);
+
+      refmv_count = AOMMIN(4 - tpl_sf->prune_starting_mv, refmv_count);
+      // Further reduce number of refmv based on sad difference.
+      if (refmv_count > 1) {
+        int last_sad = center_mvs[refmv_count - 1].sad;
+        int second_to_last_sad = center_mvs[refmv_count - 2].sad;
+        if ((last_sad - second_to_last_sad) * 5 > second_to_last_sad)
+          refmv_count--;
+      }
+    }
+
+    for (idx = 0; idx < refmv_count; ++idx) {
+      int_mv this_mv;
+      uint32_t thissme = motion_estimation(
+          cpi, x, src_mb_buffer, ref_mb, src_stride, ref_stride, src_width,
+          ref_width, bsize, center_mvs[idx].mv.as_mv, &this_mv);
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        best_rfidx_mv = this_mv;
+      }
+    }
+
+    tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int;
+    single_mv[rf_idx] = best_rfidx_mv;
+
+    inter_cost = get_inter_cost(
+        cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+        mi_row, mi_col, rf_idx, &best_rfidx_mv.as_mv, tpl_frame->use_pred_sad);
+    // Store inter cost for each ref frame. This is used to prune inter modes.
+    tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
+
+    if (inter_cost < best_inter_cost) {
+      best_rf_idx = rf_idx;
+
+      best_inter_cost = inter_cost;
+      best_mv[0].as_int = best_rfidx_mv.as_int;
+    }
+  }
+  // Calculate SATD of the best inter mode if SAD was used for mode decision
+  // as best_inter_cost is used in ML model to skip intra mode evaluation.
+  if (best_inter_cost < INT32_MAX && tpl_frame->use_pred_sad) {
+    assert(best_rf_idx != -1);
+    best_inter_cost = get_inter_cost(
+        cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+        mi_row, mi_col, best_rf_idx, &best_mv[0].as_mv, 0 /* use_pred_sad */);
+  }
+
+  if (best_rf_idx != -1 && best_inter_cost < best_intra_cost) {
+    best_mode = NEWMV;
+    xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME;
+    xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
+  }
+
+  // Start compound predition search.
+  int comp_ref_frames[3][2] = {
+    { 0, 4 },
+    { 0, 6 },
+    { 3, 6 },
+  };
+
+  int start_rf = 0;
+  int end_rf = 3;
+  if (!tpl_sf->allow_compound_pred) end_rf = 0;
+  if (cpi->third_pass_ctx &&
+      frame_offset < cpi->third_pass_ctx->frame_info_count &&
+      tpl_data->frame_idx < gf_group->size) {
+    double ratio_h, ratio_w;
+    av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+                             cm->width, &ratio_h, &ratio_w);
+    THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+        cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+    if (this_mi->ref_frame[0] >= LAST_FRAME &&
+        this_mi->ref_frame[1] >= LAST_FRAME) {
+      int found = 0;
+      for (int i = 0; i < 3; i++) {
+        if (comp_ref_frames[i][0] + LAST_FRAME == this_mi->ref_frame[0] &&
+            comp_ref_frames[i][1] + LAST_FRAME == this_mi->ref_frame[1]) {
+          found = 1;
+          break;
+        }
+      }
+      if (!found || !tpl_sf->allow_compound_pred) {
+        comp_ref_frames[2][0] = this_mi->ref_frame[0] - LAST_FRAME;
+        comp_ref_frames[2][1] = this_mi->ref_frame[1] - LAST_FRAME;
+        if (!tpl_sf->allow_compound_pred) {
+          start_rf = 2;
+          end_rf = 3;
+        }
+      }
+    }
+  }
+
+  xd->mi_row = mi_row;
+  xd->mi_col = mi_col;
+  int best_cmp_rf_idx = -1;
+  const int_interpfilters kernel =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+  for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) {
+    int rf_idx0 = comp_ref_frames[cmp_rf_idx][0];
+    int rf_idx1 = comp_ref_frames[cmp_rf_idx][1];
+
+    if (tpl_data->ref_frame[rf_idx0] == NULL ||
+        tpl_data->src_ref_frame[rf_idx0] == NULL ||
+        tpl_data->ref_frame[rf_idx1] == NULL ||
+        tpl_data->src_ref_frame[rf_idx1] == NULL) {
+      continue;
+    }
+
+    const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
+      tpl_data->src_ref_frame[rf_idx0],
+      tpl_data->src_ref_frame[rf_idx1],
+    };
+
+    xd->mi[0]->ref_frame[0] = rf_idx0 + LAST_FRAME;
+    xd->mi[0]->ref_frame[1] = rf_idx1 + LAST_FRAME;
+    xd->mi[0]->mode = NEW_NEWMV;
+    const int8_t ref_frame_type = av1_ref_frame_type(xd->mi[0]->ref_frame);
+    // Set up ref_mv for av1_joint_motion_search().
+    CANDIDATE_MV *this_ref_mv_stack = x->mbmi_ext.ref_mv_stack[ref_frame_type];
+    this_ref_mv_stack[xd->mi[0]->ref_mv_idx].this_mv = single_mv[rf_idx0];
+    this_ref_mv_stack[xd->mi[0]->ref_mv_idx].comp_mv = single_mv[rf_idx1];
+
+    struct buf_2d yv12_mb[2][MAX_MB_PLANE];
+    for (int i = 0; i < 2; ++i) {
+      av1_setup_pred_block(xd, yv12_mb[i], ref_frame_ptr[i],
+                           xd->block_ref_scale_factors[i],
+                           xd->block_ref_scale_factors[i], MAX_MB_PLANE);
+      for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        xd->plane[plane].pre[i] = yv12_mb[i][plane];
+      }
+    }
+
+    int_mv tmp_mv[2] = { single_mv[rf_idx0], single_mv[rf_idx1] };
+    int rate_mv;
+    av1_joint_motion_search(cpi, x, bsize, tmp_mv, NULL, 0, &rate_mv,
+                            !cpi->sf.mv_sf.disable_second_mv,
+                            NUM_JOINT_ME_REFINE_ITER);
+
+    for (int ref = 0; ref < 2; ++ref) {
+      struct buf_2d ref_buf = { NULL, ref_frame_ptr[ref]->y_buffer,
+                                ref_frame_ptr[ref]->y_width,
+                                ref_frame_ptr[ref]->y_height,
+                                ref_frame_ptr[ref]->y_stride };
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+                            mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd),
+                            0, &tpl_data->sf, &ref_buf, kernel);
+      av1_init_comp_mode(&inter_pred_params);
+
+      inter_pred_params.conv_params = get_conv_params_no_round(
+          ref, 0, xd->tmp_conv_dst, MAX_SB_SIZE, 1, xd->bd);
+
+      av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv,
+                                        &inter_pred_params);
+    }
+    inter_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
+    if (inter_cost < best_inter_cost) {
+      best_cmp_rf_idx = cmp_rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv[0] = tmp_mv[0];
+      best_mv[1] = tmp_mv[1];
+    }
+  }
+
+  if (best_cmp_rf_idx != -1 && best_inter_cost < best_intra_cost) {
+    best_mode = NEW_NEWMV;
+    const int best_rf_idx0 = comp_ref_frames[best_cmp_rf_idx][0];
+    const int best_rf_idx1 = comp_ref_frames[best_cmp_rf_idx][1];
+    xd->mi[0]->ref_frame[0] = best_rf_idx0 + LAST_FRAME;
+    xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME;
+  }
+
+  if (best_inter_cost < INT32_MAX && is_inter_mode(best_mode)) {
+    xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
+    xd->mi[0]->mv[1].as_int = best_mv[1].as_int;
+    const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
+      best_cmp_rf_idx >= 0
+          ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
+          : tpl_data->src_ref_frame[best_rf_idx],
+      best_cmp_rf_idx >= 0
+          ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
+          : NULL,
+    };
+    rate_cost = 1;
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+                        rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                        use_y_only_rate_distortion, 0 /*do_recon*/, NULL);
+    tpl_stats->srcrf_rate = rate_cost;
+  }
+
+  best_intra_cost = AOMMAX(best_intra_cost, 1);
+  best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = best_inter_cost;
+  tpl_stats->intra_cost = best_intra_cost;
+
+  tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+
+  // Final encode
+  rate_cost = 0;
+  const YV12_BUFFER_CONFIG *ref_frame_ptr[2];
+
+  ref_frame_ptr[0] =
+      best_mode == NEW_NEWMV
+          ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
+      : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx]
+                         : NULL;
+  ref_frame_ptr[1] =
+      best_mode == NEW_NEWMV
+          ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
+          : NULL;
+  get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                      qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+                      rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                      use_y_only_rate_distortion, 1 /*do_recon*/,
+                      tpl_txfm_stats);
+
+  tpl_stats->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->recrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->recrf_rate = rate_cost;
+
+  if (!is_inter_mode(best_mode)) {
+    tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_rate = rate_cost;
+    tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+  }
+
+  tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
+  tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate);
+
+  if (best_mode == NEW_NEWMV) {
+    ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
+    ref_frame_ptr[1] =
+        tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+                        rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                        use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+    tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->cmp_recrf_rate[0] = rate_cost;
+
+    tpl_stats->cmp_recrf_dist[0] =
+        AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]);
+    tpl_stats->cmp_recrf_rate[0] =
+        AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[0]);
+
+    tpl_stats->cmp_recrf_dist[0] =
+        AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[0]);
+    tpl_stats->cmp_recrf_rate[0] =
+        AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[0]);
+
+    rate_cost = 0;
+    ref_frame_ptr[0] =
+        tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
+    ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+                        rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                        use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+    tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->cmp_recrf_rate[1] = rate_cost;
+
+    tpl_stats->cmp_recrf_dist[1] =
+        AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]);
+    tpl_stats->cmp_recrf_rate[1] =
+        AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[1]);
+
+    tpl_stats->cmp_recrf_dist[1] =
+        AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[1]);
+    tpl_stats->cmp_recrf_rate[1] =
+        AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[1]);
+  }
+
+  if (best_mode == NEWMV) {
+    tpl_stats->mv[best_rf_idx] = best_mv[0];
+    tpl_stats->ref_frame_index[0] = best_rf_idx;
+    tpl_stats->ref_frame_index[1] = NONE_FRAME;
+  } else if (best_mode == NEW_NEWMV) {
+    tpl_stats->ref_frame_index[0] = comp_ref_frames[best_cmp_rf_idx][0];
+    tpl_stats->ref_frame_index[1] = comp_ref_frames[best_cmp_rf_idx][1];
+    tpl_stats->mv[tpl_stats->ref_frame_index[0]] = best_mv[0];
+    tpl_stats->mv[tpl_stats->ref_frame_index[1]] = best_mv[1];
+  }
+
+  for (int idy = 0; idy < mi_height; ++idy) {
+    for (int idx = 0; idx < mi_width; ++idx) {
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) {
+        xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0];
+      }
+    }
+  }
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+                         int height) {
+  int min_row = AOMMAX(row_a, row_b);
+  int max_row = AOMMIN(row_a + height, row_b + height);
+  int min_col = AOMMAX(col_a, col_b);
+  int max_col = AOMMIN(col_a + width, col_b + width);
+  if (min_row < max_row && min_col < max_col) {
+    return (max_row - min_row) * (max_col - min_col);
+  }
+  return 0;
+}
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) {
+  return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
+}
+
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                            int64_t srcrf_dist, int pix_num) {
+  double beta = (double)srcrf_dist / recrf_dist;
+  int64_t rate_cost = delta_rate;
+
+  if (srcrf_dist <= 128) return rate_cost;
+
+  double dr =
+      (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) /
+      pix_num;
+
+  double log_den = log(beta) / log(2.0) + 2.0 * dr;
+
+  if (log_den > log(10.0) / log(2.0)) {
+    rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0);
+    rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+    return rate_cost;
+  }
+
+  double num = pow(2.0, log_den);
+  double den = num * beta + (1 - beta) * beta;
+
+  rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0);
+
+  rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+
+  return rate_cost;
+}
+
+static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
+                                          int mi_col, const BLOCK_SIZE bsize,
+                                          int frame_idx, int ref) {
+  TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
+  TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
+  TplDepFrame *tpl_frame = tpl_data->tpl_frame;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos(
+      mi_row, mi_col, tpl_frame->stride, block_mis_log2)];
+
+  int is_compound = tpl_stats_ptr->ref_frame_index[1] >= 0;
+
+  if (tpl_stats_ptr->ref_frame_index[ref] < 0) return;
+  const int ref_frame_index = tpl_stats_ptr->ref_frame_index[ref];
+  TplDepFrame *ref_tpl_frame =
+      &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]];
+  TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr;
+
+  if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return;
+
+  const FULLPEL_MV full_mv =
+      get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv);
+  const int ref_pos_row = mi_row * MI_SIZE + full_mv.row;
+  const int ref_pos_col = mi_col * MI_SIZE + full_mv.col;
+
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref]
+                                   : tpl_stats_ptr->srcrf_dist;
+  int64_t srcrf_rate =
+      is_compound
+          ? (tpl_stats_ptr->cmp_recrf_rate[!ref] << TPL_DEP_COST_SCALE_LOG2)
+          : (tpl_stats_ptr->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
+
+  int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist;
+  int64_t mc_dep_dist =
+      (int64_t)(tpl_stats_ptr->mc_dep_dist *
+                ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) /
+                 tpl_stats_ptr->recrf_dist));
+  int64_t delta_rate =
+      (tpl_stats_ptr->recrf_rate << TPL_DEP_COST_SCALE_LOG2) - srcrf_rate;
+  int64_t mc_dep_rate =
+      av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+                          srcrf_dist, pix_num);
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col,
+                                              ref_pos_row, ref_pos_col, bw, bh);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+      assert((1 << block_mis_log2) == mi_height);
+      assert((1 << block_mis_log2) == mi_width);
+      TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
+          ref_mi_row, ref_mi_col, ref_tpl_frame->stride, block_mis_log2)];
+      des_stats->mc_dep_dist +=
+          ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num;
+      des_stats->mc_dep_rate +=
+          ((delta_rate + mc_dep_rate) * overlap_area) / pix_num;
+    }
+  }
+}
+
+static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row,
+                                        int mi_col, int frame_idx) {
+  const BLOCK_SIZE tpl_stats_block_size =
+      convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
+  tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
+                     0);
+  tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
+                     1);
+}
+
+static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
+                                       int mi_col, int stride,
+                                       const TplDepStats *src_stats,
+                                       uint8_t block_mis_log2) {
+  int index = av1_tpl_ptr_pos(mi_row, mi_col, stride, block_mis_log2);
+  TplDepStats *tpl_ptr = &tpl_stats_ptr[index];
+  *tpl_ptr = *src_stats;
+  tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost);
+  tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost);
+  tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist);
+  tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse);
+  tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist);
+  tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate);
+  tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate);
+  tpl_ptr->cmp_recrf_dist[0] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[0]);
+  tpl_ptr->cmp_recrf_dist[1] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[1]);
+  tpl_ptr->cmp_recrf_rate[0] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[0]);
+  tpl_ptr->cmp_recrf_rate[1] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[1]);
+}
+
+// Reset the ref and source frame pointers of tpl_data.
+static AOM_INLINE void tpl_reset_src_ref_frames(TplParams *tpl_data) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    tpl_data->ref_frame[i] = NULL;
+    tpl_data->src_ref_frame[i] = NULL;
+  }
+}
+
+static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) {
+  int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1);
+  return gop_length;
+}
+
+// Initialize the mc_flow parameters used in computing tpl data.
+static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
+                                              int pframe_qindex) {
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
+  const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
+  uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+  int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+      gf_group, cpi->sf.inter_sf.selective_ref_frame,
+      tpl_sf->prune_ref_frames_in_tpl, frame_idx);
+  int gop_length = get_gop_length(gf_group);
+  int ref_frame_flags;
+  AV1_COMMON *cm = &cpi->common;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+  tpl_data->frame_idx = frame_idx;
+  tpl_reset_src_ref_frames(tpl_data);
+  av1_tile_init(&xd->tile, cm, 0, 0);
+
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+  // Setup scaling factor
+  av1_setup_scale_factors_for_frame(
+      &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+
+  xd->cur_buf = this_frame;
+
+  for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+    TplDepFrame *tpl_ref_frame =
+        &tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]];
+    tpl_data->ref_frame[idx] = tpl_ref_frame->rec_picture;
+    tpl_data->src_ref_frame[idx] = tpl_ref_frame->gf_picture;
+    ref_frame_display_indices[idx] = tpl_ref_frame->frame_display_index;
+  }
+
+  // Store the reference frames based on priority order
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    ref_frames_ordered[i] =
+        tpl_data->ref_frame[ref_frame_priority_order[i] - 1];
+  }
+
+  // Work out which reference frame slots may be used.
+  ref_frame_flags =
+      get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi),
+                          ref_frames_ordered, cpi->ext_flags.ref_frame_flags);
+
+  enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices,
+                         tpl_frame->frame_display_index);
+
+  // Prune reference frames
+  for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+    if ((ref_frame_flags & (1 << idx)) == 0) {
+      tpl_data->ref_frame[idx] = NULL;
+    }
+  }
+
+  // Skip motion estimation w.r.t. reference frames which are not
+  // considered in RD search, using "selective_ref_frame" speed feature.
+  // The reference frame pruning is not enabled for frames beyond the gop
+  // length, as there are fewer reference frames and the reference frames
+  // differ from the frames considered during RD search.
+  if (ref_pruning_enabled && (frame_idx < gop_length)) {
+    for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+      const MV_REFERENCE_FRAME refs[2] = { idx + 1, NONE_FRAME };
+      if (prune_ref_by_selective_ref_frame(cpi, NULL, refs,
+                                           ref_frame_display_indices)) {
+        tpl_data->ref_frame[idx] = NULL;
+      }
+    }
+  }
+
+  // Make a temporary mbmi for tpl model
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+
+  xd->block_ref_scale_factors[0] = &tpl_data->sf;
+  xd->block_ref_scale_factors[1] = &tpl_data->sf;
+
+  const int base_qindex =
+      cpi->use_ducky_encode ? gf_group->q_val[frame_idx] : pframe_qindex;
+  // Get rd multiplier set up.
+  rdmult = (int)av1_compute_rd_mult(
+      base_qindex, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+
+  if (rdmult < 1) rdmult = 1;
+  av1_set_error_per_bit(&x->errorperbit, rdmult);
+  av1_set_sad_per_bit(cpi, &x->sadperbit, base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->quant_params.base_qindex = base_qindex;
+  av1_frame_init_quantizer(cpi);
+
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
+  tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex(
+                               bd_info.bit_depth, update_type, base_qindex) /
+                           6;
+
+  if (cpi->use_ducky_encode)
+    tpl_frame->base_rdmult = gf_group->rdmult_val[frame_idx];
+
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
+  // Initialize x->mbmi_ext when compound predictions are enabled.
+  if (tpl_sf->allow_compound_pred) av1_zero(x->mbmi_ext);
+
+  // Set the pointer to null since mbmi is only allocated inside this function.
+  assert(xd->mi == &mbmi_ptr);
+  xd->mi = NULL;
+
+  // Tpl module is called before the setting of speed features at frame level.
+  // Thus, turning off this speed feature for key frame is done here and not
+  // integrated into the speed feature setting itself.
+  const int layer_depth_th = (tpl_sf->use_sad_for_mode_decision == 1) ? 5 : 0;
+  tpl_frame->use_pred_sad =
+      tpl_sf->use_sad_for_mode_decision &&
+      gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+      gf_group->layer_depth[frame_idx] >= layer_depth_th;
+}
+
+// This function stores the motion estimation dependencies of all the blocks in
+// a row
+void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
+                               TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+                               int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_width = mi_size_wide[bsize];
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  const int tplb_cols_in_tile =
+      ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+  const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]);
+  assert(mi_size_high[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2));
+  assert(mi_size_wide[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2));
+
+  for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols;
+       mi_col += mi_width, tplb_col_in_tile++) {
+    (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                 tplb_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+    if (mt_info->num_workers > 1) {
+      pthread_mutex_lock(tpl_row_mt->mutex_);
+      const bool tpl_mt_exit = tpl_row_mt->tpl_mt_exit;
+      pthread_mutex_unlock(tpl_row_mt->mutex_);
+      // Exit in case any worker has encountered an error.
+      if (tpl_mt_exit) return;
+    }
+#endif
+
+    TplDepStats tpl_stats;
+
+    // Motion estimation column boundary
+    av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
+                          tpl_data->border_in_pixels);
+    xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+    xd->mb_to_right_edge =
+        GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
+    mode_estimation(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, mi_col,
+                    bsize, tx_size, &tpl_stats);
+
+    // Motion flow dependency dispenser.
+    tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride,
+                    &tpl_stats, tpl_data->tpl_stats_block_mis_log2);
+    (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                  tplb_col_in_tile, tplb_cols_in_tile);
+  }
+}
+
+static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE bsize =
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = mi_size_high[bsize];
+  for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
+    // Motion estimation row boundary
+    av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+                          cpi->ppi->tpl_data.border_in_pixels);
+    xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+    xd->mb_to_bottom_edge =
+        GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+    av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, &td->tpl_tmp_buffers, x,
+                              mi_row, bsize, tx_size);
+  }
+}
+
+static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows,
+                                int mi_cols) {
+  if (!frame_idx) {
+    return;
+  }
+  const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2));
+  assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2));
+
+  for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) {
+    for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) {
+      tpl_model_update(tpl_data, mi_row, mi_col, frame_idx);
+    }
+  }
+}
+
+static AOM_INLINE void init_gop_frames_for_tpl(
+    AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params,
+    GF_GROUP *gf_group, int *tpl_group_frames, int *pframe_qindex) {
+  AV1_COMMON *cm = &cpi->common;
+  assert(cpi->gf_frame_index == 0);
+  *pframe_qindex = 0;
+
+  RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+  init_ref_map_pair(cpi, ref_frame_map_pairs);
+
+  int remapped_ref_idx[REF_FRAMES];
+
+  EncodeFrameParams frame_params = *init_frame_params;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+
+  int ref_picture_map[REF_FRAMES];
+
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    if (frame_params.frame_type == KEY_FRAME) {
+      tpl_data->tpl_frame[-i - 1].gf_picture = NULL;
+      tpl_data->tpl_frame[-i - 1].rec_picture = NULL;
+      tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
+    } else {
+      tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
+      tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf;
+      tpl_data->tpl_frame[-i - 1].frame_display_index =
+          cm->ref_frame_map[i]->display_order_hint;
+    }
+
+    ref_picture_map[i] = -i - 1;
+  }
+
+  *tpl_group_frames = 0;
+
+  int gf_index;
+  int process_frame_count = 0;
+  const int gop_length = get_gop_length(gf_group);
+
+  for (gf_index = 0; gf_index < gop_length; ++gf_index) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+    FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
+    int lookahead_index =
+        gf_group->cur_frame_idx[gf_index] + gf_group->arf_src_offset[gf_index];
+    frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+                              frame_update_type != INTNL_ARF_UPDATE;
+    frame_params.show_existing_frame =
+        frame_update_type == INTNL_OVERLAY_UPDATE ||
+        frame_update_type == OVERLAY_UPDATE;
+    frame_params.frame_type = gf_group->frame_type[gf_index];
+
+    if (frame_update_type == LF_UPDATE)
+      *pframe_qindex = gf_group->q_val[gf_index];
+
+    const struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage);
+    if (buf == NULL) break;
+    tpl_frame->gf_picture = &buf->img;
+
+    // Use filtered frame buffer if available. This will make tpl stats more
+    // precise.
+    FRAME_DIFF frame_diff;
+    const YV12_BUFFER_CONFIG *tf_buf =
+        av1_tf_info_get_filtered_buf(&cpi->ppi->tf_info, gf_index, &frame_diff);
+    if (tf_buf != NULL) {
+      tpl_frame->gf_picture = tf_buf;
+    }
+
+    // 'cm->current_frame.frame_number' is the display number
+    // of the current frame.
+    // 'lookahead_index' is frame offset within the gf group.
+    // 'lookahead_index + cm->current_frame.frame_number'
+    // is the display index of the frame.
+    tpl_frame->frame_display_index =
+        lookahead_index + cm->current_frame.frame_number;
+    assert(buf->display_idx ==
+           cpi->frame_index_set.show_frame_count + lookahead_index);
+
+    if (frame_update_type != OVERLAY_UPDATE &&
+        frame_update_type != INTNL_OVERLAY_UPDATE) {
+      tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+      tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+      ++process_frame_count;
+    }
+    const int true_disp = (int)(tpl_frame->frame_display_index);
+
+    av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0,
+                       remapped_ref_idx);
+
+    int refresh_mask =
+        av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+                                    gf_index, true_disp, ref_frame_map_pairs);
+
+    // Make the frames marked as is_frame_non_ref to non-reference frames.
+    if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0;
+
+    int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+
+    if (refresh_frame_map_index < REF_FRAMES &&
+        refresh_frame_map_index != INVALID_IDX) {
+      ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+          AOMMAX(0, true_disp);
+      ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+          get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+                             cpi->ppi->gf_group.max_layer_depth);
+    }
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      tpl_frame->ref_map_index[i - LAST_FRAME] =
+          ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
+
+    if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
+    ++*tpl_group_frames;
+  }
+
+  const int tpl_extend = cpi->oxcf.gf_cfg.lag_in_frames - MAX_GF_INTERVAL;
+  int extend_frame_count = 0;
+  int extend_frame_length = AOMMIN(
+      tpl_extend, cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval);
+
+  int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] +
+                            gf_group->arf_src_offset[gop_length - 1] + 1;
+
+  for (;
+       gf_index < MAX_TPL_FRAME_IDX && extend_frame_count < extend_frame_length;
+       ++gf_index) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+    FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE;
+    frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+                              frame_update_type != INTNL_ARF_UPDATE;
+    frame_params.show_existing_frame =
+        frame_update_type == INTNL_OVERLAY_UPDATE;
+    frame_params.frame_type = INTER_FRAME;
+
+    int lookahead_index = frame_display_index;
+    struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage);
+
+    if (buf == NULL) break;
+
+    tpl_frame->gf_picture = &buf->img;
+    tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+    tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+    // 'cm->current_frame.frame_number' is the display number
+    // of the current frame.
+    // 'frame_display_index' is frame offset within the gf group.
+    // 'frame_display_index + cm->current_frame.frame_number'
+    // is the display index of the frame.
+    tpl_frame->frame_display_index =
+        frame_display_index + cm->current_frame.frame_number;
+
+    ++process_frame_count;
+
+    gf_group->update_type[gf_index] = LF_UPDATE;
+
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+    if (cpi->oxcf.pass == AOM_RC_SECOND_PASS) {
+      if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+        *pframe_qindex = cpi->oxcf.rc_cfg.cq_level;
+      } else if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
+        // TODO(angiebird): Find a more adaptive method to decide pframe_qindex
+        // override the pframe_qindex in the second pass when bitrate accuracy
+        // is on. We found that setting this pframe_qindex make the tpl stats
+        // more stable.
+        *pframe_qindex = 128;
+      }
+    }
+#endif  // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+    gf_group->q_val[gf_index] = *pframe_qindex;
+    const int true_disp = (int)(tpl_frame->frame_display_index);
+    av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0,
+                       remapped_ref_idx);
+    int refresh_mask =
+        av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+                                    gf_index, true_disp, ref_frame_map_pairs);
+    int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+
+    if (refresh_frame_map_index < REF_FRAMES &&
+        refresh_frame_map_index != INVALID_IDX) {
+      ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+          AOMMAX(0, true_disp);
+      ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+          get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+                             cpi->ppi->gf_group.max_layer_depth);
+    }
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      tpl_frame->ref_map_index[i - LAST_FRAME] =
+          ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
+
+    tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1;
+
+    if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+    ++frame_display_index;
+  }
+}
+
+void av1_init_tpl_stats(TplParams *const tpl_data) {
+  tpl_data->ready = 0;
+  set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+                           &tpl_data->tpl_bsize_1d);
+  for (int frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+    tpl_frame->is_valid = 0;
+  }
+  for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+    if (tpl_data->tpl_stats_pool[frame_idx] == NULL) continue;
+    memset(tpl_data->tpl_stats_pool[frame_idx], 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+  }
+}
+
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) {
+  if (tpl_data->ready == 0) {
+    return 0;
+  }
+  if (gf_frame_index >= MAX_TPL_FRAME_IDX) {
+    // The sub-GOP length exceeds the TPL buffer capacity.
+    // Hence the TPL related functions are disabled hereafter.
+    return 0;
+  }
+  return tpl_data->tpl_frame[gf_frame_index].is_valid;
+}
+
+static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) {
+  switch (gop_eval) {
+    case 1:
+      // Allow larger GOP size if the base layer ARF has higher dependency
+      // factor than the intermediate ARF and both ARFs have reasonably high
+      // dependency factors.
+      return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0;
+    case 2:
+      if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6)
+        return 1;  // Don't shorten the gf interval
+      else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4)
+        return 0;  // Shorten the gf interval
+      else
+        return 2;  // Cannot decide the gf interval, so redo the
+                   // tpl stats calculation.
+    case 3: return beta[0] > 1.1;
+    default: return 2;
+  }
+}
+
+// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down
+// the scope of input arguments.
+void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
+                                 const EncodeFrameParams *const frame_params) {
+  AV1_COMMON *cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int bottom_index, top_index;
+  if (cpi->use_ducky_encode) return;
+
+  cm->current_frame.frame_type = frame_params->frame_type;
+  for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+       ++gf_index) {
+    cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+    cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
+                     gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
+    gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds(
+        cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index);
+  }
+}
+
+static AOM_INLINE int skip_tpl_for_frame(const GF_GROUP *gf_group,
+                                         int frame_idx, int gop_eval,
+                                         int approx_gop_eval,
+                                         int reduce_num_frames) {
+  // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+  // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+  // tpl stats calculation is limited to ARFs from base layer and (base+1)
+  // layer.
+  const int num_arf_layers = (gop_eval == 2) ? 3 : 2;
+  const int gop_length = get_gop_length(gf_group);
+
+  if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+      gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+    return 1;
+
+  // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+  // frames and for frames beyond gop length.
+  if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                          frame_idx >= gop_length))
+    return 1;
+
+  if (reduce_num_frames && gf_group->update_type[frame_idx] == LF_UPDATE &&
+      frame_idx < gop_length)
+    return 1;
+
+  return 0;
+}
+
+int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
+                        const EncodeFrameParams *const frame_params) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+  assert(cpi->gf_frame_index == 0);
+  AV1_COMMON *cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  EncodeFrameParams this_frame_params = *frame_params;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  int approx_gop_eval = (gop_eval > 1);
+
+  if (cpi->superres_mode != AOM_SUPERRES_NONE) {
+    assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
+    av1_init_tpl_stats(tpl_data);
+    return 0;
+  }
+
+  cm->current_frame.frame_type = frame_params->frame_type;
+  for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+       ++gf_index) {
+    cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+    av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+                                 gf_group->update_type[gf_index],
+                                 gf_group->refbuf_state[gf_index], 0);
+
+    memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame,
+           sizeof(cpi->refresh_frame));
+  }
+
+  int pframe_qindex;
+  int tpl_gf_group_frames;
+  init_gop_frames_for_tpl(cpi, frame_params, gf_group, &tpl_gf_group_frames,
+                          &pframe_qindex);
+
+  cpi->ppi->p_rc.base_layer_qp = pframe_qindex;
+
+  av1_init_tpl_stats(tpl_data);
+
+  TplBuffers *tpl_tmp_buffers = &cpi->td.tpl_tmp_buffers;
+  if (!tpl_alloc_temp_buffers(tpl_tmp_buffers, tpl_data->tpl_bsize_1d)) {
+    aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating tpl data");
+  }
+
+  tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy;
+  tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy;
+
+  av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+                                    cm->width, cm->height);
+
+  if (frame_params->frame_type == KEY_FRAME) {
+    av1_init_mv_probs(cm);
+  }
+  av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
+                    cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
+
+  const int num_planes =
+      cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm);
+  // As tpl module is called before the setting of speed features at frame
+  // level, turning off this speed feature for the first GF group of the
+  // key-frame interval is done here.
+  int reduce_num_frames =
+      cpi->sf.tpl_sf.reduce_num_frames &&
+      gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+      gf_group->max_layer_depth > 2;
+  // TPL processing is skipped for frames of type LF_UPDATE when
+  // 'reduce_num_frames' is 1, which affects the r0 calcuation. Thus, a factor
+  // to adjust r0 is used. The value of 1.6 corresponds to using ~60% of the
+  // frames in the gf group on an average.
+  tpl_data->r0_adjust_factor = reduce_num_frames ? 1.6 : 1.0;
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
+       ++frame_idx) {
+    if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+                           reduce_num_frames))
+      continue;
+
+    init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+    if (mt_info->num_workers > 1) {
+      tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read;
+      tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write;
+      av1_mc_flow_dispenser_mt(cpi);
+    } else {
+      mc_flow_dispenser(cpi);
+    }
+#if CONFIG_BITRATE_ACCURACY
+    av1_tpl_txfm_stats_update_abs_coeff_mean(&cpi->td.tpl_txfm_stats);
+    av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx);
+#endif  // CONFIG_BITRATE_ACCURACY
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+    if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+      int frame_coding_idx =
+          av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, frame_idx);
+      rc_log_frame_stats(&cpi->rc_log, frame_coding_idx,
+                         &cpi->td.tpl_txfm_stats);
+    }
+#endif  // CONFIG_RATECTRL_LOG
+
+    aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
+                             num_planes);
+  }
+
+  for (int frame_idx = tpl_gf_group_frames - 1;
+       frame_idx >= cpi->gf_frame_index; --frame_idx) {
+    if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+                           reduce_num_frames))
+      continue;
+
+    mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
+                        cm->mi_params.mi_cols);
+  }
+
+  av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+                               gf_group->update_type[cpi->gf_frame_index],
+                               gf_group->update_type[cpi->gf_frame_index], 0);
+  cm->current_frame.frame_type = frame_params->frame_type;
+  cm->show_frame = frame_params->show_frame;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  // Record the time if the function returns.
+  if (cpi->common.tiles.large_scale || gf_group->max_layer_depth_allowed == 0 ||
+      !gop_eval)
+    end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+
+  tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+
+  if (!approx_gop_eval) {
+    tpl_data->ready = 1;
+  }
+  if (cpi->common.tiles.large_scale) return 0;
+  if (gf_group->max_layer_depth_allowed == 0) return 1;
+  if (!gop_eval) return 0;
+  assert(gf_group->arf_index >= 0);
+
+  double beta[2] = { 0.0 };
+  const int frame_idx_0 = gf_group->arf_index;
+  const int frame_idx_1 =
+      AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1);
+  beta[0] = av1_tpl_get_frame_importance(tpl_data, frame_idx_0);
+  beta[1] = av1_tpl_get_frame_importance(tpl_data, frame_idx_1);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+  return eval_gop_length(beta, gop_eval);
+}
+
+void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int tpl_idx = cpi->gf_frame_index;
+
+  assert(
+      IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size));
+
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+
+  if (!tpl_frame->is_valid) return;
+
+  const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int tpl_stride = tpl_frame->stride;
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  const int block_size = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const double c = 1.2;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+  // Loop through each 'block_size' X 'block_size' block.
+  for (int row = 0; row < num_rows; row++) {
+    for (int col = 0; col < num_cols; col++) {
+      double intra_cost = 0.0, mc_dep_cost = 0.0;
+      // Loop through each mi block.
+      for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h;
+           mi_row += step) {
+        for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w;
+             mi_col += step) {
+          if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue;
+          const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+              mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+          int64_t mc_dep_delta =
+              RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                     this_stats->mc_dep_dist);
+          intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS);
+          mc_dep_cost +=
+              (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+        }
+      }
+      const double rk = intra_cost / mc_dep_cost;
+      const int index = row * num_cols + col;
+      cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
+    }
+  }
+}
+
+void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+                             BLOCK_SIZE sb_size, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) return;
+  TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx];
+  if (!tpl_frame->is_valid) return;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
+
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int sb_mi_width_sr = coded_to_superres_mi(
+      mi_size_wide[sb_size], cm->superres_scale_denominator);
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (sb_mi_width_sr + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+
+  double base_block_count = 0.0;
+  double log_sum = 0.0;
+
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
+      base_block_count += 1.0;
+    }
+  }
+
+  const CommonQuantParams *quant_params = &cm->quant_params;
+
+  const int orig_qindex_rdmult =
+      quant_params->base_qindex + quant_params->y_dc_delta_q;
+  const int orig_rdmult = av1_compute_rd_mult(
+      orig_qindex_rdmult, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+
+  const int new_qindex_rdmult = quant_params->base_qindex +
+                                x->rdmult_delta_qindex +
+                                quant_params->y_dc_delta_q;
+  const int new_rdmult = av1_compute_rd_mult(
+      new_qindex_rdmult, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+
+  const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
+
+  double scale_adj = log(scaling_factor) - log_sum / base_block_count;
+  scale_adj = exp_bounded(scale_adj);
+
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->ppi->tpl_sb_rdmult_scaling_factors[index] =
+          scale_adj * cpi->tpl_rdmult_scaling_factors[index];
+    }
+  }
+}
+
+double av1_exponential_entropy(double q_step, double b) {
+  b = AOMMAX(b, TPL_EPSILON);
+  double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+  return -log2(1 - z) - z * log2(z) / (1 - z);
+}
+
+double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) {
+  // zero bin's size is zero_bin_ratio * q_step
+  // non-zero bin's size is q_step
+  b = AOMMAX(b, TPL_EPSILON);
+  double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  double h = av1_exponential_entropy(q_step, b);
+  double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1);
+  return r;
+}
+
+double av1_laplace_estimate_frame_rate(int q_index, int block_count,
+                                       const double *abs_coeff_mean,
+                                       int coeff_num) {
+  double zero_bin_ratio = 2;
+  double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double est_rate = 0;
+  // dc coeff
+  est_rate += av1_laplace_entropy(dc_q_step, abs_coeff_mean[0], zero_bin_ratio);
+  // ac coeff
+  for (int i = 1; i < coeff_num; ++i) {
+    est_rate +=
+        av1_laplace_entropy(ac_q_step, abs_coeff_mean[i], zero_bin_ratio);
+  }
+  est_rate *= block_count;
+  return est_rate;
+}
+
+double av1_estimate_coeff_entropy(double q_step, double b,
+                                  double zero_bin_ratio, int qcoeff) {
+  b = AOMMAX(b, TPL_EPSILON);
+  int abs_qcoeff = abs(qcoeff);
+  double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  if (abs_qcoeff == 0) {
+    double r = -log2(1 - z0);
+    return r;
+  } else {
+    double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+    double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z);
+    return r;
+  }
+}
+
+double av1_estimate_txfm_block_entropy(int q_index,
+                                       const double *abs_coeff_mean,
+                                       int *qcoeff_arr, int coeff_num) {
+  double zero_bin_ratio = 2;
+  double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double est_rate = 0;
+  // dc coeff
+  est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0],
+                                         zero_bin_ratio, qcoeff_arr[0]);
+  // ac coeff
+  for (int i = 1; i < coeff_num; ++i) {
+    est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i],
+                                           zero_bin_ratio, qcoeff_arr[i]);
+  }
+  return est_rate;
+}
+
+#if CONFIG_RD_COMMAND
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) {
+  FILE *fptr = fopen(filepath, "r");
+  fscanf(fptr, "%d", &rd_command->frame_count);
+  rd_command->frame_index = 0;
+  for (int i = 0; i < rd_command->frame_count; ++i) {
+    int option;
+    fscanf(fptr, "%d", &option);
+    rd_command->option_ls[i] = (RD_OPTION)option;
+    if (option == RD_OPTION_SET_Q) {
+      fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+    } else if (option == RD_OPTION_SET_Q_RDMULT) {
+      fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+      fscanf(fptr, "%d", &rd_command->rdmult_ls[i]);
+    }
+  }
+  fclose(fptr);
+}
+#endif  // CONFIG_RD_COMMAND
+
+double av1_tpl_get_frame_importance(const TplParams *tpl_data,
+                                    int gf_frame_index) {
+  const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index];
+  const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  const int tpl_stride = tpl_frame->stride;
+  double intra_cost_base = 0;
+  double mc_dep_cost_base = 0;
+  double cbcmp_base = 1;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+  for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+      const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+      double cbcmp = (double)this_stats->srcrf_dist;
+      const int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+      dist_scaled = AOMMAX(dist_scaled, 1);
+      intra_cost_base += log(dist_scaled) * cbcmp;
+      mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp;
+      cbcmp_base += cbcmp;
+    }
+  }
+  return exp((mc_dep_cost_base - intra_cost_base) / cbcmp_base);
+}
+
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) {
+  if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) {
+    return 1;
+  }
+  const double frame_importance =
+      av1_tpl_get_frame_importance(tpl_data, gf_frame_index);
+  return sqrt(1 / frame_importance);
+}
+
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+                                     aom_bit_depth_t bit_depth) {
+  const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth);
+  const double target_qstep = leaf_qstep * qstep_ratio;
+  int qindex = leaf_qindex;
+  if (qstep_ratio < 1.0) {
+    for (qindex = leaf_qindex; qindex > 0; --qindex) {
+      const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (qstep <= target_qstep) break;
+    }
+  } else {
+    for (qindex = leaf_qindex; qindex <= MAXQ; ++qindex) {
+      const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (qstep >= target_qstep) break;
+    }
+  }
+  return qindex;
+}
+
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+                        int leaf_qindex, aom_bit_depth_t bit_depth) {
+  const double qstep_ratio = av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index);
+  return av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+}
+
+#if CONFIG_BITRATE_ACCURACY
+void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget,
+                     int show_frame_count) {
+  av1_zero(*vbr_rc_info);
+  vbr_rc_info->ready = 0;
+  vbr_rc_info->total_bit_budget = total_bit_budget;
+  vbr_rc_info->show_frame_count = show_frame_count;
+  const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.94559, 1,
+                                                     0.94559, 1,       1,
+                                                     0.94559 };
+
+  // TODO(angiebird): Based on the previous code, only the scale factor 0.94559
+  // will be used in most of the cases with --limi=17. Figure out if the
+  // following scale factors works better.
+  // const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1,
+  //                                                    1.10199, 1,       1,
+  //                                                    0.16393 };
+
+  const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 };
+  memcpy(vbr_rc_info->scale_factors, scale_factors,
+         sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES);
+  memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors,
+         sizeof(mv_scale_factors[0]) * FRAME_UPDATE_TYPES);
+
+  vbr_rc_reset_gop_data(vbr_rc_info);
+#if CONFIG_THREE_PASS
+  // TODO(angiebird): Explain why we use -1 here
+  vbr_rc_info->cur_gop_idx = -1;
+  vbr_rc_info->gop_count = 0;
+  vbr_rc_info->total_frame_count = 0;
+#endif  // CONFIG_THREE_PASS
+}
+
+#if CONFIG_THREE_PASS
+int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info,
+                                int gf_frame_index) {
+  int gop_idx = vbr_rc_info->cur_gop_idx;
+  int gop_start_idx = vbr_rc_info->gop_start_idx_list[gop_idx];
+  return gop_start_idx + gf_frame_index;
+}
+
+void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info,
+                                const TPL_INFO *tpl_info) {
+  int gop_start_idx = vbr_rc_info->total_frame_count;
+  vbr_rc_info->gop_start_idx_list[vbr_rc_info->gop_count] = gop_start_idx;
+  vbr_rc_info->gop_length_list[vbr_rc_info->gop_count] = tpl_info->gf_length;
+  assert(gop_start_idx + tpl_info->gf_length <= VBR_RC_INFO_MAX_FRAMES);
+  for (int i = 0; i < tpl_info->gf_length; ++i) {
+    vbr_rc_info->txfm_stats_list[gop_start_idx + i] =
+        tpl_info->txfm_stats_list[i];
+    vbr_rc_info->qstep_ratio_list[gop_start_idx + i] =
+        tpl_info->qstep_ratio_ls[i];
+    vbr_rc_info->update_type_list[gop_start_idx + i] =
+        tpl_info->update_type_list[i];
+  }
+  vbr_rc_info->total_frame_count += tpl_info->gf_length;
+  vbr_rc_info->gop_count++;
+}
+#endif  // CONFIG_THREE_PASS
+
+void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+                                   int gop_showframe_count) {
+  vbr_rc_info->gop_showframe_count = gop_showframe_count;
+  vbr_rc_info->gop_bit_budget = vbr_rc_info->total_bit_budget *
+                                gop_showframe_count /
+                                vbr_rc_info->show_frame_count;
+}
+
+void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count,
+                                  const double *qstep_ratio_list,
+                                  aom_bit_depth_t bit_depth,
+                                  int *q_index_list) {
+  for (int i = 0; i < frame_count; ++i) {
+    q_index_list[i] = av1_get_q_index_from_qstep_ratio(
+        base_q_index, qstep_ratio_list[i], bit_depth);
+  }
+}
+
+double av1_vbr_rc_info_estimate_gop_bitrate(
+    int base_q_index, aom_bit_depth_t bit_depth,
+    const double *update_type_scale_factors, int frame_count,
+    const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+    const TplTxfmStats *stats_list, int *q_index_list,
+    double *estimated_bitrate_byframe) {
+  av1_vbr_rc_compute_q_indices(base_q_index, frame_count, qstep_ratio_list,
+                               bit_depth, q_index_list);
+  double estimated_gop_bitrate = 0;
+  for (int frame_index = 0; frame_index < frame_count; frame_index++) {
+    const TplTxfmStats *frame_stats = &stats_list[frame_index];
+    double frame_bitrate = 0;
+    if (frame_stats->ready) {
+      int q_index = q_index_list[frame_index];
+
+      frame_bitrate = av1_laplace_estimate_frame_rate(
+          q_index, frame_stats->txfm_block_count, frame_stats->abs_coeff_mean,
+          frame_stats->coeff_num);
+    }
+    FRAME_UPDATE_TYPE update_type = update_type_list[frame_index];
+    estimated_gop_bitrate +=
+        frame_bitrate * update_type_scale_factors[update_type];
+    if (estimated_bitrate_byframe != NULL) {
+      estimated_bitrate_byframe[frame_index] = frame_bitrate;
+    }
+  }
+  return estimated_gop_bitrate;
+}
+
+int av1_vbr_rc_info_estimate_base_q(
+    double bit_budget, aom_bit_depth_t bit_depth,
+    const double *update_type_scale_factors, int frame_count,
+    const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+    const TplTxfmStats *stats_list, int *q_index_list,
+    double *estimated_bitrate_byframe) {
+  int q_max = 255;  // Maximum q value.
+  int q_min = 0;    // Minimum q value.
+  int q = (q_max + q_min) / 2;
+
+  double q_max_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+      q_max, bit_depth, update_type_scale_factors, frame_count,
+      update_type_list, qstep_ratio_list, stats_list, q_index_list,
+      estimated_bitrate_byframe);
+
+  double q_min_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+      q_min, bit_depth, update_type_scale_factors, frame_count,
+      update_type_list, qstep_ratio_list, stats_list, q_index_list,
+      estimated_bitrate_byframe);
+  while (q_min + 1 < q_max) {
+    double estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+        q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+        qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+    if (estimate > bit_budget) {
+      q_min = q;
+      q_min_estimate = estimate;
+    } else {
+      q_max = q;
+      q_max_estimate = estimate;
+    }
+    q = (q_max + q_min) / 2;
+  }
+  // Pick the estimate that lands closest to the budget.
+  if (fabs(q_max_estimate - bit_budget) < fabs(q_min_estimate - bit_budget)) {
+    q = q_max;
+  } else {
+    q = q_min;
+  }
+  // Update q_index_list and vbr_rc_info.
+  av1_vbr_rc_info_estimate_gop_bitrate(
+      q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+      qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+  return q;
+}
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+                                    const TplParams *tpl_data,
+                                    const GF_GROUP *gf_group,
+                                    aom_bit_depth_t bit_depth) {
+  vbr_rc_info->q_index_list_ready = 1;
+  double gop_bit_budget = vbr_rc_info->gop_bit_budget;
+
+  for (int i = 0; i < gf_group->size; i++) {
+    vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+  }
+
+  double mv_bits = 0;
+  for (int i = 0; i < gf_group->size; i++) {
+    double frame_mv_bits = 0;
+    if (av1_tpl_stats_ready(tpl_data, i)) {
+      TplDepFrame *tpl_frame = &tpl_data->tpl_frame[i];
+      frame_mv_bits = av1_tpl_compute_frame_mv_entropy(
+          tpl_frame, tpl_data->tpl_stats_block_mis_log2);
+      FRAME_UPDATE_TYPE updae_type = gf_group->update_type[i];
+      mv_bits += frame_mv_bits * vbr_rc_info->mv_scale_factors[updae_type];
+    }
+  }
+
+  mv_bits = AOMMIN(mv_bits, 0.6 * gop_bit_budget);
+  gop_bit_budget -= mv_bits;
+
+  vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q(
+      gop_bit_budget, bit_depth, vbr_rc_info->scale_factors, gf_group->size,
+      gf_group->update_type, vbr_rc_info->qstep_ratio_list,
+      tpl_data->txfm_stats_list, vbr_rc_info->q_index_list, NULL);
+}
+
+#endif  // CONFIG_BITRATE_ACCURACY
+
+// Use upper and left neighbor block as the reference MVs.
+// Compute the minimum difference between current MV and reference MV.
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+                                 int step, int tpl_stride, int right_shift) {
+  const TplDepStats *tpl_stats =
+      &tpl_frame
+           ->tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_stride, right_shift)];
+  int_mv current_mv = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+  int current_mv_magnitude =
+      abs(current_mv.as_mv.row) + abs(current_mv.as_mv.col);
+
+  // Retrieve the up and left neighbors.
+  int up_error = INT_MAX;
+  int_mv up_mv_diff;
+  if (row - step >= 0) {
+    tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+        row - step, col, tpl_stride, right_shift)];
+    up_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+    up_mv_diff.as_mv.row = current_mv.as_mv.row - up_mv_diff.as_mv.row;
+    up_mv_diff.as_mv.col = current_mv.as_mv.col - up_mv_diff.as_mv.col;
+    up_error = abs(up_mv_diff.as_mv.row) + abs(up_mv_diff.as_mv.col);
+  }
+
+  int left_error = INT_MAX;
+  int_mv left_mv_diff;
+  if (col - step >= 0) {
+    tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+        row, col - step, tpl_stride, right_shift)];
+    left_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+    left_mv_diff.as_mv.row = current_mv.as_mv.row - left_mv_diff.as_mv.row;
+    left_mv_diff.as_mv.col = current_mv.as_mv.col - left_mv_diff.as_mv.col;
+    left_error = abs(left_mv_diff.as_mv.row) + abs(left_mv_diff.as_mv.col);
+  }
+
+  // Return the MV with the minimum distance from current.
+  if (up_error < left_error && up_error < current_mv_magnitude) {
+    return up_mv_diff;
+  } else if (left_error < up_error && left_error < current_mv_magnitude) {
+    return left_mv_diff;
+  }
+  return current_mv;
+}
+
+/* Compute the entropy of motion vectors for a single frame. */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+                                        uint8_t right_shift) {
+  if (!tpl_frame->is_valid) {
+    return 0;
+  }
+
+  int count_row[500] = { 0 };
+  int count_col[500] = { 0 };
+  int n = 0;  // number of MVs to process
+
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << right_shift;
+
+  for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+      int_mv mv = av1_compute_mv_difference(tpl_frame, row, col, step,
+                                            tpl_stride, right_shift);
+      count_row[clamp(mv.as_mv.row, 0, 499)] += 1;
+      count_col[clamp(mv.as_mv.row, 0, 499)] += 1;
+      n += 1;
+    }
+  }
+
+  // Estimate the bits used using the entropy formula.
+  double rate_row = 0;
+  double rate_col = 0;
+  for (int i = 0; i < 500; i++) {
+    if (count_row[i] != 0) {
+      double p = count_row[i] / (double)n;
+      rate_row += count_row[i] * -log2(p);
+    }
+    if (count_col[i] != 0) {
+      double p = count_col[i] / (double)n;
+      rate_col += count_col[i] * -log2(p);
+    }
+  }
+
+  return rate_row + rate_col;
+}
diff --git a/third_party/aom/av1/encoder/tpl_model.h b/third_party/aom/av1/encoder/tpl_model.h
new file mode 100644
index 0000000000..bcd58216c5
--- /dev/null
+++ b/third_party/aom/av1/encoder/tpl_model.h
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_
+#define AOM_AV1_ENCODER_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+
+struct AV1_PRIMARY;
+struct AV1_COMP;
+struct AV1_SEQ_CODING_TOOLS;
+struct EncodeFrameParams;
+struct EncodeFrameInput;
+struct GF_GROUP;
+struct ThreadData;
+struct TPL_INFO;
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
+  switch (length) {
+    case 64: return BLOCK_64X64;
+    case 32: return BLOCK_32X32;
+    case 16: return BLOCK_16X16;
+    case 8: return BLOCK_8X8;
+    case 4: return BLOCK_4X4;
+    default:
+      assert(0 && "Invalid block size for tpl model");
+      return BLOCK_16X16;
+  }
+}
+
+typedef struct AV1TplRowMultiThreadSync {
+#if CONFIG_MULTITHREAD
+  // Synchronization objects for top-right dependency.
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Buffer to store the macroblock whose encoding is complete.
+  // num_finished_cols[i] stores the number of macroblocks which finished
+  // encoding in the ith macroblock row.
+  int *num_finished_cols;
+  // Number of extra macroblocks of the top row to be complete for encoding
+  // of the current macroblock to start. A value of 1 indicates top-right
+  // dependency.
+  int sync_range;
+  // Number of macroblock rows.
+  int rows;
+  // Number of threads processing the current tile.
+  int num_threads_working;
+} AV1TplRowMultiThreadSync;
+
+typedef struct AV1TplRowMultiThreadInfo {
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool tpl_mt_exit;
+#if CONFIG_MULTITHREAD
+  // Mutex lock object used for error handling.
+  pthread_mutex_t *mutex_;
+#endif
+  // Row synchronization related function pointers.
+  void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c);
+  void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c,
+                         int cols);
+} AV1TplRowMultiThreadInfo;
+
+// TODO(jingning): This needs to be cleaned up next.
+
+// TPL stats buffers are prepared for every frame in the GOP,
+// including (internal) overlays and (internal) arfs.
+// In addition, frames in the lookahead that are outside of the GOP
+// are also used.
+// Thus it should use
+// (gop_length) + (# overlays) + (MAX_LAG_BUFFERS - gop_len) =
+// MAX_LAG_BUFFERS + (# overlays)
+// 2 * MAX_LAG_BUFFERS is therefore a safe estimate.
+// TODO(bohanli): test setting it to 1.5 * MAX_LAG_BUFFER
+#define MAX_TPL_FRAME_IDX (2 * MAX_LAG_BUFFERS)
+// The first REF_FRAMES + 1 buffers are reserved.
+// tpl_data->tpl_frame starts after REF_FRAMES + 1
+#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1)
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+#define TPL_EPSILON 0.0000001
+
+typedef struct TplTxfmStats {
+  int ready;                  // Whether abs_coeff_mean is ready
+  double abs_coeff_sum[256];  // Assume we are using 16x16 transform block
+  double abs_coeff_mean[256];
+  int txfm_block_count;
+  int coeff_num;
+} TplTxfmStats;
+
+typedef struct {
+  uint8_t *predictor8;
+  int16_t *src_diff;
+  tran_low_t *coeff;
+  tran_low_t *qcoeff;
+  tran_low_t *dqcoeff;
+} TplBuffers;
+
+typedef struct TplDepStats {
+  int64_t srcrf_sse;
+  int64_t srcrf_dist;
+  int64_t recrf_sse;
+  int64_t recrf_dist;
+  int64_t intra_sse;
+  int64_t intra_dist;
+  int64_t cmp_recrf_dist[2];
+  int64_t mc_dep_rate;
+  int64_t mc_dep_dist;
+  int64_t pred_error[INTER_REFS_PER_FRAME];
+  int32_t intra_cost;
+  int32_t inter_cost;
+  int32_t srcrf_rate;
+  int32_t recrf_rate;
+  int32_t intra_rate;
+  int32_t cmp_recrf_rate[2];
+  int_mv mv[INTER_REFS_PER_FRAME];
+  int8_t ref_frame_index[2];
+} TplDepStats;
+
+typedef struct TplDepFrame {
+  uint8_t is_valid;
+  TplDepStats *tpl_stats_ptr;
+  const YV12_BUFFER_CONFIG *gf_picture;
+  YV12_BUFFER_CONFIG *rec_picture;
+  int ref_map_index[REF_FRAMES];
+  int stride;
+  int width;
+  int height;
+  int mi_rows;
+  int mi_cols;
+  int base_rdmult;
+  uint32_t frame_display_index;
+  // When set, SAD metric is used for intra and inter mode decision.
+  int use_pred_sad;
+} TplDepFrame;
+
+/*!\endcond */
+/*!
+ * \brief Params related to temporal dependency model.
+ */
+typedef struct TplParams {
+  /*!
+   * Whether the tpl stats is ready.
+   */
+  int ready;
+
+  /*!
+   * Block granularity of tpl score storage.
+   */
+  uint8_t tpl_stats_block_mis_log2;
+
+  /*!
+   * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16.
+   */
+  uint8_t tpl_bsize_1d;
+
+  /*!
+   * Buffer to store the frame level tpl information for each frame in a gf
+   * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
+   * group
+   */
+  TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
+
+  /*!
+   * Buffer to store tpl stats at block granularity.
+   * tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
+   * group.
+   */
+  TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
+
+  /*!
+   * Pointer to the buffer which stores tpl transform stats per frame.
+   * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group.
+   * Memory is allocated dynamically for MAX_LENGTH_TPL_FRAME_STATS frames when
+   * tpl is enabled.
+   */
+  TplTxfmStats *txfm_stats_list;
+
+  /*!
+   * Buffer to store tpl reconstructed frame.
+   * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
+   */
+  YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
+
+  /*!
+   * Pointer to tpl_stats_buffer.
+   */
+  TplDepFrame *tpl_frame;
+
+  /*!
+   * Scale factors for the current frame.
+   */
+  struct scale_factors sf;
+
+  /*!
+   * GF group index of the current frame.
+   */
+  int frame_idx;
+
+  /*!
+   * Array of pointers to the frame buffers holding the source frame.
+   * src_ref_frame[i] stores the pointer to the source frame of the ith
+   * reference frame type.
+   */
+  const YV12_BUFFER_CONFIG *src_ref_frame[INTER_REFS_PER_FRAME];
+
+  /*!
+   * Array of pointers to the frame buffers holding the tpl reconstructed frame.
+   * ref_frame[i] stores the pointer to the tpl reconstructed frame of the ith
+   * reference frame type.
+   */
+  const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME];
+
+  /*!
+   * Parameters related to synchronization for top-right dependency in row based
+   * multi-threading of tpl
+   */
+  AV1TplRowMultiThreadSync tpl_mt_sync;
+
+  /*!
+   * Frame border for tpl frame.
+   */
+  int border_in_pixels;
+
+  /*!
+   * Factor to adjust r0 if TPL uses a subset of frames in the gf group.
+   */
+  double r0_adjust_factor;
+} TplParams;
+
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
+#define VBR_RC_INFO_MAX_FRAMES 500
+#endif  //  CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
+
+#if CONFIG_BITRATE_ACCURACY
+
+/*!
+ * \brief This structure stores information needed for bitrate accuracy
+ * experiment.
+ */
+typedef struct {
+  int ready;
+  double total_bit_budget;  // The total bit budget of the entire video
+  int show_frame_count;     // Number of show frames in the entire video
+
+  int gop_showframe_count;  // The number of show frames in the current gop
+  double gop_bit_budget;    // The bitbudget for the current gop
+  double scale_factors[FRAME_UPDATE_TYPES];     // Scale factors to improve the
+                                                // budget estimation
+  double mv_scale_factors[FRAME_UPDATE_TYPES];  // Scale factors to improve
+                                                // MV entropy estimation
+
+  // === Below this line are GOP related data that will be updated per GOP ===
+  int base_q_index;  // Stores the base q index.
+  int q_index_list_ready;
+  int q_index_list[VBR_RC_INFO_MAX_FRAMES];  // q indices for the current
+                                             // GOP
+
+  // Array to store qstep_ratio for each frame in a GOP
+  double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES];
+
+#if CONFIG_THREE_PASS
+  TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES];
+  FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES];
+  int gop_start_idx_list[VBR_RC_INFO_MAX_FRAMES];
+  int gop_length_list[VBR_RC_INFO_MAX_FRAMES];
+  int cur_gop_idx;
+  int total_frame_count;
+  int gop_count;
+#endif  // CONFIG_THREE_PASS
+} VBR_RATECTRL_INFO;
+
+static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) {
+  vbr_rc_info->q_index_list_ready = 0;
+  av1_zero(vbr_rc_info->q_index_list);
+}
+
+void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget,
+                     int show_frame_count);
+
+int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info,
+                                int gf_frame_index);
+
+void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info,
+                                const struct TPL_INFO *tpl_info);
+
+void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+                                   int gop_showframe_count);
+
+void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count,
+                                  const double *qstep_ratio_list,
+                                  aom_bit_depth_t bit_depth, int *q_index_list);
+
+/*!\brief Update q_index_list in vbr_rc_info based on tpl stats
+ *
+ * \param[out]      vbr_rc_info    Rate control info for BITRATE_ACCURACY
+ *                                 experiment
+ * \param[in]       tpl_data       TPL struct
+ * \param[in]       gf_group       GOP struct
+ * \param[in]       bit_depth      bit depth
+ */
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+                                    const TplParams *tpl_data,
+                                    const struct GF_GROUP *gf_group,
+                                    aom_bit_depth_t bit_depth);
+/*
+ *!\brief Compute the number of bits needed to encode a GOP
+ *
+ * \param[in]    base_q_index              base layer q_index
+ * \param[in]    bit_depth                 bit depth
+ * \param[in]    update_type_scale_factors array of scale factors for each
+ *                                         update_type
+ * \param[in]    frame_count               size of update_type_list,
+ *                                         qstep_ratio_list stats_list,
+ *                                         q_index_list and
+ *                                         estimated_bitrate_byframe
+ * \param[in]    update_type_list          array of update_type, one per frame
+ * \param[in]    qstep_ratio_list          array of qstep_ratio, one per frame
+ * \param[in]    stats_list                array of transform stats, one per
+ *                                         frame
+ * \param[out]   q_index_list              array of q_index, one per frame
+ * \param[out]   estimated_bitrate_byframe array to keep track of frame
+ *                                         bitrate
+ *
+ * \return The estimated GOP bitrate.
+ *
+ */
+double av1_vbr_rc_info_estimate_gop_bitrate(
+    int base_q_index, aom_bit_depth_t bit_depth,
+    const double *update_type_scale_factors, int frame_count,
+    const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+    const TplTxfmStats *stats_list, int *q_index_list,
+    double *estimated_bitrate_byframe);
+
+/*!\brief Estimate the optimal base q index for a GOP.
+ *
+ * This function uses a binary search to find base layer q index to
+ * achieve the specified bit budget.
+ *
+ * \param[in]    bit_budget        target bit budget
+ * \param[in]    bit_depth         bit depth
+ * \param[in]    update_type_scale_factors array of scale factors for each
+ *                                 update_type
+ * \param[in]    frame_count       size of update_type_list, qstep_ratio_list
+ *                                 stats_list, q_index_list and
+ *                                 estimated_bitrate_byframe
+ * \param[in]    update_type_list  array of update_type, one per frame
+ * \param[in]    qstep_ratio_list  array of qstep_ratio, one per frame
+ * \param[in]    stats_list        array of transform stats, one per frame
+ * \param[out]   q_index_list      array of q_index, one per frame
+ * \param[out]   estimated_bitrate_byframe Array to keep track of frame
+ * bitrate
+ *
+ * \return Returns the optimal base q index to use.
+ */
+int av1_vbr_rc_info_estimate_base_q(
+    double bit_budget, aom_bit_depth_t bit_depth,
+    const double *update_type_scale_factors, int frame_count,
+    const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+    const TplTxfmStats *stats_list, int *q_index_list,
+    double *estimated_bitrate_byframe);
+
+#endif  // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RD_COMMAND
+typedef enum {
+  RD_OPTION_NONE,
+  RD_OPTION_SET_Q,
+  RD_OPTION_SET_Q_RDMULT
+} RD_OPTION;
+
+typedef struct RD_COMMAND {
+  RD_OPTION option_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int q_index_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int rdmult_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int frame_count;
+  int frame_index;
+} RD_COMMAND;
+
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command);
+#endif  // CONFIG_RD_COMMAND
+
+/*!\brief Allocate buffers used by tpl model
+ *
+ * \param[in]    Top-level encode/decode structure
+ * \param[in]    lag_in_frames  number of lookahead frames
+ *
+ * \param[out]   tpl_data  tpl data structure
+ */
+
+void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi,
+                           CommonModeInfoParams *const mi_params, int width,
+                           int height, int byte_alignment, int lag_in_frames);
+
+static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) {
+  aom_free(tpl_tmp_buffers->predictor8);
+  tpl_tmp_buffers->predictor8 = NULL;
+  aom_free(tpl_tmp_buffers->src_diff);
+  tpl_tmp_buffers->src_diff = NULL;
+  aom_free(tpl_tmp_buffers->coeff);
+  tpl_tmp_buffers->coeff = NULL;
+  aom_free(tpl_tmp_buffers->qcoeff);
+  tpl_tmp_buffers->qcoeff = NULL;
+  aom_free(tpl_tmp_buffers->dqcoeff);
+  tpl_tmp_buffers->dqcoeff = NULL;
+}
+
+static AOM_INLINE bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers,
+                                              uint8_t tpl_bsize_1d) {
+  // Number of pixels in a tpl block
+  const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d;
+
+  // Allocate temporary buffers used in mode estimation.
+  tpl_tmp_buffers->predictor8 = (uint8_t *)aom_memalign(
+      32, tpl_block_pels * 2 * sizeof(*tpl_tmp_buffers->predictor8));
+  tpl_tmp_buffers->src_diff = (int16_t *)aom_memalign(
+      32, tpl_block_pels * sizeof(*tpl_tmp_buffers->src_diff));
+  tpl_tmp_buffers->coeff = (tran_low_t *)aom_memalign(
+      32, tpl_block_pels * sizeof(*tpl_tmp_buffers->coeff));
+  tpl_tmp_buffers->qcoeff = (tran_low_t *)aom_memalign(
+      32, tpl_block_pels * sizeof(*tpl_tmp_buffers->qcoeff));
+  tpl_tmp_buffers->dqcoeff = (tran_low_t *)aom_memalign(
+      32, tpl_block_pels * sizeof(*tpl_tmp_buffers->dqcoeff));
+
+  if (!(tpl_tmp_buffers->predictor8 && tpl_tmp_buffers->src_diff &&
+        tpl_tmp_buffers->coeff && tpl_tmp_buffers->qcoeff &&
+        tpl_tmp_buffers->dqcoeff)) {
+    tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+    return false;
+  }
+  return true;
+}
+
+/*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
+ * group) and selects between 16 and 32 frame GOP structure.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    cpi           Top - level encoder instance structure
+ * \param[in]    gop_eval      Flag if it is in the GOP length decision stage
+ * \param[in]    frame_params  Per frame encoding parameters
+ *
+ * \return Indicates whether or not we should use a longer GOP length.
+ */
+int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval,
+                        const struct EncodeFrameParams *const frame_params);
+
+/*!\cond */
+
+void av1_tpl_preload_rc_estimate(
+    struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
+
+void av1_init_tpl_stats(TplParams *const tpl_data);
+
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index);
+
+void av1_tpl_rdmult_setup(struct AV1_COMP *cpi);
+
+void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x,
+                             BLOCK_SIZE sb_size, int mi_row, int mi_col);
+
+void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi,
+                               TplTxfmStats *tpl_txfm_stats,
+                               TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+                               int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+/*!\brief  Compute the entropy of an exponential probability distribution
+ * function (pdf) subjected to uniform quantization.
+ *
+ * pdf(x) = b*exp(-b*x)
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_step        quantizer step size
+ * \param[in]    b             parameter of exponential distribution
+ *
+ * \return entropy cost
+ */
+double av1_exponential_entropy(double q_step, double b);
+
+/*!\brief  Compute the entropy of a Laplace probability distribution
+ * function (pdf) subjected to non-uniform quantization.
+ *
+ * pdf(x) = 0.5*b*exp(-0.5*b*|x|)
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_step          quantizer step size for non-zero bins
+ * \param[in]    b               parameter of Laplace distribution
+ * \param[in]    zero_bin_ratio  zero bin's size is zero_bin_ratio * q_step
+ *
+ * \return entropy cost
+ */
+double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio);
+
+/*!\brief  Compute the frame rate using transform block stats
+ *
+ * Assume each position i in the transform block is of Laplace distribution
+ * with mean absolute deviation abs_coeff_mean[i]
+ *
+ * Then we can use av1_laplace_entropy() to compute the expected frame
+ * rate.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_index         quantizer index
+ * \param[in]    block_count     number of transform blocks
+ * \param[in]    abs_coeff_mean  array of mean absolute deviation
+ * \param[in]    coeff_num       number of coefficients per transform block
+ *
+ * \return expected frame rate
+ */
+double av1_laplace_estimate_frame_rate(int q_index, int block_count,
+                                       const double *abs_coeff_mean,
+                                       int coeff_num);
+
+/*
+ *!\brief Init TplTxfmStats
+ *
+ * \param[in]    tpl_txfm_stats  a structure for storing transform stats
+ *
+ */
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats);
+
+#if CONFIG_BITRATE_ACCURACY
+/*
+ *!\brief Accumulate TplTxfmStats
+ *
+ * \param[in]  sub_stats          a structure for storing sub transform stats
+ * \param[out] accumulated_stats  a structure for storing accumulated
+ *transform stats
+ *
+ */
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+                                   TplTxfmStats *accumulated_stats);
+
+/*
+ *!\brief Record a transform block into  TplTxfmStats
+ *
+ * \param[in]  tpl_txfm_stats     A structure for storing transform stats
+ * \param[out] coeff              An array of transform coefficients. Its size
+ *                                should equal to tpl_txfm_stats.coeff_num.
+ *
+ */
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+                               const tran_low_t *coeff);
+
+/*
+ *!\brief Update abs_coeff_mean and ready of txfm_stats
+ * If txfm_block_count > 0, this function will use abs_coeff_sum and
+ * txfm_block_count to compute abs_coeff_mean. Moreover, reday flag
+ * will be set to one.
+ *
+ * \param[in]  txfm_stats     A structure for storing transform stats
+ */
+void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats);
+#endif  // CONFIG_BITRATE_ACCURACY
+
+/*!\brief  Estimate coefficient entropy using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * This function is equivalent to -log2(laplace_prob()), where laplace_prob()
+ *is defined in tpl_model_test.cc
+ *
+ * \param[in]    q_step          quantizer step size without any scaling
+ * \param[in]    b               mean absolute deviation of Laplace
+ *distribution \param[in]    zero_bin_ratio  zero bin's size is zero_bin_ratio
+ ** q_step \param[in]    qcoeff          quantized coefficient
+ *
+ * \return estimated coefficient entropy
+ *
+ */
+double av1_estimate_coeff_entropy(double q_step, double b,
+                                  double zero_bin_ratio, int qcoeff);
+
+/*!\brief  Estimate entropy of a transform block using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_index         quantizer index
+ * \param[in]    abs_coeff_mean  array of mean absolute deviations
+ * \param[in]    qcoeff_arr      array of quantized coefficients
+ * \param[in]    coeff_num       number of coefficients per transform block
+ *
+ * \return estimated transform block entropy
+ *
+ */
+double av1_estimate_txfm_block_entropy(int q_index,
+                                       const double *abs_coeff_mean,
+                                       int *qcoeff_arr, int coeff_num);
+
+// TODO(angiebird): Add doxygen description here.
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                            int64_t srcrf_dist, int pix_num);
+
+/*!\brief  Compute the overlap area between two blocks with the same size
+ *
+ *\ingroup tpl_modelling
+ *
+ * If there is no overlap, this function should return zero.
+ *
+ * \param[in]    row_a  row position of the first block
+ * \param[in]    col_a  column position of the first block
+ * \param[in]    row_b  row position of the second block
+ * \param[in]    col_b  column position of the second block
+ * \param[in]    width  width shared by the two blocks
+ * \param[in]    height height shared by the two blocks
+ *
+ * \return overlap area of the two blocks
+ */
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+                         int height);
+
+/*!\brief Get current frame's q_index from tpl stats and leaf_qindex
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_frame_index    current frame index in the GOP
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       bit_depth         bit depth
+ *
+ * \return q_index
+ */
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+                        int leaf_qindex, aom_bit_depth_t bit_depth);
+
+/*!\brief Compute the frame importance from TPL stats
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_frame_index    current frame index in the GOP
+ *
+ * \return frame_importance
+ */
+double av1_tpl_get_frame_importance(const TplParams *tpl_data,
+                                    int gf_frame_index);
+
+/*!\brief Compute the ratio between arf q step and the leaf q step based on
+ * TPL stats
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_frame_index    current frame index in the GOP
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       bit_depth         bit depth
+ *
+ * \return qstep_ratio
+ */
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index);
+
+/*!\brief Find a q index whose step size is near qstep_ratio * leaf_qstep
+ *
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       qstep_ratio       step ratio between target q index and
+ * leaf q index \param[in]       bit_depth         bit depth
+ *
+ * \return q_index
+ */
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+                                     aom_bit_depth_t bit_depth);
+
+/*!\brief Improve the motion vector estimation by taking neighbors into
+ * account.
+ *
+ * Use the upper and left neighbor block as the reference MVs.
+ * Compute the minimum difference between current MV and reference MV.
+ *
+ * \param[in]       tpl_frame         Tpl frame struct
+ * \param[in]       row               Current row
+ * \param[in]       col               Current column
+ * \param[in]       step              Step parameter for av1_tpl_ptr_pos
+ * \param[in]       tpl_stride        Stride parameter for av1_tpl_ptr_pos
+ * \param[in]       right_shift       Right shift parameter for
+ * av1_tpl_ptr_pos
+ */
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+                                 int step, int tpl_stride, int right_shift);
+
+/*!\brief Compute the entropy of motion vectors for a single frame.
+ *
+ * \param[in]       tpl_frame         TPL frame struct
+ * \param[in]       right_shift       right shift value for step
+ *
+ * \return Bits used by the motion vectors for one frame.
+ */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+                                        uint8_t right_shift);
+
+#if CONFIG_RATECTRL_LOG
+typedef struct {
+  int coding_frame_count;
+  int base_q_index;
+
+  // Encode decision
+  int q_index_list[VBR_RC_INFO_MAX_FRAMES];
+  double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES];
+  FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES];
+
+  // Frame stats
+  TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES];
+
+  // Estimated encode results
+  double est_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
+
+  // Actual encode results
+  double act_rate_list[VBR_RC_INFO_MAX_FRAMES];
+  double act_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
+} RATECTRL_LOG;
+
+static INLINE void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); }
+
+static INLINE void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index,
+                                      const TplTxfmStats *txfm_stats) {
+  rc_log->txfm_stats_list[coding_index] = *txfm_stats;
+}
+
+static INLINE void rc_log_frame_encode_param(RATECTRL_LOG *rc_log,
+                                             int coding_index,
+                                             double qstep_ratio, int q_index,
+                                             FRAME_UPDATE_TYPE update_type) {
+  rc_log->qstep_ratio_list[coding_index] = qstep_ratio;
+  rc_log->q_index_list[coding_index] = q_index;
+  rc_log->update_type_list[coding_index] = update_type;
+  const TplTxfmStats *txfm_stats = &rc_log->txfm_stats_list[coding_index];
+  rc_log->est_coeff_rate_list[coding_index] = 0;
+  if (txfm_stats->ready) {
+    rc_log->est_coeff_rate_list[coding_index] = av1_laplace_estimate_frame_rate(
+        q_index, txfm_stats->txfm_block_count, txfm_stats->abs_coeff_mean,
+        txfm_stats->coeff_num);
+  }
+}
+
+static INLINE void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index,
+                                        double act_rate,
+                                        double act_coeff_rate) {
+  rc_log->act_rate_list[coding_index] = act_rate;
+  rc_log->act_coeff_rate_list[coding_index] = act_coeff_rate;
+}
+
+static INLINE void rc_log_record_chunk_info(RATECTRL_LOG *rc_log,
+                                            int base_q_index,
+                                            int coding_frame_count) {
+  rc_log->base_q_index = base_q_index;
+  rc_log->coding_frame_count = coding_frame_count;
+}
+
+static INLINE void rc_log_show(const RATECTRL_LOG *rc_log) {
+  printf("= chunk 1\n");
+  printf("coding_frame_count %d base_q_index %d\n", rc_log->coding_frame_count,
+         rc_log->base_q_index);
+  printf("= frame %d\n", rc_log->coding_frame_count);
+  for (int coding_idx = 0; coding_idx < rc_log->coding_frame_count;
+       coding_idx++) {
+    printf(
+        "coding_idx %d update_type %d q %d qstep_ratio %f est_coeff_rate %f "
+        "act_coeff_rate %f act_rate %f\n",
+        coding_idx, rc_log->update_type_list[coding_idx],
+        rc_log->q_index_list[coding_idx], rc_log->qstep_ratio_list[coding_idx],
+        rc_log->est_coeff_rate_list[coding_idx],
+        rc_log->act_coeff_rate_list[coding_idx],
+        rc_log->act_rate_list[coding_idx]);
+  }
+}
+#endif  // CONFIG_RATECTRL_LOG
+
+/*!\endcond */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TPL_MODEL_H_
diff --git a/third_party/aom/av1/encoder/tune_butteraugli.c b/third_party/aom/av1/encoder/tune_butteraugli.c
new file mode 100644
index 0000000000..92fc4b2a92
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_butteraugli.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/encoder/tune_butteraugli.h"
+
+#include "aom_dsp/butteraugli.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/var_based_part.h"
+
+static const int resize_factor = 2;
+
+static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
+                                              const YV12_BUFFER_CONFIG *source,
+                                              const YV12_BUFFER_CONFIG *recon,
+                                              const double K) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const aom_color_range_t color_range =
+      seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int width = source->y_crop_width;
+  const int height = source->y_crop_height;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
+
+  float *diffmap;
+  CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap)));
+  if (!aom_calc_butteraugli(source, recon, bit_depth,
+                            seq_params->matrix_coefficients, color_range,
+                            diffmap)) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "Failed to calculate Butteraugli distances.");
+  }
+
+  const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize] / resize_factor;
+  const int num_mi_h = mi_size_high[butteraugli_rdo_bsize] / resize_factor;
+  const int num_cols =
+      (mi_params->mi_cols / resize_factor + num_mi_w - 1) / num_mi_w;
+  const int num_rows =
+      (mi_params->mi_rows / resize_factor + num_mi_h - 1) / num_mi_h;
+  const int block_w = num_mi_w << 2;
+  const int block_h = num_mi_h << 2;
+  double log_sum = 0.0;
+  double blk_count = 0.0;
+
+  // Loop through each block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      const int y_start = row * block_h;
+      const int x_start = col * block_w;
+      float dbutteraugli = 0.0f;
+      float dmse = 0.0f;
+      float px_count = 0.0f;
+
+      // Loop through each pixel.
+      for (int y = y_start; y < y_start + block_h && y < height; y++) {
+        for (int x = x_start; x < x_start + block_w && x < width; x++) {
+          dbutteraugli += powf(diffmap[y * width + x], 12.0f);
+          float px_diff = source->y_buffer[y * source->y_stride + x] -
+                          recon->y_buffer[y * recon->y_stride + x];
+          dmse += px_diff * px_diff;
+          px_count += 1.0f;
+        }
+      }
+      const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y),
+                               (height + ss_y) >> ss_y);
+      for (int y = y_start >> ss_y; y < y_end; y++) {
+        const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x),
+                                 (width + ss_x) >> ss_x);
+        for (int x = x_start >> ss_x; x < x_end; x++) {
+          const int src_px_index = y * source->uv_stride + x;
+          const int recon_px_index = y * recon->uv_stride + x;
+          const float px_diff_u = (float)(source->u_buffer[src_px_index] -
+                                          recon->u_buffer[recon_px_index]);
+          const float px_diff_v = (float)(source->v_buffer[src_px_index] -
+                                          recon->v_buffer[recon_px_index]);
+          dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v;
+          px_count += 2.0f;
+        }
+      }
+
+      dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f);
+      dmse = dmse / px_count;
+      const float eps = 0.01f;
+      double weight;
+      if (dbutteraugli < eps || dmse < eps) {
+        weight = -1.0;
+      } else {
+        blk_count += 1.0;
+        weight = dmse / dbutteraugli;
+        weight = AOMMIN(weight, 5.0);
+        weight += K;
+        log_sum += log(weight);
+      }
+      cpi->butteraugli_info.rdmult_scaling_factors[index] = weight;
+    }
+  }
+  // Geometric average of the weights.
+  log_sum = exp(log_sum / blk_count);
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      double *weight = &cpi->butteraugli_info.rdmult_scaling_factors[index];
+      if (*weight <= 0.0) {
+        *weight = 1.0;
+      } else {
+        *weight /= log_sum;
+      }
+      *weight = AOMMIN(*weight, 2.5);
+      *weight = AOMMAX(*weight, 0.4);
+    }
+  }
+
+  aom_free(diffmap);
+}
+
+void av1_set_butteraugli_rdmult(const AV1_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                int *rdmult) {
+  assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI);
+  if (!cpi->butteraugli_info.recon_set) {
+    return;
+  }
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize];
+  const int num_mi_h = mi_size_high[butteraugli_rdo_bsize];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  for (int row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (int col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale +=
+          log(cpi->butteraugli_info.rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  av1_set_error_per_bit(&x->errorperbit, *rdmult);
+}
+
+static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst,
+                       int dst_stride, int w, int h) {
+  for (int row = 0; row < h; row++) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                     int width, int height) {
+  copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width,
+             height);
+  const int width_uv = (width + src->subsampling_x) >> src->subsampling_x;
+  const int height_uv = (height + src->subsampling_y) >> src->subsampling_y;
+  copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+             width_uv, height_uv);
+  copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+             width_uv, height_uv);
+}
+
+static void zero_plane(uint8_t *dst, int dst_stride, int h) {
+  for (int row = 0; row < h; row++) {
+    memset(dst, 0, dst_stride);
+    dst += dst_stride;
+  }
+}
+
+static void zero_img(YV12_BUFFER_CONFIG *dst) {
+  zero_plane(dst->y_buffer, dst->y_stride, dst->y_height);
+  zero_plane(dst->u_buffer, dst->uv_stride, dst->uv_height);
+  zero_plane(dst->v_buffer, dst->uv_stride, dst->uv_height);
+}
+
+void av1_setup_butteraugli_source(AV1_COMP *cpi) {
+  YV12_BUFFER_CONFIG *const dst = &cpi->butteraugli_info.source;
+  AV1_COMMON *const cm = &cpi->common;
+  const int width = cpi->source->y_crop_width;
+  const int height = cpi->source->y_crop_height;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
+  if (dst->buffer_alloc_sz == 0) {
+    aom_alloc_frame_buffer(
+        dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+        cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+  }
+  av1_copy_and_extend_frame(cpi->source, dst);
+
+  YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source;
+  if (resized_dst->buffer_alloc_sz == 0) {
+    aom_alloc_frame_buffer(
+        resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y,
+        cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+        cm->features.byte_alignment, 0, 0);
+  }
+  if (!av1_resize_and_extend_frame_nonnormative(
+          cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating buffers during resize");
+  }
+
+  zero_img(cpi->source);
+  copy_img(resized_dst, cpi->source, width / resize_factor,
+           height / resize_factor);
+}
+
+void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) {
+  av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source);
+  AV1_COMMON *const cm = &cpi->common;
+  const int width = cpi->source->y_crop_width;
+  const int height = cpi->source->y_crop_height;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
+
+  YV12_BUFFER_CONFIG resized_recon;
+  memset(&resized_recon, 0, sizeof(resized_recon));
+  aom_alloc_frame_buffer(
+      &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0, 0);
+  copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
+           height / resize_factor);
+
+  set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source,
+                                    &resized_recon, K);
+  cpi->butteraugli_info.recon_set = true;
+  aom_free_frame_buffer(&resized_recon);
+}
+
+void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+  const int q_index = 96;
+
+  // Setup necessary params for encoding, including frame source, etc.
+  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+  av1_set_frame_size(cpi, cm->superres_upscaled_width,
+                     cm->superres_upscaled_height);
+
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+      0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+        cpi->image_pyramid_levels);
+  }
+
+  av1_setup_butteraugli_source(cpi);
+  av1_setup_frame(cpi);
+
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  const PARTITION_SEARCH_TYPE partition_search_type =
+      cpi->sf.part_sf.partition_search_type;
+  const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size;
+  // Enable a quicker pass by uncommenting the following lines:
+  // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+  // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+
+  av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index,
+                    q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+  av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+  av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                     cm->seq_params->bit_depth);
+
+  av1_set_variance_partition_thresholds(cpi, q_index, 0);
+  av1_encode_frame(cpi);
+
+  av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3);
+  cpi->sf.part_sf.partition_search_type = partition_search_type;
+  cpi->sf.part_sf.fixed_partition_size = fixed_partition_size;
+}
diff --git a/third_party/aom/av1/encoder/tune_butteraugli.h b/third_party/aom/av1/encoder/tune_butteraugli.h
new file mode 100644
index 0000000000..bae5d2a882
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_butteraugli.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
+#define AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
+
+typedef struct {
+  // Stores the scaling factors for rdmult when tuning for Butteraugli.
+  // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+  // 4x4 block at (row, col).
+  double *rdmult_scaling_factors;
+  YV12_BUFFER_CONFIG source, resized_source;
+  bool recon_set;
+} TuneButteraugliInfo;
+
+struct AV1_COMP;
+static const BLOCK_SIZE butteraugli_rdo_bsize = BLOCK_16X16;
+
+void av1_set_butteraugli_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                int *rdmult);
+
+void av1_setup_butteraugli_source(struct AV1_COMP *cpi);
+
+// 'K' is used to balance the rate-distortion distribution between PSNR
+// and Butteraugli.
+void av1_setup_butteraugli_rdmult_and_restore_source(struct AV1_COMP *cpi,
+                                                     double K);
+
+void av1_setup_butteraugli_rdmult(struct AV1_COMP *cpi);
+
+#endif  // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
diff --git a/third_party/aom/av1/encoder/tune_vmaf.c b/third_party/aom/av1/encoder/tune_vmaf.c
new file mode 100644
index 0000000000..4e5ffa387c
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_vmaf.c
@@ -0,0 +1,1112 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/tune_vmaf.h"
+
+#include "aom_dsp/psnr.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/rdopt.h"
+#include "config/aom_scale_rtcd.h"
+
+static const double kBaselineVmaf = 97.42773;
+
+static double get_layer_value(const double *array, int layer) {
+  while (array[layer] < 0.0 && layer > 0) layer--;
+  return AOMMAX(array[layer], 0.0);
+}
+
+static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src,
+                          const YV12_BUFFER_CONFIG *ref,
+                          const BLOCK_SIZE block_size, const int mb_row,
+                          const int mb_col, FULLPEL_MV *ref_mv) {
+  // Block information (ONLY Y-plane is used for motion search).
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int y_stride = src->y_stride;
+  assert(y_stride == ref->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  const struct buf_2d ori_src_buf = mb->plane[0].src;
+  const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+
+  // Parameters used for motion search.
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  FULLPEL_MV_STATS best_mv_stats;
+  const SEARCH_METHODS search_method = NSTEP;
+  const search_site_config *search_site_cfg =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+  const int step_param =
+      av1_init_search_range(AOMMAX(src->y_crop_width, src->y_crop_height));
+
+  // Baseline position for motion search (used for rate distortion comparison).
+  const MV baseline_mv = kZeroMv;
+
+  // Setup.
+  mb->plane[0].src.buf = src->y_buffer + y_offset;
+  mb->plane[0].src.stride = y_stride;
+  mbd->plane[0].pre[0].buf = ref->y_buffer + y_offset;
+  mbd->plane[0].pre[0].stride = y_stride;
+
+  // Unused intermediate results for motion search.
+  int cost_list[5];
+
+  // Do motion search.
+  // Only do full search on the entire block.
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+                                     &baseline_mv, *ref_mv, search_site_cfg,
+                                     search_method,
+                                     /*fine_search_interval=*/0);
+  av1_full_pixel_search(*ref_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), ref_mv, &best_mv_stats,
+                        NULL);
+
+  // Restore input state.
+  mb->plane[0].src = ori_src_buf;
+  mbd->plane[0].pre[0] = ori_pre_buf;
+}
+
+static unsigned int residual_variance(const AV1_COMP *cpi,
+                                      const YV12_BUFFER_CONFIG *src,
+                                      const YV12_BUFFER_CONFIG *ref,
+                                      const BLOCK_SIZE block_size,
+                                      const int mb_row, const int mb_col,
+                                      FULLPEL_MV ref_mv, unsigned int *sse) {
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int y_stride = src->y_stride;
+  assert(y_stride == ref->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+  const int mv_offset = ref_mv.row * y_stride + ref_mv.col;
+  const unsigned int var = cpi->ppi->fn_ptr[block_size].vf(
+      ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset,
+      y_stride, sse);
+  return var;
+}
+
+static double frame_average_variance(const AV1_COMP *const cpi,
+                                     const YV12_BUFFER_CONFIG *const frame) {
+  const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const uint8_t *const y_buffer = frame->y_buffer;
+  const int y_stride = frame->y_stride;
+  const BLOCK_SIZE block_size = BLOCK_64X64;
+
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  int row, col;
+  double var = 0.0, var_count = 0.0;
+  const int use_hbd = frame->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  // Loop through each block.
+  for (row = 0; row < frame->y_height / block_h; ++row) {
+    for (col = 0; col < frame->y_width / block_w; ++col) {
+      struct buf_2d buf;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+
+      buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
+      buf.stride = y_stride;
+
+      var += av1_get_perpixel_variance(cpi, xd, &buf, block_size, AOM_PLANE_Y,
+                                       use_hbd);
+      var_count += 1.0;
+    }
+  }
+  var /= var_count;
+  return var;
+}
+
+static double residual_frame_average_variance(AV1_COMP *cpi,
+                                              const YV12_BUFFER_CONFIG *src,
+                                              const YV12_BUFFER_CONFIG *ref,
+                                              FULLPEL_MV *mvs) {
+  if (ref == NULL) return frame_average_variance(cpi, src);
+  const BLOCK_SIZE block_size = BLOCK_16X16;
+  const int frame_height = src->y_height;
+  const int frame_width = src->y_width;
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_rows = (frame_height + mb_height - 1) / mb_height;
+  const int mb_cols = (frame_width + mb_width - 1) / mb_width;
+  const int num_planes = av1_num_planes(&cpi->common);
+  const int mi_h = mi_size_high_log2[block_size];
+  const int mi_w = mi_size_wide_log2[block_size];
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  for (int i = 0; i < num_planes; i++) {
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+  }
+  MB_MODE_INFO **input_mb_mode_info = mbd->mi;
+
+  bool do_motion_search = false;
+  if (mvs == NULL) {
+    do_motion_search = true;
+    CHECK_MEM_ERROR(&cpi->common, mvs,
+                    (FULLPEL_MV *)aom_calloc(mb_rows * mb_cols, sizeof(*mvs)));
+  }
+
+  unsigned int variance = 0;
+  // Perform temporal filtering block by block.
+  for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
+    av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                          (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+                          cpi->oxcf.border_in_pixels);
+    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+      av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                            (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+                            cpi->oxcf.border_in_pixels);
+      FULLPEL_MV *ref_mv = &mvs[mb_col + mb_row * mb_cols];
+      if (do_motion_search) {
+        motion_search(cpi, src, ref, block_size, mb_row, mb_col, ref_mv);
+      }
+      unsigned int mv_sse;
+      const unsigned int blk_var = residual_variance(
+          cpi, src, ref, block_size, mb_row, mb_col, *ref_mv, &mv_sse);
+      variance += blk_var;
+    }
+  }
+
+  // Restore input state
+  for (int i = 0; i < num_planes; i++) {
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+  }
+  mbd->mi = input_mb_mode_info;
+  return (double)variance / (double)(mb_rows * mb_cols);
+}
+
+// TODO(sdeng): Add the SIMD implementation.
+static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
+                                           int source_stride,
+                                           const uint16_t *blurred,
+                                           int blurred_stride, uint16_t *dst,
+                                           int dst_stride, int w, int h,
+                                           double amount, int bit_depth) {
+  const int max_value = (1 << bit_depth) - 1;
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      const double val =
+          (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+      dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value);
+    }
+    source += source_stride;
+    blurred += blurred_stride;
+    dst += dst_stride;
+  }
+}
+
+static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
+                                    const uint8_t *blurred, int blurred_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    double amount) {
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      const double val =
+          (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+      dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255);
+    }
+    source += source_stride;
+    blurred += blurred_stride;
+    dst += dst_stride;
+  }
+}
+
+static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
+                               const YV12_BUFFER_CONFIG *source,
+                               const YV12_BUFFER_CONFIG *blurred,
+                               const YV12_BUFFER_CONFIG *dst, double amount) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  if (cpi->common.seq_params->use_highbitdepth) {
+    assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(dst->flags & YV12_FLAG_HIGHBITDEPTH);
+    highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
+                        CONVERT_TO_SHORTPTR(blurred->y_buffer),
+                        blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer),
+                        dst->y_stride, source->y_width, source->y_height,
+                        amount, bit_depth);
+  } else {
+    unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer,
+                 blurred->y_stride, dst->y_buffer, dst->y_stride,
+                 source->y_width, source->y_height, amount);
+  }
+}
+
+// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
+// all co-efficients must be even.
+DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0,  8, 30, 52,
+                                                               30, 8, 0,  0 };
+static AOM_INLINE void gaussian_blur(const int bit_depth,
+                                     const YV12_BUFFER_CONFIG *source,
+                                     const YV12_BUFFER_CONFIG *dst) {
+  const int block_size = BLOCK_128X128;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
+  int row, col;
+
+  ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
+  InterpFilterParams filter = { .filter_ptr = gauss_filter,
+                                .taps = 8,
+                                .interp_filter = EIGHTTAP_REGULAR };
+
+  for (row = 0; row < num_rows; ++row) {
+    for (col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+
+      uint8_t *src_buf =
+          source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+      uint8_t *dst_buf =
+          dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y;
+
+      if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+        av1_highbd_convolve_2d_sr(
+            CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
+            CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
+            &filter, &filter, 0, 0, &conv_params, bit_depth);
+      } else {
+        av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride,
+                           block_w, block_h, &filter, &filter, 0, 0,
+                           &conv_params);
+      }
+    }
+  }
+}
+
+static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi,
+                                         double source_variance,
+                                         YV12_BUFFER_CONFIG *const source,
+                                         YV12_BUFFER_CONFIG *const sharpened) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  double new_vmaf;
+
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth,
+                cal_vmaf_neg, &new_vmaf);
+
+  const double sharpened_var = frame_average_variance(cpi, sharpened);
+  return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
+}
+
+static double find_best_frame_unsharp_amount_loop(
+    const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
+    YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened,
+    double best_vmaf, const double baseline_variance,
+    const double unsharp_amount_start, const double step_size,
+    const int max_loop_count, const double max_amount) {
+  const double min_amount = 0.0;
+  int loop_count = 0;
+  double approx_vmaf = best_vmaf;
+  double unsharp_amount = unsharp_amount_start;
+  do {
+    best_vmaf = approx_vmaf;
+    unsharp_amount += step_size;
+    if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+    unsharp(cpi, source, blurred, sharpened, unsharp_amount);
+    approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened);
+
+    loop_count++;
+  } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
+  unsharp_amount =
+      approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size;
+  return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
+                                             YV12_BUFFER_CONFIG *const source,
+                                             YV12_BUFFER_CONFIG *const blurred,
+                                             const double unsharp_amount_start,
+                                             const double step_size,
+                                             const int max_loop_count,
+                                             const double max_filter_amount) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = source->y_width;
+  const int height = source->y_height;
+  YV12_BUFFER_CONFIG sharpened;
+  memset(&sharpened, 0, sizeof(sharpened));
+  aom_alloc_frame_buffer(
+      &sharpened, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0, 0);
+
+  const double baseline_variance = frame_average_variance(cpi, source);
+  double unsharp_amount;
+  if (unsharp_amount_start <= step_size) {
+    unsharp_amount = find_best_frame_unsharp_amount_loop(
+        cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0,
+        step_size, max_loop_count, max_filter_amount);
+  } else {
+    double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start;
+    double v0, v1;
+    unsharp(cpi, source, blurred, &sharpened, a0);
+    v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    unsharp(cpi, source, blurred, &sharpened, a1);
+    v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    if (fabs(v0 - v1) < 0.01) {
+      unsharp_amount = a0;
+    } else if (v0 > v1) {
+      unsharp_amount = find_best_frame_unsharp_amount_loop(
+          cpi, source, blurred, &sharpened, v0, baseline_variance, a0,
+          -step_size, max_loop_count, max_filter_amount);
+    } else {
+      unsharp_amount = find_best_frame_unsharp_amount_loop(
+          cpi, source, blurred, &sharpened, v1, baseline_variance, a1,
+          step_size, max_loop_count, max_filter_amount);
+    }
+  }
+
+  aom_free_frame_buffer(&sharpened);
+  return unsharp_amount;
+}
+
+void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
+                                YV12_BUFFER_CONFIG *const source) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int width = source->y_width;
+  const int height = source->y_height;
+
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  const double best_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+  if (best_frame_unsharp_amount <= 0.0) return;
+
+  YV12_BUFFER_CONFIG blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(
+      &blurred, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0, 0);
+
+  gaussian_blur(bit_depth, source, &blurred);
+  unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+  aom_free_frame_buffer(&blurred);
+}
+
+void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
+                                  YV12_BUFFER_CONFIG *const source) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int width = source->y_width;
+  const int height = source->y_height;
+
+  YV12_BUFFER_CONFIG source_extended, blurred;
+  memset(&source_extended, 0, sizeof(source_extended));
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(
+      &source_extended, width, height, source->subsampling_x,
+      source->subsampling_y, cm->seq_params->use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+  aom_alloc_frame_buffer(
+      &blurred, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0, 0);
+
+  av1_copy_and_extend_frame(source, &source_extended);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
+  aom_free_frame_buffer(&source_extended);
+
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  const double last_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+  const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+      cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+  cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+      best_frame_unsharp_amount;
+
+  unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+  aom_free_frame_buffer(&blurred);
+}
+
+void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
+                                YV12_BUFFER_CONFIG *const source) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = source->y_width;
+  const int height = source->y_height;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
+
+  YV12_BUFFER_CONFIG source_extended, blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  memset(&source_extended, 0, sizeof(source_extended));
+  aom_alloc_frame_buffer(
+      &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+  aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+
+  av1_copy_and_extend_frame(source, &source_extended);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
+  aom_free_frame_buffer(&source_extended);
+
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  const double last_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+  const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+      cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+  cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+      best_frame_unsharp_amount;
+
+  const int block_size = BLOCK_64X64;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
+  double *best_unsharp_amounts =
+      aom_calloc(num_cols * num_rows, sizeof(*best_unsharp_amounts));
+  if (!best_unsharp_amounts) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating vmaf data");
+  }
+
+  YV12_BUFFER_CONFIG source_block, blurred_block;
+  memset(&source_block, 0, sizeof(source_block));
+  memset(&blurred_block, 0, sizeof(blurred_block));
+  aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+  aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+      const int block_width = AOMMIN(width - col_offset_y, block_w);
+      const int block_height = AOMMIN(height - row_offset_y, block_h);
+      const int index = col + row * num_cols;
+
+      if (cm->seq_params->use_highbitdepth) {
+        assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+        assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
+        uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+                                  row_offset_y * source->y_stride +
+                                  col_offset_y;
+        uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+                                      row_offset_y * blurred.y_stride +
+                                      col_offset_y;
+        uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer);
+        uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer);
+
+        // Copy block from source frame.
+        for (int i = 0; i < block_h; ++i) {
+          for (int j = 0; j < block_w; ++j) {
+            if (i >= block_height || j >= block_width) {
+              src_dst[j] = 0;
+              blurred_dst[j] = 0;
+            } else {
+              src_dst[j] = frame_src_buf[j];
+              blurred_dst[j] = frame_blurred_buf[j];
+            }
+          }
+          frame_src_buf += source->y_stride;
+          frame_blurred_buf += blurred.y_stride;
+          src_dst += source_block.y_stride;
+          blurred_dst += blurred_block.y_stride;
+        }
+      } else {
+        uint8_t *frame_src_buf =
+            source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+        uint8_t *frame_blurred_buf =
+            blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+        uint8_t *blurred_dst = blurred_block.y_buffer;
+        uint8_t *src_dst = source_block.y_buffer;
+
+        // Copy block from source frame.
+        for (int i = 0; i < block_h; ++i) {
+          for (int j = 0; j < block_w; ++j) {
+            if (i >= block_height || j >= block_width) {
+              src_dst[j] = 0;
+              blurred_dst[j] = 0;
+            } else {
+              src_dst[j] = frame_src_buf[j];
+              blurred_dst[j] = frame_blurred_buf[j];
+            }
+          }
+          frame_src_buf += source->y_stride;
+          frame_blurred_buf += blurred.y_stride;
+          src_dst += source_block.y_stride;
+          blurred_dst += blurred_block.y_stride;
+        }
+      }
+
+      best_unsharp_amounts[index] = find_best_frame_unsharp_amount(
+          cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3,
+          1.5);
+    }
+  }
+
+  // Apply best blur amounts
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+      const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
+      const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
+      const int index = col + row * num_cols;
+
+      if (cm->seq_params->use_highbitdepth) {
+        assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+        assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
+        uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+                            row_offset_y * source->y_stride + col_offset_y;
+        uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+                                row_offset_y * blurred.y_stride + col_offset_y;
+        highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf,
+                            blurred.y_stride, src_buf, source->y_stride,
+                            block_width, block_height,
+                            best_unsharp_amounts[index], bit_depth);
+      } else {
+        uint8_t *src_buf =
+            source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+        uint8_t *blurred_buf =
+            blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+        unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+                     src_buf, source->y_stride, block_width, block_height,
+                     best_unsharp_amounts[index]);
+      }
+    }
+  }
+
+  aom_free_frame_buffer(&source_block);
+  aom_free_frame_buffer(&blurred_block);
+  aom_free_frame_buffer(&blurred);
+  aom_free(best_unsharp_amounts);
+}
+
+void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int y_width = cpi->source->y_width;
+  const int y_height = cpi->source->y_height;
+  const int resized_block_size = BLOCK_32X32;
+  const int resize_factor = 2;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
+
+  YV12_BUFFER_CONFIG resized_source;
+  memset(&resized_source, 0, sizeof(resized_source));
+  aom_alloc_frame_buffer(
+      &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x,
+      ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0, 0);
+  if (!av1_resize_and_extend_frame_nonnormative(
+          cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating buffers during resize");
+  }
+
+  const int resized_y_width = resized_source.y_width;
+  const int resized_y_height = resized_source.y_height;
+  const int resized_block_w = mi_size_wide[resized_block_size] * 4;
+  const int resized_block_h = mi_size_high[resized_block_size] * 4;
+  const int num_cols =
+      (resized_y_width + resized_block_w - 1) / resized_block_w;
+  const int num_rows =
+      (resized_y_height + resized_block_h - 1) / resized_block_h;
+
+  YV12_BUFFER_CONFIG blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x,
+                         ss_y, cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+  gaussian_blur(bit_depth, &resized_source, &blurred);
+
+  YV12_BUFFER_CONFIG recon;
+  memset(&recon, 0, sizeof(recon));
+  aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+  aom_yv12_copy_frame(&resized_source, &recon, 1);
+
+  VmafContext *vmaf_context;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg);
+  unsigned int *sses = aom_calloc(num_rows * num_cols, sizeof(*sses));
+  if (!sses) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating vmaf data");
+  }
+
+  // Loop through each 'block_size' block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      const int row_offset_y = row * resized_block_h;
+      const int col_offset_y = col * resized_block_w;
+
+      uint8_t *const orig_buf = resized_source.y_buffer +
+                                row_offset_y * resized_source.y_stride +
+                                col_offset_y;
+      uint8_t *const blurred_buf =
+          blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+
+      cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+                                              blurred_buf, blurred.y_stride,
+                                              &sses[index]);
+
+      uint8_t *const recon_buf =
+          recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y;
+      // Set recon buf
+      if (cpi->common.seq_params->use_highbitdepth) {
+        highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+                            CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+                            CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
+                            resized_block_w, resized_block_h, 0.0, bit_depth);
+      } else {
+        unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf,
+                     blurred.y_stride, recon_buf, recon.y_stride,
+                     resized_block_w, resized_block_h, 0.0);
+      }
+
+      aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth,
+                          index);
+
+      // Restore recon buf
+      if (cpi->common.seq_params->use_highbitdepth) {
+        highbd_unsharp_rect(
+            CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+            CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+            CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w,
+            resized_block_h, 0.0, bit_depth);
+      } else {
+        unsharp_rect(orig_buf, resized_source.y_stride, orig_buf,
+                     resized_source.y_stride, recon_buf, recon.y_stride,
+                     resized_block_w, resized_block_h, 0.0);
+      }
+    }
+  }
+  aom_flush_vmaf_context(vmaf_context);
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      const double vmaf = aom_calc_vmaf_at_index(
+          vmaf_context, cpi->vmaf_info.vmaf_model, index);
+      const double dvmaf = kBaselineVmaf - vmaf;
+
+      const double mse =
+          (double)sses[index] / (double)(resized_y_width * resized_y_height);
+      double weight;
+      const double eps = 0.01 / (num_rows * num_cols);
+      if (dvmaf < eps || mse < eps) {
+        weight = 1.0;
+      } else {
+        weight = mse / dvmaf;
+      }
+
+      // Normalize it with a data fitted model.
+      weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8;
+      cpi->vmaf_info.rdmult_scaling_factors[index] = weight;
+    }
+  }
+
+  aom_free_frame_buffer(&resized_source);
+  aom_free_frame_buffer(&blurred);
+  aom_close_vmaf_context(vmaf_context);
+  aom_free(sses);
+}
+
+void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_64X64;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->vmaf_info.rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  av1_set_error_per_bit(&x->errorperbit, *rdmult);
+}
+
+// TODO(sdeng): replace them with the SIMD versions.
+static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int w, int h) {
+  double accum = 0.0;
+  int i, j;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      double img1px = src[i * src_stride + j];
+      double img2px = ref[i * ref_stride + j];
+
+      accum += fabs(img1px - img2px);
+    }
+  }
+
+  return accum / (double)(h * w);
+}
+
+static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int w,
+                                     int h) {
+  double accum = 0.0;
+  int i, j;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      double img1px = src[i * src_stride + j];
+      double img2px = ref[i * ref_stride + j];
+
+      accum += fabs(img1px - img2px);
+    }
+  }
+
+  return accum / (double)(h * w);
+}
+
+static double calc_vmaf_motion_score(const AV1_COMP *const cpi,
+                                     const AV1_COMMON *const cm,
+                                     const YV12_BUFFER_CONFIG *const cur,
+                                     const YV12_BUFFER_CONFIG *const last,
+                                     const YV12_BUFFER_CONFIG *const next) {
+  const int y_width = cur->y_width;
+  const int y_height = cur->y_height;
+  YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cur->subsampling_x;
+  const int ss_y = cur->subsampling_y;
+
+  memset(&blurred_cur, 0, sizeof(blurred_cur));
+  memset(&blurred_last, 0, sizeof(blurred_last));
+  memset(&blurred_next, 0, sizeof(blurred_next));
+
+  aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+  aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+  aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+
+  gaussian_blur(bit_depth, cur, &blurred_cur);
+  gaussian_blur(bit_depth, last, &blurred_last);
+  if (next) gaussian_blur(bit_depth, next, &blurred_next);
+
+  double motion1, motion2 = 65536.0;
+  if (cm->seq_params->use_highbitdepth) {
+    assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH);
+    const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
+    motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+                                 blurred_cur.y_stride,
+                                 CONVERT_TO_SHORTPTR(blurred_last.y_buffer),
+                                 blurred_last.y_stride, y_width, y_height) *
+              scale_factor;
+    if (next) {
+      assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH);
+      motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+                                   blurred_cur.y_stride,
+                                   CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
+                                   blurred_next.y_stride, y_width, y_height) *
+                scale_factor;
+    }
+  } else {
+    motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+                          blurred_last.y_buffer, blurred_last.y_stride, y_width,
+                          y_height);
+    if (next) {
+      motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+                            blurred_next.y_buffer, blurred_next.y_stride,
+                            y_width, y_height);
+    }
+  }
+
+  aom_free_frame_buffer(&blurred_cur);
+  aom_free_frame_buffer(&blurred_last);
+  aom_free_frame_buffer(&blurred_next);
+
+  return AOMMIN(motion1, motion2);
+}
+
+static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi,
+                                           YV12_BUFFER_CONFIG **last,
+                                           YV12_BUFFER_CONFIG **next) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const int src_index =
+      cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index];
+  struct lookahead_entry *last_entry = av1_lookahead_peek(
+      cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage);
+  struct lookahead_entry *next_entry = av1_lookahead_peek(
+      cpi->ppi->lookahead, src_index + 1, cpi->compressor_stage);
+  *next = &next_entry->img;
+  *last = cm->show_frame ? cpi->last_source : &last_entry->img;
+}
+
+// Calculates the new qindex from the VMAF motion score. This is based on the
+// observation: when the motion score becomes higher, the VMAF score of the
+// same source and distorted frames would become higher.
+int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) {
+    return current_qindex;
+  }
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  const double last_frame_ysse =
+      get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth);
+  const double last_frame_vmaf =
+      get_layer_value(cpi->vmaf_info.last_frame_vmaf, layer_depth);
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const double approx_sse = last_frame_ysse / (double)((1 << (bit_depth - 8)) *
+                                                       (1 << (bit_depth - 8)));
+  const double approx_dvmaf = kBaselineVmaf - last_frame_vmaf;
+  const double sse_threshold =
+      0.01 * cpi->source->y_width * cpi->source->y_height;
+  const double vmaf_threshold = 0.01;
+  if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) {
+    return current_qindex;
+  }
+  YV12_BUFFER_CONFIG *cur_buf = cpi->source;
+  if (cm->show_frame == 0) {
+    const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+    struct lookahead_entry *cur_entry = av1_lookahead_peek(
+        cpi->ppi->lookahead, src_index, cpi->compressor_stage);
+    cur_buf = &cur_entry->img;
+  }
+  assert(cur_buf);
+
+  YV12_BUFFER_CONFIG *next_buf, *last_buf;
+  get_neighbor_frames(cpi, &last_buf, &next_buf);
+  assert(last_buf);
+
+  const double motion =
+      calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf);
+
+  // Get dVMAF through a data fitted model.
+  const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion));
+  const double dsse = dvmaf * approx_sse / approx_dvmaf;
+
+  // Clamping beta to address VQ issue (aomedia:3170).
+  const double beta = AOMMAX(approx_sse / (dsse + approx_sse), 0.5);
+  const int offset =
+      av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta);
+  int qindex = current_qindex + offset;
+
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  return qindex;
+}
+
+static AOM_INLINE double cal_approx_score(
+    AV1_COMP *const cpi, double src_variance, double new_variance,
+    double src_score, YV12_BUFFER_CONFIG *const src,
+    YV12_BUFFER_CONFIG *const recon_sharpened) {
+  double score;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth,
+                cal_vmaf_neg, &score);
+  return src_variance / new_variance * (score - src_score);
+}
+
+static double find_best_frame_unsharp_amount_loop_neg(
+    AV1_COMP *const cpi, double src_variance, double base_score,
+    YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon,
+    YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred,
+    YV12_BUFFER_CONFIG *const recon_blurred,
+    YV12_BUFFER_CONFIG *const src_sharpened,
+    YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs,
+    double best_score, const double unsharp_amount_start,
+    const double step_size, const int max_loop_count, const double max_amount) {
+  const double min_amount = 0.0;
+  int loop_count = 0;
+  double approx_score = best_score;
+  double unsharp_amount = unsharp_amount_start;
+
+  do {
+    best_score = approx_score;
+    unsharp_amount += step_size;
+    if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+    unsharp(cpi, recon, recon_blurred, recon_sharpened, unsharp_amount);
+    unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount);
+    const double new_variance =
+        residual_frame_average_variance(cpi, src_sharpened, ref, mvs);
+    approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score,
+                                    src, recon_sharpened);
+
+    loop_count++;
+  } while (approx_score > best_score && loop_count < max_loop_count);
+  unsharp_amount =
+      approx_score > best_score ? unsharp_amount : unsharp_amount - step_size;
+
+  return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount_neg(
+    AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src,
+    YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref,
+    double base_score, const double unsharp_amount_start,
+    const double step_size, const int max_loop_count,
+    const double max_filter_amount) {
+  FULLPEL_MV *mvs = NULL;
+  const double src_variance =
+      residual_frame_average_variance(cpi, src, ref, mvs);
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = recon->y_width;
+  const int height = recon->y_height;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = recon->subsampling_x;
+  const int ss_y = recon->subsampling_y;
+
+  YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened;
+  memset(&recon_sharpened, 0, sizeof(recon_sharpened));
+  memset(&src_sharpened, 0, sizeof(src_sharpened));
+  memset(&recon_blurred, 0, sizeof(recon_blurred));
+  memset(&src_blurred, 0, sizeof(src_blurred));
+  aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+  aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+  aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0, 0);
+  aom_alloc_frame_buffer(
+      &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+
+  gaussian_blur(bit_depth, recon, &recon_blurred);
+  gaussian_blur(bit_depth, src, &src_blurred);
+
+  unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_start);
+  unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start);
+  const double variance_start =
+      residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+  const double score_start = cal_approx_score(
+      cpi, src_variance, variance_start, base_score, src, &recon_sharpened);
+
+  const double unsharp_amount_next = unsharp_amount_start + step_size;
+  unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next);
+  unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next);
+  const double variance_next =
+      residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+  const double score_next = cal_approx_score(cpi, src_variance, variance_next,
+                                             base_score, src, &recon_sharpened);
+
+  double unsharp_amount;
+  if (score_next > score_start) {
+    unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+        cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+        &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next,
+        unsharp_amount_next, step_size, max_loop_count, max_filter_amount);
+  } else {
+    unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+        cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+        &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start,
+        unsharp_amount_start, -step_size, max_loop_count, max_filter_amount);
+  }
+
+  aom_free_frame_buffer(&recon_sharpened);
+  aom_free_frame_buffer(&src_sharpened);
+  aom_free_frame_buffer(&recon_blurred);
+  aom_free_frame_buffer(&src_blurred);
+  aom_free(mvs);
+  return unsharp_amount;
+}
+
+void av1_update_vmaf_curve(AV1_COMP *cpi) {
+  YV12_BUFFER_CONFIG *source = cpi->source;
+  YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  double base_score;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth,
+                cal_vmaf_neg, &base_score);
+  cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score;
+  if (cpi->common.seq_params->use_highbitdepth) {
+    assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(recon->flags & YV12_FLAG_HIGHBITDEPTH);
+    cpi->vmaf_info.last_frame_ysse[layer_depth] =
+        (double)aom_highbd_get_y_sse(source, recon);
+  } else {
+    cpi->vmaf_info.last_frame_ysse[layer_depth] =
+        (double)aom_get_y_sse(source, recon);
+  }
+
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    YV12_BUFFER_CONFIG *last, *next;
+    get_neighbor_frames(cpi, &last, &next);
+    double best_unsharp_amount_start =
+        get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+    const int max_loop_count = 5;
+    cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+        find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score,
+                                           best_unsharp_amount_start, 0.025,
+                                           max_loop_count, 1.01);
+  }
+}
diff --git a/third_party/aom/av1/encoder/tune_vmaf.h b/third_party/aom/av1/encoder/tune_vmaf.h
new file mode 100644
index 0000000000..a04a29e6fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_vmaf.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_
+#define AOM_AV1_ENCODER_TUNE_VMAF_H_
+
+#include "aom_dsp/vmaf.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
+
+typedef struct {
+  // Stores the scaling factors for rdmult when tuning for VMAF.
+  // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+  // 64x64 block at (row, col).
+  double *rdmult_scaling_factors;
+
+  // Stores the luma sse of the last frame.
+  double last_frame_ysse[MAX_ARF_LAYERS];
+
+  // Stores the VMAF of the last frame.
+  double last_frame_vmaf[MAX_ARF_LAYERS];
+
+  // Stores the filter strength of the last frame.
+  double last_frame_unsharp_amount[MAX_ARF_LAYERS];
+
+  // Stores the origial qindex before scaling.
+  int original_qindex;
+
+  // VMAF model used in VMAF caculations.
+  VmafModel *vmaf_model;
+} TuneVMAFInfo;
+
+struct AV1_COMP;
+
+void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi,
+                                YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi,
+                                  YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi,
+                                YV12_BUFFER_CONFIG *source);
+
+void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi);
+
+void av1_set_vmaf_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult);
+
+int av1_get_vmaf_base_qindex(const struct AV1_COMP *cpi, int current_qindex);
+
+void av1_update_vmaf_curve(struct AV1_COMP *cpi);
+
+#endif  // AOM_AV1_ENCODER_TUNE_VMAF_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
new file mode 100644
index 0000000000..aab5e1398d
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -0,0 +1,3422 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * Contains the details of the ML models used for pruning transform size. This
+ * file is only included by av1/encoder/tx_search.c.
+ */
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+/***************************CONFIG_NN_V2 (New)********************************/
+#if CONFIG_NN_V2
+// Tx type model for 4x4 block.
+static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = {
+  -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+  0.17996f,  1.20000f,  -0.27654f, 0.77396f,  1.21684f,  -1.75909f, -0.51272f,
+  -1.25923f, 0.35005f,  -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+  2.73144f,  -0.16875f, -0.23482f, 0.02194f,  -0.26427f, 0.28049f,  0.21260f,
+  1.35792f,  0.27733f,  0.88660f,  -0.68304f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = {
+  1.38742f, 0.59540f,  -1.37622f, 1.92114f,
+  0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = {
+  1.65254f,  1.00915f,  -0.89318f, -2.05142f, -0.23235f, 0.96781f,  -0.37145f,
+  -0.21056f, 1.13891f,  0.38675f,  0.87739f,  -1.42697f, 0.48015f,  0.61883f,
+  -0.03979f, 0.11487f,  0.48042f,  0.45200f,  -0.23242f, 0.75166f,  0.55458f,
+  0.39452f,  -0.35285f, 1.59120f,  -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+  -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = {
+  4.07177f,
+  3.26961f,
+  0.58083f,
+  1.21199f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x4_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x4_hor_layer1_weights,
+          av1_tx_type_nn_4x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = {
+  -0.02032f, 2.61610f,  0.02098f,  -0.30217f, 0.12637f,  0.11017f,  -3.01996f,
+  0.35144f,  1.93776f,  -0.20463f, 1.64102f,  -1.41986f, -3.66717f, -0.51655f,
+  0.43910f,  0.37778f,  -1.02634f, 0.85337f,  -0.69753f, 1.00206f,  2.11784f,
+  1.89427f,  1.92919f,  0.43201f,  -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+  -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = {
+  -0.33685f, 0.22025f,  0.28140f, 0.56138f,
+  0.93489f,  -1.77048f, 1.34989f, -0.93747f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = {
+  -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f,  -2.39850f, -1.26457f,
+  0.75328f,  -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f,  -2.37739f,
+  -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+  -0.99165f, -1.91366f, 0.16785f,  0.34776f,  0.58154f,  -0.18217f, -0.29257f,
+  -0.86315f, -0.53336f, 0.30320f,  -1.32331f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = {
+  -1.31519f,
+  -3.26321f,
+  1.71794f,
+  -1.90778f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x4_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x4_ver_layer1_weights,
+          av1_tx_type_nn_4x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = {
+  0.00218f,  -0.41880f, -0.61215f, -0.92588f, 0.54291f,  -0.10898f, 0.70691f,
+  0.46819f,  -1.61598f, -0.08834f, -0.96839f, 1.18489f,  -0.45171f, -0.65445f,
+  -0.32179f, -0.10399f, 1.04379f,  0.91895f,  0.85589f,  0.08267f,  1.35388f,
+  -2.03096f, 0.08168f,  -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+  -1.35896f, -1.17121f, 1.68866f,  0.10357f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = {
+  2.93391f,  0.66831f, -0.21419f, 0.00000f,
+  -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = {
+  -1.52077f, -1.06243f, 0.35319f,  -0.49207f, 0.54524f,  0.44271f, 1.37117f,
+  -0.38957f, -1.28889f, -0.57133f, 0.04658f,  0.62278f,  0.37984f, 0.33247f,
+  1.65547f,  -0.56806f, -1.38645f, -0.76258f, 0.67926f,  0.08783f, -0.01443f,
+  0.34950f,  1.45812f,  -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+  -0.50191f, 0.18219f,  1.83664f,  -0.75276f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = {
+  -1.17455f,
+  -2.26089f,
+  -1.79863f,
+  -2.26333f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x8_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x8_hor_layer1_weights,
+          av1_tx_type_nn_4x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = {
+  -0.00952f, -0.98858f, -0.93181f, 1.39594f,  0.96559f,  0.18162f,  -0.76064f,
+  -0.06066f, 0.07907f,  -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+  -0.10982f, 0.18559f,  1.17049f,  1.11387f,  1.12697f,  1.05804f,  1.12764f,
+  1.06318f,  1.12052f,  0.17406f,  1.83157f,  0.19362f,  0.46910f,  0.39608f,
+  0.33342f,  0.40083f,  0.27645f,  1.06864f,  -4.06645f, -0.38775f, -0.11070f,
+  0.03781f,  -0.09141f, 0.06185f,  -0.04852f, 0.20163f,  0.16784f,  0.16641f,
+  -0.50941f, -0.61087f, 2.07008f,  -0.82381f, -0.85558f, 0.05528f,  -0.10535f,
+  -2.81150f, 0.67038f,  0.43643f,  0.49062f,  -0.04465f, 0.90438f,  0.00977f,
+  0.46272f,  1.59751f,  0.95234f,  0.35086f,  0.85624f,  0.73149f,  1.67779f,
+  -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+  -1.24956f, 0.73797f,  1.23275f,  -0.60064f, -0.07851f, 0.14397f,  0.22110f,
+  -0.04422f, 0.14350f,  0.75926f,  0.35032f,  0.48104f,  2.81408f,  0.34662f,
+  0.42090f,  0.35521f,  -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+  0.32299f,  0.23916f,  0.06032f,  -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+  -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f,  -0.04973f,
+  -0.09273f, 1.04249f,  0.79235f,  1.13229f,  0.99617f,  0.03851f,  0.56334f,
+  0.90795f,  1.08296f,  0.58519f,  1.74765f,  0.63971f,  1.35951f,  0.07803f,
+  -0.05127f, 0.26514f,  -0.84629f, -0.66343f, -2.10630f, 0.11017f,  2.18528f,
+  -0.21958f, 0.05970f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = {
+  0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f,  0.01143f,
+  0.00235f, 4.26772f, 0.44364f,  -0.33199f, -0.39076f, -0.35129f,
+  0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = {
+  -0.38193f, -0.12095f, 1.57802f,  0.34932f,  -0.47333f, -0.12304f, -0.01736f,
+  -2.52445f, 0.18983f,  -0.64707f, -0.60889f, -0.53750f, 0.91666f,  -0.62823f,
+  -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f,  1.48589f,  -1.03238f,
+  -0.33459f, -0.35108f, -2.42417f, 0.60229f,  0.06824f,  -0.75495f, 0.26902f,
+  0.65311f,  -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f,  -0.59589f,
+  0.49738f,  -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f,  -0.66490f,
+  -0.76312f, 0.28256f,  1.06311f,  -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+  -1.04403f, -0.46531f, 0.34084f,  -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+  -0.68736f, -0.37904f, -1.32371f, 0.47288f,  1.51904f,  0.78372f,  -1.01830f,
+  -1.01848f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = {
+  -1.45955f,
+  -2.08949f,
+  -1.24813f,
+  -1.55368f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_4x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x8_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x8_ver_layer1_weights,
+          av1_tx_type_nn_4x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = {
+  -0.22492f, 0.13341f,  -4.03243f, -0.64015f, 0.02783f,  0.60466f,  -0.13335f,
+  0.16828f,  0.12336f,  0.52904f,  1.18455f,  -0.32425f, 0.13052f,  0.93810f,
+  -3.71165f, 0.02990f,  -4.63558f, 0.05666f,  0.03524f,  -0.07449f, -0.44006f,
+  -0.33215f, -0.33713f, 0.08097f,  0.60873f,  0.29582f,  0.21696f,  -0.78729f,
+  -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f,  1.58463f,  1.48536f,
+  1.54374f,  1.60069f,  1.46125f,  1.53932f,  0.05974f,  -1.82192f, 0.47043f,
+  0.38090f,  0.20833f,  -0.05637f, 0.05183f,  0.01323f,  -0.25662f, 0.78634f,
+  -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+  -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+  0.75079f,  2.32551f,  0.05878f,  0.80438f,  0.88584f,  0.69153f,  0.89060f,
+  0.73660f,  0.87259f,  -0.00745f, -1.30044f, -0.59430f, 2.07270f,  1.03307f,
+  -0.84697f, -1.19393f, 0.17549f,  -0.24978f, -3.67234f, 0.20781f,  -0.53946f,
+  -0.05068f, 0.88274f,  1.30371f,  0.10288f,  0.07585f,  0.12259f,  -0.30815f,
+  0.25437f,  -2.82096f, -2.69482f, 0.02370f,  0.12500f,  -0.21019f, -0.49220f,
+  0.03638f,  -0.29795f, 0.28645f,  -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+  0.32437f,  0.32528f,  -0.19437f, 0.30383f,  -0.31879f, 0.26359f,  -0.12164f,
+  -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+  -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+  -1.85523f, 0.92532f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = {
+  0.36631f,  0.02901f,  0.64305f,  1.53074f, -1.40229f, 0.03852f,
+  -0.05043f, 0.89632f,  -1.23312f, 0.07036f, 0.17070f,  0.56250f,
+  -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = {
+  -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f,  -0.73172f,
+  -0.69337f, 0.88807f,  -0.49242f, -0.44717f, -0.11436f, 0.09978f,  0.15393f,
+  0.17083f,  1.44850f,  -0.20582f, -0.04906f, 0.42990f,  -0.61939f, -1.09692f,
+  -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+  0.15383f,  -0.04193f, -0.54858f, 1.82676f,  -0.22411f, 0.05264f,  -0.45848f,
+  -0.72985f, 0.87553f,  0.04116f,  -1.29774f, -2.63018f, 1.09089f,  -0.36048f,
+  -0.16725f, 0.11627f,  0.49918f,  0.07539f,  0.00763f,  0.73706f,  0.87800f,
+  0.57049f,  0.60969f,  1.02779f,  1.53339f,  -0.35915f, 0.06410f,  1.44582f,
+  0.09698f,  0.71888f,  0.60594f,  0.84103f,  -0.50440f, -0.38825f, 0.15626f,
+  -1.10654f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = {
+  -0.92861f,
+  -1.45151f,
+  -1.33588f,
+  -4.33853f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x4_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x4_hor_layer1_weights,
+          av1_tx_type_nn_8x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = {
+  -1.10946f, 1.86574f,  -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+  -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+  -1.36831f, 1.00374f,  2.59312f,  0.50291f, -0.71042f, -0.12238f, -0.15901f,
+  -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f,  2.28687f,
+  1.66212f,  1.70826f,  1.55182f,  0.12230f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = {
+  0.10943f,  2.09789f, 2.16578f, 0.15766f,
+  -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = {
+  1.20426f,  -1.23237f, 2.41053f, -0.72488f, 1.25249f,  0.18018f,  -0.09586f,
+  2.17901f,  0.15364f,  1.21535f, -0.38263f, -0.74309f, 0.50551f,  -0.54208f,
+  0.59139f,  1.16095f,  0.55919f, -0.60183f, 1.18949f,  1.60787f,  0.54002f,
+  -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f,  -2.83483f, -0.27086f,
+  -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = {
+  1.81013f,
+  1.10517f,
+  2.90059f,
+  0.95391f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_8x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x4_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x4_ver_layer1_weights,
+          av1_tx_type_nn_8x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = {
+  -0.85529f, 0.37619f,  0.12754f,  0.08622f,  0.45278f,  0.54929f,  1.60651f,
+  -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f,  0.31695f,  -0.05616f,
+  0.20483f,  -0.36448f, 2.27203f,  -0.33087f, 0.47679f,  0.86888f,  0.39370f,
+  0.46239f,  0.01113f,  1.50327f,  -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+  -1.37753f, -1.22681f, -1.70576f, 0.51329f,  -1.65662f, 1.74197f,  -0.13579f,
+  -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f,  -0.56491f,
+  -0.83432f, 0.13492f,  1.32147f,  2.85285f,  0.13819f,  0.03792f,  -1.30792f,
+  0.04155f,  -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+  0.29540f,  0.01137f,  -0.25335f, -0.16856f, 0.12028f,  0.05207f,  0.39357f,
+  -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+  2.08283f,  0.19291f,  -4.81426f, -0.65044f, -0.24598f, 0.06371f,  -0.10272f,
+  -0.14502f, -0.06821f, 0.45202f,  0.21091f,  -0.80864f, 0.39255f,  1.79189f,
+  1.80453f,  1.10484f,  1.17608f,  0.96901f,  -0.35871f, -0.94311f, 0.63147f,
+  2.95157f,  0.45917f,  -0.42849f, -0.55643f, -0.06097f, 3.49299f,  -0.50972f,
+  0.11075f,  -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f,  -1.61074f,
+  1.82998f,  0.37623f,  -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+  1.38846f,  1.42085f,  1.42568f,  1.36152f,  1.46910f,  1.27473f,  1.34752f,
+  0.12753f,  -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+  -0.99892f, 1.09823f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = {
+  -0.49232f, -0.29685f, -1.44020f, 1.10940f,  1.16452f, -0.34862f,
+  -0.38761f, -0.36243f, 0.21776f,  0.28234f,  2.34269f, -0.04104f,
+  -0.26319f, 2.65579f,  -1.30137f, -0.01487f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = {
+  -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f,  -1.42322f,
+  -0.29106f, 0.07228f,  0.04391f,  1.61388f,  -0.03055f, 0.81637f,  2.06045f,
+  0.27119f,  -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+  0.60958f,  -1.30523f, 0.25143f,  0.11398f,  0.37860f,  1.54829f,  0.02309f,
+  0.67288f,  2.11447f,  0.44845f,  -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+  -1.22646f, -1.54571f, 0.60552f,  -1.52565f, 0.11469f,  0.17344f,  0.08622f,
+  1.57906f,  -0.00909f, 0.81634f,  2.04909f,  1.26466f,  -1.45741f, -0.75229f,
+  0.06200f,  -1.05835f, -0.66257f, -1.73766f, 0.99923f,  -1.87082f, 0.14580f,
+  0.49525f,  0.46839f,  1.32203f,  0.33923f,  0.97001f,  2.38584f,  1.58811f,
+  0.06161f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = {
+  1.70385f,
+  1.82373f,
+  1.78496f,
+  1.80826f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x8_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x8_hor_layer1_weights,
+          av1_tx_type_nn_8x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = {
+  -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+  2.09681f,  -0.05081f, -0.61030f, 2.02541f,  0.60222f,  0.99936f,  2.02114f,
+  -0.53893f, -0.23757f, 0.73566f,  0.25443f,  0.00132f,  -0.74036f, -0.75351f,
+  -0.76964f, -1.71007f, -0.15770f, 1.60982f,  2.17638f,  0.90681f,  0.64973f,
+  0.85914f,  0.58786f,  -1.46228f, 0.05187f,  1.18804f,  0.30850f,  0.29512f,
+  0.40526f,  0.37635f,  0.32311f,  0.37471f,  1.12346f,  3.41856f,  -0.36653f,
+  0.42537f,  -0.19240f, 0.00155f,  0.30826f,  -0.02116f, -0.53435f, -0.34829f,
+  -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+  -0.75257f, 0.10057f,  1.43474f,  0.89450f,  0.75900f,  1.11147f,  1.00558f,
+  0.25886f,  2.22095f,  -0.17926f, 0.57161f,  0.39546f,  0.47846f,  0.40452f,
+  0.54298f,  0.45814f,  -3.62788f, -3.02374f, 0.03716f,  -0.13937f, -0.09415f,
+  -0.12463f, 0.05682f,  0.03672f,  1.20746f,  1.25003f,  1.27071f,  1.31883f,
+  1.27473f,  1.34943f,  1.23158f,  0.09039f,  0.19388f,  0.63420f,  2.79612f,
+  0.93803f,  -0.11323f, -0.02027f, 0.41286f,  -0.05979f, -3.80705f, -0.52451f,
+  -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f,  0.05346f,
+  0.61403f,  0.32140f,  -2.39831f, -1.42355f, 1.30541f,  1.02361f,  0.12930f,
+  -1.61469f, -0.77036f, -0.59144f, 1.27769f,  1.52068f,  0.82137f,  1.83159f,
+  -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+  -1.29848f, 0.39308f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = {
+  -0.14868f, -0.48343f, 3.94416f,  -0.78037f, -1.33789f, -0.60611f,
+  0.51793f,  0.44030f,  -0.71563f, 0.22561f,  -1.19083f, -0.46149f,
+  0.83015f,  0.06024f,  1.17180f,  0.65122f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = {
+  -1.42711f, -0.21683f, 2.12061f,  0.20489f,  -0.50228f, -0.24770f, 0.23391f,
+  1.03470f,  -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+  1.43322f,  0.00280f,  -1.53057f, -0.18912f, 1.95333f,  0.31151f,  -2.07601f,
+  0.06776f,  0.25529f,  0.94800f,  -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+  0.17650f,  -0.07955f, 1.43734f,  -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+  0.30351f,  0.27594f,  -0.36245f, 0.19539f,  0.91045f,  -0.24068f, -0.37616f,
+  0.88792f,  0.02947f,  -0.16903f, -0.04932f, 1.51293f,  -0.95967f, -1.62903f,
+  0.05326f,  2.30703f,  0.64445f,  -1.09464f, -0.16623f, 1.00240f,  0.07548f,
+  -0.50406f, 0.63854f,  1.02340f,  0.49833f,  0.13671f,  0.26722f,  2.09516f,
+  -0.41305f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = {
+  2.14067f,
+  2.76699f,
+  2.04233f,
+  1.34803f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x8_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x8_ver_layer1_weights,
+          av1_tx_type_nn_8x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = {
+  -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+  0.73431f,  1.10135f,  0.47054f,  0.43230f,  -0.43009f, -0.09135f, -0.07289f,
+  -0.38785f, 1.23775f,  -0.35312f, 0.73789f,  0.88864f,  0.75957f,  0.62579f,
+  0.46974f,  0.21851f,  1.63821f,  -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+  -0.91320f, -0.63055f, -1.03296f, 0.55778f,  -0.00071f, 1.27539f,  1.60068f,
+  1.40975f,  0.97372f,  0.92843f,  1.90853f,  0.12626f,  1.71953f,  1.41978f,
+  -0.12234f, -1.27058f, 0.76207f,  0.02495f,  -0.67038f, -0.05255f, 1.72923f,
+  1.47630f,  1.47058f,  1.47614f,  1.49354f,  1.66131f,  1.50801f,  0.17145f,
+  -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f,  1.26572f,  0.97865f,
+  -0.65466f, 1.31129f,  0.26916f,  0.12139f,  -0.12761f, -0.39143f, -0.28134f,
+  0.06584f,  2.24418f,  0.22516f,  0.05011f,  -0.01671f, -0.29476f, -0.40326f,
+  0.21138f,  -0.11573f, -0.31154f, -0.36828f, 0.03694f,  -0.07172f, -0.63419f,
+  -3.14351f, -1.23125f, 0.65311f,  -0.11406f, 1.97287f,  -0.10422f, 0.83896f,
+  0.85033f,  0.49724f,  0.80482f,  0.51454f,  1.06447f,  0.76693f,  0.72599f,
+  -0.78573f, -0.53950f, 0.40894f,  0.00086f,  0.10784f,  -0.70498f, 1.16395f,
+  1.14597f,  1.13496f,  1.12177f,  1.02100f,  -1.37574f, -2.97144f, 0.33899f,
+  0.42013f,  0.86327f,  2.31983f,  2.04008f,  0.95503f,  0.15081f,  0.11530f,
+  -0.02574f, -4.77119f, 0.13257f,  -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+  -0.28136f, 0.42556f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = {
+  0.93617f,  -0.24000f, -1.26821f, 0.78780f,  0.13690f, -0.21948f,
+  -1.45162f, 0.44584f,  -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+  1.81560f,  -1.02643f, -0.81690f, 0.08302f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = {
+  0.06696f,  -0.11538f, -1.42029f, 0.32965f,  0.81046f,  0.01146f,  1.20945f,
+  -0.16899f, 0.53224f,  -0.40232f, 0.01786f,  -0.73242f, 1.29750f,  1.95185f,
+  0.70143f,  1.43287f,  0.76220f,  0.79937f,  -1.79011f, -1.15178f, 0.42526f,
+  -0.67519f, 0.77267f,  -0.30697f, 2.46004f,  -0.49828f, 0.02875f,  1.09972f,
+  1.47662f,  0.61719f,  0.61417f,  -0.12363f, 2.53048f,  0.00418f,  -1.38964f,
+  0.88117f,  0.39239f,  -0.19347f, -2.58600f, -0.33715f, 1.09323f,  -0.32127f,
+  0.02456f,  -0.19125f, 1.12728f,  0.66502f,  0.34296f,  1.14897f,  0.29967f,
+  1.19209f,  0.22108f,  -0.11975f, 1.49776f,  -1.34624f, -2.58478f, -1.34632f,
+  1.53207f,  0.45634f,  -1.48476f, 0.17489f,  0.71790f,  -2.12086f, -1.21778f,
+  -1.31243f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = {
+  0.83359f,
+  1.06875f,
+  1.77645f,
+  1.49570f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_8x16_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x16_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_8x16_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x16_hor_layer1_weights,
+          av1_tx_type_nn_8x16_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x16_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_8x16_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = {
+  0.32858f,  -1.28887f, 0.25632f,  -0.05262f, 2.69203f,  -0.07004f, 1.37337f,
+  -0.05725f, -0.05659f, 0.05592f,  0.01039f,  -0.29343f, 1.58628f,  -0.30003f,
+  -3.43118f, 0.00272f,  1.70928f,  -0.76348f, 0.05889f,  -0.03263f, -0.07724f,
+  0.03523f,  -0.19890f, 1.18005f,  -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+  -0.05368f, -0.17650f, -0.15317f, 0.06499f,  0.56705f,  1.04341f,  0.62890f,
+  0.73451f,  -0.22199f, 0.86659f,  0.78443f,  -0.61664f, -0.50606f, 0.30247f,
+  0.14455f,  0.39276f,  0.49203f,  0.65019f,  0.12269f,  1.64080f,  1.68289f,
+  1.42694f,  1.60825f,  1.58501f,  1.47252f,  1.62589f,  1.48218f,  0.17726f,
+  -0.04884f, 0.35376f,  -0.04796f, 0.32589f,  0.35087f,  0.35258f,  -0.46103f,
+  -0.31176f, -0.05203f, 0.07247f,  -0.26756f, 0.22019f,  0.03412f,  0.33773f,
+  0.29811f,  -0.11140f, 0.12831f,  -0.44673f, -0.09858f, 0.07889f,  0.15137f,
+  0.00347f,  -0.23394f, 0.08886f,  -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+  -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+  -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f,  -0.30681f, 0.04494f,
+  -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f,  -0.41224f,
+  -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+  -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+  0.36381f,  -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+  -0.12236f, 0.16075f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = {
+  -0.35385f, 0.30491f,  -0.90011f, 0.42941f,  1.20928f, -0.88331f,
+  -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+  0.57598f,  0.99819f,  0.75175f,  0.17044f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = {
+  -0.62913f, -0.34304f, 0.42963f,  -0.17440f, -1.44092f, 0.69142f,  -1.36067f,
+  0.52211f,  0.44658f,  -0.26501f, -0.41657f, 0.34428f,  -0.34390f, -0.58567f,
+  -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+  0.58755f,  -1.30559f, 0.39551f,  0.41743f,  -0.09940f, -0.33230f, 0.14458f,
+  -0.25139f, -0.54517f, 0.13469f,  -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+  -0.08395f, -0.92187f, 0.56724f,  1.44381f,  0.53226f,  -0.22356f, 0.12285f,
+  -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+  0.62641f,  -0.11823f, 1.00395f,  1.64794f,  -0.64535f, 2.29322f,  -0.23397f,
+  0.17251f,  -0.35927f, 0.65631f,  -0.26812f, 0.80128f,  0.85748f,  0.47404f,
+  2.20547f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = {
+  -0.44080f,
+  -1.67455f,
+  -1.46332f,
+  -6.13206f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_8x16_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x16_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_8x16_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x16_ver_layer1_weights,
+          av1_tx_type_nn_8x16_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x16_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_8x16_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = {
+  0.02600f,  0.09786f,  -1.05107f, -0.35594f, -0.15658f, 2.99828f,  -0.07106f,
+  -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f,  1.91727f,  -0.00956f,
+  -0.90640f, 0.09174f,  1.58895f,  1.38945f,  1.49431f,  1.51381f,  1.44803f,
+  1.53544f,  1.44694f,  0.17753f,  1.69735f,  -0.78652f, 0.31092f,  -0.23736f,
+  0.02231f,  -0.09884f, -0.00493f, 1.21189f,  -1.94382f, -0.34629f, -0.58309f,
+  0.72291f,  -0.30056f, 0.90660f,  -0.57495f, 3.07809f,  0.73644f,  1.43050f,
+  1.34356f,  -0.66554f, 0.50102f,  -0.64305f, 0.42044f,  -1.66165f, -0.05733f,
+  -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f,  -0.07290f,
+  -0.26290f, -0.68941f, 1.81156f,  0.66125f,  -2.09974f, 0.17032f,  -0.67461f,
+  -0.00876f, -1.50154f, 1.17153f,  1.00377f,  0.33022f,  0.74689f,  0.42878f,
+  0.61725f,  -0.83967f, 0.09467f,  -0.39892f, 0.33863f,  0.10656f,  -0.09249f,
+  -0.39757f, 0.48481f,  -0.35162f, 1.47014f,  1.67827f,  -1.84051f, 0.16291f,
+  -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f,  -0.14743f, -0.02763f,
+  -0.28003f, -0.01364f, 0.21014f,  -0.29026f, -0.20198f, 1.38782f,  0.56731f,
+  0.27489f,  0.43227f,  0.41326f,  0.42721f,  0.87720f,  -1.90067f, -5.04951f,
+  -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+  -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+  0.28689f,  0.32394f,  0.52128f,  0.01013f,  -0.28948f, -0.26293f, -0.44331f,
+  -0.36570f, -0.50757f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = {
+  -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+  0.45260f,  0.16229f,  4.01393f,  -0.21748f, 0.36411f,  -0.08764f,
+  -0.12329f, 0.08986f,  1.08117f,  -0.00220f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = {
+  0.55824f,  -0.14648f, 0.81947f,  -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+  0.15153f,  1.75625f,  -0.25760f, 0.72015f,  -0.30059f, -0.57975f, 0.07609f,
+  -0.02036f, 0.07912f,  0.57080f,  -0.13792f, 0.74184f,  -0.87669f, -1.87572f,
+  -0.27270f, 0.39751f,  0.19652f,  2.03514f,  -0.32944f, 0.76251f,  0.04399f,
+  -0.63175f, 0.37420f,  0.08309f,  0.04466f,  0.60255f,  -0.12820f, 1.66065f,
+  -0.59496f, -1.94794f, -0.14847f, 0.39424f,  0.16273f,  1.80587f,  0.41197f,
+  0.74691f,  -0.21217f, -0.63173f, 0.09510f,  -0.35538f, -0.04407f, 0.92847f,
+  0.20141f,  1.68680f,  -0.56528f, -2.26960f, 0.12978f,  0.73748f,  0.42438f,
+  2.00673f,  -0.40189f, 0.95423f,  0.23234f,  -0.80953f, 0.65814f,  0.49444f,
+  -0.23347f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = {
+  3.57175f,
+  2.42612f,
+  3.31259f,
+  2.08287f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_16x8_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x8_hor_layer1_weights,
+          av1_tx_type_nn_16x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = {
+  0.46633f,  1.55328f,  -0.11230f, -0.29571f, 0.18814f,  -1.52430f, -2.34660f,
+  0.08644f,  -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+  -1.57129f, 0.96021f,  1.34192f,  1.28623f,  1.21655f,  1.28758f,  1.25482f,
+  1.30195f,  1.19190f,  0.09310f,  0.52072f,  0.91487f,  1.24100f,  1.61236f,
+  1.72166f,  2.20750f,  1.62379f,  -1.43936f, 0.50665f,  0.40213f,  0.66502f,
+  -1.66699f, -3.07618f, 0.05877f,  0.60987f,  -0.09995f, -0.10916f, 0.48049f,
+  0.23812f,  0.39847f,  -0.21682f, -0.63455f, 0.33453f,  -0.67939f, -4.14355f,
+  -0.62756f, -0.22502f, -0.17215f, 0.01062f,  0.27049f,  -0.10748f, 0.30945f,
+  2.72445f,  -0.89181f, -0.06800f, 0.20595f,  -0.73385f, 0.04071f,  -1.30294f,
+  1.83507f,  0.92570f,  0.69609f,  0.76285f,  0.69892f,  0.76409f,  0.63104f,
+  0.73397f,  1.09575f,  -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+  -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+  -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f,  -0.75580f, -0.65263f,
+  -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f,  -0.53611f,
+  0.19752f,  -0.16842f, -0.24828f, 0.21857f,  0.08222f,  -2.55894f, -1.75702f,
+  0.11394f,  1.03083f,  0.79972f,  -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+  -0.39616f, -0.00995f, -0.12809f, 0.01188f,  -0.25117f, 0.09202f,  0.09336f,
+  -0.05614f, -0.30039f, 0.25834f,  1.19944f,  1.22533f,  0.92330f,  0.75967f,
+  -0.81945f, -0.41647f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = {
+  0.17841f,  0.67315f,  -1.24450f, 3.13859f,  0.16203f, -0.14992f,
+  0.29553f,  -1.15567f, -0.71421f, 1.15977f,  1.14585f, 3.02460f,
+  -0.04510f, 0.48000f,  -0.09354f, -0.42422f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = {
+  0.29912f,  -0.10009f, -1.11478f, 1.76812f,  -0.27719f, 0.52148f,  0.17622f,
+  -1.17116f, 0.73397f,  -0.69279f, -0.11080f, 1.53751f,  -1.42003f, 0.14731f,
+  0.13592f,  -0.04883f, 0.39186f,  -0.13655f, -0.43994f, 1.82759f,  -0.25601f,
+  -0.15018f, 0.51920f,  -1.56070f, 0.31683f,  -0.79367f, -0.02904f, 1.28637f,
+  -1.15203f, 0.26627f,  0.42828f,  -0.24258f, 0.38647f,  -0.83352f, 0.32553f,
+  2.09522f,  -0.26822f, -0.42191f, 0.32825f,  -1.30748f, 1.50551f,  -0.52669f,
+  0.20045f,  1.69318f,  -1.47839f, 0.30802f,  -0.07290f, -0.28106f, 0.68192f,
+  -0.15522f, 1.12579f,  2.21921f,  0.09720f,  -0.50265f, 0.83165f,  -1.31721f,
+  0.72422f,  -1.24952f, 0.61653f,  2.04117f,  -1.42406f, 0.52568f,  -0.46180f,
+  -0.00873f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = {
+  3.34981f,
+  3.74710f,
+  1.38339f,
+  0.45176f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_16x8_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x8_ver_layer1_weights,
+          av1_tx_type_nn_16x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static float av1_tx_type_nn_16x16_layer0_weights[128] = {
+  1.26592f,  1.36313f,  1.30956f,  1.29926f,  1.48816f,  1.68851f,  1.32000f,
+  0.13321f,  -0.22477f, -0.88906f, -0.19622f, 1.69605f,  1.22180f,  -1.57771f,
+  -1.15765f, 0.05710f,  -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+  -0.77952f, -1.15723f, 1.17809f,  1.35602f,  -0.05243f, -0.37596f, 0.26108f,
+  0.17611f,  -0.10323f, 0.77279f,  -0.48911f, -0.79308f, 0.55112f,  0.43918f,
+  0.27872f,  0.28714f,  0.45830f,  1.05689f,  0.03705f,  -2.49975f, -0.01940f,
+  0.05709f,  0.07942f,  -0.13290f, -0.10359f, 0.00143f,  0.37303f,  0.96470f,
+  0.53293f,  1.14459f,  0.89185f,  0.43378f,  0.47764f,  0.90924f,  0.15279f,
+  -0.15361f, 0.02949f,  0.42240f,  0.68143f,  0.89588f,  0.73754f,  0.10974f,
+  1.57755f,  -0.39870f, -0.32914f, 0.35638f,  0.34991f,  -0.00003f, -0.23373f,
+  0.29630f,  -0.76699f, -0.01356f, 0.04234f,  0.84253f,  1.92078f,  0.93160f,
+  0.71993f,  0.71604f,  0.76455f,  -1.59782f, 0.32332f,  1.11628f,  0.33062f,
+  -0.03728f, -0.05710f, 0.80447f,  -0.14719f, 1.34658f,  -0.05718f, 0.64015f,
+  0.21926f,  0.41653f,  0.12720f,  0.54092f,  1.39411f,  1.81819f,  -0.24513f,
+  0.00955f,  0.38011f,  -0.57787f, -0.41759f, 0.68834f,  -0.31783f, -0.40607f,
+  -0.10107f, -0.79374f, 0.75599f,  -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+  -0.13793f, -0.22293f, 0.18305f,  0.12445f,  0.56830f,  0.24567f,  0.09278f,
+  0.70803f,  0.35803f,  -1.52676f, -0.89624f, 0.77665f,  0.19877f,  0.77175f,
+  0.50355f,  0.08592f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_bias[16] = {
+  -1.31834f, 0.14346f,  -0.10062f, 0.84489f,  0.95617f,  -0.06720f,
+  -0.68502f, -0.91442f, -0.31932f, 0.25276f,  -0.15138f, -1.57661f,
+  -0.14062f, -0.42120f, 0.94573f,  -0.09287f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_weights[64] = {
+  -1.80333f, -1.06353f, 0.55139f,  0.74644f,  0.13747f, -0.93018f, -0.10286f,
+  0.67133f,  0.24460f,  1.44583f,  0.02173f,  0.26037f, -0.73687f, 0.19566f,
+  0.61846f,  -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+  0.96224f,  -0.59139f, 0.03813f,  0.05403f,  1.33427f, -0.54375f, -1.92181f,
+  0.54704f,  0.13608f,  0.22151f,  -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+  1.00894f,  0.62318f,  -0.15296f, 1.27600f,  0.22822f, 0.12751f,  0.93910f,
+  -0.28502f, 0.53912f,  -0.96889f, 0.10182f,  0.81508f, -0.43028f, 2.67386f,
+  0.52204f,  0.49820f,  -0.41711f, 1.05038f,  1.12192f, 0.74349f,  -0.75417f,
+  -0.03718f, -0.35769f, 0.89651f,  0.63236f,  0.54215f, -0.07894f, 0.48274f,
+  1.08829f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_bias[4] = {
+  0.81986f,
+  1.26865f,
+  0.11118f,
+  2.48404f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                    // num_inputs
+          16,                                   // num_outputs
+          av1_tx_type_nn_16x16_layer0_weights,  // weights
+          av1_tx_type_nn_16x16_layer0_bias,     // bias
+          RELU,                                 // activation
+          av1_tx_type_nn_16x16_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x16_layer1_weights,
+          av1_tx_type_nn_16x16_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x16_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                // num_outputs
+  av1_tx_type_nn_16x16_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = {
+  0.36539f,  0.25667f,  0.01491f,  -0.21959f, 2.55105f,  0.17615f, 1.79884f,
+  1.65936f,  -0.44363f, 0.00706f,  -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+  1.47682f,  0.09650f,  -3.59244f, -0.35004f, 0.93295f,  0.25806f, -0.08154f,
+  0.79332f,  0.79535f,  1.09467f,  1.57855f,  -0.51359f, 0.90553f, -1.67744f,
+  -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = {
+  -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+  0.62806f,  -0.20675f, 4.91940f,  -0.56079f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = {
+  -0.57191f, -1.46418f, 0.67331f,  -1.15027f, 0.46288f,  0.81251f,  2.51768f,
+  -0.27147f, 0.00761f,  -2.15214f, -0.69650f, -0.50808f, 0.92832f,  0.45668f,
+  2.34201f,  -0.52941f, 0.51008f,  -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+  0.88043f,  2.64862f,  -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+  1.28413f,  -0.30326f, 2.45329f,  -0.83335f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = {
+  2.33198f,
+  3.36245f,
+  1.62603f,
+  2.91056f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                       // num_inputs
+          8,                                       // num_outputs
+          av1_tx_type_nn_4x16_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x16_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_4x16_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x16_hor_layer1_weights,
+          av1_tx_type_nn_4x16_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x16_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_4x16_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = {
+  1.61392f,  1.41239f,  1.47646f,  1.47325f,  1.46110f,  1.49208f,  1.49414f,
+  0.12835f,  -0.76986f, 0.07087f,  -0.24572f, -0.93168f, 3.07935f,  -0.18183f,
+  -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f,  -0.38711f,
+  -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+  -0.03305f, -4.07069f, -2.76643f, 0.04413f,  -1.03176f, -0.19217f, -0.44980f,
+  -2.48615f, -2.58112f, -0.87695f, 0.16187f,  -0.04891f, -0.06854f, 1.08104f,
+  0.75245f,  1.49302f,  0.63363f,  1.45715f,  0.92574f,  1.72029f,  0.33326f,
+  3.86646f,  0.04422f,  0.41019f,  0.36212f,  0.56600f,  -1.01552f, 0.05128f,
+  0.40454f,  -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+  -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f,  0.41010f,  0.31223f,
+  -0.43382f, -0.74715f, 2.03366f,  -0.30419f, 0.45747f,  0.09526f,  0.31678f,
+  0.22915f,  0.21832f,  1.26385f,  -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+  0.10936f,  2.97396f,  -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+  0.05173f,  -0.44274f, -0.15738f, 0.11311f,  0.43872f,  0.16837f,  -0.52849f,
+  2.90050f,  -0.54735f, -0.29591f, 1.24030f,  0.21696f,  -0.04443f, -1.60877f,
+  -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+  0.42820f,  -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f,  2.23477f,
+  0.01370f,  -0.20426f, -1.51411f, -0.72293f, 0.64516f,  0.97638f,  0.32616f,
+  -0.27975f, -0.01149f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = {
+  -1.37863f, -0.05763f, -0.07041f, 0.15306f,  0.96026f,  -1.42105f,
+  -0.55822f, 1.04845f,  -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+  -0.32530f, 0.73483f,  0.08322f,  -0.23890f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = {
+  0.27194f,  0.50607f,  0.49229f,  -0.48192f, 0.15667f,  -1.38891f, 0.38102f,
+  -0.58825f, -0.07337f, -0.52909f, 0.36975f,  0.28710f,  0.34992f,  -0.73630f,
+  0.30386f,  -0.58822f, 0.36127f,  0.57950f,  0.55878f,  -0.42796f, 0.19967f,
+  -1.45517f, 0.42529f,  -0.54630f, -0.38169f, -0.84899f, 0.41622f,  0.46935f,
+  0.39077f,  -0.75448f, 0.31698f,  -0.76187f, 0.97765f,  0.57052f,  0.55825f,
+  -0.54273f, 0.20466f,  -1.46347f, 0.41813f,  -0.55019f, -0.19948f, -0.57982f,
+  0.41206f,  0.32373f,  0.38537f,  -1.11657f, 0.32887f,  -0.76911f, 1.12259f,
+  0.72163f,  0.82603f,  0.37786f,  0.34976f,  -1.86642f, 0.59961f,  -0.16329f,
+  -0.36631f, -0.56814f, 0.60410f,  0.53158f,  0.56389f,  -0.70508f, 0.51009f,
+  -0.56513f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = {
+  4.60896f,
+  4.53551f,
+  4.53124f,
+  4.27435f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_4x16_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x16_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_4x16_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x16_ver_layer1_weights,
+          av1_tx_type_nn_4x16_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x16_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_4x16_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = {
+  1.45347f,  -0.15743f, 0.44236f,  0.25808f,  0.33944f,  0.38678f,  0.24428f,
+  1.67287f,  0.09539f,  -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+  -0.49183f, 0.09333f,  -0.99026f, -0.22157f, 0.53701f,  0.60447f,  0.15686f,
+  -0.04646f, 0.26341f,  2.12361f,  0.27090f,  -1.14716f, -0.64146f, -0.91604f,
+  -0.75335f, -0.60056f, -1.25084f, 1.68473f,  -3.24075f, -4.03867f, -2.07877f,
+  -0.02347f, 0.00333f,  -0.01259f, -0.00465f, 0.02526f,  0.36286f,  -0.10324f,
+  2.12780f,  -0.74584f, -1.05052f, 1.78467f,  -0.55065f, -0.03326f, 2.46781f,
+  1.18349f,  0.96015f,  1.01696f,  1.10584f,  1.07263f,  1.11531f,  -1.06413f,
+  0.32389f,  -1.87360f, -0.14435f, 1.77926f,  1.09966f,  -0.12680f, -0.61386f,
+  -0.09724f, -0.33095f, 1.12122f,  1.00791f,  1.52416f,  1.35004f,  1.32657f,
+  0.60950f,  -1.13538f, -0.38654f, 0.06473f,  2.10669f,  0.27734f,  -0.38359f,
+  -1.91455f, -1.22676f, 0.05786f,  0.97432f,  2.19967f,  0.50457f,  0.78976f,
+  0.95183f,  -0.32414f, 0.49437f,  -0.04506f, 0.18993f,  -0.07971f, 0.23889f,
+  -0.09872f, -0.66036f, 0.05377f,  2.69638f,  -0.08259f, -0.69210f, -1.08296f,
+  -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+  -0.30732f, -0.12043f, 0.11126f,  0.10771f,  -0.14956f, -0.02218f, 0.41016f,
+  1.16599f,  1.14629f,  1.12881f,  1.18676f,  1.24677f,  1.28695f,  1.11270f,
+  0.08233f,  1.75440f,  0.49228f,  -0.34858f, -0.17032f, 0.29288f,  0.47175f,
+  0.19055f,  -1.56413f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = {
+  -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f,  -0.21451f,
+  2.75281f,  0.04318f, 2.03965f,  0.14618f,  -0.70483f, -0.24517f,
+  1.14048f,  0.33308f, -1.10886f, 0.41184f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = {
+  -1.17079f, 0.19096f,  -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+  0.05972f,  1.44759f,  -0.04068f, -0.26331f, 0.31400f,  0.96923f,  0.33443f,
+  -0.77215f, -0.91316f, -1.78928f, 0.21483f,  -1.24008f, -0.46190f, -0.12127f,
+  -0.62144f, 1.37593f,  0.08373f,  1.56215f,  0.00279f,  -0.14556f, 0.38710f,
+  0.96228f,  0.66433f,  -0.51798f, -0.80738f, -0.18539f, 0.19377f,  -1.03090f,
+  -1.51044f, -0.59485f, -0.62589f, 1.90742f,  0.09078f,  1.49113f,  0.00205f,
+  -0.15918f, 0.40827f,  1.08553f,  0.43431f,  0.33519f,  -1.12669f, -1.10274f,
+  0.80004f,  -1.83599f, -0.53134f, 2.00515f,  -0.32670f, 1.37124f,  0.51136f,
+  1.62563f,  0.24787f,  0.31757f,  0.81751f,  1.57262f,  0.83214f,  1.04661f,
+  -0.43819f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = {
+  2.32575f,
+  2.75703f,
+  1.12304f,
+  2.15567f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_16x4_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x4_hor_layer1_weights,
+          av1_tx_type_nn_16x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = {
+  0.26047f,  0.99930f,  1.16484f,  -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+  0.46375f,  1.47951f,  1.13735f,  1.12356f,  0.27385f,  0.50978f,  2.09967f,
+  -1.47386f, 0.01950f,  -0.06362f, 0.26014f,  1.04544f,  -0.03099f, 0.07478f,
+  -0.39701f, 0.05545f,  2.73633f,  -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+  -0.17967f, -0.96622f, 0.42635f,  -1.04784f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = {
+  -0.52088f, 0.52844f,  -1.03655f, -0.30974f,
+  2.59952f,  -1.93604f, 0.00000f,  2.51787f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = {
+  0.10916f,  -0.21219f, -0.51340f, 0.69161f,  1.45988f,  -1.36942f, -0.40899f,
+  1.05136f,  -0.08486f, 0.10008f,  -0.55304f, 0.88012f,  1.61177f,  -1.64507f,
+  0.63428f,  1.15130f,  -0.17287f, -0.18592f, -0.01143f, 0.88293f,  1.73326f,
+  -1.63624f, 0.09359f,  1.18393f,  0.26531f,  0.22378f,  0.15170f,  1.06965f,
+  1.26814f,  -1.93873f, -0.00768f, 1.58309f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = {
+  2.34713f,
+  1.68667f,
+  1.25488f,
+  1.69812f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                       // num_inputs
+          8,                                       // num_outputs
+          av1_tx_type_nn_16x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_16x4_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x4_ver_layer1_weights,
+          av1_tx_type_nn_16x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = {
+  &av1_tx_type_nnconfig_4x4_hor,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_hor,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_hor,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_hor,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_hor,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_hor,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_hor,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_hor,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = {
+  &av1_tx_type_nnconfig_4x4_ver,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_ver,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_ver,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_ver,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+#else
+/******************************CONFIG_NN***************************************/
+// Tx type model for 4x4 block.
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+  -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+  0.17996f,  1.20000f,  -0.27654f, 0.77396f,  1.21684f,  -1.75909f, -0.51272f,
+  -1.25923f, 0.35005f,  -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+  2.73144f,  -0.16875f, -0.23482f, 0.02194f,  -0.26427f, 0.28049f,  0.21260f,
+  1.35792f,  0.27733f,  0.88660f,  -0.68304f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+  1.38742f, 0.59540f,  -1.37622f, 1.92114f,
+  0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+  1.65254f,  1.00915f,  -0.89318f, -2.05142f, -0.23235f, 0.96781f,  -0.37145f,
+  -0.21056f, 1.13891f,  0.38675f,  0.87739f,  -1.42697f, 0.48015f,  0.61883f,
+  -0.03979f, 0.11487f,  0.48042f,  0.45200f,  -0.23242f, 0.75166f,  0.55458f,
+  0.39452f,  -0.35285f, 1.59120f,  -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+  -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+  4.07177f,
+  3.26961f,
+  0.58083f,
+  1.21199f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x4_hor_layer0,
+    av1_tx_type_nn_weights_4x4_hor_layer1 },
+  { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+  -0.02032f, 2.61610f,  0.02098f,  -0.30217f, 0.12637f,  0.11017f,  -3.01996f,
+  0.35144f,  1.93776f,  -0.20463f, 1.64102f,  -1.41986f, -3.66717f, -0.51655f,
+  0.43910f,  0.37778f,  -1.02634f, 0.85337f,  -0.69753f, 1.00206f,  2.11784f,
+  1.89427f,  1.92919f,  0.43201f,  -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+  -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+  -0.33685f, 0.22025f,  0.28140f, 0.56138f,
+  0.93489f,  -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+  -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f,  -2.39850f, -1.26457f,
+  0.75328f,  -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f,  -2.37739f,
+  -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+  -0.99165f, -1.91366f, 0.16785f,  0.34776f,  0.58154f,  -0.18217f, -0.29257f,
+  -0.86315f, -0.53336f, 0.30320f,  -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+  -1.31519f,
+  -3.26321f,
+  1.71794f,
+  -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x4_ver_layer0,
+    av1_tx_type_nn_weights_4x4_ver_layer1 },
+  { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
+  0.00218f,  -0.41880f, -0.61215f, -0.92588f, 0.54291f,  -0.10898f, 0.70691f,
+  0.46819f,  -1.61598f, -0.08834f, -0.96839f, 1.18489f,  -0.45171f, -0.65445f,
+  -0.32179f, -0.10399f, 1.04379f,  0.91895f,  0.85589f,  0.08267f,  1.35388f,
+  -2.03096f, 0.08168f,  -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+  -1.35896f, -1.17121f, 1.68866f,  0.10357f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
+  2.93391f,  0.66831f, -0.21419f, 0.00000f,
+  -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
+  -1.52077f, -1.06243f, 0.35319f,  -0.49207f, 0.54524f,  0.44271f, 1.37117f,
+  -0.38957f, -1.28889f, -0.57133f, 0.04658f,  0.62278f,  0.37984f, 0.33247f,
+  1.65547f,  -0.56806f, -1.38645f, -0.76258f, 0.67926f,  0.08783f, -0.01443f,
+  0.34950f,  1.45812f,  -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+  -0.50191f, 0.18219f,  1.83664f,  -0.75276f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
+  -1.17455f,
+  -2.26089f,
+  -1.79863f,
+  -2.26333f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x8_hor_layer0,
+    av1_tx_type_nn_weights_4x8_hor_layer1 },
+  { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
+  -0.00952f, -0.98858f, -0.93181f, 1.39594f,  0.96559f,  0.18162f,  -0.76064f,
+  -0.06066f, 0.07907f,  -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+  -0.10982f, 0.18559f,  1.17049f,  1.11387f,  1.12697f,  1.05804f,  1.12764f,
+  1.06318f,  1.12052f,  0.17406f,  1.83157f,  0.19362f,  0.46910f,  0.39608f,
+  0.33342f,  0.40083f,  0.27645f,  1.06864f,  -4.06645f, -0.38775f, -0.11070f,
+  0.03781f,  -0.09141f, 0.06185f,  -0.04852f, 0.20163f,  0.16784f,  0.16641f,
+  -0.50941f, -0.61087f, 2.07008f,  -0.82381f, -0.85558f, 0.05528f,  -0.10535f,
+  -2.81150f, 0.67038f,  0.43643f,  0.49062f,  -0.04465f, 0.90438f,  0.00977f,
+  0.46272f,  1.59751f,  0.95234f,  0.35086f,  0.85624f,  0.73149f,  1.67779f,
+  -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+  -1.24956f, 0.73797f,  1.23275f,  -0.60064f, -0.07851f, 0.14397f,  0.22110f,
+  -0.04422f, 0.14350f,  0.75926f,  0.35032f,  0.48104f,  2.81408f,  0.34662f,
+  0.42090f,  0.35521f,  -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+  0.32299f,  0.23916f,  0.06032f,  -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+  -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f,  -0.04973f,
+  -0.09273f, 1.04249f,  0.79235f,  1.13229f,  0.99617f,  0.03851f,  0.56334f,
+  0.90795f,  1.08296f,  0.58519f,  1.74765f,  0.63971f,  1.35951f,  0.07803f,
+  -0.05127f, 0.26514f,  -0.84629f, -0.66343f, -2.10630f, 0.11017f,  2.18528f,
+  -0.21958f, 0.05970f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
+  0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f,  0.01143f,
+  0.00235f, 4.26772f, 0.44364f,  -0.33199f, -0.39076f, -0.35129f,
+  0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
+  -0.38193f, -0.12095f, 1.57802f,  0.34932f,  -0.47333f, -0.12304f, -0.01736f,
+  -2.52445f, 0.18983f,  -0.64707f, -0.60889f, -0.53750f, 0.91666f,  -0.62823f,
+  -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f,  1.48589f,  -1.03238f,
+  -0.33459f, -0.35108f, -2.42417f, 0.60229f,  0.06824f,  -0.75495f, 0.26902f,
+  0.65311f,  -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f,  -0.59589f,
+  0.49738f,  -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f,  -0.66490f,
+  -0.76312f, 0.28256f,  1.06311f,  -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+  -1.04403f, -0.46531f, 0.34084f,  -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+  -0.68736f, -0.37904f, -1.32371f, 0.47288f,  1.51904f,  0.78372f,  -1.01830f,
+  -1.01848f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
+  -1.45955f,
+  -2.08949f,
+  -1.24813f,
+  -1.55368f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x8_ver_layer0,
+    av1_tx_type_nn_weights_4x8_ver_layer1 },
+  { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
+  -0.22492f, 0.13341f,  -4.03243f, -0.64015f, 0.02783f,  0.60466f,  -0.13335f,
+  0.16828f,  0.12336f,  0.52904f,  1.18455f,  -0.32425f, 0.13052f,  0.93810f,
+  -3.71165f, 0.02990f,  -4.63558f, 0.05666f,  0.03524f,  -0.07449f, -0.44006f,
+  -0.33215f, -0.33713f, 0.08097f,  0.60873f,  0.29582f,  0.21696f,  -0.78729f,
+  -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f,  1.58463f,  1.48536f,
+  1.54374f,  1.60069f,  1.46125f,  1.53932f,  0.05974f,  -1.82192f, 0.47043f,
+  0.38090f,  0.20833f,  -0.05637f, 0.05183f,  0.01323f,  -0.25662f, 0.78634f,
+  -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+  -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+  0.75079f,  2.32551f,  0.05878f,  0.80438f,  0.88584f,  0.69153f,  0.89060f,
+  0.73660f,  0.87259f,  -0.00745f, -1.30044f, -0.59430f, 2.07270f,  1.03307f,
+  -0.84697f, -1.19393f, 0.17549f,  -0.24978f, -3.67234f, 0.20781f,  -0.53946f,
+  -0.05068f, 0.88274f,  1.30371f,  0.10288f,  0.07585f,  0.12259f,  -0.30815f,
+  0.25437f,  -2.82096f, -2.69482f, 0.02370f,  0.12500f,  -0.21019f, -0.49220f,
+  0.03638f,  -0.29795f, 0.28645f,  -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+  0.32437f,  0.32528f,  -0.19437f, 0.30383f,  -0.31879f, 0.26359f,  -0.12164f,
+  -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+  -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+  -1.85523f, 0.92532f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
+  0.36631f,  0.02901f,  0.64305f,  1.53074f, -1.40229f, 0.03852f,
+  -0.05043f, 0.89632f,  -1.23312f, 0.07036f, 0.17070f,  0.56250f,
+  -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
+  -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f,  -0.73172f,
+  -0.69337f, 0.88807f,  -0.49242f, -0.44717f, -0.11436f, 0.09978f,  0.15393f,
+  0.17083f,  1.44850f,  -0.20582f, -0.04906f, 0.42990f,  -0.61939f, -1.09692f,
+  -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+  0.15383f,  -0.04193f, -0.54858f, 1.82676f,  -0.22411f, 0.05264f,  -0.45848f,
+  -0.72985f, 0.87553f,  0.04116f,  -1.29774f, -2.63018f, 1.09089f,  -0.36048f,
+  -0.16725f, 0.11627f,  0.49918f,  0.07539f,  0.00763f,  0.73706f,  0.87800f,
+  0.57049f,  0.60969f,  1.02779f,  1.53339f,  -0.35915f, 0.06410f,  1.44582f,
+  0.09698f,  0.71888f,  0.60594f,  0.84103f,  -0.50440f, -0.38825f, 0.15626f,
+  -1.10654f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
+  -0.92861f,
+  -1.45151f,
+  -1.33588f,
+  -4.33853f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x4_hor_layer0,
+    av1_tx_type_nn_weights_8x4_hor_layer1 },
+  { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
+  -1.10946f, 1.86574f,  -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+  -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+  -1.36831f, 1.00374f,  2.59312f,  0.50291f, -0.71042f, -0.12238f, -0.15901f,
+  -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f,  2.28687f,
+  1.66212f,  1.70826f,  1.55182f,  0.12230f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
+  0.10943f,  2.09789f, 2.16578f, 0.15766f,
+  -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
+  1.20426f,  -1.23237f, 2.41053f, -0.72488f, 1.25249f,  0.18018f,  -0.09586f,
+  2.17901f,  0.15364f,  1.21535f, -0.38263f, -0.74309f, 0.50551f,  -0.54208f,
+  0.59139f,  1.16095f,  0.55919f, -0.60183f, 1.18949f,  1.60787f,  0.54002f,
+  -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f,  -2.83483f, -0.27086f,
+  -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
+  1.81013f,
+  1.10517f,
+  2.90059f,
+  0.95391f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x4_ver_layer0,
+    av1_tx_type_nn_weights_8x4_ver_layer1 },
+  { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+  -0.85529f, 0.37619f,  0.12754f,  0.08622f,  0.45278f,  0.54929f,  1.60651f,
+  -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f,  0.31695f,  -0.05616f,
+  0.20483f,  -0.36448f, 2.27203f,  -0.33087f, 0.47679f,  0.86888f,  0.39370f,
+  0.46239f,  0.01113f,  1.50327f,  -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+  -1.37753f, -1.22681f, -1.70576f, 0.51329f,  -1.65662f, 1.74197f,  -0.13579f,
+  -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f,  -0.56491f,
+  -0.83432f, 0.13492f,  1.32147f,  2.85285f,  0.13819f,  0.03792f,  -1.30792f,
+  0.04155f,  -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+  0.29540f,  0.01137f,  -0.25335f, -0.16856f, 0.12028f,  0.05207f,  0.39357f,
+  -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+  2.08283f,  0.19291f,  -4.81426f, -0.65044f, -0.24598f, 0.06371f,  -0.10272f,
+  -0.14502f, -0.06821f, 0.45202f,  0.21091f,  -0.80864f, 0.39255f,  1.79189f,
+  1.80453f,  1.10484f,  1.17608f,  0.96901f,  -0.35871f, -0.94311f, 0.63147f,
+  2.95157f,  0.45917f,  -0.42849f, -0.55643f, -0.06097f, 3.49299f,  -0.50972f,
+  0.11075f,  -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f,  -1.61074f,
+  1.82998f,  0.37623f,  -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+  1.38846f,  1.42085f,  1.42568f,  1.36152f,  1.46910f,  1.27473f,  1.34752f,
+  0.12753f,  -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+  -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+  -0.49232f, -0.29685f, -1.44020f, 1.10940f,  1.16452f, -0.34862f,
+  -0.38761f, -0.36243f, 0.21776f,  0.28234f,  2.34269f, -0.04104f,
+  -0.26319f, 2.65579f,  -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+  -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f,  -1.42322f,
+  -0.29106f, 0.07228f,  0.04391f,  1.61388f,  -0.03055f, 0.81637f,  2.06045f,
+  0.27119f,  -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+  0.60958f,  -1.30523f, 0.25143f,  0.11398f,  0.37860f,  1.54829f,  0.02309f,
+  0.67288f,  2.11447f,  0.44845f,  -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+  -1.22646f, -1.54571f, 0.60552f,  -1.52565f, 0.11469f,  0.17344f,  0.08622f,
+  1.57906f,  -0.00909f, 0.81634f,  2.04909f,  1.26466f,  -1.45741f, -0.75229f,
+  0.06200f,  -1.05835f, -0.66257f, -1.73766f, 0.99923f,  -1.87082f, 0.14580f,
+  0.49525f,  0.46839f,  1.32203f,  0.33923f,  0.97001f,  2.38584f,  1.58811f,
+  0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+  1.70385f,
+  1.82373f,
+  1.78496f,
+  1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x8_hor_layer0,
+    av1_tx_type_nn_weights_8x8_hor_layer1 },
+  { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+  -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+  2.09681f,  -0.05081f, -0.61030f, 2.02541f,  0.60222f,  0.99936f,  2.02114f,
+  -0.53893f, -0.23757f, 0.73566f,  0.25443f,  0.00132f,  -0.74036f, -0.75351f,
+  -0.76964f, -1.71007f, -0.15770f, 1.60982f,  2.17638f,  0.90681f,  0.64973f,
+  0.85914f,  0.58786f,  -1.46228f, 0.05187f,  1.18804f,  0.30850f,  0.29512f,
+  0.40526f,  0.37635f,  0.32311f,  0.37471f,  1.12346f,  3.41856f,  -0.36653f,
+  0.42537f,  -0.19240f, 0.00155f,  0.30826f,  -0.02116f, -0.53435f, -0.34829f,
+  -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+  -0.75257f, 0.10057f,  1.43474f,  0.89450f,  0.75900f,  1.11147f,  1.00558f,
+  0.25886f,  2.22095f,  -0.17926f, 0.57161f,  0.39546f,  0.47846f,  0.40452f,
+  0.54298f,  0.45814f,  -3.62788f, -3.02374f, 0.03716f,  -0.13937f, -0.09415f,
+  -0.12463f, 0.05682f,  0.03672f,  1.20746f,  1.25003f,  1.27071f,  1.31883f,
+  1.27473f,  1.34943f,  1.23158f,  0.09039f,  0.19388f,  0.63420f,  2.79612f,
+  0.93803f,  -0.11323f, -0.02027f, 0.41286f,  -0.05979f, -3.80705f, -0.52451f,
+  -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f,  0.05346f,
+  0.61403f,  0.32140f,  -2.39831f, -1.42355f, 1.30541f,  1.02361f,  0.12930f,
+  -1.61469f, -0.77036f, -0.59144f, 1.27769f,  1.52068f,  0.82137f,  1.83159f,
+  -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+  -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+  -0.14868f, -0.48343f, 3.94416f,  -0.78037f, -1.33789f, -0.60611f,
+  0.51793f,  0.44030f,  -0.71563f, 0.22561f,  -1.19083f, -0.46149f,
+  0.83015f,  0.06024f,  1.17180f,  0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+  -1.42711f, -0.21683f, 2.12061f,  0.20489f,  -0.50228f, -0.24770f, 0.23391f,
+  1.03470f,  -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+  1.43322f,  0.00280f,  -1.53057f, -0.18912f, 1.95333f,  0.31151f,  -2.07601f,
+  0.06776f,  0.25529f,  0.94800f,  -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+  0.17650f,  -0.07955f, 1.43734f,  -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+  0.30351f,  0.27594f,  -0.36245f, 0.19539f,  0.91045f,  -0.24068f, -0.37616f,
+  0.88792f,  0.02947f,  -0.16903f, -0.04932f, 1.51293f,  -0.95967f, -1.62903f,
+  0.05326f,  2.30703f,  0.64445f,  -1.09464f, -0.16623f, 1.00240f,  0.07548f,
+  -0.50406f, 0.63854f,  1.02340f,  0.49833f,  0.13671f,  0.26722f,  2.09516f,
+  -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+  2.14067f,
+  2.76699f,
+  2.04233f,
+  1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x8_ver_layer0,
+    av1_tx_type_nn_weights_8x8_ver_layer1 },
+  { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
+  -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+  0.73431f,  1.10135f,  0.47054f,  0.43230f,  -0.43009f, -0.09135f, -0.07289f,
+  -0.38785f, 1.23775f,  -0.35312f, 0.73789f,  0.88864f,  0.75957f,  0.62579f,
+  0.46974f,  0.21851f,  1.63821f,  -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+  -0.91320f, -0.63055f, -1.03296f, 0.55778f,  -0.00071f, 1.27539f,  1.60068f,
+  1.40975f,  0.97372f,  0.92843f,  1.90853f,  0.12626f,  1.71953f,  1.41978f,
+  -0.12234f, -1.27058f, 0.76207f,  0.02495f,  -0.67038f, -0.05255f, 1.72923f,
+  1.47630f,  1.47058f,  1.47614f,  1.49354f,  1.66131f,  1.50801f,  0.17145f,
+  -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f,  1.26572f,  0.97865f,
+  -0.65466f, 1.31129f,  0.26916f,  0.12139f,  -0.12761f, -0.39143f, -0.28134f,
+  0.06584f,  2.24418f,  0.22516f,  0.05011f,  -0.01671f, -0.29476f, -0.40326f,
+  0.21138f,  -0.11573f, -0.31154f, -0.36828f, 0.03694f,  -0.07172f, -0.63419f,
+  -3.14351f, -1.23125f, 0.65311f,  -0.11406f, 1.97287f,  -0.10422f, 0.83896f,
+  0.85033f,  0.49724f,  0.80482f,  0.51454f,  1.06447f,  0.76693f,  0.72599f,
+  -0.78573f, -0.53950f, 0.40894f,  0.00086f,  0.10784f,  -0.70498f, 1.16395f,
+  1.14597f,  1.13496f,  1.12177f,  1.02100f,  -1.37574f, -2.97144f, 0.33899f,
+  0.42013f,  0.86327f,  2.31983f,  2.04008f,  0.95503f,  0.15081f,  0.11530f,
+  -0.02574f, -4.77119f, 0.13257f,  -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+  -0.28136f, 0.42556f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
+  0.93617f,  -0.24000f, -1.26821f, 0.78780f,  0.13690f, -0.21948f,
+  -1.45162f, 0.44584f,  -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+  1.81560f,  -1.02643f, -0.81690f, 0.08302f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
+  0.06696f,  -0.11538f, -1.42029f, 0.32965f,  0.81046f,  0.01146f,  1.20945f,
+  -0.16899f, 0.53224f,  -0.40232f, 0.01786f,  -0.73242f, 1.29750f,  1.95185f,
+  0.70143f,  1.43287f,  0.76220f,  0.79937f,  -1.79011f, -1.15178f, 0.42526f,
+  -0.67519f, 0.77267f,  -0.30697f, 2.46004f,  -0.49828f, 0.02875f,  1.09972f,
+  1.47662f,  0.61719f,  0.61417f,  -0.12363f, 2.53048f,  0.00418f,  -1.38964f,
+  0.88117f,  0.39239f,  -0.19347f, -2.58600f, -0.33715f, 1.09323f,  -0.32127f,
+  0.02456f,  -0.19125f, 1.12728f,  0.66502f,  0.34296f,  1.14897f,  0.29967f,
+  1.19209f,  0.22108f,  -0.11975f, 1.49776f,  -1.34624f, -2.58478f, -1.34632f,
+  1.53207f,  0.45634f,  -1.48476f, 0.17489f,  0.71790f,  -2.12086f, -1.21778f,
+  -1.31243f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
+  0.83359f,
+  1.06875f,
+  1.77645f,
+  1.49570f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x16_hor_layer0,
+    av1_tx_type_nn_weights_8x16_hor_layer1 },
+  { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
+  0.32858f,  -1.28887f, 0.25632f,  -0.05262f, 2.69203f,  -0.07004f, 1.37337f,
+  -0.05725f, -0.05659f, 0.05592f,  0.01039f,  -0.29343f, 1.58628f,  -0.30003f,
+  -3.43118f, 0.00272f,  1.70928f,  -0.76348f, 0.05889f,  -0.03263f, -0.07724f,
+  0.03523f,  -0.19890f, 1.18005f,  -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+  -0.05368f, -0.17650f, -0.15317f, 0.06499f,  0.56705f,  1.04341f,  0.62890f,
+  0.73451f,  -0.22199f, 0.86659f,  0.78443f,  -0.61664f, -0.50606f, 0.30247f,
+  0.14455f,  0.39276f,  0.49203f,  0.65019f,  0.12269f,  1.64080f,  1.68289f,
+  1.42694f,  1.60825f,  1.58501f,  1.47252f,  1.62589f,  1.48218f,  0.17726f,
+  -0.04884f, 0.35376f,  -0.04796f, 0.32589f,  0.35087f,  0.35258f,  -0.46103f,
+  -0.31176f, -0.05203f, 0.07247f,  -0.26756f, 0.22019f,  0.03412f,  0.33773f,
+  0.29811f,  -0.11140f, 0.12831f,  -0.44673f, -0.09858f, 0.07889f,  0.15137f,
+  0.00347f,  -0.23394f, 0.08886f,  -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+  -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+  -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f,  -0.30681f, 0.04494f,
+  -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f,  -0.41224f,
+  -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+  -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+  0.36381f,  -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+  -0.12236f, 0.16075f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
+  -0.35385f, 0.30491f,  -0.90011f, 0.42941f,  1.20928f, -0.88331f,
+  -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+  0.57598f,  0.99819f,  0.75175f,  0.17044f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
+  -0.62913f, -0.34304f, 0.42963f,  -0.17440f, -1.44092f, 0.69142f,  -1.36067f,
+  0.52211f,  0.44658f,  -0.26501f, -0.41657f, 0.34428f,  -0.34390f, -0.58567f,
+  -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+  0.58755f,  -1.30559f, 0.39551f,  0.41743f,  -0.09940f, -0.33230f, 0.14458f,
+  -0.25139f, -0.54517f, 0.13469f,  -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+  -0.08395f, -0.92187f, 0.56724f,  1.44381f,  0.53226f,  -0.22356f, 0.12285f,
+  -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+  0.62641f,  -0.11823f, 1.00395f,  1.64794f,  -0.64535f, 2.29322f,  -0.23397f,
+  0.17251f,  -0.35927f, 0.65631f,  -0.26812f, 0.80128f,  0.85748f,  0.47404f,
+  2.20547f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
+  -0.44080f,
+  -1.67455f,
+  -1.46332f,
+  -6.13206f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x16_ver_layer0,
+    av1_tx_type_nn_weights_8x16_ver_layer1 },
+  { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
+  0.02600f,  0.09786f,  -1.05107f, -0.35594f, -0.15658f, 2.99828f,  -0.07106f,
+  -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f,  1.91727f,  -0.00956f,
+  -0.90640f, 0.09174f,  1.58895f,  1.38945f,  1.49431f,  1.51381f,  1.44803f,
+  1.53544f,  1.44694f,  0.17753f,  1.69735f,  -0.78652f, 0.31092f,  -0.23736f,
+  0.02231f,  -0.09884f, -0.00493f, 1.21189f,  -1.94382f, -0.34629f, -0.58309f,
+  0.72291f,  -0.30056f, 0.90660f,  -0.57495f, 3.07809f,  0.73644f,  1.43050f,
+  1.34356f,  -0.66554f, 0.50102f,  -0.64305f, 0.42044f,  -1.66165f, -0.05733f,
+  -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f,  -0.07290f,
+  -0.26290f, -0.68941f, 1.81156f,  0.66125f,  -2.09974f, 0.17032f,  -0.67461f,
+  -0.00876f, -1.50154f, 1.17153f,  1.00377f,  0.33022f,  0.74689f,  0.42878f,
+  0.61725f,  -0.83967f, 0.09467f,  -0.39892f, 0.33863f,  0.10656f,  -0.09249f,
+  -0.39757f, 0.48481f,  -0.35162f, 1.47014f,  1.67827f,  -1.84051f, 0.16291f,
+  -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f,  -0.14743f, -0.02763f,
+  -0.28003f, -0.01364f, 0.21014f,  -0.29026f, -0.20198f, 1.38782f,  0.56731f,
+  0.27489f,  0.43227f,  0.41326f,  0.42721f,  0.87720f,  -1.90067f, -5.04951f,
+  -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+  -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+  0.28689f,  0.32394f,  0.52128f,  0.01013f,  -0.28948f, -0.26293f, -0.44331f,
+  -0.36570f, -0.50757f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
+  -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+  0.45260f,  0.16229f,  4.01393f,  -0.21748f, 0.36411f,  -0.08764f,
+  -0.12329f, 0.08986f,  1.08117f,  -0.00220f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
+  0.55824f,  -0.14648f, 0.81947f,  -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+  0.15153f,  1.75625f,  -0.25760f, 0.72015f,  -0.30059f, -0.57975f, 0.07609f,
+  -0.02036f, 0.07912f,  0.57080f,  -0.13792f, 0.74184f,  -0.87669f, -1.87572f,
+  -0.27270f, 0.39751f,  0.19652f,  2.03514f,  -0.32944f, 0.76251f,  0.04399f,
+  -0.63175f, 0.37420f,  0.08309f,  0.04466f,  0.60255f,  -0.12820f, 1.66065f,
+  -0.59496f, -1.94794f, -0.14847f, 0.39424f,  0.16273f,  1.80587f,  0.41197f,
+  0.74691f,  -0.21217f, -0.63173f, 0.09510f,  -0.35538f, -0.04407f, 0.92847f,
+  0.20141f,  1.68680f,  -0.56528f, -2.26960f, 0.12978f,  0.73748f,  0.42438f,
+  2.00673f,  -0.40189f, 0.95423f,  0.23234f,  -0.80953f, 0.65814f,  0.49444f,
+  -0.23347f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
+  3.57175f,
+  2.42612f,
+  3.31259f,
+  2.08287f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x8_hor_layer0,
+    av1_tx_type_nn_weights_16x8_hor_layer1 },
+  { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
+  0.46633f,  1.55328f,  -0.11230f, -0.29571f, 0.18814f,  -1.52430f, -2.34660f,
+  0.08644f,  -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+  -1.57129f, 0.96021f,  1.34192f,  1.28623f,  1.21655f,  1.28758f,  1.25482f,
+  1.30195f,  1.19190f,  0.09310f,  0.52072f,  0.91487f,  1.24100f,  1.61236f,
+  1.72166f,  2.20750f,  1.62379f,  -1.43936f, 0.50665f,  0.40213f,  0.66502f,
+  -1.66699f, -3.07618f, 0.05877f,  0.60987f,  -0.09995f, -0.10916f, 0.48049f,
+  0.23812f,  0.39847f,  -0.21682f, -0.63455f, 0.33453f,  -0.67939f, -4.14355f,
+  -0.62756f, -0.22502f, -0.17215f, 0.01062f,  0.27049f,  -0.10748f, 0.30945f,
+  2.72445f,  -0.89181f, -0.06800f, 0.20595f,  -0.73385f, 0.04071f,  -1.30294f,
+  1.83507f,  0.92570f,  0.69609f,  0.76285f,  0.69892f,  0.76409f,  0.63104f,
+  0.73397f,  1.09575f,  -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+  -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+  -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f,  -0.75580f, -0.65263f,
+  -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f,  -0.53611f,
+  0.19752f,  -0.16842f, -0.24828f, 0.21857f,  0.08222f,  -2.55894f, -1.75702f,
+  0.11394f,  1.03083f,  0.79972f,  -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+  -0.39616f, -0.00995f, -0.12809f, 0.01188f,  -0.25117f, 0.09202f,  0.09336f,
+  -0.05614f, -0.30039f, 0.25834f,  1.19944f,  1.22533f,  0.92330f,  0.75967f,
+  -0.81945f, -0.41647f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
+  0.17841f,  0.67315f,  -1.24450f, 3.13859f,  0.16203f, -0.14992f,
+  0.29553f,  -1.15567f, -0.71421f, 1.15977f,  1.14585f, 3.02460f,
+  -0.04510f, 0.48000f,  -0.09354f, -0.42422f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
+  0.29912f,  -0.10009f, -1.11478f, 1.76812f,  -0.27719f, 0.52148f,  0.17622f,
+  -1.17116f, 0.73397f,  -0.69279f, -0.11080f, 1.53751f,  -1.42003f, 0.14731f,
+  0.13592f,  -0.04883f, 0.39186f,  -0.13655f, -0.43994f, 1.82759f,  -0.25601f,
+  -0.15018f, 0.51920f,  -1.56070f, 0.31683f,  -0.79367f, -0.02904f, 1.28637f,
+  -1.15203f, 0.26627f,  0.42828f,  -0.24258f, 0.38647f,  -0.83352f, 0.32553f,
+  2.09522f,  -0.26822f, -0.42191f, 0.32825f,  -1.30748f, 1.50551f,  -0.52669f,
+  0.20045f,  1.69318f,  -1.47839f, 0.30802f,  -0.07290f, -0.28106f, 0.68192f,
+  -0.15522f, 1.12579f,  2.21921f,  0.09720f,  -0.50265f, 0.83165f,  -1.31721f,
+  0.72422f,  -1.24952f, 0.61653f,  2.04117f,  -1.42406f, 0.52568f,  -0.46180f,
+  -0.00873f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
+  3.34981f,
+  3.74710f,
+  1.38339f,
+  0.45176f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x8_ver_layer0,
+    av1_tx_type_nn_weights_16x8_ver_layer1 },
+  { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static const float av1_tx_type_nn_weights_16x16_layer0[128] = {
+  1.26592f,  1.36313f,  1.30956f,  1.29926f,  1.48816f,  1.68851f,  1.32000f,
+  0.13321f,  -0.22477f, -0.88906f, -0.19622f, 1.69605f,  1.22180f,  -1.57771f,
+  -1.15765f, 0.05710f,  -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+  -0.77952f, -1.15723f, 1.17809f,  1.35602f,  -0.05243f, -0.37596f, 0.26108f,
+  0.17611f,  -0.10323f, 0.77279f,  -0.48911f, -0.79308f, 0.55112f,  0.43918f,
+  0.27872f,  0.28714f,  0.45830f,  1.05689f,  0.03705f,  -2.49975f, -0.01940f,
+  0.05709f,  0.07942f,  -0.13290f, -0.10359f, 0.00143f,  0.37303f,  0.96470f,
+  0.53293f,  1.14459f,  0.89185f,  0.43378f,  0.47764f,  0.90924f,  0.15279f,
+  -0.15361f, 0.02949f,  0.42240f,  0.68143f,  0.89588f,  0.73754f,  0.10974f,
+  1.57755f,  -0.39870f, -0.32914f, 0.35638f,  0.34991f,  -0.00003f, -0.23373f,
+  0.29630f,  -0.76699f, -0.01356f, 0.04234f,  0.84253f,  1.92078f,  0.93160f,
+  0.71993f,  0.71604f,  0.76455f,  -1.59782f, 0.32332f,  1.11628f,  0.33062f,
+  -0.03728f, -0.05710f, 0.80447f,  -0.14719f, 1.34658f,  -0.05718f, 0.64015f,
+  0.21926f,  0.41653f,  0.12720f,  0.54092f,  1.39411f,  1.81819f,  -0.24513f,
+  0.00955f,  0.38011f,  -0.57787f, -0.41759f, 0.68834f,  -0.31783f, -0.40607f,
+  -0.10107f, -0.79374f, 0.75599f,  -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+  -0.13793f, -0.22293f, 0.18305f,  0.12445f,  0.56830f,  0.24567f,  0.09278f,
+  0.70803f,  0.35803f,  -1.52676f, -0.89624f, 0.77665f,  0.19877f,  0.77175f,
+  0.50355f,  0.08592f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer0[16] = {
+  -1.31834f, 0.14346f,  -0.10062f, 0.84489f,  0.95617f,  -0.06720f,
+  -0.68502f, -0.91442f, -0.31932f, 0.25276f,  -0.15138f, -1.57661f,
+  -0.14062f, -0.42120f, 0.94573f,  -0.09287f,
+};
+
+static const float av1_tx_type_nn_weights_16x16_layer1[64] = {
+  -1.80333f, -1.06353f, 0.55139f,  0.74644f,  0.13747f, -0.93018f, -0.10286f,
+  0.67133f,  0.24460f,  1.44583f,  0.02173f,  0.26037f, -0.73687f, 0.19566f,
+  0.61846f,  -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+  0.96224f,  -0.59139f, 0.03813f,  0.05403f,  1.33427f, -0.54375f, -1.92181f,
+  0.54704f,  0.13608f,  0.22151f,  -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+  1.00894f,  0.62318f,  -0.15296f, 1.27600f,  0.22822f, 0.12751f,  0.93910f,
+  -0.28502f, 0.53912f,  -0.96889f, 0.10182f,  0.81508f, -0.43028f, 2.67386f,
+  0.52204f,  0.49820f,  -0.41711f, 1.05038f,  1.12192f, 0.74349f,  -0.75417f,
+  -0.03718f, -0.35769f, 0.89651f,  0.63236f,  0.54215f, -0.07894f, 0.48274f,
+  1.08829f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer1[4] = {
+  0.81986f,
+  1.26865f,
+  0.11118f,
+  2.48404f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x16_layer0,
+      av1_tx_type_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x16_layer0,
+      av1_tx_type_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+  0.36539f,  0.25667f,  0.01491f,  -0.21959f, 2.55105f,  0.17615f, 1.79884f,
+  1.65936f,  -0.44363f, 0.00706f,  -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+  1.47682f,  0.09650f,  -3.59244f, -0.35004f, 0.93295f,  0.25806f, -0.08154f,
+  0.79332f,  0.79535f,  1.09467f,  1.57855f,  -0.51359f, 0.90553f, -1.67744f,
+  -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+  -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+  0.62806f,  -0.20675f, 4.91940f,  -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+  -0.57191f, -1.46418f, 0.67331f,  -1.15027f, 0.46288f,  0.81251f,  2.51768f,
+  -0.27147f, 0.00761f,  -2.15214f, -0.69650f, -0.50808f, 0.92832f,  0.45668f,
+  2.34201f,  -0.52941f, 0.51008f,  -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+  0.88043f,  2.64862f,  -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+  1.28413f,  -0.30326f, 2.45329f,  -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+  2.33198f,
+  3.36245f,
+  1.62603f,
+  2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x16_hor_layer0,
+    av1_tx_type_nn_weights_4x16_hor_layer1 },
+  { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+  1.61392f,  1.41239f,  1.47646f,  1.47325f,  1.46110f,  1.49208f,  1.49414f,
+  0.12835f,  -0.76986f, 0.07087f,  -0.24572f, -0.93168f, 3.07935f,  -0.18183f,
+  -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f,  -0.38711f,
+  -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+  -0.03305f, -4.07069f, -2.76643f, 0.04413f,  -1.03176f, -0.19217f, -0.44980f,
+  -2.48615f, -2.58112f, -0.87695f, 0.16187f,  -0.04891f, -0.06854f, 1.08104f,
+  0.75245f,  1.49302f,  0.63363f,  1.45715f,  0.92574f,  1.72029f,  0.33326f,
+  3.86646f,  0.04422f,  0.41019f,  0.36212f,  0.56600f,  -1.01552f, 0.05128f,
+  0.40454f,  -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+  -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f,  0.41010f,  0.31223f,
+  -0.43382f, -0.74715f, 2.03366f,  -0.30419f, 0.45747f,  0.09526f,  0.31678f,
+  0.22915f,  0.21832f,  1.26385f,  -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+  0.10936f,  2.97396f,  -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+  0.05173f,  -0.44274f, -0.15738f, 0.11311f,  0.43872f,  0.16837f,  -0.52849f,
+  2.90050f,  -0.54735f, -0.29591f, 1.24030f,  0.21696f,  -0.04443f, -1.60877f,
+  -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+  0.42820f,  -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f,  2.23477f,
+  0.01370f,  -0.20426f, -1.51411f, -0.72293f, 0.64516f,  0.97638f,  0.32616f,
+  -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+  -1.37863f, -0.05763f, -0.07041f, 0.15306f,  0.96026f,  -1.42105f,
+  -0.55822f, 1.04845f,  -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+  -0.32530f, 0.73483f,  0.08322f,  -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+  0.27194f,  0.50607f,  0.49229f,  -0.48192f, 0.15667f,  -1.38891f, 0.38102f,
+  -0.58825f, -0.07337f, -0.52909f, 0.36975f,  0.28710f,  0.34992f,  -0.73630f,
+  0.30386f,  -0.58822f, 0.36127f,  0.57950f,  0.55878f,  -0.42796f, 0.19967f,
+  -1.45517f, 0.42529f,  -0.54630f, -0.38169f, -0.84899f, 0.41622f,  0.46935f,
+  0.39077f,  -0.75448f, 0.31698f,  -0.76187f, 0.97765f,  0.57052f,  0.55825f,
+  -0.54273f, 0.20466f,  -1.46347f, 0.41813f,  -0.55019f, -0.19948f, -0.57982f,
+  0.41206f,  0.32373f,  0.38537f,  -1.11657f, 0.32887f,  -0.76911f, 1.12259f,
+  0.72163f,  0.82603f,  0.37786f,  0.34976f,  -1.86642f, 0.59961f,  -0.16329f,
+  -0.36631f, -0.56814f, 0.60410f,  0.53158f,  0.56389f,  -0.70508f, 0.51009f,
+  -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+  4.60896f,
+  4.53551f,
+  4.53124f,
+  4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x16_ver_layer0,
+    av1_tx_type_nn_weights_4x16_ver_layer1 },
+  { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+  1.45347f,  -0.15743f, 0.44236f,  0.25808f,  0.33944f,  0.38678f,  0.24428f,
+  1.67287f,  0.09539f,  -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+  -0.49183f, 0.09333f,  -0.99026f, -0.22157f, 0.53701f,  0.60447f,  0.15686f,
+  -0.04646f, 0.26341f,  2.12361f,  0.27090f,  -1.14716f, -0.64146f, -0.91604f,
+  -0.75335f, -0.60056f, -1.25084f, 1.68473f,  -3.24075f, -4.03867f, -2.07877f,
+  -0.02347f, 0.00333f,  -0.01259f, -0.00465f, 0.02526f,  0.36286f,  -0.10324f,
+  2.12780f,  -0.74584f, -1.05052f, 1.78467f,  -0.55065f, -0.03326f, 2.46781f,
+  1.18349f,  0.96015f,  1.01696f,  1.10584f,  1.07263f,  1.11531f,  -1.06413f,
+  0.32389f,  -1.87360f, -0.14435f, 1.77926f,  1.09966f,  -0.12680f, -0.61386f,
+  -0.09724f, -0.33095f, 1.12122f,  1.00791f,  1.52416f,  1.35004f,  1.32657f,
+  0.60950f,  -1.13538f, -0.38654f, 0.06473f,  2.10669f,  0.27734f,  -0.38359f,
+  -1.91455f, -1.22676f, 0.05786f,  0.97432f,  2.19967f,  0.50457f,  0.78976f,
+  0.95183f,  -0.32414f, 0.49437f,  -0.04506f, 0.18993f,  -0.07971f, 0.23889f,
+  -0.09872f, -0.66036f, 0.05377f,  2.69638f,  -0.08259f, -0.69210f, -1.08296f,
+  -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+  -0.30732f, -0.12043f, 0.11126f,  0.10771f,  -0.14956f, -0.02218f, 0.41016f,
+  1.16599f,  1.14629f,  1.12881f,  1.18676f,  1.24677f,  1.28695f,  1.11270f,
+  0.08233f,  1.75440f,  0.49228f,  -0.34858f, -0.17032f, 0.29288f,  0.47175f,
+  0.19055f,  -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+  -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f,  -0.21451f,
+  2.75281f,  0.04318f, 2.03965f,  0.14618f,  -0.70483f, -0.24517f,
+  1.14048f,  0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+  -1.17079f, 0.19096f,  -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+  0.05972f,  1.44759f,  -0.04068f, -0.26331f, 0.31400f,  0.96923f,  0.33443f,
+  -0.77215f, -0.91316f, -1.78928f, 0.21483f,  -1.24008f, -0.46190f, -0.12127f,
+  -0.62144f, 1.37593f,  0.08373f,  1.56215f,  0.00279f,  -0.14556f, 0.38710f,
+  0.96228f,  0.66433f,  -0.51798f, -0.80738f, -0.18539f, 0.19377f,  -1.03090f,
+  -1.51044f, -0.59485f, -0.62589f, 1.90742f,  0.09078f,  1.49113f,  0.00205f,
+  -0.15918f, 0.40827f,  1.08553f,  0.43431f,  0.33519f,  -1.12669f, -1.10274f,
+  0.80004f,  -1.83599f, -0.53134f, 2.00515f,  -0.32670f, 1.37124f,  0.51136f,
+  1.62563f,  0.24787f,  0.31757f,  0.81751f,  1.57262f,  0.83214f,  1.04661f,
+  -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+  2.32575f,
+  2.75703f,
+  1.12304f,
+  2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x4_hor_layer0,
+    av1_tx_type_nn_weights_16x4_hor_layer1 },
+  { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+  0.26047f,  0.99930f,  1.16484f,  -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+  0.46375f,  1.47951f,  1.13735f,  1.12356f,  0.27385f,  0.50978f,  2.09967f,
+  -1.47386f, 0.01950f,  -0.06362f, 0.26014f,  1.04544f,  -0.03099f, 0.07478f,
+  -0.39701f, 0.05545f,  2.73633f,  -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+  -0.17967f, -0.96622f, 0.42635f,  -1.04784f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+  -0.52088f, 0.52844f,  -1.03655f, -0.30974f,
+  2.59952f,  -1.93604f, 0.00000f,  2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+  0.10916f,  -0.21219f, -0.51340f, 0.69161f,  1.45988f,  -1.36942f, -0.40899f,
+  1.05136f,  -0.08486f, 0.10008f,  -0.55304f, 0.88012f,  1.61177f,  -1.64507f,
+  0.63428f,  1.15130f,  -0.17287f, -0.18592f, -0.01143f, 0.88293f,  1.73326f,
+  -1.63624f, 0.09359f,  1.18393f,  0.26531f,  0.22378f,  0.15170f,  1.06965f,
+  1.26814f,  -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+  2.34713f,
+  1.68667f,
+  1.25488f,
+  1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x4_ver_layer0,
+    av1_tx_type_nn_weights_16x4_ver_layer1 },
+  { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static const NN_CONFIG *const av1_tx_type_nnconfig_map_hor[] = {
+  &av1_tx_type_nnconfig_4x4_hor,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_hor,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_hor,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_hor,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_hor,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_hor,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_hor,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_hor,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+
+static const NN_CONFIG *const av1_tx_type_nnconfig_map_ver[] = {
+  &av1_tx_type_nnconfig_4x4_ver,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_ver,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_ver,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_ver,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+#endif  // CONFIG_NN_V2
+
+// Tx split model for 4x8 block.
+static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {
+  0.068650f,  -0.732073f, -0.040361f, 0.322550f,  -0.021123f, 0.212518f,
+  -0.350546f, 0.435987f,  -0.111756f, -0.401568f, 0.069548f,  -0.313000f,
+  0.073918f,  -0.373805f, -0.775810f, -0.124753f, 0.181094f,  -0.602641f,
+  -0.026219f, -0.350112f, 0.020599f,  -0.311752f, -0.476482f, -0.669465f,
+  -0.310921f, 0.348869f,  -0.115984f, 0.154250f,  0.200485f,  -0.016689f,
+  0.020392f,  0.413810f,  0.634064f,  -0.627530f, 0.399178f,  -0.012284f,
+  0.472030f,  0.091087f,  -0.706100f, -0.447944f, -0.274226f, 0.445656f,
+  0.309339f,  0.505522f,  0.038496f,  -0.152809f, 0.408684f,  -0.068151f,
+  0.271612f,  0.353233f,  -0.150365f, 0.075212f,  -0.035096f, 0.346615f,
+  0.124382f,  0.477072f,  0.216288f,  0.070548f,  -0.106362f, 0.681613f,
+  -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f,
+  0.063009f,  -0.123053f, 0.104875f,  -0.137581f, -0.282933f, -0.003624f,
+  -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f,
+  -0.670248f, -0.353762f, 0.181109f,  0.289715f,  -0.071206f, 0.261141f,
+  0.052796f,  -0.114554f, -0.139214f, -0.261380f, 0.075984f,  -0.647925f,
+  -0.099528f, -0.677814f, 0.015712f,  -0.389385f, -0.095622f, -0.165117f,
+  -0.109454f, -0.175240f, -0.393914f, 0.212330f,  0.037822f,  0.248280f,
+  0.180197f,  0.110493f,  -0.525727f, -0.092329f, -0.524029f, -0.407364f,
+  -0.542373f, -0.435626f, -0.912194f, 0.062794f,  0.160433f,  0.741485f,
+  -0.103659f, -0.119327f, -0.055275f, 0.334358f,  0.014713f,  0.046327f,
+  0.831114f,  -0.576682f, 0.354369f,  -0.082088f, 0.452331f,  0.039730f,
+  -0.792429f, -0.385862f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer0[16] = {
+  0.238621f,  2.186830f,  1.383035f,  -0.867139f, 1.257119f, -0.351571f,
+  -0.240650f, -0.971692f, 2.744843f,  1.116991f,  0.139062f, -0.165332f,
+  0.262171f,  -1.598153f, -1.427340f, -1.602306f,
+};
+
+static const float av1_tx_split_nn_weights_4x8_layer1[16] = {
+  -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f,
+  -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f,  0.278987f,
+  0.085082f,  0.614986f, 0.847904f,  0.637578f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer1[1] = {
+  0.20586078f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x8 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x8_layer0,
+      av1_tx_split_nn_weights_4x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x8_layer0,
+      av1_tx_split_nn_bias_4x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x8 block.
+static const float av1_tx_split_nn_weights_8x8_layer0[144] = {
+  0.177983f,  -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f,
+  -0.098202f, -0.279510f, 0.001054f,  -0.119319f, -1.835282f, -0.581507f,
+  -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f,
+  -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f,
+  0.015331f,  -0.341818f, 0.145549f,  -0.348362f, 0.147647f,  -0.323400f,
+  0.047558f,  -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f,
+  0.447740f,  0.782381f,  -0.179164f, -0.584675f, -0.052645f, 0.038656f,
+  -0.096783f, 0.038342f,  -0.170762f, -0.405844f, -0.552665f, -0.509866f,
+  0.757204f,  -1.296465f, 0.631015f,  0.009265f,  0.646192f,  0.044523f,
+  0.653161f,  0.033820f,  0.849639f,  -0.068555f, -1.036085f, -0.511652f,
+  0.104693f,  -1.458690f, 0.286051f,  -0.089800f, 0.381564f,  -0.302640f,
+  0.304465f,  -0.268706f, 0.432603f,  -0.117914f, -2.070031f, -0.565696f,
+  -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f,
+  -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f,
+  0.101894f,  -0.221847f, 0.018412f,  -0.423887f, -0.266684f, -0.444930f,
+  -0.196237f, 0.106638f,  -0.065834f, -0.538401f, -0.280772f, -0.620348f,
+  1.089957f,  -0.799928f, 0.504112f,  -0.165763f, 0.578741f,  -0.172653f,
+  0.547316f,  -0.143484f, 0.717220f,  -0.297190f, -1.237854f, -0.074819f,
+  -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f,
+  -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f,
+  0.231249f,  -1.693073f, -0.035899f, 0.380845f,  -0.058476f, 0.409405f,
+  -0.066679f, 0.406731f,  -0.068501f, 0.396748f,  0.639462f,  0.150834f,
+  -0.418659f, -1.421931f, 0.101889f,  0.083573f,  0.129746f,  0.134460f,
+  0.081185f,  0.127420f,  0.083664f,  0.051096f,  1.361688f,  0.386093f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer0[12] = {
+  4.280443f, 2.218902f, -0.256953f, 3.161431f,  2.082548f, 2.506052f,
+  2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f,
+};
+
+static const float av1_tx_split_nn_weights_8x8_layer1[12] = {
+  1.178833f,  -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f,
+  -0.766968f, -0.356663f, 0.450146f,  0.509370f, -0.356604f, -0.443506f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer1[1] = {
+  -0.156294f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x8 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      12,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x8_layer0,
+      av1_tx_split_nn_weights_8x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x8_layer0,
+      av1_tx_split_nn_bias_8x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x16 block.
+static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = {
+  0.374660f,  0.218905f,  -0.139779f, 0.212141f,  0.056517f,  0.051114f,
+  0.042860f,  -0.273258f, -0.340809f, 0.138983f,  -0.216996f, -0.241519f,
+  -0.123244f, 0.078577f,  -0.472273f, -0.194201f, 0.125056f,  0.239761f,
+  -0.332782f, 0.174782f,  -0.211400f, -0.129795f, 0.062195f,  0.113176f,
+  -0.008869f, 0.140764f,  0.059833f,  0.163826f,  0.359293f,  -0.109797f,
+  -0.022091f, -0.059536f, -0.188226f, 0.179709f,  0.031386f,  0.164790f,
+  0.214364f,  0.198555f,  0.152262f,  -0.242980f, 0.319367f,  -0.136902f,
+  0.046524f,  -0.043591f, 0.342178f,  -0.011757f, -0.014286f, 0.072871f,
+  -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f,
+  -0.120865f, -0.160042f, 0.240028f,  0.112902f,  -0.141587f, -0.703012f,
+  -0.136591f, 0.318993f,  -0.154417f, -0.054668f, 0.192870f,  0.176166f,
+  -0.029965f, 0.266942f,  -0.178384f, 0.038680f,  0.134403f,  -0.002426f,
+  0.534825f,  -0.070923f, 0.413281f,  0.418148f,  0.093729f,  0.016454f,
+  0.305358f,  -0.040512f, 0.069904f,  -0.227588f, -0.362220f, -0.031604f,
+  -0.394901f, 0.071506f,  -0.342833f, -0.142550f, -0.164005f, 0.182600f,
+  0.213062f,  0.076805f,  0.278758f,  0.125613f,  -0.035552f, 0.040971f,
+  0.182785f,  -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f,
+  0.114657f,  0.047121f,  0.195902f,  0.264759f,  0.017799f,  0.210230f,
+  0.150749f,  -0.142142f, 0.182494f,  -0.142415f, -0.259782f, -0.114830f,
+  -0.198826f, 0.000061f,  -0.375668f, -0.276656f, -0.373202f, 0.210298f,
+  0.422680f,  0.066960f,  0.351106f,  -0.209034f, 0.367195f,  -0.110274f,
+  0.115573f,  -0.066642f, -0.389673f, -0.260447f, 0.056949f,  -0.180425f,
+  0.069922f,  -0.153506f, -0.097053f, -0.111757f, 0.094069f,  0.144837f,
+  -0.052984f, -0.506681f, -0.034474f, 0.279057f,  -0.105025f, 0.006656f,
+  -0.125017f, -0.114096f, 0.103153f,  -0.117402f, -0.359472f, 0.072534f,
+  0.110291f,  0.003088f,  -0.456897f, 0.038331f,  -0.322298f, 0.113942f,
+  -0.119916f, -0.194392f, 0.093167f,  0.193459f,  0.074671f,  0.033602f,
+  0.004440f,  -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f,
+  0.319160f,  -0.066218f, 0.291246f,  0.181292f,  0.089914f,  0.025273f,
+  0.303128f,  0.019063f,  0.078545f,  -0.396919f, 0.014065f,  -0.122121f,
+  0.037107f,  -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f,
+  0.102970f,  -0.225040f, 0.061059f,  -0.258188f, -0.469871f, -0.099607f,
+  -0.061524f, -0.213700f, 0.070237f,  -0.289134f, -0.238225f, 0.256403f,
+  -0.119344f, 0.067782f,  -0.398983f, -0.123975f, -0.200205f, -0.047038f,
+  0.026569f,  0.031037f,  0.094302f,  -0.101239f, 0.433307f,  -0.303612f,
+  0.088537f,  -0.164436f, 0.202471f,  -0.048592f, -0.251904f, 0.122577f,
+  -0.309874f, -0.263405f, -0.292503f, 0.216589f,  0.035378f,  0.136599f,
+  -0.145844f, -0.018211f, 0.174084f,  -0.449941f, -0.001428f, 0.064134f,
+  0.039652f,  0.111083f,  -0.246076f, -0.204733f, 0.056559f,  -0.000123f,
+  0.104049f,  0.138512f,  -0.128309f, 0.087855f,  0.232784f,  0.247138f,
+  0.162766f,  0.154829f,  0.313605f,  -0.164115f, -0.050844f, 0.156549f,
+  0.185279f,  -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f,
+  -0.203399f, -0.096831f, -0.127867f, 0.310674f,  -0.008181f, 0.004078f,
+  -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f,
+  0.114268f,  -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f,
+  -0.352853f, -0.224001f, -0.156330f, 0.215436f,  0.171846f,  0.291849f,
+  0.108832f,  0.046991f,  -0.127801f, 0.032485f,  0.141493f,  0.123319f,
+  -0.057250f, 0.315346f,  -0.061317f, -0.465086f, -0.130179f, -0.217841f,
+  -0.239089f, -0.073251f, -0.327718f, 0.054905f,  -0.283169f, -0.028900f,
+  0.071450f,  0.270072f,  0.248891f,  0.088052f,  0.253319f,  0.122808f,
+  0.175490f,  -0.147805f, 0.089169f,  -0.045457f, -0.330788f, 0.099791f,
+  -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f,
+  0.162554f,  -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f,
+  -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f,
+  -0.182036f, 0.176772f,  -0.070823f, 0.216054f,  -0.211533f, -0.232992f,
+  0.279346f,  0.117984f,  0.236674f,  0.126625f,  -0.046220f, 0.044919f,
+  0.278492f,  0.083944f,  0.180512f,  0.217994f,  0.401170f,  -0.064417f,
+  0.011636f,  -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f,
+  -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f,  -0.312849f,
+  -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f,
+  -0.354389f, 0.169464f,  0.094151f,  -0.217122f, -0.456397f, 0.211478f,
+  0.219232f,  -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f,
+  -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f,  -0.144181f,
+  0.335028f,  0.176439f,  0.105980f,  0.169390f,  0.155615f,  -0.040618f,
+  -0.176029f, 0.155569f,  -0.184833f, -0.171099f, -0.178663f, -0.032051f,
+  -0.434334f, 0.092238f,  -0.263103f, 0.061804f,  -0.172957f, 0.005962f,
+  -0.100176f, 0.125898f,  0.048092f,  -0.088141f, 0.247196f,  -0.221601f,
+  -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f,
+  0.403401f,  -0.046200f, 0.322259f,  0.219678f,  0.109850f,  0.051837f,
+  0.196861f,  -0.019118f, 0.248818f,  -0.137567f, 0.127862f,  0.052293f,
+  0.298726f,  0.275788f,  0.015344f,  0.058714f,  0.283691f,  -0.053794f,
+  -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f,
+  -0.252396f, -0.069017f, 0.034803f,  -0.003388f, -0.262577f, 0.062115f,
+  -0.298393f, 0.215415f,  -0.153615f, 0.289902f,  0.085886f,  -0.504290f,
+  0.077178f,  0.150861f,  -0.228848f, -0.261020f, 0.198204f,  0.162113f,
+  0.346418f,  -0.286950f, 0.354756f,  -0.226419f, 0.024720f,  0.208037f,
+  0.107286f,  -0.110849f, 0.104415f,  -0.207725f, 0.063932f,  -0.037748f,
+  -0.167037f, -0.068282f, 0.320815f,  -0.051884f, 0.099989f,  -0.078388f,
+  0.127071f,  0.046675f,  -0.336571f, -0.273080f, 0.264694f,  -0.007352f,
+  -0.093828f, 0.094773f,  -0.144434f, 0.091795f,  -0.031615f, 0.056914f,
+  0.064673f,  -0.136669f, 0.344734f,  0.225926f,  0.283451f,  -0.068354f,
+  0.030572f,  0.180784f,  -0.378047f, -0.092962f, -0.083291f, 0.038970f,
+  0.052094f,  -0.017932f, 0.216302f,  -0.184396f, 0.079888f,  0.210406f,
+  -0.020627f, 0.244744f,  0.336972f,  -0.182914f, -0.220976f, -0.304225f,
+  -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f,
+  -0.408768f, 0.184693f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer0[64] = {
+  -0.274107f, 0.445751f,  0.234359f,  0.291593f,  0.163298f,  0.183707f,
+  -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f,  -0.354974f,
+  0.000000f,  -0.254630f, 0.220149f,  0.371104f,  0.789759f,  0.270300f,
+  0.195126f,  -0.206958f, 0.917708f,  -0.256232f, 1.131933f,  1.178944f,
+  0.461270f,  0.246169f,  -0.818614f, -0.111986f, 0.759355f,  0.154889f,
+  0.470299f,  -1.025250f, 0.678678f,  0.959346f,  -0.164105f, 0.544079f,
+  -0.448733f, 0.649221f,  -0.536672f, 0.962758f,  -0.256427f, 0.808664f,
+  -0.118694f, 0.684873f,  -0.015635f, -0.046469f, 0.075481f,  0.412647f,
+  0.454456f,  -0.107169f, 0.775235f,  -0.261629f, -1.194849f, 0.010093f,
+  -0.231289f, 0.658286f,  -0.769320f, 0.564545f,  0.482962f,  -0.131378f,
+  -0.255844f, -0.078400f, 0.476752f,  0.643001f,
+};
+
+static const float av1_tx_split_nn_weights_8x16_layer1[64] = {
+  -0.145065f, -0.145101f, 0.174786f,  0.196692f,  0.102025f,  -0.087735f,
+  0.386353f,  -0.660539f, -0.183940f, 0.490045f,  -0.276404f, -0.145669f,
+  0.209846f,  -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f,
+  -0.108545f, -0.261181f, 1.435606f,  -0.176621f, -1.158548f, 2.035680f,
+  0.218069f,  -0.138629f, 0.305958f,  -0.277194f, -0.602468f, 0.203873f,
+  0.120720f,  0.216095f,  -0.434502f, -0.579746f, -0.239450f, 0.755529f,
+  0.545643f,  0.232091f,  0.330169f,  0.988136f,  -0.070465f, -0.345584f,
+  -0.162455f, -0.617064f, 0.123881f,  -0.201098f, 0.222756f,  0.112932f,
+  0.048647f,  -0.147890f, 0.394584f,  -0.262148f, 0.280564f,  -0.195432f,
+  -0.047515f, 1.133410f,  0.255415f,  -0.299032f, -0.397807f, -0.153246f,
+  -0.256734f, 0.177370f,  0.213522f,  -0.530158f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer1[1] = {
+  0.14910713f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      64,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x16_layer0,
+      av1_tx_split_nn_weights_8x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x16_layer0,
+      av1_tx_split_nn_bias_8x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x16 block.
+static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = {
+  -0.177215f, -0.297166f, 0.299924f,  0.207878f,  0.216871f,  0.173264f,
+  0.295464f,  0.048395f,  0.154731f,  0.305880f,  0.056787f,  -0.166617f,
+  0.115653f,  -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f,
+  -0.024940f, -0.007055f, 0.001392f,  0.021678f,  -1.594600f, -0.099593f,
+  0.332930f,  0.103574f,  0.158249f,  0.182601f,  0.332665f,  0.226207f,
+  -0.139566f, 0.185531f,  0.099074f,  -0.185654f, -0.203121f, -0.285678f,
+  -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f,
+  -0.066150f, -0.099058f, -0.458879f, 0.127544f,  0.338314f,  -0.161350f,
+  0.030091f,  -0.075528f, 0.004320f,  0.353690f,  -0.013480f, -0.420402f,
+  -0.004659f, -0.329401f, -0.001745f, 0.227384f,  -0.055183f, 0.121405f,
+  0.160340f,  0.143603f,  -0.221813f, 0.079107f,  -0.657639f, -0.084348f,
+  -0.303414f, 0.046774f,  -0.367679f, 0.060005f,  0.168645f,  0.084421f,
+  -0.133625f, 0.301375f,  0.079412f,  -0.419303f, 0.017235f,  0.068637f,
+  0.018384f,  -0.428325f, -0.019753f, 0.149444f,  -0.474836f, -0.287162f,
+  0.198083f,  0.028292f,  -0.299092f, -0.005849f, -0.256245f, 0.233277f,
+  -0.217561f, -0.264003f, 0.269411f,  0.207032f,  -0.339411f, -0.198431f,
+  -0.028521f, 0.158076f,  0.177116f,  0.345702f,  -0.145132f, 0.064623f,
+  -0.090867f, 0.288816f,  -0.263198f, -0.071028f, -0.044546f, 0.380017f,
+  -0.014100f, -0.271192f, -0.318559f, 0.129015f,  -0.050314f, -0.093355f,
+  -0.578498f, 0.099090f,  -0.133080f, -0.029975f, -0.059828f, -0.157765f,
+  -0.321153f, -0.343671f, -0.242959f, 0.128304f,  0.017170f,  0.072787f,
+  -0.475838f, -0.003806f, -0.068615f, 0.150556f,  -0.159903f, -0.416513f,
+  0.218794f,  -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f,
+  -0.077329f, -0.089747f, -0.096526f, 0.537952f,  0.134725f,  -0.006469f,
+  -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f,  -0.021712f,
+  -0.513992f, 0.259135f,  -0.319808f, 0.077811f,  0.104613f,  0.370571f,
+  0.185244f,  0.065530f,  -0.091098f, -0.573741f, 0.111934f,  0.437417f,
+  -0.123691f, 0.220641f,  -0.024783f, -0.149460f, -0.354185f, -0.134127f,
+  0.038015f,  -0.380596f, 0.250980f,  0.142208f,  0.135170f,  -0.131129f,
+  -0.357556f, -0.530945f, 0.159672f,  -0.147025f, -0.377829f, -0.504508f,
+  -0.492870f, 0.020753f,  0.142818f,  0.025172f,  0.086140f,  0.091283f,
+  0.087491f,  -0.186415f, 0.177785f,  -0.195121f, -1.191148f, -0.477102f,
+  0.023371f,  0.227004f,  -0.023502f, -0.242913f, -0.074398f, -0.153480f,
+  0.162900f,  0.415509f,  -0.162565f, -0.131709f, -0.258852f, -0.252027f,
+  -0.080845f, -0.330274f, 0.021874f,  0.232398f,  0.069277f,  0.220567f,
+  -0.024237f, -0.366771f, 0.081673f,  -0.429906f, -0.302170f, 0.061045f,
+  0.352777f,  -0.230376f, 0.408153f,  0.064758f,  0.142051f,  0.007219f,
+  0.622878f,  0.212577f,  0.036489f,  0.081150f,  -0.284767f, 0.107763f,
+  -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f,
+  -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f,  0.057565f,
+  0.414265f,  -0.159155f, 0.221456f,  0.146314f,  0.265776f,  -0.006516f,
+  0.473978f,  -0.186431f, 0.288672f,  -0.060437f, 0.083380f,  -0.205641f,
+  0.360016f,  0.222041f,  0.420011f,  0.024579f,  0.377546f,  0.250380f,
+  -0.069900f, 0.296743f,  0.073532f,  -0.243225f, -0.374987f, -0.387288f,
+  -0.237255f, -0.287013f, 0.417831f,  -0.252988f, -0.257652f, -0.066775f,
+  -0.253926f, 0.057841f,  0.346133f,  -0.157797f, -0.406028f, -0.286893f,
+  0.274507f,  -0.452561f, 0.143381f,  -0.097755f, 0.021242f,  0.034561f,
+  0.044115f,  0.004065f,  0.066729f,  0.043558f,  0.102991f,  -0.477574f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer0[24] = {
+  -0.479033f, 1.467402f,  -0.366291f, 0.372511f,  0.715322f,  -0.605500f,
+  0.176848f,  0.032318f,  0.237429f,  -0.046047f, 0.452082f,  0.451805f,
+  -0.822845f, 0.636762f,  -0.057350f, 1.163978f,  0.728287f,  0.603654f,
+  -0.245519f, -0.893569f, -1.428185f, 0.808870f,  -0.076159f, 1.231976f,
+};
+
+static const float av1_tx_split_nn_weights_16x16_layer1[24] = {
+  -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f,  -0.170504f,
+  -0.538432f, 0.033893f, 0.149842f,  0.404140f,  -0.377812f, 0.338838f,
+  -0.176091f, 0.249844f, -0.362533f, 1.412460f,  0.196862f,  0.278194f,
+  -0.140444f, 0.297746f, 0.172533f,  0.116470f,  -0.151656f, -0.603250f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer1[1] = {
+  0.184803f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x16 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x16_layer0,
+      av1_tx_split_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x16_layer0,
+      av1_tx_split_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x32 block.
+static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = {
+  -0.439303f, 0.004813f,  -0.365052f, -0.116868f, -0.356716f, -0.196537f,
+  -0.196770f, -0.076096f, 0.357004f,  -0.044909f, -0.112910f, -0.129081f,
+  0.156725f,  -0.386346f, 0.038971f,  0.160696f,  0.204923f,  -0.384333f,
+  -0.319546f, 0.028179f,  -0.250524f, -0.289669f, -0.284138f, -0.258963f,
+  -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f,  0.141414f,
+  0.303016f,  0.098066f,  0.482455f,  0.036069f,  -0.166279f, 0.210119f,
+  -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f,
+  -0.306403f, 0.026318f,  -0.277296f, 0.092684f,  -0.033584f, -0.018371f,
+  -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f,
+  0.361851f,  -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f,
+  -0.097051f, 0.259172f,  0.016432f,  0.259358f,  0.145059f,  0.037196f,
+  0.091581f,  -0.219644f, 0.140384f,  -0.446837f, -0.234531f, 0.149508f,
+  -0.083429f, 0.186189f,  -0.099890f, -0.111277f, 0.495214f,  0.085053f,
+  -0.266613f, -0.051366f, 0.148593f,  0.111875f,  0.077787f,  -0.371653f,
+  -0.146157f, -0.229235f, 0.076203f,  0.488975f,  0.096771f,  -0.009483f,
+  0.192985f,  0.246273f,  -0.192671f, -0.557890f, -0.292650f, -0.088907f,
+  -0.106892f, -0.329659f, 0.012105f,  -0.359326f, 0.170723f,  -0.004357f,
+  0.171593f,  -0.478768f, -0.236016f, -0.035077f, 0.133731f,  0.137962f,
+  -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f,
+  -0.649359f, 0.127605f,  0.097930f,  0.182775f,  -0.313324f, 0.053349f,
+  0.204203f,  -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f,
+  -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f,
+  0.632147f,  0.221825f,  0.268394f,  -0.096357f, 0.442545f,  -0.007117f,
+  -0.036125f, 0.000525f,  0.088092f,  -0.203653f, 0.086925f,  0.439141f,
+  0.329889f,  -0.370050f, -0.194306f, -0.207430f, 0.132779f,  -0.217614f,
+  -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f,
+  -0.007300f, 0.062257f,  -0.347865f, -0.296767f, -0.359123f, 0.230459f,
+  -0.189117f, -0.087622f, -0.561091f, 0.184182f,  -0.044980f, 0.012643f,
+  0.241672f,  0.050272f,  -0.204851f, -0.159285f, -0.064081f, -0.118666f,
+  -0.269471f, 0.231668f,  0.135749f,  -0.131162f, 0.062760f,  0.100949f,
+  0.074967f,  -0.056918f, 0.251707f,  0.034098f,  0.341290f,  -0.105027f,
+  0.313246f,  -0.092679f, -0.014632f, -0.390967f, 0.136881f,  -0.241554f,
+  0.097674f,  0.110832f,  -0.390245f, 0.017654f,  -0.506222f, 0.065252f,
+  0.244834f,  -0.171352f, -0.331702f, 0.111043f,  0.125217f,  -0.058116f,
+  -0.382595f, -0.052545f, 0.114261f,  -0.493617f, 0.243984f,  -0.171053f,
+  0.165009f,  -0.063020f, 0.096502f,  0.341339f,  -0.013443f, 0.056372f,
+  0.339284f,  0.398376f,  0.389409f,  0.257252f,  0.517368f,  0.078856f,
+  0.087716f,  -0.171092f, 0.227461f,  0.125307f,  -0.054423f, -0.143161f,
+  0.224041f,  -0.086477f, -0.092548f, 0.072392f,  -0.061608f, 0.258347f,
+  0.147033f,  -0.478244f, -0.204869f, 0.038552f,  -0.144563f, 0.224087f,
+  -0.296705f, 0.153889f,  -0.064624f, 0.085265f,  -0.103826f, 0.127971f,
+  0.019965f,  0.111937f,  -0.074187f, -0.029518f, -0.127305f, -0.012210f,
+  0.042714f,  0.070052f,  -0.202360f, 0.348144f,  -0.132097f, -0.209585f,
+  -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f,  -0.013468f,
+  -0.406090f, -0.144936f, 0.208620f,  0.343445f,  -0.059639f, 0.114857f,
+  -0.069431f, -0.218725f, 0.190575f,  -0.368101f, 0.030030f,  0.062815f,
+  -0.239369f, -0.537852f, 0.022487f,  0.023038f,  0.190788f,  0.040123f,
+  -0.004304f, 0.060749f,  -0.108929f, 0.136796f,  -0.542875f, -0.227074f,
+  -0.182244f, 0.082559f,  0.019149f,  0.178854f,  0.120284f,  0.009070f,
+  0.068268f,  -0.544822f, 0.120536f,  0.354028f,  -0.119890f, -0.122055f,
+  -0.405335f, 0.122341f,  -0.304412f, 0.062405f,  -0.302568f, -0.276505f,
+  -0.120915f, -0.221841f, 0.282007f,  -0.253971f, 0.059517f,  -0.144976f,
+  0.149391f,  -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f,
+  0.017485f,  0.021038f,  -0.023728f, -0.192181f, -0.103996f, 0.092873f,
+  -0.114365f, -0.397732f, -0.065421f, 0.053084f,  0.035201f,  0.053019f,
+  -0.105377f, -0.039500f, 0.131904f,  -0.123911f, -0.390328f, -0.125198f,
+  -0.000126f, 0.014864f,  -0.220187f, 0.084056f,  -0.492155f, -0.164979f,
+  0.133592f,  0.121519f,  -0.240813f, 0.186680f,  0.118673f,  0.235006f,
+  -0.239894f, -0.185759f, -0.336992f, 0.209620f,  -0.298845f, 0.127803f,
+  -0.083992f, 0.194340f,  -0.245378f, 0.212308f,  0.142512f,  -0.163324f,
+  0.383495f,  0.291065f,  0.286620f,  -0.239957f, 0.225127f,  -0.174424f,
+  0.297231f,  -0.045434f, 0.156444f,  -0.184273f, -0.204567f, 0.202551f,
+  0.370019f,  -0.073910f, 0.344897f,  0.063100f,  0.338547f,  -0.099145f,
+  0.391863f,  -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer0[32] = {
+  0.143343f,  -0.021982f, -0.314939f, 0.170867f,  -0.081248f, 0.125758f,
+  -0.355762f, 0.279798f,  1.027712f,  -0.434660f, 1.072005f,  0.668893f,
+  -0.031216f, -0.528650f, 0.328349f,  0.543645f,  -0.188810f, 0.221110f,
+  -1.638637f, 0.058045f,  -1.731105f, -0.444284f, 0.513693f,  0.890025f,
+  0.160288f,  0.393312f,  0.332856f,  -0.080767f, 0.299822f,  0.235876f,
+  0.254942f,  -0.017796f,
+};
+
+static const float av1_tx_split_nn_weights_32x32_layer1[32] = {
+  -0.090326f, -0.267553f, -0.026071f, 0.100912f,  0.279137f,  0.079064f,
+  -0.074885f, 0.053804f,  0.736810f,  -0.031693f, -0.970514f, 0.174069f,
+  0.095940f,  -0.065047f, 0.052911f,  0.176728f,  -0.058274f, 0.148364f,
+  -0.162210f, 0.093875f,  -0.367663f, 0.020876f,  0.137280f,  -1.099116f,
+  0.146854f,  0.075590f,  0.228534f,  0.141993f,  0.072143f,  0.101421f,
+  -0.068547f, -0.154148f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer1[1] = {
+  0.316622f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x32 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x32_layer0,
+      av1_tx_split_nn_weights_32x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x32_layer0,
+      av1_tx_split_nn_bias_32x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 64x64 block.
+static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = {
+  -0.006828f, 0.149944f,  -0.017614f, -0.044599f, -0.024517f, 0.507698f,
+  0.001039f,  0.037164f,  0.015091f,  -0.306620f, -0.162047f, -0.369440f,
+  0.396310f,  0.087121f,  0.208609f,  -0.083068f, 0.493774f,  0.217682f,
+  0.377393f,  0.172879f,  0.397422f,  0.078919f,  0.741350f,  0.064169f,
+  -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f,
+  -0.436596f, -0.007551f, -0.396721f, 0.153570f,  -0.190838f, -0.071869f,
+  0.048799f,  -0.301301f, -0.005015f, 0.500480f,  -0.030622f, -0.559095f,
+  -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f,  -0.411323f,
+  -0.005366f, -0.069496f, 0.019990f,  0.327931f,  -0.002516f, 0.393190f,
+  0.001759f,  0.035093f,  -0.030302f, -0.528984f, 0.174781f,  0.241462f,
+  -0.415427f, -0.164502f, 0.143065f,  -0.122595f, 0.082049f,  -0.143346f,
+  0.055642f,  -0.124701f, 0.004050f,  -0.216235f, -2.681730f, 0.101658f,
+  0.381239f,  0.465936f,  0.331154f,  0.301708f,  -0.360171f, 0.054886f,
+  -0.118658f, 0.287921f,  0.277859f,  0.203784f,  0.247809f,  0.656924f,
+  -0.354628f, 0.315081f,  0.105108f,  -0.510179f, 0.059267f,  0.061386f,
+  0.076423f,  0.347119f,  0.100134f,  0.028402f,  -0.118621f, -0.238689f,
+  0.080141f,  -0.138863f, 0.009009f,  -0.100526f, -0.138875f, 0.066992f,
+  0.005949f,  0.564336f,  0.046994f,  0.004655f,  0.366047f,  0.014695f,
+  -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f,  -0.020925f,
+  -0.227236f, -0.068141f, 0.282009f,  0.040192f,  -0.267100f, 0.229228f,
+  0.133861f,  0.338706f,  -0.030178f, -0.040919f, -0.026343f, -0.330338f,
+  -0.066931f, -0.110580f, -0.072056f, 0.599457f,  -0.020738f, 0.169200f,
+  0.836240f,  -0.157548f, 0.386273f,  0.002404f,  0.329410f,  -0.007020f,
+  0.351705f,  -0.041259f, 0.388861f,  0.003899f,  0.582627f,  0.023572f,
+  0.409912f,  -0.158472f, 0.536383f,  0.525093f,  0.604247f,  0.439159f,
+  0.692832f,  0.046272f,  0.590367f,  -0.082166f, 0.262357f,  0.478671f,
+  0.031935f,  0.042675f,  0.120002f,  0.398616f,  -0.078967f, 0.227986f,
+  -0.044679f, 0.151061f,  -0.085564f, 0.220205f,  -0.265606f, -0.203623f,
+  0.204719f,  -0.125922f, 0.038544f,  -0.269379f, 0.025866f,  0.109967f,
+  0.019064f,  -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f,
+  0.278496f,  0.018620f,  0.209971f,  0.296250f,  0.142850f,  0.288689f,
+  0.137084f,  0.130517f,  0.128171f,  -0.155396f, -0.008449f, -0.099845f,
+  0.173455f,  -0.059909f, -0.147318f, 0.102851f,  -0.251389f, -0.001448f,
+  0.103907f,  0.297273f,  -0.027846f, 0.028260f,  -0.382601f, 0.346695f,
+  -0.601641f, 0.162366f,  -0.477495f, -0.042731f, -0.387871f, -0.051791f,
+  -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f,  0.003008f,
+  0.099917f,  -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f,
+  -0.050644f, 0.020041f,  -0.132912f, -0.061578f, -3.083691f, -0.014961f,
+  -0.129115f, -0.710559f, 0.157213f,  -0.844037f, -0.121991f, -0.943386f,
+  -0.231269f, -0.003462f, 0.331478f,  -0.132703f, -1.285993f, -0.120957f,
+  -0.373755f, -0.322609f, 0.309059f,  -0.131523f, -0.118334f, -0.063805f,
+  -0.104251f, 0.012166f,  -0.094699f, -0.283753f, 0.128168f,  -0.526929f,
+  -0.050331f, 0.186153f,  0.005913f,  -0.221236f, 0.036363f,  0.160909f,
+  -0.001342f, -0.382749f, 0.037820f,  0.281689f,  -0.024275f, 0.028854f,
+  0.318291f,  0.318526f,  0.035778f,  0.034031f,  0.189663f,  -0.293367f,
+  0.082022f,  0.127923f,  0.078866f,  -0.081361f, -0.268117f, 0.246675f,
+  0.248605f,  -0.215479f, -0.073084f, 0.496140f,  -0.067327f, 0.396237f,
+  -0.120739f, 0.033752f,  -0.044120f, -0.218941f, -0.028078f, 0.195132f,
+  -0.040400f, 0.281604f,  -0.100471f, 0.415207f,  -0.258503f, -0.429749f,
+  0.150569f,  -0.010859f, 0.136448f,  0.026589f,  0.148466f,  0.110764f,
+  0.380967f,  0.009177f,  0.103075f,  0.116417f,  0.226273f,  -0.327746f,
+  0.169346f,  0.284553f,  -0.094986f, 0.312745f,  -0.147840f, 0.025062f,
+  -0.494482f, 0.112388f,  -0.213962f, 0.107050f,  -0.433371f, -0.096276f,
+  -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f,  0.042846f,
+  -0.237479f, 0.104746f,  0.158677f,  0.358937f,  0.099921f,  0.277109f,
+  0.012410f,  -0.062897f, 0.116130f,  0.255309f,  0.341628f,  0.145002f,
+  -0.429344f, -0.016433f, -0.068985f, 0.285194f,  -0.286719f, -0.018298f,
+  -0.179369f, -0.194655f, -0.165380f, 0.026071f,  -0.428268f, -0.379929f,
+  -0.727543f, 0.179610f,  -0.963979f, -0.042026f, -0.616202f, 0.133401f,
+  -0.784966f, 0.061205f,  -0.713357f, 0.129795f,  0.120512f,  -0.339545f,
+  0.353557f,  0.114906f,  -0.329813f, -0.209987f, 0.085410f,  0.214313f,
+  -0.122082f, 0.335770f,  -0.020937f, 0.202456f,  0.289023f,  -0.421186f,
+  0.337905f,  0.407663f,  0.132771f,  0.071734f,  0.213914f,  0.128595f,
+  0.302659f,  -0.209501f, 0.217756f,  0.253079f,  -0.089505f, -0.205614f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer0[32] = {
+  0.296914f,  -1.826816f, 0.346130f,  0.969520f,  -0.528154f, 1.175862f,
+  -0.075985f, -0.097323f, -0.233059f, 0.004846f,  0.401279f,  -2.272435f,
+  0.086257f,  0.414162f,  -0.194786f, -0.233887f, -0.113215f, -2.453546f,
+  0.861214f,  0.298361f,  0.267397f,  -0.158557f, -0.119911f, -0.098134f,
+  -0.339263f, 0.385871f,  -0.678123f, 0.263218f,  0.251611f,  -1.155773f,
+  -0.365437f, 0.229255f,
+};
+
+static const float av1_tx_split_nn_weights_64x64_layer1[32] = {
+  0.502104f,  -0.708023f, 0.419648f,  1.583418f,  0.419355f,  -1.462981f,
+  -0.439623f, 0.405691f,  0.823257f,  0.061654f,  0.750875f,  0.775031f,
+  -0.387909f, 0.447385f,  0.284690f,  0.353262f,  -0.224347f, 0.832864f,
+  -1.708491f, -1.042447f, -0.272829f, 0.540640f,  0.310509f,  0.723745f,
+  0.245592f,  -0.218417f, -0.597987f, -0.362301f, 0.702217f,  -0.692614f,
+  0.207812f,  0.513560f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f };
+
+static const NN_CONFIG av1_tx_split_nnconfig_64x64 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_64x64_layer0,
+      av1_tx_split_nn_weights_64x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_64x64_layer0,
+      av1_tx_split_nn_bias_64x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 4x16 block.
+static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = {
+  -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f,
+  -2.128968f, -0.655518f, 0.432180f,  0.879752f,  -0.222211f, 0.061615f,
+  -0.230969f, 0.569496f,  1.424188f,  0.598063f,  -0.436005f, -0.737606f,
+  -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f,
+  -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f,  -0.331752f,
+  -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f,
+  -0.636060f, 0.183271f,  -0.610212f, 0.345895f,  -1.100906f, -1.605713f,
+  0.111888f,  -0.140937f, 0.063013f,  -0.013315f, -0.273472f, -0.255870f,
+  1.200328f,  0.274002f,  1.005776f,  0.322392f,  1.222373f,  0.158227f,
+  0.408810f,  0.145022f,  0.139842f,  -1.249412f, 0.286672f,  -0.635699f,
+  0.312562f,  -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f,
+  -0.132199f, -0.863055f, 0.217579f,  -1.161425f, -0.302087f, -1.357271f,
+  -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f,
+  -2.057684f, -0.228755f, 0.606278f,  0.101198f,  -0.314847f, -1.303255f,
+  -0.294964f, 1.301923f,  0.041712f,  0.077593f,  -1.152746f, 0.495315f,
+  -0.751566f, 0.230249f,  -0.840661f, 0.100731f,  1.346269f,  0.649898f,
+  -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f,
+  -0.354072f, 0.068292f,  -0.234168f, 0.277503f,  0.179134f,  0.907420f,
+  0.354626f,  -0.627210f, 0.905779f,  0.512612f,  0.161190f,  -0.843177f,
+  0.014953f,  -0.354983f, 0.011116f,  -0.429598f, -1.017138f, -0.211432f,
+  0.941840f,  -0.281747f, 0.957776f,  -0.541914f, 1.041880f,  -0.433580f,
+  -1.416451f, -0.166467f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer0[16] = {
+  3.086118f,  -3.235095f, 4.830956f,  -0.165706f, 0.955031f,  4.055783f,
+  -0.311489f, 4.660205f,  -0.576277f, -0.248111f, -0.790519f, -1.686412f,
+  -1.191704f, -3.800073f, 4.121552f,  -1.399397f,
+};
+
+static const float av1_tx_split_nn_weights_4x16_layer1[16] = {
+  -0.758677f, 0.388776f,  0.439906f,  0.011390f, -0.084319f, -0.667969f,
+  -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f,  -0.549682f,
+  0.462109f,  0.343315f,  1.092593f,  0.483152f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer1[1] = {
+  0.8205083f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x16_layer0,
+      av1_tx_split_nn_weights_4x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x16_layer0,
+      av1_tx_split_nn_bias_4x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = {
+  0.180713f,  0.033211f,  0.607561f,  0.138642f,  0.637204f,  -0.000940f,
+  0.012630f,  0.358109f,  0.022238f,  0.190418f,  0.079088f,  0.065925f,
+  0.038242f,  0.162380f,  -0.122728f, 0.379382f,  -0.303283f, -0.327550f,
+  0.029120f,  -0.284553f, 0.269588f,  -0.309805f, -0.241036f, -0.161103f,
+  -0.304887f, 0.239843f,  -0.149146f, 0.311234f,  -0.073640f, -0.132718f,
+  0.178901f,  0.474712f,  0.020280f,  0.063685f,  -0.609170f, -0.013658f,
+  -0.338074f, 0.250429f,  0.082978f,  -0.186315f, -0.788959f, 0.039859f,
+  -0.426461f, -0.001524f, -0.447211f, 0.378102f,  0.315617f,  0.017428f,
+  0.745494f,  -0.219024f, 0.512836f,  0.200522f,  0.680449f,  0.313686f,
+  -0.412569f, -0.132927f, 0.631120f,  0.042735f,  0.336153f,  0.044772f,
+  0.432606f,  0.175681f,  -0.634411f, -0.073509f, -0.040643f, -0.559260f,
+  -0.104034f, -0.570495f, -0.247365f, 0.063256f,  -0.582021f, -0.492585f,
+  -0.194955f, -0.207934f, -0.506627f, 0.021743f,  -0.416518f, 0.320876f,
+  0.115889f,  0.149399f,  -0.229376f, 0.095505f,  0.115191f,  -0.471921f,
+  0.113068f,  0.343684f,  -0.036831f, 0.021240f,  0.295112f,  0.031166f,
+  0.448201f,  -0.132241f, 0.164032f,  0.355572f,  0.072154f,  0.017335f,
+  -0.046113f, 0.178719f,  -0.026881f, -0.242590f, 0.055073f,  -0.012958f,
+  0.077904f,  0.351356f,  0.107655f,  0.260568f,  -0.080052f, -0.197553f,
+  0.085763f,  0.263416f,  -0.327741f, 0.158855f,  0.056899f,  -0.162121f,
+  0.339518f,  -0.571204f, 0.264966f,  -0.252214f, -0.202560f, -0.134213f,
+  -0.330188f, 0.009470f,  -0.468376f, -0.065240f, -0.307957f, 0.116479f,
+  -0.222238f, -0.458716f, 0.186493f,  -0.391415f, 0.118649f,  -0.104653f,
+  -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f,
+  -0.598358f, 0.164947f,  -0.119694f, -0.058520f, 0.203829f,  -0.267404f,
+  -0.048202f, -0.600006f, 0.181594f,  -0.731805f, 0.146417f,  -0.687148f,
+  -1.210525f, -0.450101f, -0.620635f, 0.208825f,  -0.611357f, 0.112202f,
+  -0.309468f, -0.323545f, 0.357770f,  0.308061f,  0.553199f,  0.049012f,
+  0.530093f,  -0.208597f, 0.607882f,  -0.058120f, -0.527634f, 0.018136f,
+  0.060753f,  0.118894f,  0.175649f,  0.014731f,  0.428318f,  -0.106465f,
+  -0.119077f, 0.080179f,  0.524997f,  0.368286f,  0.528286f,  0.213659f,
+  0.639286f,  0.195079f,  -0.049815f, -0.092008f, -0.302958f, 0.298149f,
+  -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f,  0.325622f,
+  -0.115293f, 0.155188f,  0.047225f,  0.231050f,  -0.167447f, 0.349754f,
+  0.295544f,  -0.319466f, 0.095144f,  0.174612f,  -0.194652f, 0.305915f,
+  -0.239008f, -0.037453f, 0.280696f,  0.125850f,  0.749196f,  -0.101919f,
+  0.791808f,  -0.236811f, 0.064157f,  0.032865f,  -0.225911f, 0.350384f,
+  0.723183f,  -0.103992f, 0.483085f,  -0.123992f, 0.602138f,  0.023895f,
+  -0.692601f, -0.118387f, 0.162527f,  0.145178f,  -0.184702f, -0.017753f,
+  -0.159436f, 0.124105f,  -0.131067f, 0.310275f,  0.151499f,  0.138924f,
+  0.537459f,  0.263212f,  0.615896f,  0.281255f,  0.021293f,  -0.473459f,
+  0.210145f,  -0.056682f, 0.063658f,  0.377254f,  -0.314410f, -0.183487f,
+  0.300384f,  0.328471f,  0.164694f,  -0.159272f, -0.160942f, -0.502861f,
+  -0.129147f, 0.045916f,  -0.606865f, -0.101378f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer0[32] = {
+  0.051664f,  -0.212487f, -0.077596f, -0.818467f, 0.638475f,  -0.759937f,
+  0.157198f,  0.989640f,  1.586035f,  0.431144f,  0.041605f,  0.543085f,
+  0.498379f,  0.320504f,  0.134233f,  0.670979f,  -0.105562f, -1.574879f,
+  1.261812f,  -0.287530f, -1.610592f, 0.730899f,  -0.894240f, -0.657790f,
+  0.270806f,  -0.181708f, 0.298578f,  0.817240f,  -0.221508f, -0.201771f,
+  -0.294389f, 1.456413f,
+};
+
+static const float av1_tx_split_nn_weights_16x32_layer1[32] = {
+  1.208914f,  0.324728f,  0.383352f,  -0.874321f, 0.172565f,  -0.580927f,
+  -0.432927f, 0.433698f,  -0.801935f, 0.672028f,  0.563493f,  0.260077f,
+  -0.200557f, -0.121638f, 0.530735f,  -0.525196f, 0.281799f,  0.624204f,
+  -0.662775f, -0.230887f, 0.980989f,  0.223437f,  -0.790591f, 0.600724f,
+  -0.273445f, 0.427635f,  -0.501641f, -0.878390f, 0.234731f,  -0.172550f,
+  0.418904f,  1.792187f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer1[1] = {
+  -0.29233751f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x32_layer0,
+      av1_tx_split_nn_weights_16x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x32_layer0,
+      av1_tx_split_nn_bias_16x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x64 block.
+static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = {
+  0.031614f,  -0.110926f, 0.052418f,  -0.702506f, 0.045708f,  0.238329f,
+  -0.021806f, -0.208128f, 0.509745f,  -0.293891f, 0.277788f,  0.113937f,
+  0.741576f,  0.062848f,  0.351878f,  0.212532f,  0.385842f,  0.081517f,
+  0.398502f,  -0.015156f, 0.242616f,  0.214619f,  -0.182678f, -0.170546f,
+  0.110605f,  -0.236749f, -0.023831f, -0.285243f, 0.147156f,  -0.257639f,
+  0.341355f,  -0.571641f, -0.721797f, 0.139588f,  -0.518494f, -0.206526f,
+  -0.570560f, -0.184295f, 0.110271f,  0.210292f,  -0.109132f, -0.001080f,
+  0.129251f,  -0.204230f, -0.396312f, -0.183024f, 0.421243f,  -0.013154f,
+  0.222627f,  0.169826f,  0.226037f,  0.218153f,  -0.343528f, 0.274906f,
+  -0.156632f, 0.250261f,  -0.484020f, 0.019909f,  -0.349575f, -0.286643f,
+  -0.507396f, 0.202446f,  -0.154110f, -0.292644f, 0.122666f,  0.306963f,
+  0.424895f,  0.005579f,  0.494094f,  -0.079551f, 0.473740f,  0.352414f,
+  -0.356917f, 0.264331f,  -0.554487f, 0.119978f,  0.012291f,  -0.141641f,
+  -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f,  -0.118501f,
+  0.305151f,  -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f,
+  -0.177066f, -0.055114f, 0.229698f,  -0.199523f, 0.054278f,  0.365020f,
+  -0.060586f, -0.300618f, 0.157563f,  -0.064338f, -0.005711f, -0.176991f,
+  -0.424502f, -0.111914f, 0.092608f,  0.126621f,  0.078547f,  0.148008f,
+  0.024221f,  0.124599f,  0.001343f,  0.059402f,  0.453753f,  0.047102f,
+  0.242544f,  0.055735f,  -0.067451f, -0.170061f, -0.170469f, -0.232173f,
+  0.214908f,  0.248889f,  0.544348f,  -0.084566f, 0.402478f,  0.298031f,
+  0.099038f,  -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f,
+  -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f,
+  0.100219f,  0.293934f,  0.099271f,  -0.036320f, 0.356626f,  -0.261445f,
+  0.879544f,  0.000878f,  0.532920f,  -0.093918f, 0.508867f,  -0.040215f,
+  -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f,  0.352989f,
+  -0.058831f, -0.164588f, 0.039890f,  0.122861f,  0.222508f,  0.061217f,
+  0.466487f,  0.022666f,  0.423777f,  -0.002200f, -0.656835f, -0.099760f,
+  -0.520606f, 0.303204f,  -0.563620f, -0.160922f, -0.243203f, 0.313354f,
+  -0.336516f, -0.206764f, -0.236040f, 0.325899f,  -0.418748f, 0.163205f,
+  -0.476242f, -0.121928f, 0.139178f,  -0.157193f, -0.531766f, -0.180202f,
+  -0.485254f, 0.187703f,  -0.440072f, 0.137854f,  0.029139f,  0.109530f,
+  -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f,
+  -0.304542f, 0.005123f,  0.413995f,  0.314639f,  0.342648f,  -0.293264f,
+  0.358135f,  -0.180425f, -0.369530f, -0.048413f, 0.498366f,  0.121875f,
+  0.270948f,  -0.187966f, 0.342503f,  0.174420f,  -0.352105f, 0.088080f,
+  0.008277f,  0.020275f,  -0.002381f, 0.504389f,  -0.018832f, -0.366047f,
+  -0.090947f, -0.168150f, 0.016184f,  -0.328914f, 0.089579f,  -0.017349f,
+  0.005844f,  -0.005010f, -1.857514f, -0.282426f, 0.010177f,  -0.214727f,
+  -0.182529f, 0.156943f,  -0.162032f, -0.472654f, 0.069432f,  0.016901f,
+  -0.767905f, 0.137129f,  -0.411463f, 0.049056f,  -0.431657f, -0.037641f,
+  0.785500f,  0.046225f,  0.195831f,  0.245204f,  0.368614f,  0.212261f,
+  0.440626f,  -0.158048f, -0.461031f, -0.146280f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer0[32] = {
+  0.490777f,  -1.894238f, 0.621333f,  -0.076756f, 0.286298f, 0.286375f,
+  -0.126431f, -0.350034f, -1.017572f, 0.620125f,  0.408128f, 0.238756f,
+  -0.060728f, 0.210912f,  0.043124f,  0.445649f,  0.907025f, 0.360272f,
+  1.083101f,  -0.068952f, 1.062348f,  0.396354f,  0.280075f, 0.501732f,
+  0.328422f,  0.066241f,  0.474697f,  0.126313f,  0.741206f, 0.314796f,
+  0.552712f,  0.299410f,
+};
+
+static const float av1_tx_split_nn_weights_32x64_layer1[32] = {
+  1.033823f,  0.603439f,  0.304591f,  -0.279940f, -0.780909f, -0.132801f,
+  0.154059f,  0.662014f,  -0.718368f, 0.198733f,  0.039766f,  -0.208516f,
+  -0.104909f, -0.394209f, 0.081617f,  0.365041f,  -0.874960f, -0.063315f,
+  -1.189897f, 0.337225f,  0.410893f,  0.307519f,  0.221323f,  0.233895f,
+  0.469536f,  0.438557f,  0.280144f,  0.422423f,  -1.394513f, 0.781900f,
+  0.352981f,  0.111265f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer1[1] = {
+  -0.18160765f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x64_layer0,
+      av1_tx_split_nn_weights_32x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x64_layer0,
+      av1_tx_split_nn_bias_32x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x32 block.
+static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = {
+  -0.687846f, 0.121404f,  -0.372905f, 0.126770f,  -0.103298f, -0.101650f,
+  -0.148490f, -0.271740f, 0.682915f,  -0.079765f, 0.634347f,  -0.151503f,
+  0.287692f,  -0.079072f, -0.236948f, 0.065064f,  0.713383f,  0.397123f,
+  0.553621f,  0.368529f,  0.767663f,  -0.046601f, -0.392402f, -0.294822f,
+  -0.292325f, -0.010573f, -0.837945f, 0.050113f,  -0.811360f, 0.199162f,
+  0.150832f,  0.011602f,  0.369694f,  -0.225876f, 0.234113f,  -0.269808f,
+  0.303805f,  -0.190281f, -0.451136f, 0.209755f,  -0.308894f, 0.326956f,
+  0.313591f,  0.089923f,  -0.095754f, 0.390981f,  0.467366f,  0.169670f,
+  0.853322f,  0.054055f,  0.830319f,  -0.121918f, 0.262019f,  -0.093526f,
+  0.385558f,  0.419174f,  0.040198f,  -0.347030f, -0.450492f, -0.106764f,
+  0.487502f,  -0.204188f, 0.430374f,  -0.116388f, 0.236407f,  -0.157376f,
+  0.732294f,  -0.651387f, 0.347446f,  0.342575f,  0.048406f,  0.187657f,
+  0.434899f,  -0.447782f, 0.032728f,  -0.071168f, -0.255327f, 0.104174f,
+  0.095689f,  -0.431743f, 0.725694f,  0.031797f,  0.523171f,  0.061801f,
+  0.469804f,  -0.071068f, -0.059024f, -0.211937f, 0.392134f,  -0.321490f,
+  0.366060f,  -0.427798f, 0.166771f,  0.299652f,  0.044660f,  0.205142f,
+  0.039133f,  -0.051835f, -0.465475f, 0.216976f,  -0.341156f, 0.095358f,
+  0.230807f,  0.201674f,  0.279266f,  -0.713534f, -0.091690f, -0.569708f,
+  -0.119001f, 0.252160f,  -1.544578f, -0.284477f, 0.555348f,  0.226471f,
+  0.347690f,  0.034365f,  0.770835f,  -0.241859f, -0.130241f, 0.292936f,
+  0.396622f,  -0.417916f, 0.492224f,  0.125517f,  0.344824f,  0.232172f,
+  -0.432106f, -0.278745f, 0.035069f,  -0.307247f, -0.120760f, 0.170950f,
+  0.433601f,  0.044286f,  0.141463f,  -0.041382f, 0.529346f,  0.010868f,
+  -0.323674f, 0.185205f,  0.623459f,  0.232842f,  -0.406693f, -0.142944f,
+  0.222988f,  0.343634f,  0.065401f,  0.002621f,  0.805335f,  -0.426926f,
+  0.279181f,  0.131364f,  0.192339f,  -0.402391f, 0.544120f,  -0.060618f,
+  0.467780f,  0.165224f,  -0.373131f, 0.002427f,  0.688064f,  0.322317f,
+  0.259713f,  0.130583f,  0.185032f,  -0.189111f, -0.067821f, 0.010875f,
+  0.644724f,  -0.179291f, 0.463222f,  0.155230f,  0.721384f,  -0.046019f,
+  0.438501f,  0.440027f,  -0.462090f, -0.002039f, -0.468026f, -0.008890f,
+  -0.328530f, 0.370102f,  0.482531f,  0.043471f,  -0.469732f, -0.532663f,
+  0.122081f,  -0.379659f, 0.037219f,  -0.519913f, -0.128975f, -0.404365f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer0[24] = {
+  -1.198965f, 0.395204f,  -0.408627f, -0.021654f, -0.658355f, 0.154525f,
+  -0.288354f, 1.207574f,  0.411608f,  0.964678f,  -1.176893f, 1.059006f,
+  -0.472969f, 2.087975f,  1.065536f,  0.595569f,  0.197907f,  -0.349938f,
+  1.013651f,  -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f,
+};
+
+static const float av1_tx_split_nn_weights_8x32_layer1[24] = {
+  0.815787f,  -0.393465f, -0.483427f, -0.565592f, 0.493494f,  0.430229f,
+  -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f,  0.649146f,
+  -0.487383f, 1.844503f,  0.480324f,  -0.982705f, -0.501446f, -0.220584f,
+  0.334299f,  0.802238f,  0.805838f,  -0.487848f, 0.300772f,  -1.232857f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer1[1] = {
+  0.13435879f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x32_layer0,
+      av1_tx_split_nn_weights_8x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x32_layer0,
+      av1_tx_split_nn_bias_8x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = {
+  -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f,
+  -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f,
+  -0.454709f, -0.059461f, 0.210313f,  -0.155683f, 0.192968f,  -0.127804f,
+  0.471996f,  0.253377f,  0.472625f,  0.485322f,  0.150560f,  0.164868f,
+  -0.475587f, 0.447559f,  -0.455759f, -0.306665f, -0.194866f, -0.283716f,
+  -0.243897f, 0.293020f,  -0.308298f, -0.191904f, -0.468568f, 0.014053f,
+  -0.618848f, 0.096273f,  -0.444586f, 0.347750f,  -0.280643f, -0.062872f,
+  0.118661f,  0.540099f,  0.104141f,  -0.279300f, -0.098721f, -0.173427f,
+  -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f,
+  -0.523103f, 0.093620f,  -0.930396f, -0.431997f, -1.163297f, 0.190384f,
+  -0.422581f, -0.005354f, 0.450552f,  0.369210f,  0.562484f,  0.679922f,
+  0.282099f,  -0.039075f, 0.404196f,  0.006371f,  0.069679f,  -0.196160f,
+  -0.213675f, 0.275187f,  -0.104235f, -0.193090f, 0.003116f,  -0.252454f,
+  -0.094591f, 0.210439f,  -0.137070f, 0.145043f,  0.024558f,  0.121718f,
+  0.010138f,  0.301651f,  -0.377990f, 0.444414f,  0.001845f,  -0.095334f,
+  0.550259f,  0.087603f,  0.792492f,  -0.044584f, 0.641706f,  -0.328458f,
+  -0.447791f, 0.135376f,  0.356385f,  0.135748f,  0.310370f,  0.293757f,
+  -0.062000f, -0.056368f, 0.343930f,  0.312039f,  0.370763f,  0.452381f,
+  -0.023630f, -0.185909f, 0.422277f,  -0.006306f, 0.045166f,  0.423359f,
+  -0.157735f, -0.084901f, 0.219527f,  -0.209510f, 0.575057f,  0.249276f,
+  0.069267f,  0.233898f,  -0.229392f, 0.117197f,  -0.038551f, 0.293976f,
+  0.101996f,  0.120878f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer0[16] = {
+  1.036995f,  0.160249f,  0.100264f,  0.694881f,  0.694677f,  0.128379f,
+  -0.843405f, -0.405515f, 0.104139f,  0.182980f,  -0.025472f, 0.901067f,
+  -0.299866f, -0.103079f, -0.190352f, -0.048121f,
+};
+
+static const float av1_tx_split_nn_weights_16x64_layer1[16] = {
+  -1.778868f, 0.174690f,  0.211991f, 0.712138f,  0.589352f,  0.466652f,
+  1.029146f,  -0.490044f, 0.483015f, 0.600215f,  -0.577776f, -0.755546f,
+  0.348337f,  -0.205082f, 0.347129f, -0.322277f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer1[1] = {
+  0.04230947f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x64_layer0,
+      av1_tx_split_nn_weights_16x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x64_layer0,
+      av1_tx_split_nn_bias_16x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Map block size to its corresponding neural net model for tx split prediction.
+static const NN_CONFIG *const av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
+  NULL,                          // TX_4X4,
+  &av1_tx_split_nnconfig_8x8,    // TX_8X8,
+  &av1_tx_split_nnconfig_16x16,  // TX_16X16,
+  &av1_tx_split_nnconfig_32x32,  // TX_32X32,
+  &av1_tx_split_nnconfig_64x64,  // TX_64X64,
+  &av1_tx_split_nnconfig_4x8,    // TX_4X8,
+  &av1_tx_split_nnconfig_4x8,    // TX_8X4,
+  &av1_tx_split_nnconfig_8x16,   // TX_8X16,
+  &av1_tx_split_nnconfig_8x16,   // TX_16X8,
+  &av1_tx_split_nnconfig_16x32,  // TX_16X32,
+  &av1_tx_split_nnconfig_16x32,  // TX_32X16,
+  &av1_tx_split_nnconfig_32x64,  // TX_32X64,
+  &av1_tx_split_nnconfig_32x64,  // TX_64X32,
+  &av1_tx_split_nnconfig_4x16,   // TX_4X16,
+  &av1_tx_split_nnconfig_4x16,   // TX_16X4,
+  &av1_tx_split_nnconfig_8x32,   // TX_8X32,
+  &av1_tx_split_nnconfig_8x32,   // TX_32X8,
+  &av1_tx_split_nnconfig_16x64,  // TX_16X64,
+  &av1_tx_split_nnconfig_16x64,  // TX_64X16,
+};
+
+#if !CONFIG_REALTIME_ONLY
+#define NUM_INTRA_TX_SPLIT_FEATURES 14
+#define NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS 1
+#define NUM_INTRA_TX_SPLIT_HIDDEN_NODES 16
+// Model to prune intra transform depth for intra 8x8 block.
+static const float av1_intra_tx_split_8x8_mean[NUM_INTRA_TX_SPLIT_FEATURES] = {
+  0.110706f,  18.901518f, 0.250436f,  13.483487f, 0.118141f,
+  14.318728f, 0.028409f,  14.257664f, 0.045839f,  15.143358f,
+  9.702971f,  14.300809f, 6.018646f,  3.682534f,
+};
+
+static const float av1_intra_tx_split_8x8_std[NUM_INTRA_TX_SPLIT_FEATURES] = {
+  13.750575f, 13.440116f, 14.334330f, 12.236641f, 18.415247f,
+  12.733355f, 18.309339f, 12.858130f, 23.465142f, 13.447014f,
+  8.625048f,  10.456774f, 1.185447f,  1.810423f,
+};
+
+static const float av1_intra_tx_split_nn_weights_8x8_layer0
+    [NUM_INTRA_TX_SPLIT_FEATURES * NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+      -0.156142f, -0.753623f, 0.026883f,  0.039188f,  -0.035310f, 0.106140f,
+      0.051622f,  0.077838f,  0.101632f,  0.107278f,  0.232200f,  0.269083f,
+      0.048966f,  -1.553293f, -0.113983f, -0.151248f, -0.067369f, 0.787292f,
+      0.076651f,  -0.802634f, 0.266414f,  1.107563f,  -0.068848f, -0.956468f,
+      -0.074920f, -0.192258f, 0.006207f,  0.176196f,  -0.493442f, 0.152290f,
+      -0.208874f, -0.014658f, 0.297385f,  -0.351695f, 0.246295f,  -0.178519f,
+      -0.204191f, 0.049663f,  -0.330343f, -0.299754f, 0.246215f,  -0.014558f,
+      -0.117611f, 0.206445f,  0.045840f,  -0.047563f, -0.049679f, 0.406892f,
+      -0.052307f, -1.513404f, 0.166166f,  0.520760f,  -0.143320f, -0.593928f,
+      -0.010533f, 0.250752f,  0.076738f,  0.537512f,  -0.082619f, -1.534031f,
+      0.047109f,  0.634247f,  -0.089730f, 0.545534f,  -0.022742f, -0.779047f,
+      -0.606358f, -0.199145f, -0.051269f, 0.248784f,  0.327545f,  -0.851751f,
+      0.071739f,  0.035975f,  0.387781f,  -0.136427f, -0.284436f, 0.578449f,
+      -0.198276f, 0.579950f,  0.600111f,  -0.370164f, -0.215297f, 0.517342f,
+      0.200061f,  -2.507660f, -0.030851f, 0.227315f,  -0.078289f, 0.276052f,
+      -0.050281f, 0.251481f,  -0.139318f, 0.281175f,  0.226524f,  0.058968f,
+      0.197436f,  0.517294f,  -0.105914f, -1.599567f, 0.064985f,  0.043209f,
+      -0.280038f, 0.126874f,  0.330387f,  -0.014407f, 0.031241f,  0.237801f,
+      0.948959f,  -0.253791f, -0.022622f, -0.061430f, 0.265852f,  0.750823f,
+      0.086606f,  0.853527f,  -0.180971f, -1.255744f, -0.152979f, -1.022198f,
+      -0.044708f, 0.506424f,  -0.501968f, -0.416863f, -0.012688f, 0.193523f,
+      -0.093698f, 0.430875f,  0.007379f,  0.019278f,  0.080890f,  0.462755f,
+      -0.054326f, -0.157611f, -0.004851f, -1.275676f, -0.060528f, -0.508170f,
+      0.195429f,  -0.023534f, 0.355211f,  0.983561f,  -0.122036f, -0.911948f,
+      -0.172280f, -1.135245f, -0.043211f, 0.576456f,  -0.075247f, 0.429734f,
+      -0.246309f, -0.355575f, -0.048809f, 0.217113f,  0.078385f,  0.720341f,
+      0.007070f,  0.144617f,  -0.167642f, 0.303056f,  -0.031425f, 0.123448f,
+      -0.320530f, 0.164070f,  -0.497849f, -0.233918f, -0.032123f, 0.084983f,
+      0.312216f,  0.062609f,  -0.389815f, 0.237593f,  0.000157f,  -0.642068f,
+      0.167898f,  0.495234f,  -0.083493f, -0.555971f, 0.124437f,  0.381125f,
+      -0.459219f, 0.047924f,  -0.138222f, -2.232816f, 0.127585f,  -0.102420f,
+      0.131598f,  0.036837f,  -0.163055f, -0.067429f, -0.078521f, -0.055666f,
+      1.387057f,  0.400154f,  -0.003355f, -0.073627f, -0.305098f, -0.413383f,
+      -0.008266f, -0.038329f, 0.209808f,  0.375777f,  0.037274f,  -0.050226f,
+      -0.100576f, 0.237441f,  0.237854f,  0.828296f,  0.001149f,  -0.093964f,
+      0.214051f,  -0.031486f, -0.561307f, 0.014540f,  0.169357f,  0.323202f,
+      -0.395334f, -0.038941f, 0.476800f,  -0.213122f, -0.287521f, -0.420717f,
+      -0.054142f, -0.102266f,
+    };
+
+static const float
+    av1_intra_tx_split_nn_bias_8x8_layer0[NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+      -1.150850f, -0.236404f, 0.184554f,  -0.904162f, -0.949979f, 0.427016f,
+      -0.546867f, -0.611094f, -0.676570f, -0.208959f, -0.286384f, 0.562238f,
+      0.434197f,  -0.746518f, 0.123085f,  -0.549836f,
+    };
+
+static const float av1_intra_tx_split_nn_weights_8x8_layer1
+    [NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+      0.749814f,  0.598172f,  0.375611f, 0.751612f,  0.947538f, -0.282228f,
+      -1.457522f, -1.092290f, 0.738657f, 0.575779f,  0.514823f, -0.560616f,
+      -0.491619f, -1.482014f, 0.524625f, -0.533590f,
+    };
+
+static const float av1_intra_tx_split_nn_bias_8x8_layer1[1] = {
+  -0.488888f,
+};
+
+static const NN_CONFIG av1_intra_tx_split_nnconfig_8x8 = {
+  NUM_INTRA_TX_SPLIT_FEATURES,       // num_inputs
+  1,                                 // num_outputs
+  NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS,  // num_hidden_layers
+  {
+      NUM_INTRA_TX_SPLIT_HIDDEN_NODES,
+  },  // num_hidden_nodes
+  {
+      av1_intra_tx_split_nn_weights_8x8_layer0,
+      av1_intra_tx_split_nn_weights_8x8_layer1,
+  },
+  {
+      av1_intra_tx_split_nn_bias_8x8_layer0,
+      av1_intra_tx_split_nn_bias_8x8_layer1,
+  },
+};
+
+static const float av1_intra_tx_prune_nn_thresh_8x8[2] = { -0.405465f,
+                                                           0.405465f };
+#endif  // !CONFIG_REALTIME_ONLY
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/tx_search.c b/third_party/aom/av1/encoder/tx_search.c
new file mode 100644
index 0000000000..7292c01191
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_search.c
@@ -0,0 +1,3830 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/common/idct.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/sorting_network.h"
+#include "av1/encoder/tx_prune_model_weights.h"
+#include "av1/encoder/tx_search.h"
+#include "av1/encoder/txb_rdopt.h"
+
+#define PROB_THRESH_OFFSET_TX_TYPE 100
+
+struct rdcost_block_args {
+  const AV1_COMP *cpi;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
+  RD_STATS rd_stats;
+  int64_t current_rd;
+  int64_t best_rd;
+  int exit_early;
+  int incomplete_exit;
+  FAST_TX_SEARCH_MODE ftxs_mode;
+  int skip_trellis;
+};
+
+typedef struct {
+  int64_t rd;
+  int txb_entropy_ctx;
+  TX_TYPE tx_type;
+} TxCandidateInfo;
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+  {
+      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+  },
+  {
+      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+  },
+  {
+      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+  },
+};
+
+// lookup table for predict_skip_txfm
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
+  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
+};
+
+// look-up table for sqrt of number of pixels in a transform block
+// rounded up to the nearest integer.
+static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4,  8,  16, 32, 32, 6,  6,
+                                                     12, 12, 23, 23, 32, 32, 8,
+                                                     8,  16, 16, 23, 23 };
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int16_t *diff = x->plane[0].src_diff;
+  const uint32_t hash =
+      av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record->crc_calculator,
+                           (uint8_t *)diff, 2 * rows * cols);
+  return (hash << 5) + bsize;
+}
+
+static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
+                                      const int64_t ref_best_rd,
+                                      const uint32_t hash) {
+  int32_t match_index = -1;
+  if (ref_best_rd != INT64_MAX) {
+    for (int i = 0; i < mb_rd_record->num; ++i) {
+      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+      // If there is a match in the mb_rd_record, fetch the RD decision and
+      // terminate early.
+      if (mb_rd_record->mb_rd_info[index].hash_value == hash) {
+        match_index = index;
+        break;
+      }
+    }
+  }
+  return match_index;
+}
+
+static AOM_INLINE void fetch_mb_rd_info(int n4,
+                                        const MB_RD_INFO *const mb_rd_info,
+                                        RD_STATS *const rd_stats,
+                                        MACROBLOCK *const x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  mbmi->tx_size = mb_rd_info->tx_size;
+  memcpy(x->txfm_search_info.blk_skip, mb_rd_info->blk_skip,
+         sizeof(mb_rd_info->blk_skip[0]) * n4);
+  av1_copy(mbmi->inter_tx_size, mb_rd_info->inter_tx_size);
+  av1_copy_array(xd->tx_type_map, mb_rd_info->tx_type_map, n4);
+  *rd_stats = mb_rd_info->rd_stats;
+}
+
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+                            int blk_col, const BLOCK_SIZE plane_bsize,
+                            const BLOCK_SIZE tx_bsize,
+                            unsigned int *block_mse_q8) {
+  int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+
+  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+  uint64_t sse =
+      aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+  if (block_mse_q8 != NULL) {
+    if (visible_cols > 0 && visible_rows > 0)
+      *block_mse_q8 =
+          (unsigned int)((256 * sse) / (visible_cols * visible_rows));
+    else
+      *block_mse_q8 = UINT_MAX;
+  }
+  return sse;
+}
+
+// Computes the residual block's SSE and mean on all visible 4x4s in the
+// transform block
+static INLINE int64_t pixel_diff_stats(
+    MACROBLOCK *x, int plane, int blk_row, int blk_col,
+    const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
+    unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
+  int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+
+  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+  uint64_t sse = 0;
+  int sum = 0;
+  sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
+  if (visible_cols > 0 && visible_rows > 0) {
+    double norm_factor = 1.0 / (visible_cols * visible_rows);
+    int sign_sum = sum > 0 ? 1 : -1;
+    // Conversion to transform domain
+    *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7;
+    *per_px_mean = sign_sum * (*per_px_mean);
+    *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse));
+    *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum));
+  } else {
+    *block_mse_q8 = UINT_MAX;
+  }
+  return sse;
+}
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+                             int reduced_tx_set) {
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+  *dist = av1_pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+
+  const int64_t mse = *dist / bw / bh;
+  // Normalized quantizer takes the transform upscaling factor (8 for tx size
+  // smaller than 32) into account.
+  const int16_t normalized_dc_q = dc_q >> 3;
+  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+  // For faster early skip decision, use dist to compare against threshold so
+  // that quality risk is less for the skip=1 decision. Otherwise, use mse
+  // since the fwd_txfm coeff checks will take care of quality
+  // TODO(any): Use dist to return 0 when skip_txfm_level is 1
+  int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse;
+  // Predict not to skip when error is larger than threshold.
+  if (pred_err > mse_thresh) return 0;
+  // Return as skip otherwise for aggressive early skip
+  else if (txfm_params->skip_txfm_level >= 2)
+    return 1;
+
+  const int max_tx_size = max_predict_sf_tx_size[bsize];
+  const int tx_h = tx_size_high[max_tx_size];
+  const int tx_w = tx_size_wide[max_tx_size];
+  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+  TxfmParam param;
+  param.tx_type = DCT_DCT;
+  param.tx_size = max_tx_size;
+  param.bd = xd->bd;
+  param.is_hbd = is_cur_buf_hbd(xd);
+  param.lossless = 0;
+  param.tx_set_type = av1_get_ext_tx_set_type(
+      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+  const int16_t *src_diff = x->plane[0].src_diff;
+  const int n_coeff = tx_w * tx_h;
+  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+  for (int row = 0; row < bh; row += tx_h) {
+    for (int col = 0; col < bw; col += tx_w) {
+      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+      // Operating on TX domain, not pixels; we want the QTX quantizers
+      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+      if (dc_coef >= dc_thresh) return 0;
+      for (int i = 1; i < n_coeff; ++i) {
+        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+        if (ac_coef >= ac_thresh) return 0;
+      }
+    }
+    src_diff += tx_h * bw;
+  }
+  return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
+                                     BLOCK_SIZE bsize, int64_t dist) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
+  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4);
+  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+  mbmi->tx_size = tx_size;
+  for (int i = 0; i < n4; ++i)
+    set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1);
+  rd_stats->skip_txfm = 1;
+  if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+  rd_stats->dist = rd_stats->sse = (dist << 4);
+  // Though decision is to make the block as skip based on luma stats,
+  // it is possible that block becomes non skip after chroma rd. In addition
+  // intermediate non skip costs calculated by caller function will be
+  // incorrect, if rate is set as  zero (i.e., if zero_blk_rate is not
+  // accounted). Hence intermediate rate is populated to code the luma tx blks
+  // as skip, the caller function based on final rd decision (i.e., skip vs
+  // non-skip) sets the final rate accordingly. Here the rate populated
+  // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx
+  // size possible) in the current block. Eg: For 128*128 block, rate would be
+  // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx
+  // block as 'all zeros'
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+  ENTROPY_CONTEXT *ta = ctxa;
+  ENTROPY_CONTEXT *tl = ctxl;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+  rd_stats->rate = zero_blk_rate *
+                   (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
+                   (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
+}
+
+static AOM_INLINE void save_mb_rd_info(int n4, uint32_t hash,
+                                       const MACROBLOCK *const x,
+                                       const RD_STATS *const rd_stats,
+                                       MB_RD_RECORD *mb_rd_record) {
+  int index;
+  if (mb_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (mb_rd_record->index_start + mb_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++mb_rd_record->num;
+  } else {
+    index = mb_rd_record->index_start;
+    mb_rd_record->index_start =
+        (mb_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  MB_RD_INFO *const mb_rd_info = &mb_rd_record->mb_rd_info[index];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  mb_rd_info->hash_value = hash;
+  mb_rd_info->tx_size = mbmi->tx_size;
+  memcpy(mb_rd_info->blk_skip, x->txfm_search_info.blk_skip,
+         sizeof(mb_rd_info->blk_skip[0]) * n4);
+  av1_copy(mb_rd_info->inter_tx_size, mbmi->inter_tx_size);
+  av1_copy_array(mb_rd_info->tx_type_map, xd->tx_type_map, n4);
+  mb_rd_info->rd_stats = *rd_stats;
+}
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+                                 const SPEED_FEATURES *sf,
+                                 int tx_size_search_method) {
+  if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+  if (sf->tx_sf.tx_size_search_lgr_block) {
+    if (mi_width > mi_size_wide[BLOCK_64X64] ||
+        mi_height > mi_size_high[BLOCK_64X64])
+      return MAX_VARTX_DEPTH;
+  }
+
+  if (is_inter) {
+    return (mi_height != mi_width)
+               ? sf->tx_sf.inter_tx_size_search_init_depth_rect
+               : sf->tx_sf.inter_tx_size_search_init_depth_sqr;
+  } else {
+    return (mi_height != mi_width)
+               ? sf->tx_sf.intra_tx_size_search_init_depth_rect
+               : sf->tx_sf.intra_tx_size_search_init_depth_sqr;
+  }
+}
+
+static AOM_INLINE void select_tx_block(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
+
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+#if CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void get_energy_distribution_fine(
+    const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride,
+    const uint8_t *dst, int dst_stride, int need_4th, double *hordist,
+    double *verdist) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+    // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+    // functions for the 16 (very small) sub-blocks of this block.
+    const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+    const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+    assert(bw <= 32);
+    assert(bh <= 32);
+    assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+    if (cpi->common.seq_params->use_highbitdepth) {
+      const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+      const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] +=
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+        }
+    } else {
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
+        }
+    }
+  } else {  // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+    const int f_index =
+        (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+    assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+    const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+    assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+    assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[1]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[2]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[3]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[5]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[6]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[7]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[9]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[10]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[11]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[13]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[14]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[15]);
+  }
+
+  double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+                 esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+                 esq[12] + esq[13] + esq[14] + esq[15];
+  if (total > 0) {
+    const double e_recip = 1.0 / total;
+    hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+    hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+    hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+    if (need_4th) {
+      hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+    }
+    verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+    verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+    verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+    if (need_4th) {
+      verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+    }
+  } else {
+    hordist[0] = verdist[0] = 0.25;
+    hordist[1] = verdist[1] = 0.25;
+    hordist[2] = verdist[2] = 0.25;
+    if (need_4th) {
+      hordist[3] = verdist[3] = 0.25;
+    }
+  }
+}
+
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int err = diff[j * stride + i];
+      sum += err * err;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += abs(diff[j * stride + i]);
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static AOM_INLINE void get_2x2_normalized_sses_and_sads(
+    const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+    int src_stride, const uint8_t *const dst, int dst_stride,
+    const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+    double *const sad_norm_arr) {
+  const BLOCK_SIZE tx_bsize_half =
+      get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+  if (tx_bsize_half == BLOCK_INVALID) {  // manually calculate stats
+    const int half_width = block_size_wide[tx_bsize] / 2;
+    const int half_height = block_size_high[tx_bsize] / 2;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const int16_t *const this_src_diff =
+            src_diff + row * half_height * diff_stride + col * half_width;
+        if (sse_norm_arr) {
+          sse_norm_arr[row * 2 + col] =
+              get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+        if (sad_norm_arr) {
+          sad_norm_arr[row * 2 + col] =
+              get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+      }
+    }
+  } else {  // use function pointers to calculate stats
+    const int half_width = block_size_wide[tx_bsize_half];
+    const int half_height = block_size_high[tx_bsize_half];
+    const int num_samples_half = half_width * half_height;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const uint8_t *const this_src =
+            src + row * half_height * src_stride + col * half_width;
+        const uint8_t *const this_dst =
+            dst + row * half_height * dst_stride + col * half_width;
+
+        if (sse_norm_arr) {
+          unsigned int this_sse;
+          cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                             dst_stride, &this_sse);
+          sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+        }
+
+        if (sad_norm_arr) {
+          const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf(
+              this_src, src_stride, this_dst, dst_stride);
+          sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_COLLECT_RD_STATS == 1
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += diff[j * stride + i];
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+static AOM_INLINE void PrintTransformUnitStats(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats,
+    int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+    TX_TYPE tx_type, int64_t rd) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 21743;
+  if (lcg_rand16(&seed) % 256 > 0) return;
+
+  const char output_file[] = "tu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide[tx_size];
+  const int txh = tx_size_high[tx_size];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int q_step = p->dequant_QTX[1] >> dequant_shift;
+  const int num_samples = txw * txh;
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+
+  fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  unsigned int sse;
+  cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm = (double)sad / num_samples;
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+
+  fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+          tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+  const double mean = get_mean(src_diff, diff_stride, txw, txh);
+  float hor_corr, vert_corr;
+  av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr,
+                                  &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+                               1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+#if CONFIG_COLLECT_RD_STATS >= 2
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+    unsigned int sse;
+
+    if (plane) continue;
+
+    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, &sse);
+    total_sse += sse;
+  }
+  total_sse <<= 4;
+  return total_sse;
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                             int64_t sse, int *est_residue_cost,
+                             int64_t *est_dist) {
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  if (md->ready) {
+    if (sse < md->dist_mean) {
+      *est_residue_cost = 0;
+      *est_dist = sse;
+    } else {
+      *est_dist = (int64_t)round(md->dist_mean);
+      const double est_ld = md->a * sse + md->b;
+      // Clamp estimated rate cost by INT_MAX / 2.
+      // TODO(angiebird@google.com): find better solution than clamping.
+      if (fabs(est_ld) < 1e-2) {
+        *est_residue_cost = INT_MAX / 2;
+      } else {
+        double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+        if (est_residue_cost_dbl < 0) {
+          *est_residue_cost = 0;
+        } else {
+          *est_residue_cost =
+              (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+        }
+      }
+      if (*est_residue_cost <= 0) {
+        *est_residue_cost = 0;
+        *est_dist = sse;
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static double get_highbd_diff_mean(const uint8_t *src8, int src_stride,
+                                   const uint8_t *dst8, int dst_stride, int w,
+                                   int h) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+      sum += diff;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_diff_mean(const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+      sum += diff;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
+                                                const TileDataEnc *tile_data,
+                                                MACROBLOCK *x,
+                                                const RD_STATS *const rd_stats,
+                                                BLOCK_SIZE plane_bsize) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+      (tile_data == NULL ||
+       !tile_data->inter_mode_rd_models[plane_bsize].ready))
+    return;
+  (void)tile_data;
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 95014;
+
+  if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) !=
+      1)
+    return;
+
+  const char output_file[] = "pu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int diff_stride = block_size_wide[plane_bsize];
+  int bw, bh;
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+  const int num_samples = bw * bh;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int q_step = p->dequant_QTX[1] >> dequant_shift;
+  const int shift = (xd->bd - 8);
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+  const double rdcost_norm =
+      (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
+
+  fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  const int16_t *const src_diff = p->src_diff;
+
+  int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm =
+      (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  if (shift) {
+    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+    for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rdcost_norm =
+      (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+          model_rdcost_norm);
+
+  double mean;
+  if (is_cur_buf_hbd(xd)) {
+    mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
+                                pd->dst.stride, bw, bh);
+  } else {
+    mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                         bw, bh);
+  }
+  mean /= (1 << shift);
+  float hor_corr, vert_corr;
+  av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
+                                  &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+                               dst_stride, 1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+    assert(tile_data->inter_mode_rd_models[plane_bsize].ready);
+    const int64_t overall_sse = get_sse(cpi, x);
+    int est_residue_cost = 0;
+    int64_t est_dist = 0;
+    get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost,
+                      &est_dist);
+    const double est_residue_cost_norm = (double)est_residue_cost / num_samples;
+    const double est_dist_norm = (double)est_dist / num_samples;
+    const double est_rdcost_norm =
+        (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples;
+    fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm,
+            est_rdcost_norm);
+  }
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS >= 2
+#endif  // CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x,
+                                                      int plane, int block,
+                                                      int blk_row, int blk_col,
+                                                      int eob,
+                                                      int reduced_tx_set) {
+  if (!eob) return;
+  struct macroblock_plane *const p = &x->plane[plane];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+                                          tx_size, reduced_tx_set);
+
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                              dst_stride, eob, reduced_tx_set);
+}
+
+static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                               int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               const TXB_CTX *const txb_ctx, int skip_trellis,
+                               TX_TYPE best_tx_type, int do_quant,
+                               int *rate_cost, uint16_t best_eob) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (!is_inter && best_eob &&
+      (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+       blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+    // if the quantized coefficients are stored in the dqcoeff buffer, we don't
+    // need to do transform and quantization again.
+    if (do_quant) {
+      TxfmParam txfm_param_intra;
+      QUANT_PARAM quant_param_intra;
+      av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra);
+      av1_setup_quant(tx_size, !skip_trellis,
+                      skip_trellis
+                          ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+                                                    : AV1_XFORM_QUANT_FP)
+                          : AV1_XFORM_QUANT_FP,
+                      cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra);
+      av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type,
+                        &quant_param_intra);
+      av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                      &txfm_param_intra, &quant_param_intra);
+      if (quant_param_intra.use_optimize_b) {
+        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
+                       rate_cost);
+      }
+    }
+
+    inverse_transform_block_facade(x, plane, block, blk_row, blk_col,
+                                   x->plane[plane].eobs[block],
+                                   cm->features.reduced_tx_set_used);
+
+    // This may happen because of hash collision. The eob stored in the hash
+    // table is non-zero, but the real eob is zero. We need to make sure tx_type
+    // is DCT_DCT in this case.
+    if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+        best_tx_type != DCT_DCT) {
+      update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+    }
+  }
+}
+
+static unsigned pixel_dist_visible_only(
+    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+    const int src_stride, const uint8_t *dst, const int dst_stride,
+    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+    int visible_cols) {
+  unsigned sse;
+
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
+    cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const MACROBLOCKD *xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                             visible_cols, visible_rows);
+    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+  }
+#else
+  (void)x;
+#endif
+  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+                         visible_rows);
+  return sse;
+}
+
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
+// transform block.
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                           int plane, const uint8_t *src, const int src_stride,
+                           const uint8_t *dst, const int dst_stride,
+                           int blk_row, int blk_col,
+                           const BLOCK_SIZE plane_bsize,
+                           const BLOCK_SIZE tx_bsize) {
+  int txb_rows, txb_cols, visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+                     &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+  assert(visible_rows > 0);
+  assert(visible_cols > 0);
+
+  unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+                                         dst_stride, tx_bsize, txb_rows,
+                                         txb_cols, visible_rows, visible_cols);
+
+  return sse;
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           int plane, BLOCK_SIZE plane_bsize,
+                                           int block, int blk_row, int blk_col,
+                                           TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const uint16_t eob = p->eobs[block];
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int bsw = block_size_wide[tx_bsize];
+  const int bsh = block_size_high[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int dst_stride = xd->plane[plane].dst.stride;
+  // Scale the transform block index to pixel unit.
+  const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2;
+  const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2;
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+  const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+  const tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+
+  assert(cpi != NULL);
+  assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+  uint8_t *recon;
+  DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    recon = CONVERT_TO_BYTEPTR(recon16);
+    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                             CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh);
+  } else {
+    recon = (uint8_t *)recon16;
+    aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
+  }
+#else
+  recon = (uint8_t *)recon16;
+  aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
+#endif
+
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                                    cpi->common.features.reduced_tx_set_used);
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+                              MAX_TX_SIZE, eob,
+                              cpi->common.features.reduced_tx_set_used);
+
+  return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
+}
+
+// pruning thresholds for prune_txk_type and prune_txk_type_separ
+static const int prune_factors[5] = { 200, 200, 120, 80, 40 };  // scale 1000
+static const int mul_factors[5] = { 80, 80, 70, 50, 30 };       // scale 100
+
+// R-D costs are sorted in ascending order.
+static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
+  int i, j, k;
+
+  for (i = 1; i <= len - 1; ++i) {
+    for (j = 0; j < i; ++j) {
+      if (rds[j] > rds[i]) {
+        int64_t temprd;
+        int tempi;
+
+        temprd = rds[i];
+        tempi = txk[i];
+
+        for (k = i; k > j; k--) {
+          rds[k] = rds[k - 1];
+          txk[k] = txk[k - 1];
+        }
+
+        rds[j] = temprd;
+        txk[j] = tempi;
+        break;
+      }
+    }
+  }
+}
+
+static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff,
+                                         const tran_low_t *dqcoeff,
+                                         intptr_t block_size,
+                                         const qm_val_t *qmatrix,
+                                         const int16_t *scan, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    int64_t weight = qmatrix[scan[i]];
+    int64_t dd = coeff[i] - dqcoeff[i];
+    dd *= weight;
+    int64_t cc = coeff[i];
+    cc *= weight;
+    // The ranges of coeff and dqcoeff are
+    //  bd8 : 18 bits (including sign)
+    //  bd10: 20 bits (including sign)
+    //  bd12: 22 bits (including sign)
+    // As AOM_QM_BITS is 5, the intermediate quantities in the calculation
+    // below should fit in 54 bits, thus no overflow should happen.
+    error += (dd * dd + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+    sqcoeff += (cc * cc + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+                                        TX_SIZE tx_size,
+                                        const qm_val_t *qmatrix,
+                                        const int16_t *scan, int64_t *out_dist,
+                                        int64_t *out_sse) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  // Transform domain distortion computation is more efficient as it does
+  // not involve an inverse transform, but it is less accurate.
+  const int buffer_length = av1_get_max_eob(tx_size);
+  int64_t this_sse;
+  // TX-domain results need to shift down to Q2/D10 to match pixel
+  // domain distortion values which are in Q2^2
+  int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+#if CONFIG_AV1_HIGHBITDEPTH
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    // TODO(veluca): handle use_qm_dist_metric for HBD too.
+    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+                                       xd->bd);
+  } else {
+#endif
+    if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) {
+      *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+    } else {
+      *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix,
+                                     scan, &this_sse);
+    }
+#if CONFIG_AV1_HIGHBITDEPTH
+  }
+#endif
+
+  *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+  *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                              int block, TX_SIZE tx_size, int blk_row,
+                              int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
+                              int16_t allowed_tx_mask, int prune_factor,
+                              const TXB_CTX *const txb_ctx,
+                              int reduced_tx_set_used, int64_t ref_best_rd,
+                              int num_sel) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int idx;
+
+  int64_t rds_v[4];
+  int64_t rds_h[4];
+  int idx_v[4] = { 0, 1, 2, 3 };
+  int idx_h[4] = { 0, 1, 2, 3 };
+  int skip_v[4] = { 0 };
+  int skip_h[4] = { 0 };
+  const int idx_map[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+
+  const int sel_pattern_v[16] = {
+    0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3
+  };
+  const int sel_pattern_h[16] = {
+    0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3
+  };
+
+  QUANT_PARAM quant_param;
+  TxfmParam txfm_param;
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+                  &quant_param);
+  int tx_type;
+  // to ensure we can try ones even outside of ext_tx_set of current block
+  // this function should only be called for size < 16
+  assert(txsize_sqr_up_map[tx_size] <= TX_16X16);
+  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+  int rate_cost = 0;
+  int64_t dist = 0, sse = 0;
+  // evaluate horizontal with vertical DCT
+  for (idx = 0; idx < 4; ++idx) {
+    tx_type = idx_map[idx];
+    txfm_param.tx_type = tx_type;
+
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
+
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+
+    rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+    if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) {
+      skip_h[idx] = 1;
+    }
+  }
+  sort_rd(rds_h, idx_h, 4);
+  for (idx = 1; idx < 4; idx++) {
+    if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1;
+  }
+
+  if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF;
+
+  // evaluate vertical with the best horizontal chosen
+  rds_v[0] = rds_h[0];
+  int start_v = 1, end_v = 4;
+  const int *idx_map_v = idx_map + idx_h[0];
+
+  for (idx = start_v; idx < end_v; ++idx) {
+    tx_type = idx_map_v[idx_v[idx] * 4];
+    txfm_param.tx_type = tx_type;
+
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
+
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+
+    rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+    if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) {
+      skip_v[idx] = 1;
+    }
+  }
+  sort_rd(rds_v, idx_v, 4);
+  for (idx = 1; idx < 4; idx++) {
+    if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1;
+  }
+
+  // combine rd_h and rd_v to prune tx candidates
+  int i_v, i_h;
+  int64_t rds[16];
+  int num_cand = 0, last = TX_TYPES - 1;
+
+  for (int i = 0; i < 16; i++) {
+    i_v = sel_pattern_v[i];
+    i_h = sel_pattern_h[i];
+    tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]];
+    if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] ||
+        skip_v[idx_v[i_v]]) {
+      txk_map[last] = tx_type;
+      last--;
+    } else {
+      txk_map[num_cand] = tx_type;
+      rds[num_cand] = rds_v[i_v] + rds_h[i_h];
+      if (rds[num_cand] == 0) rds[num_cand] = 1;
+      num_cand++;
+    }
+  }
+  sort_rd(rds, txk_map, num_cand);
+
+  uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+  num_sel = AOMMIN(num_sel, num_cand);
+
+  for (int i = 1; i < num_sel; i++) {
+    int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]);
+    if (factor < (int64_t)prune_factor)
+      prune &= ~(1 << txk_map[i]);
+    else
+      break;
+  }
+  return prune;
+}
+
+uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                        int block, TX_SIZE tx_size, int blk_row, int blk_col,
+                        BLOCK_SIZE plane_bsize, int *txk_map,
+                        uint16_t allowed_tx_mask, int prune_factor,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int tx_type;
+
+  int64_t rds[TX_TYPES];
+
+  int num_cand = 0;
+  int last = TX_TYPES - 1;
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+                  &quant_param);
+
+  for (int idx = 0; idx < TX_TYPES; idx++) {
+    tx_type = idx;
+    int rate_cost = 0;
+    int64_t dist = 0, sse = 0;
+    if (!(allowed_tx_mask & (1 << tx_type))) {
+      txk_map[last] = tx_type;
+      last--;
+      continue;
+    }
+    txfm_param.tx_type = tx_type;
+
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
+    // do txfm and quantization
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+    // estimate rate cost
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+    // tx domain dist
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
+
+    txk_map[num_cand] = tx_type;
+    rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist);
+    if (rds[num_cand] == 0) rds[num_cand] = 1;
+    num_cand++;
+  }
+
+  if (num_cand == 0) return (uint16_t)0xFFFF;
+
+  sort_rd(rds, txk_map, num_cand);
+  uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+
+  // 0 < prune_factor <= 1000 controls aggressiveness
+  int64_t factor = 0;
+  for (int idx = 1; idx < num_cand; idx++) {
+    factor = 1000 * (rds[idx] - rds[0]) / rds[0];
+    if (factor < (int64_t)prune_factor)
+      prune &= ~(1 << txk_map[idx]);
+    else
+      break;
+  }
+  return prune;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+  // TX_4X4
+  (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+             0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+             0.09778f, 0.11780f },
+  // TX_8X8
+  (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+             0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+             0.10803f, 0.14124f },
+  // TX_16X16
+  (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+             0.06897f, 0.07629f, 0.08875f, 0.11169f },
+  // TX_32X32
+  NULL,
+  // TX_64X64
+  NULL,
+  // TX_4X8
+  (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+             0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+             0.10168f, 0.12585f },
+  // TX_8X4
+  (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+             0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+             0.10583f, 0.13123f },
+  // TX_8X16
+  (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+             0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+             0.10730f, 0.14221f },
+  // TX_16X8
+  (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+             0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+             0.10339f, 0.13464f },
+  // TX_16X32
+  NULL,
+  // TX_32X16
+  NULL,
+  // TX_32X64
+  NULL,
+  // TX_64X32
+  NULL,
+  // TX_4X16
+  (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+             0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+             0.10242f, 0.12878f },
+  // TX_16X4
+  (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+             0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+             0.10217f, 0.12610f },
+  // TX_8X32
+  NULL,
+  // TX_32X8
+  NULL,
+  // TX_16X64
+  NULL,
+  // TX_64X16
+  NULL,
+};
+
+static INLINE float get_adaptive_thresholds(
+    TX_SIZE tx_size, TxSetType tx_set_type,
+    TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) {
+  const int prune_aggr_table[5][2] = {
+    { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 }, { 12, 9 }
+  };
+  int pruning_aggressiveness = 0;
+  if (tx_set_type == EXT_TX_SET_ALL16)
+    pruning_aggressiveness =
+        prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][0];
+  else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+    pruning_aggressiveness =
+        prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][1];
+
+  return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness];
+}
+
+static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff,
+                                                     int stride, int bw, int bh,
+                                                     float *hordist,
+                                                     float *verdist) {
+  // First compute downscaled block energy values (esq); downscale factors
+  // are defined by w_shift and h_shift.
+  unsigned int esq[256];
+  const int w_shift = bw <= 8 ? 0 : 1;
+  const int h_shift = bh <= 8 ? 0 : 1;
+  const int esq_w = bw >> w_shift;
+  const int esq_h = bh >> h_shift;
+  const int esq_sz = esq_w * esq_h;
+  int i, j;
+  memset(esq, 0, esq_sz * sizeof(esq[0]));
+  if (w_shift) {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j += 2) {
+        cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+                                cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+      }
+    }
+  } else {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j++) {
+        cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+      }
+    }
+  }
+
+  uint64_t total = 0;
+  for (i = 0; i < esq_sz; i++) total += esq[i];
+
+  // Output hordist and verdist arrays are normalized 1D projections of esq
+  if (total == 0) {
+    float hor_val = 1.0f / esq_w;
+    for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+    float ver_val = 1.0f / esq_h;
+    for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+    return;
+  }
+
+  const float e_recip = 1.0f / (float)total;
+  memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+  memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+  const unsigned int *cur_esq_row;
+  for (i = 0; i < esq_h - 1; i++) {
+    cur_esq_row = esq + i * esq_w;
+    for (j = 0; j < esq_w - 1; j++) {
+      hordist[j] += (float)cur_esq_row[j];
+      verdist[i] += (float)cur_esq_row[j];
+    }
+    verdist[i] += (float)cur_esq_row[j];
+  }
+  cur_esq_row = esq + i * esq_w;
+  for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+  for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+  for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+static AOM_INLINE bool check_bit_mask(uint16_t mask, int val) {
+  return mask & (1 << val);
+}
+
+static AOM_INLINE void set_bit_mask(uint16_t *mask, int val) {
+  *mask |= (1 << val);
+}
+
+static AOM_INLINE void unset_bit_mask(uint16_t *mask, int val) {
+  *mask &= ~(1 << val);
+}
+
+static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                        int blk_row, int blk_col, TxSetType tx_set_type,
+                        TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map,
+                        uint16_t *allowed_tx_mask) {
+  // This table is used because the search order is different from the enum
+  // order.
+  static const int tx_type_table_2D[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+  if (tx_set_type != EXT_TX_SET_ALL16 &&
+      tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+    return;
+#if CONFIG_NN_V2
+  NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#else
+  const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#endif
+  if (!nn_config_hor || !nn_config_ver) return;  // Model not established yet.
+
+  float hfeatures[16], vfeatures[16];
+  float hscores[4], vscores[4];
+  float scores_2D_raw[16];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+  const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+  assert(hfeatures_num <= 16);
+  assert(vfeatures_num <= 16);
+
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+                                vfeatures);
+
+  av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
+                                  &hfeatures[hfeatures_num - 1],
+                                  &vfeatures[vfeatures_num - 1]);
+
+#if CONFIG_NN_V2
+  av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
+  av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
+#else
+  av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
+  av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
+#endif
+
+  for (int i = 0; i < 4; i++) {
+    float *cur_scores_2D = scores_2D_raw + i * 4;
+    cur_scores_2D[0] = vscores[i] * hscores[0];
+    cur_scores_2D[1] = vscores[i] * hscores[1];
+    cur_scores_2D[2] = vscores[i] * hscores[2];
+    cur_scores_2D[3] = vscores[i] * hscores[3];
+  }
+
+  assert(TX_TYPES == 16);
+  // This version of the function only works when there are at most 16 classes.
+  // So we will need to change the optimization or use av1_nn_softmax instead if
+  // this ever gets changed.
+  av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw);
+
+  const float score_thresh =
+      get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode);
+
+  // Always keep the TX type with the highest score, prune all others with
+  // score below score_thresh.
+  int max_score_i = 0;
+  float max_score = 0.0f;
+  uint16_t allow_bitmask = 0;
+  float sum_score = 0.0;
+  // Calculate sum of allowed tx type score and Populate allow bit mask based
+  // on score_thresh and allowed_tx_mask
+  int allow_count = 0;
+  int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID };
+  float scores_2D[16] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+  };
+  for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+    const int allow_tx_type =
+        check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]);
+    if (!allow_tx_type) {
+      continue;
+    }
+    if (scores_2D_raw[tx_idx] > max_score) {
+      max_score = scores_2D_raw[tx_idx];
+      max_score_i = tx_idx;
+    }
+    if (scores_2D_raw[tx_idx] >= score_thresh) {
+      // Set allow mask based on score_thresh
+      set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]);
+
+      // Accumulate score of allowed tx type
+      sum_score += scores_2D_raw[tx_idx];
+
+      scores_2D[allow_count] = scores_2D_raw[tx_idx];
+      tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx];
+      allow_count += 1;
+    }
+  }
+  if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) {
+    // If even the tx_type with max score is pruned, this means that no other
+    // tx_type is feasible. When this happens, we force enable max_score_i and
+    // end the search.
+    set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]);
+    memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+    *allowed_tx_mask = allow_bitmask;
+    return;
+  }
+
+  // Sort tx type probability of all types
+  if (allow_count <= 8) {
+    av1_sort_fi32_8(scores_2D, tx_type_allowed);
+  } else {
+    av1_sort_fi32_16(scores_2D, tx_type_allowed);
+  }
+
+  // Enable more pruning based on tx type probability and number of allowed tx
+  // types
+  if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) {
+    float temp_score = 0.0;
+    float score_ratio = 0.0;
+    int tx_idx, tx_count = 0;
+    const float inv_sum_score = 100 / sum_score;
+    // Get allowed tx types based on sorted probability score and tx count
+    for (tx_idx = 0; tx_idx < allow_count; tx_idx++) {
+      // Skip the tx type which has more than 30% of cumulative
+      // probability and allowed tx type count is more than 2
+      if (score_ratio > 30.0 && tx_count >= 2) break;
+
+      assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx]));
+      // Calculate cumulative probability
+      temp_score += scores_2D[tx_idx];
+
+      // Calculate percentage of cumulative probability of allowed tx type
+      score_ratio = temp_score * inv_sum_score;
+      tx_count++;
+    }
+    // Set remaining tx types as pruned
+    for (; tx_idx < allow_count; tx_idx++)
+      unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]);
+  }
+
+  memcpy(txk_map, tx_type_allowed, sizeof(tx_type_table_2D));
+  *allowed_tx_mask = allow_bitmask;
+}
+
+static float get_dev(float mean, double x2_sum, int num) {
+  const float e_x2 = (float)(x2_sum / num);
+  const float diff = e_x2 - mean * mean;
+  const float dev = (diff > 0) ? sqrtf(diff) : 0;
+  return dev;
+}
+
+// Writes the features required by the ML model to predict tx split based on
+// mean and standard deviation values of the block and sub-blocks.
+// Returns the number of elements written to the output array which is at most
+// 12 currently. Hence 'features' buffer should be able to accommodate at least
+// 12 elements.
+static AOM_INLINE int get_mean_dev_features(const int16_t *data, int stride,
+                                            int bw, int bh, float *features) {
+  const int16_t *const data_ptr = &data[0];
+  const int subh = (bh >= bw) ? (bh >> 1) : bh;
+  const int subw = (bw >= bh) ? (bw >> 1) : bw;
+  const int num = bw * bh;
+  const int sub_num = subw * subh;
+  int feature_idx = 2;
+  int total_x_sum = 0;
+  int64_t total_x2_sum = 0;
+  int num_sub_blks = 0;
+  double mean2_sum = 0.0f;
+  float dev_sum = 0.0f;
+
+  for (int row = 0; row < bh; row += subh) {
+    for (int col = 0; col < bw; col += subw) {
+      int x_sum;
+      int64_t x2_sum;
+      // TODO(any): Write a SIMD version. Clear registers.
+      aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+                          &x_sum, &x2_sum);
+      total_x_sum += x_sum;
+      total_x2_sum += x2_sum;
+
+      const float mean = (float)x_sum / sub_num;
+      const float dev = get_dev(mean, (double)x2_sum, sub_num);
+      features[feature_idx++] = mean;
+      features[feature_idx++] = dev;
+      mean2_sum += (double)(mean * mean);
+      dev_sum += dev;
+      num_sub_blks++;
+    }
+  }
+
+  const float lvl0_mean = (float)total_x_sum / num;
+  features[0] = lvl0_mean;
+  features[1] = get_dev(lvl0_mean, (double)total_x2_sum, num);
+
+  // Deviation of means.
+  features[feature_idx++] = get_dev(lvl0_mean, mean2_sum, num_sub_blks);
+  // Mean of deviations.
+  features[feature_idx++] = dev_sum / num_sub_blks;
+
+  return feature_idx;
+}
+
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+                               int blk_col, TX_SIZE tx_size) {
+  const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+  if (!nn_config) return -1;
+
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  float features[64] = { 0.0f };
+  get_mean_dev_features(diff, diff_stride, bw, bh, features);
+
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+
+  int int_score = (int)(score * 10000);
+  return clamp(int_score, -80000, 80000);
+}
+
+static INLINE uint16_t
+get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
+            int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+            const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode,
+            int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const int is_inter = is_inter_block(mbmi);
+  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+  // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed <
+  // TX_TYPES, only that specific tx type is allowed.
+  TX_TYPE txk_allowed = TX_TYPES;
+
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  int use_actual_frame_probs = 1;
+  const int *tx_type_probs;
+#if CONFIG_FPMT_TEST
+  use_actual_frame_probs =
+      (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+  if (!use_actual_frame_probs) {
+    tx_type_probs =
+        (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size];
+  }
+#endif
+  if (use_actual_frame_probs) {
+    tx_type_probs = cpi->ppi->frame_probs.tx_type_probs[update_type][tx_size];
+  }
+
+  if ((!is_inter && txfm_params->use_default_intra_tx_type) ||
+      (is_inter && txfm_params->default_inter_tx_type_prob_thresh == 0)) {
+    txk_allowed =
+        get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools);
+  } else if (is_inter &&
+             txfm_params->default_inter_tx_type_prob_thresh != INT_MAX) {
+    if (tx_type_probs[DEFAULT_INTER_TX_TYPE] >
+        txfm_params->default_inter_tx_type_prob_thresh) {
+      txk_allowed = DEFAULT_INTER_TX_TYPE;
+    } else {
+      int force_tx_type = 0;
+      int max_prob = 0;
+      const int tx_type_prob_threshold =
+          txfm_params->default_inter_tx_type_prob_thresh +
+          PROB_THRESH_OFFSET_TX_TYPE;
+      for (int i = 1; i < TX_TYPES; i++) {  // find maximum probability.
+        if (tx_type_probs[i] > max_prob) {
+          max_prob = tx_type_probs[i];
+          force_tx_type = i;
+        }
+      }
+      if (max_prob > tx_type_prob_threshold)  // force tx type with max prob.
+        txk_allowed = force_tx_type;
+      else if (x->rd_model == LOW_TXFM_RD) {
+        if (plane == 0) txk_allowed = DCT_DCT;
+      }
+    }
+  } else if (x->rd_model == LOW_TXFM_RD) {
+    if (plane == 0) txk_allowed = DCT_DCT;
+  }
+
+  const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+      tx_size, is_inter, cm->features.reduced_tx_set_used);
+
+  TX_TYPE uv_tx_type = DCT_DCT;
+  if (plane) {
+    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+    uv_tx_type = txk_allowed =
+        av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
+                        cm->features.reduced_tx_set_used);
+  }
+  PREDICTION_MODE intra_dir =
+      mbmi->filter_intra_mode_info.use_filter_intra
+          ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]
+          : mbmi->mode;
+  uint16_t ext_tx_used_flag =
+      cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset != 0 &&
+              tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT
+          ? av1_reduced_intra_tx_used_flag[intra_dir]
+          : av1_ext_tx_used_flag[tx_set_type];
+
+  if (cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset == 2)
+    ext_tx_used_flag &= av1_derived_intra_tx_used_flag[intra_dir];
+
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+      ext_tx_used_flag == 0x0001 ||
+      (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) ||
+      (!is_inter && cpi->oxcf.txfm_cfg.use_intra_dct_only)) {
+    txk_allowed = DCT_DCT;
+  }
+
+  if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0)
+    ext_tx_used_flag &= DCT_ADST_TX_MASK;
+
+  uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
+  if (txk_allowed < TX_TYPES) {
+    allowed_tx_mask = 1 << txk_allowed;
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else if (fast_tx_search) {
+    allowed_tx_mask = 0x0c01;  // V_DCT, H_DCT, DCT_DCT
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else {
+    assert(plane == 0);
+    allowed_tx_mask = ext_tx_used_flag;
+    int num_allowed = 0;
+    int i;
+
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 },
+                                            { 10, 17, 17, 10, 17, 17, 17 } };
+      const int thresh =
+          thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1]
+                    [update_type];
+      uint16_t prune = 0;
+      int max_prob = -1;
+      int max_idx = 0;
+      for (i = 0; i < TX_TYPES; i++) {
+        if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) {
+          max_prob = tx_type_probs[i];
+          max_idx = i;
+        }
+        if (tx_type_probs[i] < thresh) prune |= (1 << i);
+      }
+      if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx);
+      allowed_tx_mask &= (~prune);
+    }
+    for (i = 0; i < TX_TYPES; i++) {
+      if (allowed_tx_mask & (1 << i)) num_allowed++;
+    }
+    assert(num_allowed > 0);
+
+    if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
+      int pf = prune_factors[txfm_params->prune_2d_txfm_mode];
+      int mf = mul_factors[txfm_params->prune_2d_txfm_mode];
+      if (num_allowed <= 7) {
+        const uint16_t prune =
+            prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col,
+                           plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx,
+                           cm->features.reduced_tx_set_used);
+        allowed_tx_mask &= (~prune);
+      } else {
+        const int num_sel = (num_allowed * mf + 50) / 100;
+        const uint16_t prune = prune_txk_type_separ(
+            cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+            txk_map, allowed_tx_mask, pf, txb_ctx,
+            cm->features.reduced_tx_set_used, ref_best_rd, num_sel);
+
+        allowed_tx_mask &= (~prune);
+      }
+    } else {
+      assert(num_allowed > 0);
+      int allowed_tx_count =
+          (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) ? 1 : 5;
+      // !fast_tx_search && txk_end != txk_start && plane == 0
+      if (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_1 && is_inter &&
+          num_allowed > allowed_tx_count) {
+        prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+                    txfm_params->prune_2d_txfm_mode, txk_map, &allowed_tx_mask);
+      }
+    }
+  }
+
+  // Need to have at least one transform type allowed.
+  if (allowed_tx_mask == 0) {
+    txk_allowed = (plane ? uv_tx_type : DCT_DCT);
+    allowed_tx_mask = (1 << txk_allowed);
+  }
+
+  assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed));
+  *allowed_txk_types = txk_allowed;
+  return allowed_tx_mask;
+}
+
+#if CONFIG_RD_DEBUG
+static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+                                         int txb_coeff_cost) {
+  rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+}
+#endif
+
+static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
+                              TX_SIZE tx_size, const TX_TYPE tx_type,
+                              const TXB_CTX *const txb_ctx,
+                              int reduced_tx_set_used) {
+#if TXCOEFF_COST_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+  const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type,
+                                       txb_ctx, reduced_tx_set_used);
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  tmp_cm->txcoeff_cost_timer += elapsed_time;
+  ++tmp_cm->txcoeff_cost_count;
+#endif
+  return cost;
+}
+
+static int skip_trellis_opt_based_on_satd(MACROBLOCK *x,
+                                          QUANT_PARAM *quant_param, int plane,
+                                          int block, TX_SIZE tx_size,
+                                          int quant_b_adapt, int qstep,
+                                          unsigned int coeff_opt_satd_threshold,
+                                          int skip_trellis, int dc_only_blk) {
+  if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX))
+    return skip_trellis;
+
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff_ptr = p->coeff + block_offset;
+  const int n_coeffs = av1_get_max_eob(tx_size);
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size));
+  int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs);
+  satd = RIGHT_SIGNED_SHIFT(satd, shift);
+  satd >>= (x->e_mbd.bd - 8);
+
+  const int skip_block_trellis =
+      ((uint64_t)satd >
+       (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]);
+
+  av1_setup_quant(
+      tx_size, !skip_block_trellis,
+      skip_block_trellis
+          ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP)
+          : AV1_XFORM_QUANT_FP,
+      quant_b_adapt, quant_param);
+
+  return skip_block_trellis;
+}
+
+// Predict DC only blocks if the residual variance is below a qstep based
+// threshold.For such blocks, transform type search is bypassed.
+static INLINE void predict_dc_only_block(
+    MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+    int block, int blk_row, int blk_col, RD_STATS *best_rd_stats,
+    int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean,
+    int *dc_only_blk) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+  uint64_t block_var = UINT64_MAX;
+  const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3;
+  *block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize,
+                                txsize_to_bsize[tx_size], block_mse_q8,
+                                per_px_mean, &block_var);
+  assert((*block_mse_q8) != UINT_MAX);
+  uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep);
+  if (is_cur_buf_hbd(xd))
+    block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
+
+  if (block_var >= var_threshold) return;
+  const unsigned int predict_dc_level = x->txfm_search_params.predict_dc_level;
+  assert(predict_dc_level != 0);
+
+  // Prediction of skip block if residual mean and variance are less
+  // than qstep based threshold
+  if ((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) {
+    // If the normalized mean of residual block is less than the dc qstep and
+    // the  normalized block variance is less than ac qstep, then the block is
+    // assumed to be a skip block and its rdcost is updated accordingly.
+    best_rd_stats->skip_txfm = 1;
+
+    x->plane[plane].eobs[block] = 0;
+
+    if (is_cur_buf_hbd(xd))
+      *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2);
+
+    best_rd_stats->dist = (*block_sse) << 4;
+    best_rd_stats->sse = best_rd_stats->dist;
+
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl);
+    ENTROPY_CONTEXT *ta = ctxa;
+    ENTROPY_CONTEXT *tl = ctxl;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx_tmp;
+    const PLANE_TYPE plane_type = get_plane_type(plane);
+    get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp);
+    const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type]
+                                  .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1];
+    best_rd_stats->rate = zero_blk_rate;
+
+    best_rd_stats->rdcost =
+        RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse);
+
+    x->plane[plane].txb_entropy_ctx[block] = 0;
+  } else if (predict_dc_level > 1) {
+    // Predict DC only blocks based on residual variance.
+    // For chroma plane, this prediction is disabled for intra blocks.
+    if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1;
+  }
+}
+
+// Search for the best transform type for a given transform block.
+// This function can be used for both inter and intra, both luma and chroma.
+static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                           int block, int blk_row, int blk_col,
+                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                           const TXB_CTX *const txb_ctx,
+                           FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis,
+                           int64_t ref_best_rd, RD_STATS *best_rd_stats) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  int64_t best_rd = INT64_MAX;
+  uint16_t best_eob = 0;
+  TX_TYPE best_tx_type = DCT_DCT;
+  int rate_cost = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  tran_low_t *orig_dqcoeff = p->dqcoeff;
+  tran_low_t *best_dqcoeff = x->dqcoeff_buf;
+  const int tx_type_map_idx =
+      plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
+  av1_invalid_rd_stats(best_rd_stats);
+
+  skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id],
+                                   DRY_RUN_NORMAL);
+
+  uint8_t best_txb_ctx = 0;
+  // txk_allowed = TX_TYPES: >1 tx types are allowed
+  // txk_allowed < TX_TYPES: only that specific tx type is allowed.
+  TX_TYPE txk_allowed = TX_TYPES;
+  int txk_map[TX_TYPES] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  int64_t block_sse;
+  unsigned int block_mse_q8;
+  int dc_only_blk = 0;
+  const bool predict_dc_block =
+      txfm_params->predict_dc_level >= 1 && txw != 64 && txh != 64;
+  int64_t per_px_mean = INT64_MAX;
+  if (predict_dc_block) {
+    predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row,
+                          blk_col, best_rd_stats, &block_sse, &block_mse_q8,
+                          &per_px_mean, &dc_only_blk);
+    if (best_rd_stats->skip_txfm == 1) {
+      const TX_TYPE tx_type = DCT_DCT;
+      if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+      return;
+    }
+  } else {
+    block_sse = av1_pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+                                    txsize_to_bsize[tx_size], &block_mse_q8);
+    assert(block_mse_q8 != UINT_MAX);
+  }
+
+  // Bit mask to indicate which transform types are allowed in the RD search.
+  uint16_t tx_mask;
+
+  // Use DCT_DCT transform for DC only block.
+  if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1)
+    tx_mask = 1 << DCT_DCT;
+  else
+    tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+                          tx_size, txb_ctx, ftxs_mode, ref_best_rd,
+                          &txk_allowed, txk_map);
+  const uint16_t allowed_tx_mask = tx_mask;
+
+  if (is_cur_buf_hbd(xd)) {
+    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+    block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
+  }
+  block_sse *= 16;
+  // Use mse / qstep^2 based threshold logic to take decision of R-D
+  // optimization of coeffs. For smaller residuals, coeff optimization
+  // would be helpful. For larger residuals, R-D optimization may not be
+  // effective.
+  // TODO(any): Experiment with variance and mean based thresholds
+  const int perform_block_coeff_opt =
+      ((uint64_t)block_mse_q8 <=
+       (uint64_t)txfm_params->coeff_opt_thresholds[0] * qstep * qstep);
+  skip_trellis |= !perform_block_coeff_opt;
+
+  // Flag to indicate if distortion should be calculated in transform domain or
+  // not during iterating through transform type candidates.
+  // Transform domain distortion is accurate for higher residuals.
+  // TODO(any): Experiment with variance and mean based thresholds
+  int use_transform_domain_distortion =
+      (txfm_params->use_transform_domain_distortion > 0) &&
+      (block_mse_q8 >= txfm_params->tx_domain_dist_threshold) &&
+      // Any 64-pt transforms only preserves half the coefficients.
+      // Therefore transform domain distortion is not valid for these
+      // transform sizes.
+      (txsize_sqr_up_map[tx_size] != TX_64X64) &&
+      // Use pixel domain distortion for DC only blocks
+      !dc_only_blk;
+  // Flag to indicate if an extra calculation of distortion in the pixel domain
+  // should be performed at the end, after the best transform type has been
+  // decided.
+  int calc_pixel_domain_distortion_final =
+      txfm_params->use_transform_domain_distortion == 1 &&
+      use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD;
+  if (calc_pixel_domain_distortion_final &&
+      (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001))
+    calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
+
+  const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+  int skip_trellis_based_on_satd[TX_TYPES] = { 0 };
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, !skip_trellis,
+                  skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+                                                         : AV1_XFORM_QUANT_FP)
+                               : AV1_XFORM_QUANT_FP,
+                  cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+
+  // Iterate through all transform type candidates.
+  for (int idx = 0; idx < TX_TYPES; ++idx) {
+    const TX_TYPE tx_type = (TX_TYPE)txk_map[idx];
+    if (tx_type == TX_TYPE_INVALID || !check_bit_mask(allowed_tx_mask, tx_type))
+      continue;
+    txfm_param.tx_type = tx_type;
+    if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) {
+      av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                        &quant_param);
+    }
+    if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+    RD_STATS this_rd_stats;
+    av1_invalid_rd_stats(&this_rd_stats);
+
+    if (!dc_only_blk)
+      av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+    else
+      av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean);
+
+    skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd(
+        x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt,
+        qstep, txfm_params->coeff_opt_thresholds[1], skip_trellis, dc_only_blk);
+
+    av1_quant(x, plane, block, &txfm_param, &quant_param);
+
+    // Calculate rate cost of quantized coefficients.
+    if (quant_param.use_optimize_b) {
+      // TODO(aomedia:3209): update Trellis quantization to take into account
+      // quantization matrices.
+      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+                     &rate_cost);
+    } else {
+      rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
+                              cm->features.reduced_tx_set_used);
+    }
+
+    // If rd cost based on coeff rate alone is already more than best_rd,
+    // terminate early.
+    if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue;
+
+    // Calculate distortion.
+    if (eobs_ptr[block] == 0) {
+      // When eob is 0, pixel domain distortion is more efficient and accurate.
+      this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (dc_only_blk) {
+      this_rd_stats.sse = block_sse;
+      this_rd_stats.dist = dist_block_px_domain(
+          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+    } else if (use_transform_domain_distortion) {
+      const SCAN_ORDER *const scan_order =
+          get_scan(txfm_param.tx_size, txfm_param.tx_type);
+      dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                           scan_order->scan, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
+    } else {
+      int64_t sse_diff = INT64_MAX;
+      // high_energy threshold assumes that every pixel within a txfm block
+      // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
+      // for 8 bit.
+      const int64_t high_energy_thresh =
+          ((int64_t)128 * 128 * tx_size_2d[tx_size]);
+      const int is_high_energy = (block_sse >= high_energy_thresh);
+      if (tx_size == TX_64X64 || is_high_energy) {
+        // Because 3 out 4 quadrants of transform coefficients are forced to
+        // zero, the inverse transform has a tendency to overflow. sse_diff
+        // is effectively the energy of those 3 quadrants, here we use it
+        // to decide if we should do pixel domain distortion. If the energy
+        // is mostly in first quadrant, then it is unlikely that we have
+        // overflow issue in inverse transform.
+        const SCAN_ORDER *const scan_order =
+            get_scan(txfm_param.tx_size, txfm_param.tx_type);
+        dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                             scan_order->scan, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+        sse_diff = block_sse - this_rd_stats.sse;
+      }
+      if (tx_size != TX_64X64 || !is_high_energy ||
+          (sse_diff * 2) < this_rd_stats.sse) {
+        const int64_t tx_domain_dist = this_rd_stats.dist;
+        this_rd_stats.dist = dist_block_px_domain(
+            cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+        // For high energy blocks, occasionally, the pixel domain distortion
+        // can be artificially low due to clamping at reconstruction stage
+        // even when inverse transform output is hugely different from the
+        // actual residue.
+        if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
+          this_rd_stats.dist = tx_domain_dist;
+      } else {
+        assert(sse_diff < INT64_MAX);
+        this_rd_stats.dist += sse_diff;
+      }
+      this_rd_stats.sse = block_sse;
+    }
+
+    this_rd_stats.rate = rate_cost;
+
+    const int64_t rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
+    if (rd < best_rd) {
+      best_rd = rd;
+      *best_rd_stats = this_rd_stats;
+      best_tx_type = tx_type;
+      best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+      best_eob = x->plane[plane].eobs[block];
+      // Swap dqcoeff buffers
+      tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+      best_dqcoeff = p->dqcoeff;
+      p->dqcoeff = tmp_dqcoeff;
+    }
+
+#if CONFIG_COLLECT_RD_STATS == 1
+    if (plane == 0) {
+      PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+                              plane_bsize, tx_size, tx_type, rd);
+    }
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+#if COLLECT_TX_SIZE_DATA
+    // Generate small sample to restrict output size.
+    static unsigned int seed = 21743;
+    if (lcg_rand16(&seed) % 200 == 0) {
+      FILE *fp = NULL;
+
+      if (within_border) {
+        fp = fopen(av1_tx_size_data_output_file, "a");
+      }
+
+      if (fp) {
+        // Transform info and RD
+        const int txb_w = tx_size_wide[tx_size];
+        const int txb_h = tx_size_high[tx_size];
+
+        // Residue signal.
+        const int diff_stride = block_size_wide[plane_bsize];
+        struct macroblock_plane *const p = &x->plane[plane];
+        const int16_t *src_diff =
+            &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
+
+        for (int r = 0; r < txb_h; ++r) {
+          for (int c = 0; c < txb_w; ++c) {
+            fprintf(fp, "%d,", src_diff[c]);
+          }
+          src_diff += diff_stride;
+        }
+
+        fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd);
+        fprintf(fp, "\n");
+        fclose(fp);
+      }
+    }
+#endif  // COLLECT_TX_SIZE_DATA
+
+    // If the current best RD cost is much worse than the reference RD cost,
+    // terminate early.
+    if (cpi->sf.tx_sf.adaptive_txb_search_level) {
+      if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) >
+          ref_best_rd) {
+        break;
+      }
+    }
+
+    // Terminate transform type search if the block has been quantized to
+    // all zero.
+    if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break;
+  }
+
+  assert(best_rd != INT64_MAX);
+
+  best_rd_stats->skip_txfm = best_eob == 0;
+  if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
+  x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+  x->plane[plane].eobs[block] = best_eob;
+  skip_trellis = skip_trellis_based_on_satd[best_tx_type];
+
+  // Point dqcoeff to the quantized coefficients corresponding to the best
+  // transform type, then we can skip transform and quantization, e.g. in the
+  // final pixel domain distortion calculation and recon_intra().
+  p->dqcoeff = best_dqcoeff;
+
+  if (calc_pixel_domain_distortion_final && best_eob) {
+    best_rd_stats->dist = dist_block_px_domain(
+        cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+    best_rd_stats->sse = block_sse;
+  }
+
+  // Intra mode needs decoded pixels such that the next transform block
+  // can use them for prediction.
+  recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+              txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob);
+  p->dqcoeff = orig_dqcoeff;
+}
+
+// Pick transform type for a luma transform block of tx_size. Note this function
+// is used only for inter-predicted blocks.
+static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  TX_SIZE tx_size, int blk_row, int blk_col,
+                                  int block, int plane_bsize, TXB_CTX *txb_ctx,
+                                  RD_STATS *rd_stats,
+                                  FAST_TX_SEARCH_MODE ftxs_mode,
+                                  int64_t ref_rdcost) {
+  assert(is_inter_block(x->e_mbd.mi[0]));
+  RD_STATS this_rd_stats;
+  const int skip_trellis = 0;
+  search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
+                 txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats);
+
+  av1_merge_rd_stats(rd_stats, &this_rd_stats);
+}
+
+static AOM_INLINE void try_tx_block_no_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+    const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+    int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  const int bw = mi_size_wide[plane_bsize];
+  const ENTROPY_CONTEXT *const pta = ta + blk_col;
+  const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+  rd_stats->zero_rate = zero_blk_rate;
+  const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+  mbmi->inter_tx_size[index] = tx_size;
+  tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+             rd_stats, ftxs_mode, ref_best_rd);
+  assert(rd_stats->rate < INT_MAX);
+
+  const int pick_skip_txfm =
+      !xd->lossless[mbmi->segment_id] &&
+      (rd_stats->skip_txfm == 1 ||
+       RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+           RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
+  if (pick_skip_txfm) {
+#if CONFIG_RD_DEBUG
+    update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate);
+#endif  // CONFIG_RD_DEBUG
+    rd_stats->rate = zero_blk_rate;
+    rd_stats->dist = rd_stats->sse;
+    p->eobs[block] = 0;
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+  }
+  rd_stats->skip_txfm = pick_skip_txfm;
+  set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+               pick_skip_txfm);
+
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+    rd_stats->rate += x->mode_costs.txfm_partition_cost[txfm_partition_ctx][0];
+
+  no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+  no_split->tx_type =
+      xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+}
+
+static AOM_INLINE void try_tx_block_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) {
+  assert(tx_size < TX_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+  const int txb_width = tx_size_wide_unit[tx_size];
+  const int txb_height = tx_size_high_unit[tx_size];
+  // Transform size after splitting current block.
+  const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+  const int sub_txb_width = tx_size_wide_unit[sub_txs];
+  const int sub_txb_height = tx_size_high_unit[sub_txs];
+  const int sub_step = sub_txb_width * sub_txb_height;
+  const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width);
+  assert(nblks > 0);
+  av1_init_rd_stats(split_rd_stats);
+  split_rd_stats->rate =
+      x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1];
+
+  for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) {
+    const int offsetr = blk_row + r;
+    if (offsetr >= max_blocks_high) break;
+    for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) {
+      assert(blk_idx < 4);
+      const int offsetc = blk_col + c;
+      if (offsetc >= max_blocks_wide) continue;
+
+      RD_STATS this_rd_stats;
+      int this_cost_valid = 1;
+      select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1,
+                      plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
+                      no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost,
+                      &this_cost_valid, ftxs_mode);
+      if (!this_cost_valid) {
+        split_rd_stats->rdcost = INT64_MAX;
+        return;
+      }
+      av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+      split_rd_stats->rdcost =
+          RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+      if (split_rd_stats->rdcost > ref_best_rd) {
+        split_rd_stats->rdcost = INT64_MAX;
+        return;
+      }
+      block += sub_step;
+    }
+  }
+}
+
+static float get_var(float mean, double x2_sum, int num) {
+  const float e_x2 = (float)(x2_sum / num);
+  const float diff = e_x2 - mean * mean;
+  return diff;
+}
+
+static AOM_INLINE void get_blk_var_dev(const int16_t *data, int stride, int bw,
+                                       int bh, float *dev_of_mean,
+                                       float *var_of_vars) {
+  const int16_t *const data_ptr = &data[0];
+  const int subh = (bh >= bw) ? (bh >> 1) : bh;
+  const int subw = (bw >= bh) ? (bw >> 1) : bw;
+  const int num = bw * bh;
+  const int sub_num = subw * subh;
+  int total_x_sum = 0;
+  int64_t total_x2_sum = 0;
+  int blk_idx = 0;
+  float var_sum = 0.0f;
+  float mean_sum = 0.0f;
+  double var2_sum = 0.0f;
+  double mean2_sum = 0.0f;
+
+  for (int row = 0; row < bh; row += subh) {
+    for (int col = 0; col < bw; col += subw) {
+      int x_sum;
+      int64_t x2_sum;
+      aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+                          &x_sum, &x2_sum);
+      total_x_sum += x_sum;
+      total_x2_sum += x2_sum;
+
+      const float mean = (float)x_sum / sub_num;
+      const float var = get_var(mean, (double)x2_sum, sub_num);
+      mean_sum += mean;
+      mean2_sum += (double)(mean * mean);
+      var_sum += var;
+      var2_sum += var * var;
+      blk_idx++;
+    }
+  }
+
+  const float lvl0_mean = (float)total_x_sum / num;
+  const float block_var = get_var(lvl0_mean, (double)total_x2_sum, num);
+  mean_sum += lvl0_mean;
+  mean2_sum += (double)(lvl0_mean * lvl0_mean);
+  var_sum += block_var;
+  var2_sum += block_var * block_var;
+  const float av_mean = mean_sum / 5;
+
+  if (blk_idx > 1) {
+    // Deviation of means.
+    *dev_of_mean = get_dev(av_mean, mean2_sum, (blk_idx + 1));
+    // Variance of variances.
+    const float mean_var = var_sum / (blk_idx + 1);
+    *var_of_vars = get_var(mean_var, var2_sum, (blk_idx + 1));
+  }
+}
+
+static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize,
+                                    int blk_row, int blk_col, TX_SIZE tx_size,
+                                    int *try_no_split, int *try_split,
+                                    int pruning_level) {
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  float dev_of_means = 0.0f;
+  float var_of_vars = 0.0f;
+
+  // This function calculates the deviation of means, and the variance of pixel
+  // variances of the block as well as it's sub-blocks.
+  get_blk_var_dev(diff, diff_stride, bw, bh, &dev_of_means, &var_of_vars);
+  const int dc_q = x->plane[0].dequant_QTX[0] >> 3;
+  const int ac_q = x->plane[0].dequant_QTX[1] >> 3;
+  const int no_split_thresh_scales[4] = { 0, 24, 8, 8 };
+  const int no_split_thresh_scale = no_split_thresh_scales[pruning_level];
+  const int split_thresh_scales[4] = { 0, 24, 10, 8 };
+  const int split_thresh_scale = split_thresh_scales[pruning_level];
+
+  if ((dev_of_means <= dc_q) &&
+      (split_thresh_scale * var_of_vars <= ac_q * ac_q)) {
+    *try_split = 0;
+  }
+  if ((dev_of_means > no_split_thresh_scale * dc_q) &&
+      (var_of_vars > no_split_thresh_scale * ac_q * ac_q)) {
+    *try_no_split = 0;
+  }
+}
+
+// Search for the best transform partition(recursive)/type for a given
+// inter-predicted luma block. The obtained transform selection will be saved
+// in xd->mi[0], the corresponding RD stats will be saved in rd_stats.
+static AOM_INLINE void select_tx_block(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
+  assert(tx_size < TX_SIZES_ALL);
+  av1_init_rd_stats(rd_stats);
+  if (ref_best_rd < 0) {
+    *is_cost_valid = 0;
+    return;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(blk_row < max_block_high(xd, plane_bsize, 0) &&
+         blk_col < max_block_wide(xd, plane_bsize, 0));
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->bsize, tx_size);
+  struct macroblock_plane *const p = &x->plane[0];
+
+  int try_no_split = (cpi->oxcf.txfm_cfg.enable_tx64 ||
+                      txsize_sqr_up_map[tx_size] != TX_64X64) &&
+                     (cpi->oxcf.txfm_cfg.enable_rect_tx ||
+                      tx_size_wide[tx_size] == tx_size_high[tx_size]);
+  int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+  TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
+  // Prune tx_split and no-split based on sub-block properties.
+  if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 &&
+      cpi->sf.tx_sf.prune_tx_size_level > 0) {
+    prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size,
+                            &try_no_split, &try_split,
+                            cpi->sf.tx_sf.prune_tx_size_level);
+  }
+
+  if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) {
+    if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0;
+  }
+
+  // Try using current block as a single transform block without split.
+  if (try_no_split) {
+    try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                          plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+                          ftxs_mode, &no_split);
+
+    // Speed features for early termination.
+    const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
+    if (search_level) {
+      if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) {
+        *is_cost_valid = 0;
+        return;
+      }
+      if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) {
+        try_split = 0;
+      }
+    }
+    if (cpi->sf.tx_sf.txb_split_cap) {
+      if (p->eobs[block] == 0) try_split = 0;
+    }
+  }
+
+  // ML based speed feature to skip searching for split transform blocks.
+  if (x->e_mbd.bd == 8 && try_split &&
+      !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) {
+    const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh;
+    if (threshold >= 0) {
+      const int split_score =
+          ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+      if (split_score < -threshold) try_split = 0;
+    }
+  }
+
+  RD_STATS split_rd_stats;
+  split_rd_stats.rdcost = INT64_MAX;
+  // Try splitting current block into smaller transform blocks.
+  if (try_split) {
+    try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                       plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+                       AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+                       &split_rd_stats);
+  }
+
+  if (no_split.rd < split_rd_stats.rdcost) {
+    ENTROPY_CONTEXT *pta = ta + blk_col;
+    ENTROPY_CONTEXT *ptl = tl + blk_row;
+    p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
+    av1_set_txb_context(x, 0, block, tx_size, pta, ptl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
+    }
+    mbmi->tx_size = tx_size;
+    update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type);
+    const int bw = mi_size_wide[plane_bsize];
+    set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+                 rd_stats->skip_txfm);
+  } else {
+    *rd_stats = split_rd_stats;
+    if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0;
+  }
+}
+
+static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
+                                              MACROBLOCK *x, RD_STATS *rd_stats,
+                                              int64_t ref_best_rd,
+                                              BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  mbmi->tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
+
+  // If tx64 is not enabled, we need to go down to the next available size
+  if (!cpi->oxcf.txfm_cfg.enable_tx64 && cpi->oxcf.txfm_cfg.enable_rect_tx) {
+    static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_32X32,  // 64x64 transform
+      TX_4X8,    // 4x8 transform
+      TX_8X4,    // 8x4 transform
+      TX_8X16,   // 8x16 transform
+      TX_16X8,   // 16x8 transform
+      TX_16X32,  // 16x32 transform
+      TX_32X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X16,   // 4x16 transform
+      TX_16X4,   // 16x4 transform
+      TX_8X32,   // 8x32 transform
+      TX_32X8,   // 32x8 transform
+      TX_16X32,  // 16x64 transform
+      TX_32X16,  // 64x16 transform
+    };
+    mbmi->tx_size = tx_size_max_32[mbmi->tx_size];
+  } else if (cpi->oxcf.txfm_cfg.enable_tx64 &&
+             !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+    static const TX_SIZE tx_size_max_square[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_64X64,  // 64x64 transform
+      TX_4X4,    // 4x8 transform
+      TX_4X4,    // 8x4 transform
+      TX_8X8,    // 8x16 transform
+      TX_8X8,    // 16x8 transform
+      TX_16X16,  // 16x32 transform
+      TX_16X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X4,    // 4x16 transform
+      TX_4X4,    // 16x4 transform
+      TX_8X8,    // 8x32 transform
+      TX_8X8,    // 32x8 transform
+      TX_16X16,  // 16x64 transform
+      TX_16X16,  // 64x16 transform
+    };
+    mbmi->tx_size = tx_size_max_square[mbmi->tx_size];
+  } else if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+             !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+    static const TX_SIZE tx_size_max_32_square[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_32X32,  // 64x64 transform
+      TX_4X4,    // 4x8 transform
+      TX_4X4,    // 8x4 transform
+      TX_8X8,    // 8x16 transform
+      TX_8X8,    // 16x8 transform
+      TX_16X16,  // 16x32 transform
+      TX_16X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X4,    // 4x16 transform
+      TX_4X4,    // 16x4 transform
+      TX_8X8,    // 8x32 transform
+      TX_8X8,    // 32x8 transform
+      TX_16X16,  // 16x64 transform
+      TX_16X16,  // 64x16 transform
+    };
+
+    mbmi->tx_size = tx_size_max_32_square[mbmi->tx_size];
+  }
+
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+  // Skip RDcost is used only for Inter blocks
+  const int64_t skip_txfm_rd =
+      is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
+  const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_rate, 0);
+  const int skip_trellis = 0;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+                       AOMMIN(no_skip_txfm_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+                       mbmi->tx_size, FTXS_NONE, skip_trellis);
+}
+
+static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
+                                               MACROBLOCK *x,
+                                               RD_STATS *rd_stats,
+                                               int64_t ref_best_rd,
+                                               BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  mbmi->tx_size = TX_4X4;
+  // TODO(any) : Pass this_rd based on skip/non-skip cost
+  const int skip_trellis = 0;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
+                       FTXS_NONE, skip_trellis);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void ml_predict_intra_tx_depth_prune(MACROBLOCK *x, int blk_row,
+                                            int blk_col, BLOCK_SIZE bsize,
+                                            TX_SIZE tx_size) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  // Disable the pruning logic using NN model for the following cases:
+  // 1) Lossless coding as only 4x4 transform is evaluated in this case
+  // 2) When transform and current block sizes do not match as the features are
+  // obtained over the current block
+  // 3) When operating bit-depth is not 8-bit as the input features are not
+  // scaled according to bit-depth.
+  if (xd->lossless[mbmi->segment_id] || txsize_to_bsize[tx_size] != bsize ||
+      xd->bd != 8)
+    return;
+
+  // Currently NN model based pruning is supported only when largest transform
+  // size is 8x8
+  if (tx_size != TX_8X8) return;
+
+  // Neural network model is a sequential neural net and was trained using SGD
+  // optimizer. The model can be further improved in terms of speed/quality by
+  // considering the following experiments:
+  // 1) Generate ML model by training with balanced data for different learning
+  // rates and optimizers.
+  // 2) Experiment with ML model by adding features related to the statistics of
+  // top and left pixels to capture the accuracy of reconstructed neighbouring
+  // pixels for 4x4 blocks numbered 1, 2, 3 in 8x8 block, source variance of 4x4
+  // sub-blocks, etc.
+  // 3) Generate ML models for transform blocks other than 8x8.
+  const NN_CONFIG *const nn_config = &av1_intra_tx_split_nnconfig_8x8;
+  const float *const intra_tx_prune_thresh = av1_intra_tx_prune_nn_thresh_8x8;
+
+  float features[NUM_INTRA_TX_SPLIT_FEATURES] = { 0.0f };
+  const int diff_stride = block_size_wide[bsize];
+
+  const int16_t *diff = x->plane[0].src_diff + MI_SIZE * blk_row * diff_stride +
+                        MI_SIZE * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  int feature_idx = get_mean_dev_features(diff, diff_stride, bw, bh, features);
+
+  features[feature_idx++] = log1pf((float)x->source_variance);
+
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  const float log_dc_q_square = log1pf((float)(dc_q * dc_q) / 256.0f);
+  features[feature_idx++] = log_dc_q_square;
+  assert(feature_idx == NUM_INTRA_TX_SPLIT_FEATURES);
+  for (int i = 0; i < NUM_INTRA_TX_SPLIT_FEATURES; i++) {
+    features[i] = (features[i] - av1_intra_tx_split_8x8_mean[i]) /
+                  av1_intra_tx_split_8x8_std[i];
+  }
+
+  float score;
+  av1_nn_predict(features, nn_config, 1, &score);
+
+  TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+  if (score <= intra_tx_prune_thresh[0])
+    txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_SPLIT;
+  else if (score > intra_tx_prune_thresh[1])
+    txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_LARGEST;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// Search for the best uniform transform size and type for current coding block.
+static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+                                                   MACROBLOCK *x,
+                                                   RD_STATS *rd_stats,
+                                                   int64_t ref_best_rd,
+                                                   BLOCK_SIZE bs) {
+  av1_invalid_rd_stats(rd_stats);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+  const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT;
+  int start_tx;
+  // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls
+  // how many times of splitting is allowed during the RD search.
+  int init_depth;
+
+  if (tx_select) {
+    start_tx = max_rect_tx_size;
+    init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+                                       is_inter_block(mbmi), &cpi->sf,
+                                       txfm_params->tx_size_search_method);
+    if (init_depth == MAX_TX_DEPTH && !cpi->oxcf.txfm_cfg.enable_tx64 &&
+        txsize_sqr_up_map[start_tx] == TX_64X64) {
+      start_tx = sub_tx_size_map[start_tx];
+    }
+  } else {
+    const TX_SIZE chosen_tx_size =
+        tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
+    start_tx = chosen_tx_size;
+    init_depth = MAX_TX_DEPTH;
+  }
+
+  const int skip_trellis = 0;
+  uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  TX_SIZE best_tx_size = max_rect_tx_size;
+  int64_t best_rd = INT64_MAX;
+  const int num_blks = bsize_to_num_blk(bs);
+  x->rd_model = FULL_TXFM_RD;
+  int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH;
+       depth++, tx_size = sub_tx_size_map[tx_size]) {
+    if ((!cpi->oxcf.txfm_cfg.enable_tx64 &&
+         txsize_sqr_up_map[tx_size] == TX_64X64) ||
+        (!cpi->oxcf.txfm_cfg.enable_rect_tx &&
+         tx_size_wide[tx_size] != tx_size_high[tx_size])) {
+      continue;
+    }
+
+#if !CONFIG_REALTIME_ONLY
+    if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_SPLIT) break;
+
+    // Set the flag to enable the evaluation of NN classifier to prune transform
+    // depths. As the features are based on intra residual information of
+    // largest transform, the evaluation of NN model is enabled only for this
+    // case.
+    txfm_params->enable_nn_prune_intra_tx_depths =
+        (cpi->sf.tx_sf.prune_intra_tx_depths_using_nn && tx_size == start_tx);
+#endif
+
+    RD_STATS this_rd_stats;
+    // When the speed feature use_rd_based_breakout_for_intra_tx_search is
+    // enabled, use the known minimum best_rd for early termination.
+    const int64_t rd_thresh =
+        cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search
+            ? AOMMIN(ref_best_rd, best_rd)
+            : ref_best_rd;
+    rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, rd_thresh, bs,
+                                     tx_size, FTXS_NONE, skip_trellis);
+    if (rd[depth] < best_rd) {
+      av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks);
+      av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks);
+      best_tx_size = tx_size;
+      best_rd = rd[depth];
+      *rd_stats = this_rd_stats;
+    }
+    if (tx_size == TX_4X4) break;
+    // If we are searching three depths, prune the smallest size depending
+    // on rd results for the first two depths for low contrast blocks.
+    if (depth > init_depth && depth != MAX_TX_DEPTH &&
+        x->source_variance < 256) {
+      if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
+    }
+  }
+
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks);
+    av1_copy_array(txfm_info->blk_skip, best_blk_skip, num_blks);
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  // Reset the flags to avoid any unintentional evaluation of NN model and
+  // consumption of prune depths.
+  txfm_params->enable_nn_prune_intra_tx_depths = false;
+  txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_NONE;
+#endif
+}
+
+// Search for the best transform type for the given transform block in the
+// given plane/channel, and calculate the corresponding RD cost.
+static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
+                                     int blk_col, BLOCK_SIZE plane_bsize,
+                                     TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  if (args->exit_early) {
+    args->incomplete_exit = 1;
+    return;
+  }
+
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int is_inter = is_inter_block(xd->mi[0]);
+  const AV1_COMP *cpi = args->cpi;
+  ENTROPY_CONTEXT *a = args->t_above + blk_col;
+  ENTROPY_CONTEXT *l = args->t_left + blk_row;
+  const AV1_COMMON *cm = &cpi->common;
+  RD_STATS this_rd_stats;
+  av1_init_rd_stats(&this_rd_stats);
+
+  if (!is_inter) {
+    av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+#if !CONFIG_REALTIME_ONLY
+    const TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+    if (txfm_params->enable_nn_prune_intra_tx_depths) {
+      ml_predict_intra_tx_depth_prune(x, blk_row, blk_col, plane_bsize,
+                                      tx_size);
+      if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_LARGEST) {
+        av1_invalid_rd_stats(&args->rd_stats);
+        args->exit_early = 1;
+        return;
+      }
+    }
+#endif
+  }
+
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                 &txb_ctx, args->ftxs_mode, args->skip_trellis,
+                 args->best_rd - args->current_rd, &this_rd_stats);
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+    assert(!is_inter || plane_bsize < BLOCK_8X8);
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+  }
+
+#if CONFIG_RD_DEBUG
+  update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate);
+#endif  // CONFIG_RD_DEBUG
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  const int blk_idx =
+      blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col;
+
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  if (plane == 0)
+    set_blk_skip(txfm_info->blk_skip, plane, blk_idx,
+                 x->plane[plane].eobs[block] == 0);
+  else
+    set_blk_skip(txfm_info->blk_skip, plane, blk_idx, 0);
+
+  int64_t rd;
+  if (is_inter) {
+    const int64_t no_skip_txfm_rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+    const int64_t skip_txfm_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+    rd = AOMMIN(no_skip_txfm_rd, skip_txfm_rd);
+    this_rd_stats.skip_txfm &= !x->plane[plane].eobs[block];
+  } else {
+    // Signal non-skip_txfm for Intra blocks
+    rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+    this_rd_stats.skip_txfm = 0;
+  }
+
+  av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+  args->current_rd += rd;
+  if (args->current_rd > args->best_rd) args->exit_early = 1;
+}
+
+int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              RD_STATS *rd_stats, int64_t ref_best_rd,
+                              BLOCK_SIZE bs, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+                        block_signals_txsize(mbmi->bsize);
+  int tx_size_rate = 0;
+  if (tx_select) {
+    const int ctx = txfm_partition_context(
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+    tx_size_rate = mode_costs->txfm_partition_cost[ctx][0];
+  }
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+  const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, 0);
+  const int64_t no_this_rd =
+      RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
+  mbmi->tx_size = tx_size;
+
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+  const int max_blocks_wide = max_block_wide(xd, bs, 0);
+  const int max_blocks_high = max_block_high(xd, bs, 0);
+
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.x = x;
+  args.cpi = cpi;
+  args.best_rd = ref_best_rd;
+  args.current_rd = AOMMIN(no_this_rd, skip_txfm_rd);
+  av1_init_rd_stats(&args.rd_stats);
+  av1_get_entropy_contexts(bs, &xd->plane[0], args.t_above, args.t_left);
+  int i = 0;
+  for (int blk_row = 0; blk_row < max_blocks_high && !args.incomplete_exit;
+       blk_row += txh_unit) {
+    for (int blk_col = 0; blk_col < max_blocks_wide; blk_col += txw_unit) {
+      RD_STATS this_rd_stats;
+      av1_init_rd_stats(&this_rd_stats);
+
+      if (args.exit_early) {
+        args.incomplete_exit = 1;
+        break;
+      }
+
+      ENTROPY_CONTEXT *a = args.t_above + blk_col;
+      ENTROPY_CONTEXT *l = args.t_left + blk_row;
+      TXB_CTX txb_ctx;
+      get_txb_ctx(bs, tx_size, 0, a, l, &txb_ctx);
+
+      TxfmParam txfm_param;
+      QUANT_PARAM quant_param;
+      av1_setup_xform(&cpi->common, x, tx_size, DCT_DCT, &txfm_param);
+      av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, 0, &quant_param);
+
+      av1_xform(x, 0, i, blk_row, blk_col, bs, &txfm_param);
+      av1_quant(x, 0, i, &txfm_param, &quant_param);
+
+      this_rd_stats.rate =
+          cost_coeffs(x, 0, i, tx_size, txfm_param.tx_type, &txb_ctx, 0);
+
+      const SCAN_ORDER *const scan_order =
+          get_scan(txfm_param.tx_size, txfm_param.tx_type);
+      dist_block_tx_domain(x, 0, i, tx_size, quant_param.qmatrix,
+                           scan_order->scan, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
+
+      const int64_t no_skip_txfm_rd =
+          RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+      const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+
+      this_rd_stats.skip_txfm &= !x->plane[0].eobs[i];
+
+      av1_merge_rd_stats(&args.rd_stats, &this_rd_stats);
+      args.current_rd += AOMMIN(no_skip_txfm_rd, skip_rd);
+
+      if (args.current_rd > ref_best_rd) {
+        args.exit_early = 1;
+        break;
+      }
+
+      av1_set_txb_context(x, 0, i, tx_size, a, l);
+      i += step;
+    }
+  }
+
+  if (args.incomplete_exit) av1_invalid_rd_stats(&args.rd_stats);
+
+  *rd_stats = args.rd_stats;
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  int64_t rd;
+  // rdstats->rate should include all the rate except skip/non-skip cost as the
+  // same is accounted in the caller functions after rd evaluation of all
+  // planes. However the decisions should be done after considering the
+  // skip/non-skip header cost
+  if (rd_stats->skip_txfm && is_inter) {
+    rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+  } else {
+    // Intra blocks are always signalled as non-skip
+    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
+                rd_stats->dist);
+    rd_stats->rate += tx_size_rate;
+  }
+  // Check if forcing the block to skip transform leads to smaller RD cost.
+  if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+    int64_t temp_skip_txfm_rd =
+        RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+    if (temp_skip_txfm_rd <= rd) {
+      rd = temp_skip_txfm_rd;
+      rd_stats->rate = 0;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip_txfm = 1;
+    }
+  }
+
+  return rd;
+}
+
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                             RD_STATS *rd_stats, int64_t ref_best_rd,
+                             BLOCK_SIZE bs, TX_SIZE tx_size,
+                             FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+                        block_signals_txsize(mbmi->bsize);
+  int tx_size_rate = 0;
+  if (tx_select) {
+    const int ctx = txfm_partition_context(
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+    tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0]
+                            : tx_size_cost(x, bs, tx_size);
+  }
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+  const int64_t skip_txfm_rd =
+      is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
+  const int64_t no_this_rd =
+      RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
+
+  mbmi->tx_size = tx_size;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+                       AOMMIN(no_this_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+                       tx_size, ftxs_mode, skip_trellis);
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  int64_t rd;
+  // rdstats->rate should include all the rate except skip/non-skip cost as the
+  // same is accounted in the caller functions after rd evaluation of all
+  // planes. However the decisions should be done after considering the
+  // skip/non-skip header cost
+  if (rd_stats->skip_txfm && is_inter) {
+    rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+  } else {
+    // Intra blocks are always signalled as non-skip
+    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
+                rd_stats->dist);
+    rd_stats->rate += tx_size_rate;
+  }
+  // Check if forcing the block to skip transform leads to smaller RD cost.
+  if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+    int64_t temp_skip_txfm_rd =
+        RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+    if (temp_skip_txfm_rd <= rd) {
+      rd = temp_skip_txfm_rd;
+      rd_stats->rate = 0;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip_txfm = 1;
+    }
+  }
+
+  return rd;
+}
+
+// Search for the best transform type for a luma inter-predicted block, given
+// the transform block partitions.
+// This function is used only when some speed features are enabled.
+static AOM_INLINE void tx_block_yrd(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth,
+    ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+    TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd,
+    RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) {
+  assert(tx_size < TX_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(is_inter_block(mbmi));
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+      plane_bsize, blk_row, blk_col)];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->bsize, tx_size);
+
+  av1_init_rd_stats(rd_stats);
+  if (tx_size == plane_tx_size) {
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+    const int zero_blk_rate =
+        x->coeff_costs.coeff_costs[txs_ctx][get_plane_type(0)]
+            .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    rd_stats->zero_rate = zero_blk_rate;
+    tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+               rd_stats, ftxs_mode, ref_best_rd);
+    const int mi_width = mi_size_wide[plane_bsize];
+    TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+    if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+        rd_stats->skip_txfm == 1) {
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip_txfm = 1;
+      set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 1);
+      x->plane[0].eobs[block] = 0;
+      x->plane[0].txb_entropy_ctx[block] = 0;
+      update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+    } else {
+      rd_stats->skip_txfm = 0;
+      set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 0);
+    }
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][0];
+    av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int txb_width = tx_size_wide_unit[sub_txs];
+    const int txb_height = tx_size_high_unit[sub_txs];
+    const int step = txb_height * txb_width;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+    RD_STATS pn_rd_stats;
+    int64_t this_rd = 0;
+    assert(txb_width > 0 && txb_height > 0);
+
+    for (int row = 0; row < row_end; row += txb_height) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += txb_width) {
+        const int offsetc = blk_col + col;
+
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+                     depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+        block += step;
+      }
+    }
+
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][1];
+  }
+}
+
+// search for tx type with tx sizes already decided for a inter-predicted luma
+// partition block. It's used only when some speed features are enabled.
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+  if (ref_best_rd < 0) {
+    av1_invalid_rd_stats(rd_stats);
+    return 0;
+  }
+
+  av1_init_rd_stats(rd_stats);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  const int step = bw * bh;
+  const int init_depth = get_search_init_depth(
+      mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+  memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+  memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+  int64_t this_rd = 0;
+  for (int idy = 0, block = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      RD_STATS pn_rd_stats;
+      av1_init_rd_stats(&pn_rd_stats);
+      tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth,
+                   ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd,
+                   &pn_rd_stats, ftxs_mode);
+      if (pn_rd_stats.rate == INT_MAX) {
+        av1_invalid_rd_stats(rd_stats);
+        return 0;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      this_rd +=
+          AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                 RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+      block += step;
+    }
+  }
+
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+  const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+  this_rd =
+      RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate, rd_stats->dist);
+  if (skip_txfm_rd < this_rd) {
+    this_rd = skip_txfm_rd;
+    rd_stats->rate = 0;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip_txfm = 1;
+  }
+
+  const int is_cost_valid = this_rd > ref_best_rd;
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+  return is_cost_valid;
+}
+
+// Search for the best transform size and type for current inter-predicted
+// luma block with recursive transform block partitioning. The obtained
+// transform selection will be saved in xd->mi[0], the corresponding RD stats
+// will be saved in rd_stats. The returned value is the corresponding RD cost.
+static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  assert(is_inter_block(xd->mi[0]));
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int fast_tx_search = txfm_params->tx_size_search_method > USE_FULL_RD;
+  int64_t rd_thresh = ref_best_rd;
+  if (rd_thresh == 0) {
+    av1_invalid_rd_stats(rd_stats);
+    return INT64_MAX;
+  }
+  if (fast_tx_search && rd_thresh < INT64_MAX) {
+    if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+  }
+  assert(rd_thresh > 0);
+  const FAST_TX_SEARCH_MODE ftxs_mode =
+      fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+  memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+  memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+  const int init_depth = get_search_init_depth(
+      mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  const int step = bw * bh;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+  int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0);
+  int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0);
+  int block = 0;
+
+  av1_init_rd_stats(rd_stats);
+  for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) {
+    for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) {
+      const int64_t best_rd_sofar =
+          (rd_thresh == INT64_MAX)
+              ? INT64_MAX
+              : (rd_thresh - (AOMMIN(skip_txfm_rd, no_skip_txfm_rd)));
+      int is_cost_valid = 1;
+      RD_STATS pn_rd_stats;
+      // Search for the best transform block size and type for the sub-block.
+      select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
+                      ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
+                      best_rd_sofar, &is_cost_valid, ftxs_mode);
+      if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+        av1_invalid_rd_stats(rd_stats);
+        return INT64_MAX;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
+      no_skip_txfm_rd =
+          RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
+      block += step;
+    }
+  }
+
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  rd_stats->skip_txfm = (skip_txfm_rd <= no_skip_txfm_rd);
+
+  // If fast_tx_search is true, only DCT and 1D DCT were tested in
+  // select_inter_block_yrd() above. Do a better search for tx type with
+  // tx sizes already decided.
+  if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) {
+    if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+      return INT64_MAX;
+  }
+
+  int64_t final_rd;
+  if (rd_stats->skip_txfm) {
+    final_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
+  } else {
+    final_rd =
+        RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
+    if (!xd->lossless[xd->mi[0]->segment_id]) {
+      final_rd =
+          AOMMIN(final_rd, RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse));
+    }
+  }
+
+  return final_rd;
+}
+
+// Return 1 to terminate transform search early. The decision is made based on
+// the comparison with the reference RD cost and the model-estimated RD cost.
+static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi,
+                                                  MACROBLOCK *x,
+                                                  BLOCK_SIZE bsize,
+                                                  int64_t ref_best_rd) {
+  const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level;
+  assert(level >= 0 && level <= 2);
+  int model_rate;
+  int64_t model_dist;
+  uint8_t model_skip;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+      cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL,
+      NULL, NULL, NULL);
+  if (model_skip) return 0;
+  const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+  // TODO(debargha, urvang): Improve the model and make the check below
+  // tighter.
+  static const int prune_factor_by8[] = { 3, 5 };
+  const int factor = prune_factor_by8[level - 1];
+  return ((model_rd * factor) >> 3) > ref_best_rd;
+}
+
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                         RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                         int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  assert(is_inter_block(xd->mi[0]));
+
+  av1_invalid_rd_stats(rd_stats);
+
+  // If modeled RD cost is a lot worse than the best so far, terminate early.
+  if (cpi->sf.tx_sf.model_based_prune_tx_search_level &&
+      ref_best_rd != INT64_MAX) {
+    if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return;
+  }
+
+  // Hashing based speed feature. If the hash of the prediction residue block is
+  // found in the hash table, use previous search results and terminate early.
+  uint32_t hash = 0;
+  MB_RD_RECORD *mb_rd_record = NULL;
+  const int mi_row = x->e_mbd.mi_row;
+  const int mi_col = x->e_mbd.mi_col;
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+  const int is_mb_rd_hash_enabled =
+      (within_border && cpi->sf.rd_sf.use_mb_rd_hash);
+  const int n4 = bsize_to_num_blk(bsize);
+  if (is_mb_rd_hash_enabled) {
+    hash = get_block_residue_hash(x, bsize);
+    mb_rd_record = x->txfm_search_info.mb_rd_record;
+    const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+    if (match_index != -1) {
+      MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
+      fetch_mb_rd_info(n4, mb_rd_info, rd_stats, x);
+      return;
+    }
+  }
+
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (txfm_params->skip_txfm_level &&
+      predict_skip_txfm(x, bsize, &dist,
+                        cpi->common.features.reduced_tx_set_used)) {
+    set_skip_txfm(x, rd_stats, bsize, dist);
+    // Save the RD search results into mb_rd_record.
+    if (is_mb_rd_hash_enabled)
+      save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    return;
+  }
+#if CONFIG_SPEED_STATS
+  ++x->txfm_search_info.tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+
+  const int64_t rd =
+      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd);
+
+  if (rd == INT64_MAX) {
+    // We should always find at least one candidate unless ref_best_rd is less
+    // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+    // might have failed to find something better)
+    assert(ref_best_rd != INT64_MAX);
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  // Save the RD search results into mb_rd_record.
+  if (is_mb_rd_hash_enabled) {
+    assert(mb_rd_record != NULL);
+    save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+  }
+}
+
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bs,
+                                       int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *tx_params = &x->txfm_search_params;
+  assert(bs == mbmi->bsize);
+  const int is_inter = is_inter_block(mbmi);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  av1_init_rd_stats(rd_stats);
+
+  // Hashing based speed feature for inter blocks. If the hash of the residue
+  // block is found in the table, use previously saved search results and
+  // terminate early.
+  uint32_t hash = 0;
+  MB_RD_RECORD *mb_rd_record = NULL;
+  const int num_blks = bsize_to_num_blk(bs);
+  if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) {
+    const int within_border =
+        mi_row >= xd->tile.mi_row_start &&
+        (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
+        mi_col >= xd->tile.mi_col_start &&
+        (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
+    if (within_border) {
+      hash = get_block_residue_hash(x, bs);
+      mb_rd_record = x->txfm_search_info.mb_rd_record;
+      const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+      if (match_index != -1) {
+        MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
+        fetch_mb_rd_info(num_blks, mb_rd_info, rd_stats, x);
+        return;
+      }
+    }
+  }
+
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (tx_params->skip_txfm_level && is_inter &&
+      !xd->lossless[mbmi->segment_id] &&
+      predict_skip_txfm(x, bs, &dist,
+                        cpi->common.features.reduced_tx_set_used)) {
+    // Populate rdstats as per skip decision
+    set_skip_txfm(x, rd_stats, bs, dist);
+    // Save the RD search results into mb_rd_record.
+    if (mb_rd_record) {
+      save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+    }
+    return;
+  }
+
+  if (xd->lossless[mbmi->segment_id]) {
+    // Lossless mode can only pick the smallest (4x4) transform size.
+    choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else if (tx_params->tx_size_search_method == USE_LARGESTALL) {
+    choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else {
+    choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+  }
+
+  // Save the RD search results into mb_rd_record for possible reuse in future.
+  if (mb_rd_record) {
+    save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+  }
+}
+
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+                  BLOCK_SIZE bsize, int64_t ref_best_rd) {
+  av1_init_rd_stats(rd_stats);
+  if (ref_best_rd < 0) return 0;
+  if (!x->e_mbd.is_chroma_ref) return 1;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+  const int is_inter = is_inter_block(mbmi);
+  int64_t this_rd = 0, skip_txfm_rd = 0;
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  if (is_inter) {
+    for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
+      av1_subtract_plane(x, plane_bsize, plane);
+  }
+
+  const int skip_trellis = 0;
+  const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+  int is_cost_valid = 1;
+  for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    RD_STATS this_rd_stats;
+    int64_t chroma_ref_best_rd = ref_best_rd;
+    // For inter blocks, refined ref_best_rd is used for early exit
+    // For intra blocks, even though current rd crosses ref_best_rd, early
+    // exit is not recommended as current rd is used for gating subsequent
+    // modes as well (say, for angular modes)
+    // TODO(any): Extend the early exit mechanism for intra modes as well
+    if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter &&
+        chroma_ref_best_rd != INT64_MAX)
+      chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd);
+    av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane,
+                         plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis);
+    if (this_rd_stats.rate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+    av1_merge_rd_stats(rd_stats, &this_rd_stats);
+    this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+    if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+
+  return is_cost_valid;
+}
+
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                          RD_STATS *rd_stats, int64_t ref_best_rd,
+                          int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+                          int skip_trellis) {
+  assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size));
+
+  if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+      txsize_sqr_up_map[tx_size] == TX_64X64) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  if (current_rd > ref_best_rd) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.x = x;
+  args.cpi = cpi;
+  args.best_rd = ref_best_rd;
+  args.current_rd = current_rd;
+  args.ftxs_mode = ftxs_mode;
+  args.skip_trellis = skip_trellis;
+  av1_init_rd_stats(&args.rd_stats);
+
+  av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left);
+  av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm,
+                                         &args);
+
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
+
+  if (invalid_rd) {
+    av1_invalid_rd_stats(rd_stats);
+  } else {
+    *rd_stats = args.rd_stats;
+  }
+}
+
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                    RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                    RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int skip_txfm_cost[2] = { x->mode_costs.skip_txfm_cost[skip_ctx][0],
+                                  x->mode_costs.skip_txfm_cost[skip_ctx][1] };
+  const int64_t min_header_rate =
+      mode_rate + AOMMIN(skip_txfm_cost[0], skip_txfm_cost[1]);
+  // Account for minimum skip and non_skip rd.
+  // Eventually either one of them will be added to mode_rate
+  const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+  if (min_header_rd_possible > ref_best_rd) {
+    av1_invalid_rd_stats(rd_stats_y);
+    return 0;
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+  const int64_t rd_thresh =
+      ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_y);
+  rd_stats->rate = mode_rate;
+
+  // cost and distortion
+  av1_subtract_plane(x, bsize, 0);
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+      !xd->lossless[mbmi->segment_id]) {
+    av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+    PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 2
+  } else {
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+    for (int i = 0; i < xd->height * xd->width; ++i)
+      set_blk_skip(x->txfm_search_info.blk_skip, 0, i, rd_stats_y->skip_txfm);
+  }
+
+  if (rd_stats_y->rate == INT_MAX) return 0;
+
+  av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+  const int64_t non_skip_txfm_rdcosty =
+      RDCOST(x->rdmult, rd_stats->rate + skip_txfm_cost[0], rd_stats->dist);
+  const int64_t skip_txfm_rdcosty =
+      RDCOST(x->rdmult, mode_rate + skip_txfm_cost[1], rd_stats->sse);
+  const int64_t min_rdcosty = AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty);
+  if (min_rdcosty > ref_best_rd) return 0;
+
+  av1_init_rd_stats(rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    int64_t ref_best_chroma_rd = ref_best_rd;
+    // Calculate best rd cost possible for chroma
+    if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma &&
+        (ref_best_chroma_rd != INT64_MAX)) {
+      ref_best_chroma_rd = (ref_best_chroma_rd -
+                            AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty));
+    }
+    const int is_cost_valid_uv =
+        av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
+    if (!is_cost_valid_uv) return 0;
+    av1_merge_rd_stats(rd_stats, rd_stats_uv);
+  }
+
+  int choose_skip_txfm = rd_stats->skip_txfm;
+  if (!choose_skip_txfm && !xd->lossless[mbmi->segment_id]) {
+    const int64_t rdcost_no_skip_txfm = RDCOST(
+        x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_txfm_cost[0],
+        rd_stats->dist);
+    const int64_t rdcost_skip_txfm =
+        RDCOST(x->rdmult, skip_txfm_cost[1], rd_stats->sse);
+    if (rdcost_no_skip_txfm >= rdcost_skip_txfm) choose_skip_txfm = 1;
+  }
+  if (choose_skip_txfm) {
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    rd_stats->rate = mode_rate + skip_txfm_cost[1];
+    rd_stats->dist = rd_stats->sse;
+    rd_stats_y->dist = rd_stats_y->sse;
+    rd_stats_uv->dist = rd_stats_uv->sse;
+    mbmi->skip_txfm = 1;
+    if (rd_stats->skip_txfm) {
+      const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (tmprd > ref_best_rd) return 0;
+    }
+  } else {
+    rd_stats->rate += skip_txfm_cost[0];
+    mbmi->skip_txfm = 0;
+  }
+
+  return 1;
+}
diff --git a/third_party/aom/av1/encoder/tx_search.h b/third_party/aom/av1/encoder/tx_search.h
new file mode 100644
index 0000000000..ed95c1cd98
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_search.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+#define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set this macro as 1 to collect data about tx size selection.
+#define COLLECT_TX_SIZE_DATA 0
+
+#if COLLECT_TX_SIZE_DATA
+static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
+#endif
+
+enum {
+  FTXS_NONE = 0,
+  FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+  FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+  FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} UENUM1BYTE(FAST_TX_SEARCH_MODE);
+
+static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                   TX_SIZE tx_size) {
+  assert(bsize == x->e_mbd.mi[0]->bsize);
+  if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT ||
+      !block_signals_txsize(bsize))
+    return 0;
+
+  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+  const int depth = tx_size_to_depth(tx_size, bsize);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int tx_size_ctx = get_tx_size_context(xd);
+  return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+}
+
+/*!\brief Compute the pixel domain distortion.
+ *
+ * \ingroup transform_search
+ * Compute the pixel domain distortion from diff on all visible 4x4s in the
+ * transform block.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          Plane index
+ * \param[in]    blk_row        Block row index
+ * \param[in]    blk_col        Block col index
+ * \param[in]    plane_bsize    Current plane block size
+ * \param[in]    tx_bsize       Transform size
+ * \param[in]    block_mse_q8   Block mse
+ * \return       An int64_t value that is the block sse.
+ */
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+                            int blk_col, const BLOCK_SIZE plane_bsize,
+                            const BLOCK_SIZE tx_bsize,
+                            unsigned int *block_mse_q8);
+
+int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              RD_STATS *rd_stats, int64_t ref_best_rd,
+                              BLOCK_SIZE bs, TX_SIZE tx_size);
+
+/*!\brief Transform type search for luma macroblock with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and return the transform coefficients RD
+ * cost of current luma macroblock with the given uniform transform size.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \param[in]    bs             Size of the current macroblock
+ * \param[in]    tx_size        The given transform size
+ * \param[in]    ftxs_mode      Transform search mode specifying desired speed
+                                and quality tradeoff
+ * \param[in]    skip_trellis   Binary flag indicating if trellis optimization
+                                should be skipped
+ * \return       An int64_t value that is the best RD cost found.
+ */
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                             RD_STATS *rd_stats, int64_t ref_best_rd,
+                             BLOCK_SIZE bs, TX_SIZE tx_size,
+                             FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for best transform size and type for luma inter blocks. The transform
+ * block partitioning can be recursive resulting in non-uniform transform sizes.
+ * The best transform size and type, if found, will be saved in the MB_MODE_INFO
+ * structure, and the corresponding RD stats will be saved in rd_stats.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \remark       Nothing is returned. The selected transform size and type will
+                 be saved in the MB_MODE_INFO structure
+ */
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                         RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                         int64_t ref_best_rd);
+
+/*!\brief Uniform transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for the best transform size and type for current macroblock block,
+ * with the assumption that all the transform blocks have a uniform size
+ * (VP9 style). The selected transform size and type will be saved in the
+ * MB_MODE_INFO structure; the corresponding RD stats will be saved in rd_stats.
+ * This function may be used for both intra and inter predicted blocks.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bs             Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \remark       Nothing is returned. The selected transform size and type will
+                 be saved in the MB_MODE_INFO structure
+ */
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bs,
+                                       int64_t ref_best_rd);
+
+/*!\brief Chroma block transform search.
+ *
+ * \ingroup transform_search
+ * Calculate the transform coefficient RD cost for the given chroma macroblock
+ * If the current mode is intra, then this function will compute the predictor.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \return       An integer value is returned. 0: early termination triggered,
+                 no valid rd cost available; 1: rd cost values are valid.
+ */
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+                  BLOCK_SIZE bsize, int64_t ref_best_rd);
+
+/*!\brief Transform type search with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and calculate the transform coefficients
+ * RD cost of the current transform block with the specified (uniform) transform
+ * size and plane. The RD results will be saved in rd_stats.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \param[in]    current_rd     Current RD cost for this block so far
+ * \param[in]    plane          Plane index
+ * \param[in]    plane_bsize    Size of the current macroblock considering
+                                sup-sampling
+ * \param[in]    tx_size        The given transform size
+ * \param[in]    ftxs_mode      Transform search mode specifying desired speed
+                                and quality tradeoff
+ * \param[in]    skip_trellis   Binary flag indicating if trellis optimization
+                                should be skipped
+ *
+ * \remark       Nothing is returned. The RD results will be saved in rd_stats.
+ */
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                          RD_STATS *rd_stats, int64_t ref_best_rd,
+                          int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+                          int skip_trellis);
+
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * This function combines y and uv planes' transform search processes together
+ * for inter-predicted blocks (including IntraBC), when the prediction is
+ * already generated. It first does subtraction to obtain the prediction error.
+ * Then it calls
+ * av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
+ * av1_txfm_uvrd sequentially and handles possible early terminations.
+ * The RD metrics are calculated and stored in rd_stats/_y/_uv.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    rd_stats       Pointer to struct to keep track of the overal RD
+                                stats
+ * \param[in]    rd_stats_y     Pointer to struct to keep track of the RD
+                                stats for the luma plane
+ * \param[in]    rd_stats_uv    Pointer to struct to keep track of the RD
+                                stats for the chroma planes
+ * \param[in]    mode_rate      Rate cost to encode the prediction mode info. of
+                                the current macroblock
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ *
+ * \return       An integer value is returned indicating if a valid transform
+                 candidate is found (1) or not (0).
+ */
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                    RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                    RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/txb_rdopt.c b/third_party/aom/av1/encoder/txb_rdopt.c
new file mode 100644
index 0000000000..e551e8aa12
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt.c
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/txb_rdopt.h"
+#include "av1/encoder/txb_rdopt_utils.h"
+
+#include "av1/common/idct.h"
+
+static INLINE void update_coeff_general(
+    int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+    TX_CLASS tx_class, int bhl, int width, int64_t rdmult, int shift,
+    int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
+    const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int is_last = si == (eob - 1);
+  const int coeff_ctx = get_lower_levels_ctx_general(
+      is_last, si, bhl, width, levels, ci, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const int sign = (qc < 0) ? 1 : 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+    const int rate =
+        get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    tran_low_t qc_low, dqc_low;
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = qc_low = dqc_low = 0;
+      dist_low = dist0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci);
+      rate_low =
+          get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+    }
+
+    rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+      *accu_dist += dist_low - dist0;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist - dist0;
+    }
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple(
+    int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+    int bhl, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+    const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+    uint8_t *levels, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  (void)eob;
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(si != eob - 1);
+  assert(si > 0);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t abs_tqc = abs(tcoeff[ci]);
+    const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+    int rate_low = 0;
+    const int rate = get_two_coeff_cost_simple(
+        ci, abs_qc, coeff_ctx, txb_costs, bhl, tx_class, levels, &rate_low);
+    if (abs_dqc < abs_tqc) {
+      *accu_rate += rate;
+      return;
+    }
+
+    const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+    const int64_t dist_low =
+        get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
+    if (rd_low < rd) {
+      const int sign = (qc < 0) ? 1 : 0;
+      qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+      dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
+      levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+    } else {
+      *accu_rate += rate;
+    }
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_eob(
+    int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+    int si, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width,
+    int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+    const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  assert(si != *eob - 1);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    int lower_level = 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int sign = (qc < 0) ? 1 : 0;
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+    int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
+    int rate =
+        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bhl, tx_class, levels);
+    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+    tran_low_t qc_low, dqc_low;
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+
+    if (abs_qc == 1) {
+      abs_qc_low = 0;
+      dqc_low = qc_low = 0;
+      dist_low = 0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0;
+      rate_low =
+          get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    }
+
+    int lower_level_new_eob = 0;
+    const int new_eob = si + 1;
+    const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bhl, width, si);
+    const int new_eob_cost =
+        get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+    int rate_coeff_eob =
+        new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
+                                          dc_sign_ctx, txb_costs, bhl,
+                                          tx_class);
+    int64_t dist_new_eob = dist;
+    int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+    if (abs_qc_low > 0) {
+      const int rate_coeff_eob_low =
+          new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
+                                            coeff_ctx_new_eob, dc_sign_ctx,
+                                            txb_costs, bhl, tx_class);
+      const int64_t dist_new_eob_low = dist_low;
+      const int64_t rd_new_eob_low =
+          RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+      if (rd_new_eob_low < rd_new_eob) {
+        lower_level_new_eob = 1;
+        rd_new_eob = rd_new_eob_low;
+        rate_coeff_eob = rate_coeff_eob_low;
+        dist_new_eob = dist_new_eob_low;
+      }
+    }
+
+    if (sharpness == 0 || abs_qc > 1) {
+      if (rd_low < rd) {
+        lower_level = 1;
+        rd = rd_low;
+        rate = rate_low;
+        dist = dist_low;
+      }
+    }
+
+    if (sharpness == 0 && rd_new_eob < rd) {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        int last_ci = nz_ci[ni];
+        levels[get_padded_idx(last_ci, bhl)] = 0;
+        qcoeff[last_ci] = 0;
+        dqcoeff[last_ci] = 0;
+      }
+      *eob = new_eob;
+      *nz_num = 0;
+      *accu_rate = rate_coeff_eob;
+      *accu_dist = dist_new_eob;
+      lower_level = lower_level_new_eob;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist;
+    }
+
+    if (lower_level) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+    }
+    if (qcoeff[ci]) {
+      nz_ci[*nz_num] = ci;
+      ++*nz_num;
+    }
+  }
+}
+
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+                               int nz_num, int *nz_ci, int64_t rdmult,
+                               int skip_cost, int non_skip_cost,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+  const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+  const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+  if (rd_new_eob < rd) {
+    for (int i = 0; i < nz_num; ++i) {
+      const int ci = nz_ci[i];
+      qcoeff[ci] = 0;
+      dqcoeff[ci] = 0;
+      // no need to set up levels because this is the last step
+      // levels[get_padded_idx(ci, bhl)] = 0;
+    }
+    *accu_rate = 0;
+    *eob = 0;
+  }
+}
+
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+                            int reduced_tx_set_used) {
+  if (plane > 0) return 0;
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        return x->mode_costs
+            .inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+    } else {
+      if (ext_tx_set > 0) {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+        return x->mode_costs.intra_tx_type_costs[ext_tx_set][square_tx_size]
+                                                [intra_dir][tx_type];
+      }
+    }
+  }
+  return 0;
+}
+
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                     int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                     const TXB_CTX *const txb_ctx, int *rate_cost,
+                     int sharpness) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int shift = av1_get_tx_scale(tx_size);
+  int eob = p->eobs[block];
+  const int16_t *dequant = p->dequant_QTX;
+  const qm_val_t *iqmatrix =
+      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
+  const qm_val_t *qmatrix =
+      cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR
+          ? av1_get_qmatrix(&cpi->common.quant_params, xd, plane, tx_size,
+                            tx_type)
+          : NULL;
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  tran_low_t *dqcoeff = p->dqcoeff + block_offset;
+  const tran_low_t *tcoeff = p->coeff + block_offset;
+  const CoeffCosts *coeff_costs = &x->coeff_costs;
+
+  // This function is not called if eob = 0.
+  assert(eob > 0);
+
+  const AV1_COMMON *cm = &cpi->common;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int bhl = get_txb_bhl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  assert(height == (1 << bhl));
+  const int is_inter = is_inter_block(mbmi);
+  const LV_MAP_COEFF_COST *txb_costs =
+      &coeff_costs->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *txb_eob_costs =
+      &coeff_costs->eob_costs[eob_multi_size][plane_type];
+
+  const int rshift = 2;
+
+  const int64_t rdmult =
+      (((int64_t)x->rdmult *
+        (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
+       2) >>
+      rshift;
+
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, height);
+
+  if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
+
+  // TODO(angirbird): check iqmatrix
+
+  const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+  const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int si = eob - 1;
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const tran_low_t abs_qc = abs(qc);
+  const int sign = qc < 0;
+  const int max_nz_num = 2;
+  int nz_num = 1;
+  int nz_ci[3] = { ci, 0, 0 };
+  if (abs_qc >= 2) {
+    update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+                         bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels, iqmatrix, qmatrix);
+    --si;
+  } else {
+    assert(abs_qc == 1);
+    const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, si);
+    accu_rate +=
+        get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
+                           txb_costs, bhl, tx_class);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+    accu_dist += dist - dist0;
+    --si;
+  }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
+  case tx_class_literal:                                                   \
+    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
+      update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
+                       tx_size, tx_class_literal, bhl, width,              \
+                       txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+                       txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
+                       levels, sharpness, iqmatrix, qmatrix);              \
+    }                                                                      \
+    break
+  switch (tx_class) {
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+    default: assert(false);
+  }
+
+  if (si == -1 && nz_num <= max_nz_num && sharpness == 0) {
+    update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+                non_skip_cost, qcoeff, dqcoeff);
+  }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
+  case tx_class_literal:                                                       \
+    for (; si >= 1; --si) {                                                    \
+      update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bhl, \
+                          rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
+                          qcoeff, dqcoeff, levels, iqmatrix, qmatrix);         \
+    }                                                                          \
+    break
+  switch (tx_class) {
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+    default: assert(false);
+  }
+
+  // DC position
+  if (si == 0) {
+    // no need to update accu_dist because it's not used after this point
+    int64_t dummy_dist = 0;
+    update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+                         bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels, iqmatrix, qmatrix);
+  }
+
+  const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
+                                            cm->features.reduced_tx_set_used);
+  if (eob == 0)
+    accu_rate += skip_cost;
+  else
+    accu_rate += non_skip_cost + tx_type_cost;
+
+  p->eobs[block] = eob;
+  p->txb_entropy_ctx[block] =
+      av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
+
+  *rate_cost = accu_rate;
+  return eob;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+    const struct macroblock_plane *p, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+  const int bhl = get_txb_bhl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, height);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+      coeff_costs->lps_cost;
+  int c = eob - 1;
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int sign = AOMSIGN(v);
+    const int level = (v ^ sign) - sign;
+    const int coeff_ctx = coeff_contexts[pos];
+    cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+    if (v) {
+      // sign bit cost
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx_eob(pos, bhl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+      if (c) {
+        cost += av1_cost_literal(1);
+      } else {
+        const int sign01 = (sign ^ sign) - sign;
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+        return cost;
+      }
+    }
+  }
+  const int(*base_cost)[8] = coeff_costs->base_cost;
+  for (c = eob - 2; c >= 1; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const int level = abs(v);
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+    if (v) {
+      // sign bit cost
+      cost += av1_cost_literal(1);
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bhl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+    }
+  }
+  // c == 0 after previous loop
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int coeff_ctx = coeff_contexts[pos];
+    const int sign = AOMSIGN(v);
+    const int level = (v ^ sign) - sign;
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+
+    if (v) {
+      // sign bit cost
+      const int sign01 = (sign ^ sign) - sign;
+      const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+      cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bhl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+    }
+  }
+  return cost;
+}
+
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type) {
+  assert(plane == 0);
+
+  int cost = 0;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+
+  int eob = p->eobs[block];
+
+  // coeffs
+  int c = eob - 1;
+  // eob
+  {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]) - 1;
+    cost += (v << (AV1_PROB_COST_SHIFT + 2));
+  }
+  // other coeffs
+  for (c = eob - 2; c >= 0; c--) {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]);
+    const int idx = AOMMIN(v, 14);
+
+    cost += costLUT[idx];
+  }
+
+  // const_term does not contain DC, and log(e) does not contain eob, so both
+  // (eob-1)
+  cost += (const_term + loge_par) * (eob - 1);
+
+  return cost;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
+  return cost;
+}
+
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+  return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
+                                  plane_type, coeff_costs, xd, tx_type,
+                                  tx_class, reduced_tx_set_used);
+}
+
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  int eob = p->eobs[block];
+
+  if (adjust_eob) {
+    const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+    const int16_t *scan = scan_order->scan;
+    tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
+    tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+    tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+    update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
+                          tcoeff, qcoeff, dqcoeff);
+    p->eobs[block] = eob;
+  }
+
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+  return warehouse_efficients_txb_laplacian(
+      x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
+      tx_type, tx_class, reduced_tx_set_used);
+}
diff --git a/third_party/aom/av1/encoder/txb_rdopt.h b/third_party/aom/av1/encoder/txb_rdopt.h
new file mode 100644
index 0000000000..70b322a2e1
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TXB_RDOPT_H_
+#define AOM_AV1_ENCODER_TXB_RDOPT_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Adjust the magnitude of quantized coefficients to achieve better
+ * rate-distortion (RD) trade-off.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function goes through each coefficient and greedily choose to lower
+ * the coefficient magnitude by 1 or not based on the RD score.
+ *
+ * The coefficients are processing in reversed scan order.
+ *
+ * Note that, the end of block position (eob) may change if the original last
+ * coefficient is lowered to zero.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[out]   rate_cost      The entropy cost of coding the transform block
+ * after adjustment of coefficients.
+ * \param[in]    sharpness      When sharpness > 0, the function will be less
+ * aggressive towards lowering the magnitude of coefficients.
+ * In this way, the transform block will contain more high-frequency
+ * coefficients and therefore will preserve the sharpness of the reconstructed
+ * block.
+ */
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                     int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                     const TXB_CTX *const txb_ctx, int *rate_cost,
+                     int sharpness);
+
+/*!\brief Compute the entropy cost of coding coefficients in a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in]    x                    Pointer to structure holding the data for
+ the current encoding macroblock.
+ * \param[in]    plane                The index of the current plane.
+ * \param[in]    block                The index of the current transform block
+ in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in]    tx_size              The transform size.
+ * \param[in]    tx_type              The transform type.
+ * \param[in]    txb_ctx              Context info for entropy coding transform
+ block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    reduced_tx_set_used  Whether the transform type is chosen from
+ * a reduced set.
+ */
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used);
+
+/*!\brief Estimate the entropy cost of coding a transform block using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function compute the entropy costs of the end of block position (eob)
+ * and the transform type (tx_type) precisely.
+ *
+ * Then using \ref av1_cost_coeffs_txb_estimate to estimate the entropy costs
+ * of coefficients in the transform block.
+ *
+ * In the end, the function returns the sum of entropy costs of end of block
+ * position (eob), transform type (tx_type) and coefficients.
+ *
+ * Compared to \ref av1_cost_coeffs_txb, this function is much faster but less
+ * accurate.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    reduced_tx_set_used  Whether the transform type is chosen from
+ * a reduced set.
+ * \param[in]    adjust_eob     Whether to adjust the end of block position
+ (eob)
+ * or not.
+ * \return       int            Estimated entropy cost of coding the transform
+ block.
+ */
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob);
+
+/*!\brief Estimate the entropy cost of transform coefficients using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function assumes each transform coefficient is of its own Laplacian
+ * distribution and the coefficient is the only observation of the Laplacian
+ * distribution.
+ *
+ * Based on that, each coefficient's coding cost can be estimated by computing
+ * the entropy of the corresponding Laplacian distribution.
+ *
+ * This function then return the sum of the estimated entropy cost for all
+ * coefficients in the transform block.
+ *
+ * Note that the entropy cost of end of block (eob) and transform type (tx_type)
+ * are not included.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \return       int            Estimated entropy cost of coefficients in the
+ * transform block.
+ */
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_TXB_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/txb_rdopt_utils.h b/third_party/aom/av1/encoder/txb_rdopt_utils.h
new file mode 100644
index 0000000000..b9f08aacf0
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt_utils.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
+
+#include "av1/encoder/encodetxb.h"
+
+static const int golomb_bits_cost[32] = {
+  0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+  512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+
+static const int golomb_cost_diff[32] = {
+  0,       512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+  512 * 2, 0,   0,       0, 0,       0, 0, 0, 0,       0, 0, 0, 0, 0, 0, 0
+};
+
+// Look up table of individual cost of coefficient by its quantization level.
+// determined based on Laplacian distribution conditioned on estimated context
+static const int costLUT[15] = { -1143, 53,   545,  825,  1031,
+                                 1209,  1393, 1577, 1763, 1947,
+                                 2132,  2317, 2501, 2686, 2871 };
+
+static const int const_term = (1 << AV1_PROB_COST_SHIFT);
+
+static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+                                     int shift, const qm_val_t *qmatrix,
+                                     int coeff_idx) {
+  int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+  if (qmatrix == NULL) {
+    return diff * diff;
+  }
+  // When AOM_DIST_METRIC_QM_PSNR is enabled, this mirrors the rate-distortion
+  // computation done in av1_block_error_qm, improving visual quality.
+  // The maximum value of `shift` is 2, `tcoeff` and `dqcoeff` are at most 22
+  // bits, and AOM_QM_BITS is 5, so `diff` should fit in 29-bits. The
+  // multiplication `diff * diff` then does not risk overflowing.
+  diff *= qmatrix[coeff_idx];
+  const int64_t error =
+      (diff * diff + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+  return error;
+}
+
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+                        const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+  int eob_extra;
+  const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+  int eob_cost = 0;
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+  if (av1_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+    const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+    const int offset_bits = av1_eob_offset_bits[eob_pt];
+    if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+  }
+  return eob_cost;
+}
+
+static INLINE int get_golomb_cost(int abs_qc) {
+  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    const int length = get_msb(r) + 1;
+    return av1_cost_literal(2 * length - 1);
+  }
+  return 0;
+}
+
+static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  return coeff_lps[base_range] + get_golomb_cost(level);
+}
+
+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+                                        int *diff) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  int golomb_bits = 0;
+  if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+    *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+  if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+    int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    if (r < 32) {
+      golomb_bits = golomb_bits_cost[r];
+      *diff += golomb_cost_diff[r];
+    } else {
+      golomb_bits = get_golomb_cost(level);
+      *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+    }
+  }
+
+  return coeff_lps[base_range] + golomb_bits;
+}
+
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
+    int ci, tran_low_t abs_qc, int coeff_ctx,
+    const LV_MAP_COEFF_COST *txb_costs, int bhl, TX_CLASS tx_class,
+    const uint8_t *levels, int *cost_low) {
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(ci > 0);
+  int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  int diff = 0;
+  if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
+  if (abs_qc) {
+    cost += av1_cost_literal(1);
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bhl, tx_class);
+      int brcost_diff = 0;
+      cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+                                    &brcost_diff);
+      diff += brcost_diff;
+    }
+  }
+  *cost_low = cost - diff;
+
+  return cost;
+}
+
+static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+                                     int coeff_ctx, int dc_sign_ctx,
+                                     const LV_MAP_COEFF_COST *txb_costs,
+                                     int bhl, TX_CLASS tx_class) {
+  int cost = 0;
+  cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int br_ctx;
+      br_ctx = get_br_ctx_eob(ci, bhl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+    }
+  }
+  return cost;
+}
+
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+                                         int sign, int coeff_ctx,
+                                         int dc_sign_ctx,
+                                         const LV_MAP_COEFF_COST *txb_costs,
+                                         int bhl, TX_CLASS tx_class,
+                                         const uint8_t *levels) {
+  int cost = 0;
+  if (is_last) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  }
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int br_ctx;
+      if (is_last)
+        br_ctx = get_br_ctx_eob(ci, bhl, tx_class);
+      else
+        br_ctx = get_br_ctx(levels, ci, bhl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+    }
+  }
+  return cost;
+}
+
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+                                  int shift, tran_low_t *qc_low,
+                                  tran_low_t *dqc_low) {
+  tran_low_t abs_qc_low = abs_qc - 1;
+  *qc_low = (-sign ^ abs_qc_low) + sign;
+  assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+  tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+  *dqc_low = (-sign ^ abs_dqc_low) + sign;
+  assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
+}
+
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+                                         const int16_t *dequant_ptr,
+                                         const int16_t *scan,
+                                         const tran_low_t *coeff_ptr,
+                                         tran_low_t *qcoeff_ptr,
+                                         tran_low_t *dqcoeff_ptr) {
+  // TODO(sarahparker) make this work for aomqm
+  int eob_out = *eob;
+  int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+                  dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+  for (int i = *eob - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+      eob_out--;
+      qcoeff_ptr[rc] = 0;
+      dqcoeff_ptr[rc] = 0;
+    } else {
+      break;
+    }
+  }
+
+  *eob = eob_out;
+}
+#endif  // AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
diff --git a/third_party/aom/av1/encoder/var_based_part.c b/third_party/aom/av1/encoder/var_based_part.c
new file mode 100644
index 0000000000..f664795153
--- /dev/null
+++ b/third_party/aom/av1/encoder/var_based_part.c
@@ -0,0 +1,1914 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/rdopt_utils.h"
+
+// Possible values for the force_split variable while evaluating variance based
+// partitioning.
+enum {
+  // Evaluate all partition types
+  PART_EVAL_ALL = 0,
+  // Force PARTITION_SPLIT
+  PART_EVAL_ONLY_SPLIT = 1,
+  // Force PARTITION_NONE
+  PART_EVAL_ONLY_NONE = 2
+} UENUM1BYTE(PART_EVAL_STATUS);
+
+typedef struct {
+  VPVariance *part_variances;
+  VPartVar *split[4];
+} variance_node;
+
+static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize,
+                                    variance_node *node) {
+  node->part_variances = NULL;
+  switch (bsize) {
+    case BLOCK_128X128: {
+      VP128x128 *vt = (VP128x128 *)data;
+      node->part_variances = &vt->part_variances;
+      for (int split_idx = 0; split_idx < 4; split_idx++)
+        node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+      break;
+    }
+    case BLOCK_64X64: {
+      VP64x64 *vt = (VP64x64 *)data;
+      node->part_variances = &vt->part_variances;
+      for (int split_idx = 0; split_idx < 4; split_idx++)
+        node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+      break;
+    }
+    case BLOCK_32X32: {
+      VP32x32 *vt = (VP32x32 *)data;
+      node->part_variances = &vt->part_variances;
+      for (int split_idx = 0; split_idx < 4; split_idx++)
+        node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+      break;
+    }
+    case BLOCK_16X16: {
+      VP16x16 *vt = (VP16x16 *)data;
+      node->part_variances = &vt->part_variances;
+      for (int split_idx = 0; split_idx < 4; split_idx++)
+        node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+      break;
+    }
+    case BLOCK_8X8: {
+      VP8x8 *vt = (VP8x8 *)data;
+      node->part_variances = &vt->part_variances;
+      for (int split_idx = 0; split_idx < 4; split_idx++)
+        node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+      break;
+    }
+    default: {
+      VP4x4 *vt = (VP4x4 *)data;
+      assert(bsize == BLOCK_4X4);
+      node->part_variances = &vt->part_variances;
+      for (int split_idx = 0; split_idx < 4; split_idx++)
+        node->split[split_idx] = &vt->split[split_idx];
+      break;
+    }
+  }
+}
+
+// Set variance values given sum square error, sum error, count.
+static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c,
+                                     VPartVar *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+}
+
+static AOM_INLINE void get_variance(VPartVar *v) {
+  v->variance =
+      (int)(256 * (v->sum_square_error -
+                   (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+                              v->log2_count)) >>
+            v->log2_count);
+}
+
+static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b,
+                                       VPartVar *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+  variance_node node;
+  memset(&node, 0, sizeof(node));
+  tree_to_node(data, bsize, &node);
+  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+                  &node.part_variances->none);
+}
+
+static AOM_INLINE void set_block_size(AV1_COMP *const cpi, int mi_row,
+                                      int mi_col, BLOCK_SIZE bsize) {
+  if (cpi->common.mi_params.mi_cols > mi_col &&
+      cpi->common.mi_params.mi_rows > mi_row) {
+    CommonModeInfoParams *mi_params = &cpi->common.mi_params;
+    const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+    const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+    MB_MODE_INFO *mi = mi_params->mi_grid_base[mi_grid_idx] =
+        &mi_params->mi_alloc[mi_alloc_idx];
+    mi->bsize = bsize;
+  }
+}
+
+static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCKD *const xd,
+                               const TileInfo *const tile, void *data,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col,
+                               int64_t threshold, BLOCK_SIZE bsize_min,
+                               PART_EVAL_STATUS force_split) {
+  AV1_COMMON *const cm = &cpi->common;
+  variance_node vt;
+  const int block_width = mi_size_wide[bsize];
+  const int block_height = mi_size_high[bsize];
+  int bs_width_check = block_width;
+  int bs_height_check = block_height;
+  int bs_width_vert_check = block_width >> 1;
+  int bs_height_horiz_check = block_height >> 1;
+  // On the right and bottom boundary we only need to check
+  // if half the bsize fits, because boundary is extended
+  // up to 64. So do this check only for sb_size = 64X64.
+  if (cm->seq_params->sb_size == BLOCK_64X64) {
+    if (tile->mi_col_end == cm->mi_params.mi_cols) {
+      bs_width_check = (block_width >> 1) + 1;
+      bs_width_vert_check = (block_width >> 2) + 1;
+    }
+    if (tile->mi_row_end == cm->mi_params.mi_rows) {
+      bs_height_check = (block_height >> 1) + 1;
+      bs_height_horiz_check = (block_height >> 2) + 1;
+    }
+  }
+
+  assert(block_height == block_width);
+  tree_to_node(data, bsize, &vt);
+
+  if (mi_col + bs_width_check <= tile->mi_col_end &&
+      mi_row + bs_height_check <= tile->mi_row_end &&
+      force_split == PART_EVAL_ONLY_NONE) {
+    set_block_size(cpi, mi_row, mi_col, bsize);
+    return 1;
+  }
+  if (force_split == PART_EVAL_ONLY_SPLIT) return 0;
+
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_min) {
+    // Variance already computed to set the force_split.
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+    if (mi_col + bs_width_check <= tile->mi_col_end &&
+        mi_row + bs_height_check <= tile->mi_row_end &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_min) {
+    // Variance already computed to set the force_split.
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (frame_is_intra_only(cm) &&
+        (bsize > BLOCK_32X32 ||
+         vt.part_variances->none.variance > (threshold << 4))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + bs_width_check <= tile->mi_col_end &&
+        mi_row + bs_height_check <= tile->mi_row_end &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, mi_row, mi_col, bsize);
+      return 1;
+    }
+    // Check vertical split.
+    if (mi_row + bs_height_check <= tile->mi_row_end &&
+        mi_col + bs_width_vert_check <= tile->mi_col_end) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+      BLOCK_SIZE plane_bsize =
+          get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                               xd->plane[AOM_PLANE_U].subsampling_y);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          plane_bsize < BLOCK_INVALID) {
+        set_block_size(cpi, mi_row, mi_col, subsize);
+        set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + bs_width_check <= tile->mi_col_end &&
+        mi_row + bs_height_horiz_check <= tile->mi_row_end) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+      BLOCK_SIZE plane_bsize =
+          get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                               xd->plane[AOM_PLANE_U].subsampling_y);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          plane_bsize < BLOCK_INVALID) {
+        set_block_size(cpi, mi_row, mi_col, subsize);
+        set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
+    return 0;
+  }
+  return 0;
+}
+
+static AOM_INLINE int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide,
+                                      int pixels_high) {
+  int all_inside = 1;
+  for (int idx = 0; idx < 4; idx++) {
+    all_inside &= ((x16_idx + GET_BLK_IDX_X(idx, 3)) < pixels_wide);
+    all_inside &= ((y16_idx + GET_BLK_IDX_Y(idx, 3)) < pixels_high);
+  }
+  return all_inside;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// TODO(yunqingwang): Perform average of four 8x8 blocks similar to lowbd
+static AOM_INLINE void fill_variance_8x8avg_highbd(
+    const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+    int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide,
+    int pixels_high) {
+  for (int idx = 0; idx < 4; idx++) {
+    const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+    const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int src_avg = aom_highbd_avg_8x8(src_buf + y8_idx * src_stride + x8_idx,
+                                       src_stride);
+      int dst_avg = aom_highbd_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx,
+                                       dst_stride);
+
+      sum = src_avg - dst_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none);
+  }
+}
+#endif
+
+static AOM_INLINE void fill_variance_8x8avg_lowbd(
+    const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+    int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide,
+    int pixels_high) {
+  unsigned int sse[4] = { 0 };
+  int sum[4] = { 0 };
+
+  if (all_blks_inside(x16_idx, y16_idx, pixels_wide, pixels_high)) {
+    int src_avg[4];
+    int dst_avg[4];
+    aom_avg_8x8_quad(src_buf, src_stride, x16_idx, y16_idx, src_avg);
+    aom_avg_8x8_quad(dst_buf, dst_stride, x16_idx, y16_idx, dst_avg);
+    for (int idx = 0; idx < 4; idx++) {
+      sum[idx] = src_avg[idx] - dst_avg[idx];
+      sse[idx] = sum[idx] * sum[idx];
+    }
+  } else {
+    for (int idx = 0; idx < 4; idx++) {
+      const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+      const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+      if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+        int src_avg =
+            aom_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride);
+        int dst_avg =
+            aom_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, dst_stride);
+        sum[idx] = src_avg - dst_avg;
+        sse[idx] = sum[idx] * sum[idx];
+      }
+    }
+  }
+
+  for (int idx = 0; idx < 4; idx++) {
+    fill_variance(sse[idx], sum[idx], 0, &vst->split[idx].part_variances.none);
+  }
+}
+
+// Obtain parameters required to calculate variance (such as sum, sse, etc,.)
+// at 8x8 sub-block level for a given 16x16 block.
+// The function can be called only when is_key_frame is false since sum is
+// computed between source and reference frames.
+static AOM_INLINE void fill_variance_8x8avg(
+    const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+    int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int highbd_flag,
+    int pixels_wide, int pixels_high) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd_flag) {
+    fill_variance_8x8avg_highbd(src_buf, src_stride, dst_buf, dst_stride,
+                                x16_idx, y16_idx, vst, pixels_wide,
+                                pixels_high);
+    return;
+  }
+#else
+  (void)highbd_flag;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  fill_variance_8x8avg_lowbd(src_buf, src_stride, dst_buf, dst_stride, x16_idx,
+                             y16_idx, vst, pixels_wide, pixels_high);
+}
+
+static int compute_minmax_8x8(const uint8_t *src_buf, int src_stride,
+                              const uint8_t *dst_buf, int dst_stride,
+                              int x16_idx, int y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+                              int highbd_flag,
+#endif
+                              int pixels_wide, int pixels_high) {
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (int idx = 0; idx < 4; idx++) {
+    const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+    const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        aom_highbd_minmax_8x8(
+            src_buf + y8_idx * src_stride + x8_idx, src_stride,
+            dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, &max);
+      } else {
+        aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride,
+                       dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min,
+                       &max);
+      }
+#else
+      aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride,
+                     dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min,
+                     &max);
+#endif
+      if ((max - min) > minmax_max) minmax_max = (max - min);
+      if ((max - min) < minmax_min) minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+// Function to compute average and variance of 4x4 sub-block.
+// The function can be called only when is_key_frame is true since sum is
+// computed using source frame only.
+static AOM_INLINE void fill_variance_4x4avg(const uint8_t *src_buf,
+                                            int src_stride, int x8_idx,
+                                            int y8_idx, VP8x8 *vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                            int highbd_flag,
+#endif
+                                            int pixels_wide, int pixels_high,
+                                            int border_offset_4x4) {
+  for (int idx = 0; idx < 4; idx++) {
+    const int x4_idx = x8_idx + GET_BLK_IDX_X(idx, 2);
+    const int y4_idx = y8_idx + GET_BLK_IDX_Y(idx, 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide - border_offset_4x4 &&
+        y4_idx < pixels_high - border_offset_4x4) {
+      int src_avg;
+      int dst_avg = 128;
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        src_avg = aom_highbd_avg_4x4(src_buf + y4_idx * src_stride + x4_idx,
+                                     src_stride);
+      } else {
+        src_avg =
+            aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride);
+      }
+#else
+      src_avg = aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride);
+#endif
+
+      sum = src_avg - dst_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none);
+  }
+}
+
+// TODO(kyslov) Bring back threshold adjustment based on content state
+static int64_t scale_part_thresh_content(int64_t threshold_base, int speed,
+                                         int width, int height,
+                                         int non_reference_frame) {
+  (void)width;
+  (void)height;
+  int64_t threshold = threshold_base;
+  if (non_reference_frame) threshold = (3 * threshold) >> 1;
+  if (speed >= 8) {
+    return (5 * threshold) >> 2;
+  }
+  return threshold;
+}
+
+// Tune thresholds less or more aggressively to prefer larger partitions
+static AOM_INLINE void tune_thresh_based_on_qindex(
+    AV1_COMP *cpi, int64_t thresholds[], uint64_t block_sad, int current_qindex,
+    int num_pixels, bool is_segment_id_boosted, int source_sad_nonrd,
+    int lighting_change) {
+  double weight;
+  if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 3) {
+    const int win = 20;
+    if (current_qindex < QINDEX_LARGE_BLOCK_THR - win)
+      weight = 1.0;
+    else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win)
+      weight = 0.0;
+    else
+      weight =
+          1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win);
+    if (num_pixels > RESOLUTION_480P) {
+      for (int i = 0; i < 4; i++) {
+        thresholds[i] <<= 1;
+      }
+    }
+    if (num_pixels <= RESOLUTION_288P) {
+      thresholds[3] = INT64_MAX;
+      if (is_segment_id_boosted == false) {
+        thresholds[1] <<= 2;
+        thresholds[2] <<= (source_sad_nonrd <= kLowSad) ? 5 : 4;
+      } else {
+        thresholds[1] <<= 1;
+        thresholds[2] <<= 3;
+      }
+      // Allow for split to 8x8 for superblocks where part of it has
+      // moving boundary. So allow for sb with source_sad above threshold,
+      // and avoid very large source_sad or high source content, to avoid
+      // too many 8x8 within superblock.
+      uint64_t avg_source_sad_thresh = 25000;
+      uint64_t block_sad_low = 25000;
+      uint64_t block_sad_high = 50000;
+      if (cpi->svc.temporal_layer_id == 0 &&
+          cpi->svc.number_temporal_layers > 1) {
+        // Increase the sad thresholds for base TL0, as reference/LAST is
+        // 2/4 frames behind (for 2/3 #TL).
+        avg_source_sad_thresh = 40000;
+        block_sad_high = 70000;
+      }
+      if (is_segment_id_boosted == false &&
+          cpi->rc.avg_source_sad < avg_source_sad_thresh &&
+          block_sad > block_sad_low && block_sad < block_sad_high &&
+          !lighting_change) {
+        thresholds[2] = (3 * thresholds[2]) >> 2;
+        thresholds[3] = thresholds[2] << 3;
+      }
+      // Condition the increase of partition thresholds on the segment
+      // and the content. Avoid the increase for superblocks which have
+      // high source sad, unless the whole frame has very high motion
+      // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks
+      // have high source sad).
+    } else if (num_pixels > RESOLUTION_480P && is_segment_id_boosted == false &&
+               (source_sad_nonrd != kHighSad ||
+                cpi->rc.avg_source_sad > 50000)) {
+      thresholds[0] = (3 * thresholds[0]) >> 1;
+      thresholds[3] = INT64_MAX;
+      if (current_qindex > QINDEX_LARGE_BLOCK_THR) {
+        thresholds[1] =
+            (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
+        thresholds[2] =
+            (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
+      }
+    } else if (current_qindex > QINDEX_LARGE_BLOCK_THR &&
+               is_segment_id_boosted == false &&
+               (source_sad_nonrd != kHighSad ||
+                cpi->rc.avg_source_sad > 50000)) {
+      thresholds[1] =
+          (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]);
+      thresholds[2] =
+          (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]);
+      thresholds[3] = INT64_MAX;
+    }
+  } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 2) {
+    thresholds[1] <<= (source_sad_nonrd <= kLowSad) ? 2 : 0;
+    thresholds[2] =
+        (source_sad_nonrd <= kLowSad) ? (3 * thresholds[2]) : thresholds[2];
+  } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 1) {
+    const int fac = (source_sad_nonrd <= kLowSad) ? 2 : 1;
+    if (current_qindex < QINDEX_LARGE_BLOCK_THR - 45)
+      weight = 1.0;
+    else if (current_qindex > QINDEX_LARGE_BLOCK_THR + 45)
+      weight = 0.0;
+    else
+      weight = 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + 45) / (2 * 45);
+    thresholds[1] =
+        (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
+    thresholds[2] =
+        (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
+    thresholds[3] =
+        (int)((1 - weight) * (thresholds[3] << fac) + weight * thresholds[3]);
+  }
+  if (cpi->sf.part_sf.disable_8x8_part_based_on_qidx && (current_qindex < 128))
+    thresholds[3] = INT64_MAX;
+}
+
+static void set_vbp_thresholds_key_frame(AV1_COMP *cpi, int64_t thresholds[],
+                                         int64_t threshold_base,
+                                         int threshold_left_shift,
+                                         int num_pixels) {
+  if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+    const int shift_steps =
+        threshold_left_shift - (cpi->oxcf.mode == ALLINTRA ? 7 : 8);
+    assert(shift_steps >= 0);
+    threshold_base <<= shift_steps;
+  }
+  thresholds[0] = threshold_base;
+  thresholds[1] = threshold_base;
+  if (num_pixels < RESOLUTION_720P) {
+    thresholds[2] = threshold_base / 3;
+    thresholds[3] = threshold_base >> 1;
+  } else {
+    int shift_val = 2;
+    if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+      shift_val = 0;
+    }
+
+    thresholds[2] = threshold_base >> shift_val;
+    thresholds[3] = threshold_base >> shift_val;
+  }
+  thresholds[4] = threshold_base << 2;
+}
+
+static AOM_INLINE void tune_thresh_based_on_resolution(
+    AV1_COMP *cpi, int64_t thresholds[], int64_t threshold_base,
+    int current_qindex, int source_sad_rd, int num_pixels) {
+  if (num_pixels >= RESOLUTION_720P) thresholds[3] = thresholds[3] << 1;
+  if (num_pixels <= RESOLUTION_288P) {
+    const int qindex_thr[5][2] = {
+      { 200, 220 }, { 140, 170 }, { 120, 150 }, { 200, 210 }, { 170, 220 },
+    };
+    int th_idx = 0;
+    if (cpi->sf.rt_sf.var_part_based_on_qidx >= 1)
+      th_idx =
+          (source_sad_rd <= kLowSad) ? cpi->sf.rt_sf.var_part_based_on_qidx : 0;
+    if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3)
+      th_idx = cpi->sf.rt_sf.var_part_based_on_qidx;
+    const int qindex_low_thr = qindex_thr[th_idx][0];
+    const int qindex_high_thr = qindex_thr[th_idx][1];
+    if (current_qindex >= qindex_high_thr) {
+      threshold_base = (5 * threshold_base) >> 1;
+      thresholds[1] = threshold_base >> 3;
+      thresholds[2] = threshold_base << 2;
+      thresholds[3] = threshold_base << 5;
+    } else if (current_qindex < qindex_low_thr) {
+      thresholds[1] = threshold_base >> 3;
+      thresholds[2] = threshold_base >> 1;
+      thresholds[3] = threshold_base << 3;
+    } else {
+      int64_t qi_diff_low = current_qindex - qindex_low_thr;
+      int64_t qi_diff_high = qindex_high_thr - current_qindex;
+      int64_t threshold_diff = qindex_high_thr - qindex_low_thr;
+      int64_t threshold_base_high = (5 * threshold_base) >> 1;
+
+      threshold_diff = threshold_diff > 0 ? threshold_diff : 1;
+      threshold_base =
+          (qi_diff_low * threshold_base_high + qi_diff_high * threshold_base) /
+          threshold_diff;
+      thresholds[1] = threshold_base >> 3;
+      thresholds[2] = ((qi_diff_low * threshold_base) +
+                       qi_diff_high * (threshold_base >> 1)) /
+                      threshold_diff;
+      thresholds[3] = ((qi_diff_low * (threshold_base << 5)) +
+                       qi_diff_high * (threshold_base << 3)) /
+                      threshold_diff;
+    }
+  } else if (num_pixels < RESOLUTION_720P) {
+    thresholds[2] = (5 * threshold_base) >> 2;
+  } else if (num_pixels < RESOLUTION_1080P) {
+    thresholds[2] = threshold_base << 1;
+  } else {
+    // num_pixels >= RESOLUTION_1080P
+    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+      if (num_pixels < RESOLUTION_1440P) {
+        thresholds[2] = (5 * threshold_base) >> 1;
+      } else {
+        thresholds[2] = (7 * threshold_base) >> 1;
+      }
+    } else {
+      if (cpi->oxcf.speed > 7) {
+        thresholds[2] = 6 * threshold_base;
+      } else {
+        thresholds[2] = 3 * threshold_base;
+      }
+    }
+  }
+}
+
+// Increase partition thresholds for noisy content. Apply it only for
+// superblocks where sumdiff is low, as we assume the sumdiff of superblock
+// whose only change is due to noise will be low (i.e, noise will average
+// out over large block).
+static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi,
+                                                    int64_t threshold_base,
+                                                    int content_lowsumdiff,
+                                                    int num_pixels) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t updated_thresh_base = threshold_base;
+  if (cpi->noise_estimate.enabled && content_lowsumdiff &&
+      num_pixels > RESOLUTION_480P && cm->current_frame.frame_number > 60) {
+    NOISE_LEVEL noise_level =
+        av1_noise_estimate_extract_level(&cpi->noise_estimate);
+    if (noise_level == kHigh)
+      updated_thresh_base = (5 * updated_thresh_base) >> 1;
+    else if (noise_level == kMedium &&
+             !cpi->sf.rt_sf.prefer_large_partition_blocks)
+      updated_thresh_base = (5 * updated_thresh_base) >> 2;
+  }
+  // TODO(kyslov) Enable var based partition adjusment on temporal denoising
+#if 0  // CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
+      updated_thresh_base =
+          av1_scale_part_thresh(updated_thresh_base, cpi->denoiser.denoising_level,
+                                content_state, cpi->svc.temporal_layer_id);
+  else
+    threshold_base =
+        scale_part_thresh_content(updated_thresh_base, cpi->oxcf.speed, cm->width,
+                                  cm->height, cpi->ppi->rtc_ref.non_reference_frame);
+#else
+  // Increase base variance threshold based on content_state/sum_diff level.
+  updated_thresh_base = scale_part_thresh_content(
+      updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height,
+      cpi->ppi->rtc_ref.non_reference_frame);
+#endif
+  return updated_thresh_base;
+}
+
+static AOM_INLINE void set_vbp_thresholds(
+    AV1_COMP *cpi, int64_t thresholds[], uint64_t blk_sad, int qindex,
+    int content_lowsumdiff, int source_sad_nonrd, int source_sad_rd,
+    bool is_segment_id_boosted, int lighting_change) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int threshold_multiplier = is_key_frame ? 120 : 1;
+  const int ac_q = av1_ac_quant_QTX(qindex, 0, cm->seq_params->bit_depth);
+  int64_t threshold_base = (int64_t)(threshold_multiplier * ac_q);
+  const int current_qindex = cm->quant_params.base_qindex;
+  const int threshold_left_shift = cpi->sf.rt_sf.var_part_split_threshold_shift;
+  const int num_pixels = cm->width * cm->height;
+
+  if (is_key_frame) {
+    set_vbp_thresholds_key_frame(cpi, thresholds, threshold_base,
+                                 threshold_left_shift, num_pixels);
+    return;
+  }
+
+  threshold_base = tune_thresh_noisy_content(cpi, threshold_base,
+                                             content_lowsumdiff, num_pixels);
+  thresholds[0] = threshold_base >> 1;
+  thresholds[1] = threshold_base;
+  thresholds[3] = threshold_base << threshold_left_shift;
+
+  tune_thresh_based_on_resolution(cpi, thresholds, threshold_base,
+                                  current_qindex, source_sad_rd, num_pixels);
+
+  tune_thresh_based_on_qindex(cpi, thresholds, blk_sad, current_qindex,
+                              num_pixels, is_segment_id_boosted,
+                              source_sad_nonrd, lighting_change);
+}
+
+// Set temporal variance low flag for superblock 64x64.
+// Only first 25 in the array are used in this case.
+static AOM_INLINE void set_low_temp_var_flag_64x64(
+    CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+    MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col,
+    int mi_row) {
+  if (xd->mi[0]->bsize == BLOCK_64X64) {
+    if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+      part_info->variance_low[0] = 1;
+  } else if (xd->mi[0]->bsize == BLOCK_64X32) {
+    for (int part_idx = 0; part_idx < 2; part_idx++) {
+      if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2))
+        part_info->variance_low[part_idx + 1] = 1;
+    }
+  } else if (xd->mi[0]->bsize == BLOCK_32X64) {
+    for (int part_idx = 0; part_idx < 2; part_idx++) {
+      if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2))
+        part_info->variance_low[part_idx + 3] = 1;
+    }
+  } else {
+    static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+    for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+      const int idx_str = mi_params->mi_stride * (mi_row + idx[lvl1_idx][0]) +
+                          mi_col + idx[lvl1_idx][1];
+      MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str;
+
+      if (mi_params->mi_cols <= mi_col + idx[lvl1_idx][1] ||
+          mi_params->mi_rows <= mi_row + idx[lvl1_idx][0])
+        continue;
+
+      if (*this_mi == NULL) continue;
+
+      if ((*this_mi)->bsize == BLOCK_32X32) {
+        int64_t threshold_32x32 = (5 * thresholds[1]) >> 3;
+        if (vt->split[lvl1_idx].part_variances.none.variance < threshold_32x32)
+          part_info->variance_low[lvl1_idx + 5] = 1;
+      } else {
+        // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+        // inside.
+        if ((*this_mi)->bsize == BLOCK_16X16 ||
+            (*this_mi)->bsize == BLOCK_32X16 ||
+            (*this_mi)->bsize == BLOCK_16X32) {
+          for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+            if (vt->split[lvl1_idx]
+                    .split[lvl2_idx]
+                    .part_variances.none.variance < (thresholds[2] >> 8))
+              part_info->variance_low[(lvl1_idx << 2) + lvl2_idx + 9] = 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_low_temp_var_flag_128x128(
+    CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+    MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col,
+    int mi_row) {
+  if (xd->mi[0]->bsize == BLOCK_128X128) {
+    if (vt->part_variances.none.variance < (thresholds[0] >> 1))
+      part_info->variance_low[0] = 1;
+  } else if (xd->mi[0]->bsize == BLOCK_128X64) {
+    for (int part_idx = 0; part_idx < 2; part_idx++) {
+      if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2))
+        part_info->variance_low[part_idx + 1] = 1;
+    }
+  } else if (xd->mi[0]->bsize == BLOCK_64X128) {
+    for (int part_idx = 0; part_idx < 2; part_idx++) {
+      if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2))
+        part_info->variance_low[part_idx + 3] = 1;
+    }
+  } else {
+    static const int idx64[4][2] = {
+      { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 }
+    };
+    static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+    for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+      const int idx_str = mi_params->mi_stride * (mi_row + idx64[lvl1_idx][0]) +
+                          mi_col + idx64[lvl1_idx][1];
+      MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str;
+      if (*mi_64 == NULL) continue;
+      if (mi_params->mi_cols <= mi_col + idx64[lvl1_idx][1] ||
+          mi_params->mi_rows <= mi_row + idx64[lvl1_idx][0])
+        continue;
+      const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
+      if ((*mi_64)->bsize == BLOCK_64X64) {
+        if (vt->split[lvl1_idx].part_variances.none.variance < threshold_64x64)
+          part_info->variance_low[5 + lvl1_idx] = 1;
+      } else if ((*mi_64)->bsize == BLOCK_64X32) {
+        for (int part_idx = 0; part_idx < 2; part_idx++)
+          if (vt->split[lvl1_idx].part_variances.horz[part_idx].variance <
+              (threshold_64x64 >> 1))
+            part_info->variance_low[9 + (lvl1_idx << 1) + part_idx] = 1;
+      } else if ((*mi_64)->bsize == BLOCK_32X64) {
+        for (int part_idx = 0; part_idx < 2; part_idx++)
+          if (vt->split[lvl1_idx].part_variances.vert[part_idx].variance <
+              (threshold_64x64 >> 1))
+            part_info->variance_low[17 + (lvl1_idx << 1) + part_idx] = 1;
+      } else {
+        for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+          const int idx_str1 =
+              mi_params->mi_stride * idx32[lvl2_idx][0] + idx32[lvl2_idx][1];
+          MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1;
+          if (*mi_32 == NULL) continue;
+
+          if (mi_params->mi_cols <=
+                  mi_col + idx64[lvl1_idx][1] + idx32[lvl2_idx][1] ||
+              mi_params->mi_rows <=
+                  mi_row + idx64[lvl1_idx][0] + idx32[lvl2_idx][0])
+            continue;
+          const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
+          if ((*mi_32)->bsize == BLOCK_32X32) {
+            if (vt->split[lvl1_idx]
+                    .split[lvl2_idx]
+                    .part_variances.none.variance < threshold_32x32)
+              part_info->variance_low[25 + (lvl1_idx << 2) + lvl2_idx] = 1;
+          } else {
+            // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+            // inside.
+            if ((*mi_32)->bsize == BLOCK_16X16 ||
+                (*mi_32)->bsize == BLOCK_32X16 ||
+                (*mi_32)->bsize == BLOCK_16X32) {
+              for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) {
+                VPartVar *none_var = &vt->split[lvl1_idx]
+                                          .split[lvl2_idx]
+                                          .split[lvl3_idx]
+                                          .part_variances.none;
+                if (none_var->variance < (thresholds[3] >> 8))
+                  part_info->variance_low[41 + (lvl1_idx << 4) +
+                                          (lvl2_idx << 2) + lvl3_idx] = 1;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_low_temp_var_flag(
+    AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd,
+    VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition,
+    int mi_col, int mi_row, const bool is_small_sb) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected.
+  // If the temporal variance is small set the flag
+  // variance_low for the block. The variance threshold can be adjusted, the
+  // higher the more aggressive.
+  if (ref_frame_partition == LAST_FRAME) {
+    if (is_small_sb)
+      set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd,
+                                  &(vt->split[0]), thresholds, mi_col, mi_row);
+    else
+      set_low_temp_var_flag_128x128(&cm->mi_params, part_info, xd, vt,
+                                    thresholds, mi_col, mi_row);
+  }
+}
+
+static const int pos_shift_16x16[4][4] = {
+  { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
+};
+
+int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize) {
+  // Relative indices of MB inside the superblock.
+  const int mi_x = mi_row & 0xF;
+  const int mi_y = mi_col & 0xF;
+  // Relative indices of 16x16 block inside the superblock.
+  const int i = mi_x >> 2;
+  const int j = mi_y >> 2;
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_64X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[1];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[2];
+      }
+      break;
+    case BLOCK_32X64:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[3];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[4];
+      }
+      break;
+    case BLOCK_32X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[5];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[6];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[7];
+      } else if (mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[8];
+      }
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+      break;
+    default: break;
+  }
+
+  return force_skip_low_temp_var;
+}
+
+int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row,
+                                    int mi_col, BLOCK_SIZE bsize) {
+  int force_skip_low_temp_var = 0;
+  int x, y;
+  x = (mi_col & 0x1F) >> 4;
+  // y = (mi_row & 0x1F) >> 4;
+  // const int idx64 = (y << 1) + x;
+  y = (mi_row & 0x17) >> 3;
+  const int idx64 = y + x;
+
+  x = (mi_col & 0xF) >> 3;
+  // y = (mi_row & 0xF) >> 3;
+  // const int idx32 = (y << 1) + x;
+  y = (mi_row & 0xB) >> 2;
+  const int idx32 = y + x;
+
+  x = (mi_col & 0x7) >> 2;
+  // y = (mi_row & 0x7) >> 2;
+  // const int idx16 = (y << 1) + x;
+  y = (mi_row & 0x5) >> 1;
+  const int idx16 = y + x;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_128X64:
+      assert((mi_col & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)];
+      break;
+    case BLOCK_64X128:
+      assert((mi_row & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)];
+      break;
+    case BLOCK_64X64:
+      // Location of this 64x64 block inside the 128x128 superblock
+      force_skip_low_temp_var = variance_low[5 + idx64];
+      break;
+    case BLOCK_64X32:
+      x = (mi_col & 0x1F) >> 4;
+      y = (mi_row & 0x1F) >> 3;
+      /*
+      .---------------.---------------.
+      | x=0,y=0,idx=0 | x=0,y=0,idx=2 |
+      :---------------+---------------:
+      | x=0,y=1,idx=1 | x=1,y=1,idx=3 |
+      :---------------+---------------:
+      | x=0,y=2,idx=4 | x=1,y=2,idx=6 |
+      :---------------+---------------:
+      | x=0,y=3,idx=5 | x=1,y=3,idx=7 |
+      '---------------'---------------'
+      */
+      const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2);
+      force_skip_low_temp_var = variance_low[9 + idx64x32];
+      break;
+    case BLOCK_32X64:
+      x = (mi_col & 0x1F) >> 3;
+      y = (mi_row & 0x1F) >> 4;
+      const int idx32x64 = (y << 2) + x;
+      force_skip_low_temp_var = variance_low[17 + idx32x64];
+      break;
+    case BLOCK_32X32:
+      force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32];
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var =
+          variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16];
+      break;
+    default: break;
+  }
+  return force_skip_low_temp_var;
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int qindex,
+                                           int content_lowsumdiff) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
+    return;
+  } else {
+    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, 0, qindex,
+                       content_lowsumdiff, 0, 0, 0, 0);
+    // The threshold below is not changed locally.
+    cpi->vbp_info.threshold_minmax = 15 + (qindex >> 3);
+  }
+}
+
+static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, unsigned int y_sad,
+                                    unsigned int y_sad_g,
+                                    unsigned int y_sad_alt, bool is_key_frame,
+                                    bool zero_motion, unsigned int *uv_sad) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+  int shift_upper_limit = 1;
+  int shift_lower_limit = 3;
+  int fac_uv = 6;
+  if (is_key_frame || cpi->oxcf.tool_cfg.enable_monochrome) return;
+
+  // Use lower threshold (more conservative in setting color flag) for
+  // higher resolutions non-screen, which tend to have more camera noise.
+  // Since this may be used to skip compound mode in nonrd pickmode, which
+  // is generally more effective for higher resolutions, better to be more
+  // conservative.
+  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+    if (cpi->common.width * cpi->common.height >= RESOLUTION_1080P)
+      fac_uv = 3;
+    else
+      fac_uv = 5;
+  }
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      cpi->rc.high_source_sad) {
+    shift_lower_limit = 7;
+  } else if (source_sad_nonrd >= kMedSad && x->source_variance > 500 &&
+             cpi->common.width * cpi->common.height >= 640 * 360) {
+    shift_upper_limit = 2;
+    shift_lower_limit = source_sad_nonrd > kMedSad ? 5 : 4;
+  }
+
+  MB_MODE_INFO *mi = xd->mi[0];
+  const AV1_COMMON *const cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+  const YV12_BUFFER_CONFIG *yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, LAST_FRAME);
+  struct buf_2d dst;
+  unsigned int uv_sad_g = 0;
+  unsigned int uv_sad_alt = 0;
+
+  for (int plane = AOM_PLANE_U; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblock_plane *p = &x->plane[plane];
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+    if (bs != BLOCK_INVALID) {
+      // For last:
+      if (zero_motion) {
+        if (mi->ref_frame[0] == LAST_FRAME) {
+          uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+              p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride);
+        } else {
+          uint8_t *src = (plane == 1) ? yv12->u_buffer : yv12->v_buffer;
+          setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12->uv_crop_width,
+                           yv12->uv_crop_height, yv12->uv_stride, xd->mi_row,
+                           xd->mi_col, sf, xd->plane[plane].subsampling_x,
+                           xd->plane[plane].subsampling_y);
+
+          uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+              p->src.buf, p->src.stride, dst.buf, dst.stride);
+        }
+      } else {
+        uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+            p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride);
+      }
+
+      // For golden:
+      if (y_sad_g != UINT_MAX) {
+        uint8_t *src = (plane == 1) ? yv12_g->u_buffer : yv12_g->v_buffer;
+        setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_g->uv_crop_width,
+                         yv12_g->uv_crop_height, yv12_g->uv_stride, xd->mi_row,
+                         xd->mi_col, sf, xd->plane[plane].subsampling_x,
+                         xd->plane[plane].subsampling_y);
+        uv_sad_g = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, dst.buf,
+                                            dst.stride);
+      }
+
+      // For altref:
+      if (y_sad_alt != UINT_MAX) {
+        uint8_t *src = (plane == 1) ? yv12_alt->u_buffer : yv12_alt->v_buffer;
+        setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_alt->uv_crop_width,
+                         yv12_alt->uv_crop_height, yv12_alt->uv_stride,
+                         xd->mi_row, xd->mi_col, sf,
+                         xd->plane[plane].subsampling_x,
+                         xd->plane[plane].subsampling_y);
+        uv_sad_alt = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                              dst.buf, dst.stride);
+      }
+    }
+
+    if (uv_sad[plane - 1] > (y_sad >> shift_upper_limit))
+      x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 1;
+    else if (uv_sad[plane - 1] < (y_sad >> shift_lower_limit))
+      x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 0;
+    // Borderline case: to be refined at coding block level in nonrd_pickmode,
+    // for coding block size < sb_size.
+    else
+      x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 2;
+
+    x->color_sensitivity_sb_g[COLOR_SENS_IDX(plane)] =
+        uv_sad_g > y_sad_g / fac_uv;
+    x->color_sensitivity_sb_alt[COLOR_SENS_IDX(plane)] =
+        uv_sad_alt > y_sad_alt / fac_uv;
+  }
+}
+
+static void fill_variance_tree_leaves(
+    AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, PART_EVAL_STATUS *force_split,
+    int avg_16x16[][4], int maxvar_16x16[][4], int minvar_16x16[][4],
+    int64_t *thresholds, const uint8_t *src_buf, int src_stride,
+    const uint8_t *dst_buf, int dst_stride, bool is_key_frame,
+    const bool is_small_sb) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_64x64_blocks = is_small_sb ? 1 : 4;
+  // TODO(kyslov) Bring back compute_minmax_variance with content type detection
+  const int compute_minmax_variance = 0;
+  const int segment_id = xd->mi[0]->segment_id;
+  int pixels_wide = 128, pixels_high = 128;
+  int border_offset_4x4 = 0;
+  int temporal_denoising = cpi->sf.rt_sf.use_rtc_tf;
+  // dst_buf pointer is not used for is_key_frame, so it should be NULL.
+  assert(IMPLIES(is_key_frame, dst_buf == NULL));
+  if (is_small_sb) {
+    pixels_wide = 64;
+    pixels_high = 64;
+  }
+  if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+  if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  temporal_denoising |= cpi->oxcf.noise_sensitivity;
+#endif
+  // For temporal filtering or temporal denoiser enabled: since the source
+  // is modified we need to avoid 4x4 avg along superblock boundary, since
+  // simd code will load 8 pixels for 4x4 avg and so can access source
+  // data outside superblock (while its being modified by temporal filter).
+  // Temporal filtering is never done on key frames.
+  if (!is_key_frame && temporal_denoising) border_offset_4x4 = 4;
+  for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; blk64_idx++) {
+    const int x64_idx = GET_BLK_IDX_X(blk64_idx, 6);
+    const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 6);
+    const int blk64_scale_idx = blk64_idx << 2;
+    force_split[blk64_idx + 1] = PART_EVAL_ALL;
+
+    for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+      const int x32_idx = x64_idx + GET_BLK_IDX_X(lvl1_idx, 5);
+      const int y32_idx = y64_idx + GET_BLK_IDX_Y(lvl1_idx, 5);
+      const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+      force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ALL;
+      avg_16x16[blk64_idx][lvl1_idx] = 0;
+      maxvar_16x16[blk64_idx][lvl1_idx] = 0;
+      minvar_16x16[blk64_idx][lvl1_idx] = INT_MAX;
+      for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+        const int x16_idx = x32_idx + GET_BLK_IDX_X(lvl2_idx, 4);
+        const int y16_idx = y32_idx + GET_BLK_IDX_Y(lvl2_idx, 4);
+        const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+        VP16x16 *vst = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+        force_split[split_index] = PART_EVAL_ALL;
+        if (is_key_frame) {
+          // Go down to 4x4 down-sampling for variance.
+          for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) {
+            const int x8_idx = x16_idx + GET_BLK_IDX_X(lvl3_idx, 3);
+            const int y8_idx = y16_idx + GET_BLK_IDX_Y(lvl3_idx, 3);
+            VP8x8 *vst2 = &vst->split[lvl3_idx];
+            fill_variance_4x4avg(src_buf, src_stride, x8_idx, y8_idx, vst2,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                 xd->cur_buf->flags,
+#endif
+                                 pixels_wide, pixels_high, border_offset_4x4);
+          }
+        } else {
+          fill_variance_8x8avg(src_buf, src_stride, dst_buf, dst_stride,
+                               x16_idx, y16_idx, vst, is_cur_buf_hbd(xd),
+                               pixels_wide, pixels_high);
+
+          fill_variance_tree(vst, BLOCK_16X16);
+          VPartVar *none_var = &vt->split[blk64_idx]
+                                    .split[lvl1_idx]
+                                    .split[lvl2_idx]
+                                    .part_variances.none;
+          get_variance(none_var);
+          const int val_none_var = none_var->variance;
+          avg_16x16[blk64_idx][lvl1_idx] += val_none_var;
+          minvar_16x16[blk64_idx][lvl1_idx] =
+              AOMMIN(minvar_16x16[blk64_idx][lvl1_idx], val_none_var);
+          maxvar_16x16[blk64_idx][lvl1_idx] =
+              AOMMAX(maxvar_16x16[blk64_idx][lvl1_idx], val_none_var);
+          if (val_none_var > thresholds[3]) {
+            // 16X16 variance is above threshold for split, so force split to
+            // 8x8 for this 16x16 block (this also forces splits for upper
+            // levels).
+            force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+            force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+            force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+            force_split[0] = PART_EVAL_ONLY_SPLIT;
+          } else if (!cyclic_refresh_segment_id_boosted(segment_id) &&
+                     compute_minmax_variance && val_none_var > thresholds[2]) {
+            // We have some nominal amount of 16x16 variance (based on average),
+            // compute the minmax over the 8x8 sub-blocks, and if above
+            // threshold, force split to 8x8 block for this 16x16 block.
+            int minmax = compute_minmax_8x8(src_buf, src_stride, dst_buf,
+                                            dst_stride, x16_idx, y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                            xd->cur_buf->flags,
+#endif
+                                            pixels_wide, pixels_high);
+            const int thresh_minmax = (int)cpi->vbp_info.threshold_minmax;
+            if (minmax > thresh_minmax) {
+              force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+              force_split[5 + blk64_scale_idx + lvl1_idx] =
+                  PART_EVAL_ONLY_SPLIT;
+              force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+              force_split[0] = PART_EVAL_ONLY_SPLIT;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_ref_frame_for_partition(
+    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+    MV_REFERENCE_FRAME *ref_frame_partition, MB_MODE_INFO *mi,
+    unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt,
+    const YV12_BUFFER_CONFIG *yv12_g, const YV12_BUFFER_CONFIG *yv12_alt,
+    int mi_row, int mi_col, int num_planes) {
+  AV1_COMMON *const cm = &cpi->common;
+  const bool is_set_golden_ref_frame =
+      *y_sad_g < 0.9 * *y_sad && *y_sad_g < *y_sad_alt;
+  const bool is_set_altref_ref_frame =
+      *y_sad_alt < 0.9 * *y_sad && *y_sad_alt < *y_sad_g;
+
+  if (is_set_golden_ref_frame) {
+    av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                         get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+    mi->ref_frame[0] = GOLDEN_FRAME;
+    mi->mv[0].as_int = 0;
+    *y_sad = *y_sad_g;
+    *ref_frame_partition = GOLDEN_FRAME;
+    x->nonrd_prune_ref_frame_search = 0;
+    x->sb_me_partition = 0;
+  } else if (is_set_altref_ref_frame) {
+    av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col,
+                         get_ref_scale_factors(cm, ALTREF_FRAME), num_planes);
+    mi->ref_frame[0] = ALTREF_FRAME;
+    mi->mv[0].as_int = 0;
+    *y_sad = *y_sad_alt;
+    *ref_frame_partition = ALTREF_FRAME;
+    x->nonrd_prune_ref_frame_search = 0;
+    x->sb_me_partition = 0;
+  } else {
+    *ref_frame_partition = LAST_FRAME;
+    x->nonrd_prune_ref_frame_search =
+        cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
+  }
+}
+
+static AOM_FORCE_INLINE int mv_distance(const FULLPEL_MV *mv0,
+                                        const FULLPEL_MV *mv1) {
+  return abs(mv0->row - mv1->row) + abs(mv0->col - mv1->col);
+}
+
+static AOM_INLINE void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
+                                              unsigned int *y_sad,
+                                              bool is_small_sb,
+                                              int est_motion) {
+  const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+  // TODO(yunqingwang@google.com): test if this condition works with other
+  // speeds.
+  if (est_motion > 2 && source_sad_nonrd > kMedSad) return;
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+  MB_MODE_INFO *mi = xd->mi[0];
+
+  unsigned int above_y_sad = UINT_MAX;
+  unsigned int left_y_sad = UINT_MAX;
+  FULLPEL_MV above_mv = kZeroFullMv;
+  FULLPEL_MV left_mv = kZeroFullMv;
+  SubpelMvLimits subpel_mv_limits;
+  const MV dummy_mv = { 0, 0 };
+  av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, &dummy_mv);
+
+  // Current best MV
+  FULLPEL_MV best_mv = get_fullmv_from_mv(&mi->mv[0].as_mv);
+  const int multi = (est_motion > 2 && source_sad_nonrd > kLowSad) ? 7 : 8;
+
+  if (xd->up_available) {
+    const MB_MODE_INFO *above_mbmi = xd->above_mbmi;
+    if (above_mbmi->mode >= INTRA_MODE_END &&
+        above_mbmi->ref_frame[0] == LAST_FRAME) {
+      MV temp = above_mbmi->mv[0].as_mv;
+      clamp_mv(&temp, &subpel_mv_limits);
+      above_mv = get_fullmv_from_mv(&temp);
+
+      if (mv_distance(&best_mv, &above_mv) > 0) {
+        uint8_t const *ref_buf =
+            get_buf_from_fullmv(&xd->plane[0].pre[0], &above_mv);
+        above_y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+            x->plane[0].src.buf, x->plane[0].src.stride, ref_buf,
+            xd->plane[0].pre[0].stride);
+      }
+    }
+  }
+  if (xd->left_available) {
+    const MB_MODE_INFO *left_mbmi = xd->left_mbmi;
+    if (left_mbmi->mode >= INTRA_MODE_END &&
+        left_mbmi->ref_frame[0] == LAST_FRAME) {
+      MV temp = left_mbmi->mv[0].as_mv;
+      clamp_mv(&temp, &subpel_mv_limits);
+      left_mv = get_fullmv_from_mv(&temp);
+
+      if (mv_distance(&best_mv, &left_mv) > 0 &&
+          mv_distance(&above_mv, &left_mv) > 0) {
+        uint8_t const *ref_buf =
+            get_buf_from_fullmv(&xd->plane[0].pre[0], &left_mv);
+        left_y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+            x->plane[0].src.buf, x->plane[0].src.stride, ref_buf,
+            xd->plane[0].pre[0].stride);
+      }
+    }
+  }
+
+  if (above_y_sad < ((multi * *y_sad) >> 3) && above_y_sad < left_y_sad) {
+    *y_sad = above_y_sad;
+    mi->mv[0].as_mv = get_mv_from_fullmv(&above_mv);
+    clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits);
+  }
+  if (left_y_sad < ((multi * *y_sad) >> 3) && left_y_sad < above_y_sad) {
+    *y_sad = left_y_sad;
+    mi->mv[0].as_mv = get_mv_from_fullmv(&left_mv);
+    clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits);
+  }
+}
+
+static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
+                         unsigned int *y_sad_g, unsigned int *y_sad_alt,
+                         unsigned int *y_sad_last,
+                         MV_REFERENCE_FRAME *ref_frame_partition,
+                         struct scale_factors *sf_no_scale, int mi_row,
+                         int mi_col, bool is_small_sb, bool scaled_ref_last) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_planes = av1_num_planes(cm);
+  bool scaled_ref_golden = false;
+  bool scaled_ref_alt = false;
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+  MB_MODE_INFO *mi = xd->mi[0];
+  const YV12_BUFFER_CONFIG *yv12 =
+      scaled_ref_last ? av1_get_scaled_ref_frame(cpi, LAST_FRAME)
+                      : get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  assert(yv12 != NULL);
+  const YV12_BUFFER_CONFIG *yv12_g = NULL;
+  const YV12_BUFFER_CONFIG *yv12_alt = NULL;
+  // Check if LAST is a reference. For spatial layers always use it as
+  // reference scaling.
+  int use_last_ref = (cpi->ref_frame_flags & AOM_LAST_FLAG) ||
+                     cpi->svc.number_spatial_layers > 1;
+  int use_golden_ref = cpi->ref_frame_flags & AOM_GOLD_FLAG;
+  int use_alt_ref = cpi->ppi->rtc_ref.set_ref_frame_config ||
+                    cpi->sf.rt_sf.use_nonrd_altref_frame ||
+                    (cpi->sf.rt_sf.use_comp_ref_nonrd &&
+                     cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1);
+
+  // For 1 spatial layer: GOLDEN is another temporal reference.
+  // Check if it should be used as reference for partitioning.
+  if (cpi->svc.number_spatial_layers == 1 && use_golden_ref &&
+      (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
+    yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+    if (yv12_g && (yv12_g->y_crop_height != cm->height ||
+                   yv12_g->y_crop_width != cm->width)) {
+      yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+      scaled_ref_golden = true;
+    }
+    if (yv12_g && yv12_g != yv12) {
+      av1_setup_pre_planes(
+          xd, 0, yv12_g, mi_row, mi_col,
+          scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME),
+          num_planes);
+      *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf(
+          x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+          xd->plane[AOM_PLANE_Y].pre[0].buf,
+          xd->plane[AOM_PLANE_Y].pre[0].stride);
+    }
+  }
+
+  // For 1 spatial layer: ALTREF is another temporal reference.
+  // Check if it should be used as reference for partitioning.
+  if (cpi->svc.number_spatial_layers == 1 && use_alt_ref &&
+      (cpi->ref_frame_flags & AOM_ALT_FLAG) &&
+      (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
+    yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+    if (yv12_alt && (yv12_alt->y_crop_height != cm->height ||
+                     yv12_alt->y_crop_width != cm->width)) {
+      yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME);
+      scaled_ref_alt = true;
+    }
+    if (yv12_alt && yv12_alt != yv12) {
+      av1_setup_pre_planes(
+          xd, 0, yv12_alt, mi_row, mi_col,
+          scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME),
+          num_planes);
+      *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf(
+          x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+          xd->plane[AOM_PLANE_Y].pre[0].buf,
+          xd->plane[AOM_PLANE_Y].pre[0].stride);
+    }
+  }
+
+  if (use_last_ref) {
+    const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+    av1_setup_pre_planes(
+        xd, 0, yv12, mi_row, mi_col,
+        scaled_ref_last ? NULL : get_ref_scale_factors(cm, LAST_FRAME),
+        num_planes);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+    mi->bsize = cm->seq_params->sb_size;
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+
+    int est_motion = cpi->sf.rt_sf.estimate_motion_for_var_based_partition;
+    // TODO(b/290596301): Look into adjusting this condition.
+    // There is regression on color content when
+    // estimate_motion_for_var_based_partition = 3 and high motion,
+    // so for now force it to 2 based on superblock sad.
+    if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2;
+
+    if (est_motion == 1 || est_motion == 2) {
+      if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+        // For screen only do int_pro_motion for spatial variance above
+        // threshold and motion level above LowSad.
+        if (x->source_variance > 100 && source_sad_nonrd > kLowSad) {
+          int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+          int me_search_size_col =
+              is_screen ? 96 : block_size_wide[cm->seq_params->sb_size] >> 1;
+          // For screen use larger search size row motion to capture
+          // vertical scroll, which can be larger motion.
+          int me_search_size_row =
+              is_screen ? 192 : block_size_high[cm->seq_params->sb_size] >> 1;
+          unsigned int y_sad_zero;
+          *y_sad = av1_int_pro_motion_estimation(
+              cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
+              &y_sad_zero, me_search_size_col, me_search_size_row);
+          // The logic below selects whether the motion estimated in the
+          // int_pro_motion() will be used in nonrd_pickmode. Only do this
+          // for screen for now.
+          if (is_screen) {
+            unsigned int thresh_sad =
+                (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
+            if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
+              x->sb_me_partition = 1;
+              x->sb_me_mv.as_int = mi->mv[0].as_int;
+            } else {
+              x->sb_me_partition = 0;
+              // Fall back to using zero motion.
+              *y_sad = y_sad_zero;
+              mi->mv[0].as_int = 0;
+            }
+          }
+        }
+      }
+    }
+
+    if (*y_sad == UINT_MAX) {
+      *y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+          x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+          xd->plane[AOM_PLANE_Y].pre[0].buf,
+          xd->plane[AOM_PLANE_Y].pre[0].stride);
+    }
+
+    // Evaluate if neighbours' MVs give better predictions. Zero MV is tested
+    // already, so only non-zero MVs are tested here. Here the neighbour blocks
+    // are the first block above or left to this superblock.
+    if (est_motion >= 2 && (xd->up_available || xd->left_available))
+      evaluate_neighbour_mvs(cpi, x, y_sad, is_small_sb, est_motion);
+
+    *y_sad_last = *y_sad;
+  }
+
+  // Pick the ref frame for partitioning, use golden or altref frame only if
+  // its lower sad, bias to LAST with factor 0.9.
+  set_ref_frame_for_partition(cpi, x, xd, ref_frame_partition, mi, y_sad,
+                              y_sad_g, y_sad_alt, yv12_g, yv12_alt, mi_row,
+                              mi_col, num_planes);
+
+  // Only calculate the predictor for non-zero MV.
+  if (mi->mv[0].as_int != 0) {
+    if (!scaled_ref_last) {
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    } else {
+      xd->block_ref_scale_factors[0] = sf_no_scale;
+      xd->block_ref_scale_factors[1] = sf_no_scale;
+    }
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
+                                  cm->seq_params->sb_size, AOM_PLANE_Y,
+                                  num_planes - 1);
+  }
+}
+
+// Decides whether to split or merge a 16x16 partition block in variance based
+// partitioning based on the 8x8 sub-block variances.
+static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
+    VP16x16 *var_16x16_info, int64_t threshold16) {
+  int max_8x8_var = 0, min_8x8_var = INT_MAX;
+  for (int split_idx = 0; split_idx < 4; split_idx++) {
+    get_variance(&var_16x16_info->split[split_idx].part_variances.none);
+    int this_8x8_var =
+        var_16x16_info->split[split_idx].part_variances.none.variance;
+    max_8x8_var = AOMMAX(this_8x8_var, max_8x8_var);
+    min_8x8_var = AOMMIN(this_8x8_var, min_8x8_var);
+  }
+  // If the difference between maximum and minimum sub-block variances is high,
+  // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate
+  // only PARTITION_NONE. The shift factor for threshold16 has been derived
+  // empirically.
+  return ((max_8x8_var - min_8x8_var) > (threshold16 << 2))
+             ? PART_EVAL_ONLY_SPLIT
+             : PART_EVAL_ONLY_NONE;
+}
+
+static AOM_INLINE bool is_set_force_zeromv_skip_based_on_src_sad(
+    int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) {
+  if (set_zeromv_skip_based_on_source_sad == 0) return false;
+
+  if (set_zeromv_skip_based_on_source_sad >= 3)
+    return source_sad_nonrd <= kLowSad;
+  else if (set_zeromv_skip_based_on_source_sad >= 2)
+    return source_sad_nonrd <= kVeryLowSad;
+  else if (set_zeromv_skip_based_on_source_sad >= 1)
+    return source_sad_nonrd == kZeroSad;
+
+  return false;
+}
+
+static AOM_INLINE bool set_force_zeromv_skip_for_sb(
+    AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt,
+    unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad,
+    BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (!is_set_force_zeromv_skip_based_on_src_sad(
+          cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad,
+          x->content_state_sb.source_sad_nonrd))
+    return false;
+  int shift = cpi->sf.rt_sf.increase_source_sad_thresh ? 1 : 0;
+  const int block_width = mi_size_wide[cm->seq_params->sb_size];
+  const int block_height = mi_size_high[cm->seq_params->sb_size];
+  const unsigned int thresh_exit_part_y =
+      cpi->zeromv_skip_thresh_exit_part[bsize] << shift;
+  unsigned int thresh_exit_part_uv =
+      CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y) << shift;
+  // Be more aggressive in UV threshold if source_sad >= VeryLowSad
+  // to suppreess visual artifact caused by the speed feature:
+  // set_zeromv_skip_based_on_source_sad = 2. For now only for
+  // part_early_exit_zeromv = 1.
+  if (x->content_state_sb.source_sad_nonrd >= kVeryLowSad &&
+      cpi->sf.rt_sf.part_early_exit_zeromv == 1)
+    thresh_exit_part_uv = thresh_exit_part_uv >> 3;
+  if (mi_col + block_width <= tile->mi_col_end &&
+      mi_row + block_height <= tile->mi_row_end && y_sad < thresh_exit_part_y &&
+      uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) {
+    set_block_size(cpi, mi_row, mi_col, bsize);
+    x->force_zeromv_skip_for_sb = 1;
+    aom_free(vt);
+    // Partition shape is set here at SB level.
+    // Exit needs to happen from av1_choose_var_based_partitioning().
+    return true;
+  } else if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+             cpi->sf.rt_sf.part_early_exit_zeromv >= 2)
+    x->force_zeromv_skip_for_sb = 2;
+  return false;
+}
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      ThreadData *td, MACROBLOCK *x, int mi_row,
+                                      int mi_col) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, choose_var_based_partitioning_time);
+#endif
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
+  PART_EVAL_STATUS force_split[85];
+  int avg_64x64;
+  int max_var_32x32[4];
+  int min_var_32x32[4];
+  int var_32x32;
+  int var_64x64;
+  int min_var_64x64 = INT_MAX;
+  int max_var_64x64 = 0;
+  int avg_16x16[4][4];
+  int maxvar_16x16[4][4];
+  int minvar_16x16[4][4];
+  const uint8_t *src_buf;
+  const uint8_t *dst_buf;
+  int dst_stride;
+  unsigned int uv_sad[MAX_MB_PLANE - 1];
+  NOISE_LEVEL noise_level = kLow;
+  bool is_zero_motion = true;
+  bool scaled_ref_last = false;
+  struct scale_factors sf_no_scale;
+  av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+                                    cm->width, cm->height);
+
+  bool is_key_frame =
+      (frame_is_intra_only(cm) ||
+       (cpi->ppi->use_svc &&
+        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+  assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+         cm->seq_params->sb_size == BLOCK_128X128);
+  const bool is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+  const int num_64x64_blocks = is_small_sb ? 1 : 4;
+
+  unsigned int y_sad = UINT_MAX;
+  unsigned int y_sad_g = UINT_MAX;
+  unsigned int y_sad_alt = UINT_MAX;
+  unsigned int y_sad_last = UINT_MAX;
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
+  int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
+                            vbp_thresholds[2], vbp_thresholds[3],
+                            vbp_thresholds[4] };
+
+  const int segment_id = xd->mi[0]->segment_id;
+  uint64_t blk_sad = 0;
+  if (cpi->src_sad_blk_64x64 != NULL &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+    const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
+    const int sb_cols =
+        (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+    const int sbi_col = mi_col / sb_size_by_mb;
+    const int sbi_row = mi_row / sb_size_by_mb;
+    blk_sad = cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+  }
+
+  const bool is_segment_id_boosted =
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      cyclic_refresh_segment_id_boosted(segment_id);
+  const int qindex =
+      is_segment_id_boosted
+          ? av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex)
+          : cm->quant_params.base_qindex;
+  set_vbp_thresholds(
+      cpi, thresholds, blk_sad, qindex, x->content_state_sb.low_sumdiff,
+      x->content_state_sb.source_sad_nonrd, x->content_state_sb.source_sad_rd,
+      is_segment_id_boosted, x->content_state_sb.lighting_change);
+
+  src_buf = x->plane[AOM_PLANE_Y].src.buf;
+  int src_stride = x->plane[AOM_PLANE_Y].src.stride;
+
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = PART_EVAL_ALL;
+  memset(x->part_search_info.variance_low, 0,
+         sizeof(x->part_search_info.variance_low));
+
+  // Check if LAST frame is NULL, and if so, treat this frame
+  // as a key frame, for the purpose of the superblock partitioning.
+  // LAST == NULL can happen in cases where enhancement spatial layers are
+  // enabled dyanmically and the only reference is the spatial(GOLDEN).
+  // If LAST frame has a different resolution: set the scaled_ref_last flag
+  // and check if ref_scaled is NULL.
+  if (!frame_is_intra_only(cm)) {
+    const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+    if (ref == NULL) {
+      is_key_frame = true;
+    } else if (ref->y_crop_height != cm->height ||
+               ref->y_crop_width != cm->width) {
+      scaled_ref_last = true;
+      const YV12_BUFFER_CONFIG *ref_scaled =
+          av1_get_scaled_ref_frame(cpi, LAST_FRAME);
+      if (ref_scaled == NULL) is_key_frame = true;
+    }
+  }
+
+  x->source_variance = UINT_MAX;
+  // For nord_pickmode: compute source_variance, only for superblocks with
+  // some motion for now. This input can then be used to bias the partitioning
+  // or the chroma_check.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+      x->content_state_sb.source_sad_nonrd > kLowSad)
+    x->source_variance = av1_get_perpixel_variance_facade(
+        cpi, xd, &x->plane[0].src, cm->seq_params->sb_size, AOM_PLANE_Y);
+
+  if (!is_key_frame) {
+    setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last,
+                 &ref_frame_partition, &sf_no_scale, mi_row, mi_col,
+                 is_small_sb, scaled_ref_last);
+
+    MB_MODE_INFO *mi = xd->mi[0];
+    // Use reference SB directly for zero mv.
+    if (mi->mv[0].as_int != 0) {
+      dst_buf = xd->plane[AOM_PLANE_Y].dst.buf;
+      dst_stride = xd->plane[AOM_PLANE_Y].dst.stride;
+      is_zero_motion = false;
+    } else {
+      dst_buf = xd->plane[AOM_PLANE_Y].pre[0].buf;
+      dst_stride = xd->plane[AOM_PLANE_Y].pre[0].stride;
+    }
+  } else {
+    dst_buf = NULL;
+    dst_stride = 0;
+  }
+
+  // check and set the color sensitivity of sb.
+  av1_zero(uv_sad);
+  chroma_check(cpi, x, bsize, y_sad_last, y_sad_g, y_sad_alt, is_key_frame,
+               is_zero_motion, uv_sad);
+
+  x->force_zeromv_skip_for_sb = 0;
+
+  VP128x128 *vt;
+  AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt)));
+  vt->split = td->vt64x64;
+
+  // If the superblock is completely static (zero source sad) and
+  // the y_sad (relative to LAST ref) is very small, take the sb_size partition
+  // and exit, and force zeromv_last skip mode for nonrd_pickmode.
+  // Only do this on the base segment (so the QP-boosted segment, if applied,
+  // can still continue cleaning/ramping up the quality).
+  // Condition on color uv_sad is also added.
+  if (!is_key_frame && cpi->sf.rt_sf.part_early_exit_zeromv &&
+      cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE &&
+      ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) {
+    // Exit here, if zero mv skip flag is set at SB level.
+    if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt, uv_sad, mi_row, mi_col,
+                                     y_sad, bsize))
+      return 0;
+  }
+
+  if (cpi->noise_estimate.enabled)
+    noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+
+  // Fill in the entire tree of 8x8 (for inter frames) or 4x4 (for key frames)
+  // variances for splits.
+  fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16,
+                            minvar_16x16, thresholds, src_buf, src_stride,
+                            dst_buf, dst_stride, is_key_frame, is_small_sb);
+
+  avg_64x64 = 0;
+  for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) {
+    max_var_32x32[blk64_idx] = 0;
+    min_var_32x32[blk64_idx] = INT_MAX;
+    const int blk64_scale_idx = blk64_idx << 2;
+    for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+      const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+      for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+        if (!is_key_frame) continue;
+        VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+        for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++)
+          fill_variance_tree(&vtemp->split[lvl3_idx], BLOCK_8X8);
+        fill_variance_tree(vtemp, BLOCK_16X16);
+        // If variance of this 16x16 block is above the threshold, force block
+        // to split. This also forces a split on the upper levels.
+        get_variance(&vtemp->part_variances.none);
+        if (vtemp->part_variances.none.variance > thresholds[3]) {
+          const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+          force_split[split_index] =
+              cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var
+                  ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3])
+                  : PART_EVAL_ONLY_SPLIT;
+          force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+          force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+          force_split[0] = PART_EVAL_ONLY_SPLIT;
+        }
+      }
+      fill_variance_tree(&vt->split[blk64_idx].split[lvl1_idx], BLOCK_32X32);
+      // If variance of this 32x32 block is above the threshold, or if its above
+      // (some threshold of) the average variance over the sub-16x16 blocks,
+      // then force this block to split. This also forces a split on the upper
+      // (64x64) level.
+      uint64_t frame_sad_thresh = 20000;
+      const int is_360p_or_smaller = cm->width * cm->height <= RESOLUTION_360P;
+      if (cpi->svc.number_temporal_layers > 2 &&
+          cpi->svc.temporal_layer_id == 0)
+        frame_sad_thresh = frame_sad_thresh << 1;
+      if (force_split[5 + blk64_scale_idx + lvl1_idx] == PART_EVAL_ALL) {
+        get_variance(&vt->split[blk64_idx].split[lvl1_idx].part_variances.none);
+        var_32x32 =
+            vt->split[blk64_idx].split[lvl1_idx].part_variances.none.variance;
+        max_var_32x32[blk64_idx] = AOMMAX(var_32x32, max_var_32x32[blk64_idx]);
+        min_var_32x32[blk64_idx] = AOMMIN(var_32x32, min_var_32x32[blk64_idx]);
+        const int max_min_var_16X16_diff = (maxvar_16x16[blk64_idx][lvl1_idx] -
+                                            minvar_16x16[blk64_idx][lvl1_idx]);
+
+        if (var_32x32 > thresholds[2] ||
+            (!is_key_frame && var_32x32 > (thresholds[2] >> 1) &&
+             var_32x32 > (avg_16x16[blk64_idx][lvl1_idx] >> 1))) {
+          force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+          force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+          force_split[0] = PART_EVAL_ONLY_SPLIT;
+        } else if (!is_key_frame && is_360p_or_smaller &&
+                   ((max_min_var_16X16_diff > (thresholds[2] >> 1) &&
+                     maxvar_16x16[blk64_idx][lvl1_idx] > thresholds[2]) ||
+                    (cpi->sf.rt_sf.prefer_large_partition_blocks &&
+                     x->content_state_sb.source_sad_nonrd > kLowSad &&
+                     cpi->rc.frame_source_sad < frame_sad_thresh &&
+                     maxvar_16x16[blk64_idx][lvl1_idx] > (thresholds[2] >> 4) &&
+                     maxvar_16x16[blk64_idx][lvl1_idx] >
+                         (minvar_16x16[blk64_idx][lvl1_idx] << 2)))) {
+          force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+          force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+          force_split[0] = PART_EVAL_ONLY_SPLIT;
+        }
+      }
+    }
+    if (force_split[1 + blk64_idx] == PART_EVAL_ALL) {
+      fill_variance_tree(&vt->split[blk64_idx], BLOCK_64X64);
+      get_variance(&vt->split[blk64_idx].part_variances.none);
+      var_64x64 = vt->split[blk64_idx].part_variances.none.variance;
+      max_var_64x64 = AOMMAX(var_64x64, max_var_64x64);
+      min_var_64x64 = AOMMIN(var_64x64, min_var_64x64);
+      // If the difference of the max-min variances of sub-blocks or max
+      // variance of a sub-block is above some threshold of then force this
+      // block to split. Only checking this for noise level >= medium, if
+      // encoder is in SVC or if we already forced large blocks.
+      const int max_min_var_32x32_diff =
+          max_var_32x32[blk64_idx] - min_var_32x32[blk64_idx];
+      const int check_max_var = max_var_32x32[blk64_idx] > thresholds[1] >> 1;
+      const bool check_noise_lvl = noise_level >= kMedium ||
+                                   cpi->ppi->use_svc ||
+                                   cpi->sf.rt_sf.prefer_large_partition_blocks;
+      const int64_t set_threshold = 3 * (thresholds[1] >> 3);
+
+      if (!is_key_frame && max_min_var_32x32_diff > set_threshold &&
+          check_max_var && check_noise_lvl) {
+        force_split[1 + blk64_idx] = PART_EVAL_ONLY_SPLIT;
+        force_split[0] = PART_EVAL_ONLY_SPLIT;
+      }
+      avg_64x64 += var_64x64;
+    }
+    if (is_small_sb) force_split[0] = PART_EVAL_ONLY_SPLIT;
+  }
+
+  if (force_split[0] == PART_EVAL_ALL) {
+    fill_variance_tree(vt, BLOCK_128X128);
+    get_variance(&vt->part_variances.none);
+    const int set_avg_64x64 = (9 * avg_64x64) >> 5;
+    if (!is_key_frame && vt->part_variances.none.variance > set_avg_64x64)
+      force_split[0] = PART_EVAL_ONLY_SPLIT;
+
+    if (!is_key_frame &&
+        (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
+        max_var_64x64 > thresholds[0] >> 1)
+      force_split[0] = PART_EVAL_ONLY_SPLIT;
+  }
+
+  if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end ||
+      !set_vt_partitioning(cpi, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+                           thresholds[0], BLOCK_16X16, force_split[0])) {
+    for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) {
+      const int x64_idx = GET_BLK_IDX_X(blk64_idx, 4);
+      const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 4);
+      const int blk64_scale_idx = blk64_idx << 2;
+
+      // Now go through the entire structure, splitting every block size until
+      // we get to one that's got a variance lower than our threshold.
+      if (set_vt_partitioning(cpi, xd, tile, &vt->split[blk64_idx], BLOCK_64X64,
+                              mi_row + y64_idx, mi_col + x64_idx, thresholds[1],
+                              BLOCK_16X16, force_split[1 + blk64_idx]))
+        continue;
+      for (int lvl1_idx = 0; lvl1_idx < 4; ++lvl1_idx) {
+        const int x32_idx = GET_BLK_IDX_X(lvl1_idx, 3);
+        const int y32_idx = GET_BLK_IDX_Y(lvl1_idx, 3);
+        const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+        if (set_vt_partitioning(
+                cpi, xd, tile, &vt->split[blk64_idx].split[lvl1_idx],
+                BLOCK_32X32, (mi_row + y64_idx + y32_idx),
+                (mi_col + x64_idx + x32_idx), thresholds[2], BLOCK_16X16,
+                force_split[5 + blk64_scale_idx + lvl1_idx]))
+          continue;
+        for (int lvl2_idx = 0; lvl2_idx < 4; ++lvl2_idx) {
+          const int x16_idx = GET_BLK_IDX_X(lvl2_idx, 2);
+          const int y16_idx = GET_BLK_IDX_Y(lvl2_idx, 2);
+          const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+          VP16x16 *vtemp =
+              &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+          if (set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16,
+                                  mi_row + y64_idx + y32_idx + y16_idx,
+                                  mi_col + x64_idx + x32_idx + x16_idx,
+                                  thresholds[3], BLOCK_8X8,
+                                  force_split[split_index]))
+            continue;
+          for (int lvl3_idx = 0; lvl3_idx < 4; ++lvl3_idx) {
+            const int x8_idx = GET_BLK_IDX_X(lvl3_idx, 1);
+            const int y8_idx = GET_BLK_IDX_Y(lvl3_idx, 1);
+            set_block_size(cpi, (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
+                           (mi_col + x64_idx + x32_idx + x16_idx + x8_idx),
+                           BLOCK_8X8);
+          }
+        }
+      }
+    }
+  }
+
+  if (cpi->sf.rt_sf.short_circuit_low_temp_var) {
+    set_low_temp_var_flag(cpi, &x->part_search_info, xd, vt, thresholds,
+                          ref_frame_partition, mi_col, mi_row, is_small_sb);
+  }
+
+  aom_free(vt);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, choose_var_based_partitioning_time);
+#endif
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/var_based_part.h b/third_party/aom/av1/encoder/var_based_part.h
new file mode 100644
index 0000000000..f912458307
--- /dev/null
+++ b/third_party/aom/av1/encoder/var_based_part.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_
+#define AOM_AV1_ENCODER_VAR_BASED_PART_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+
+// Calculate block index x and y from split level and index
+#define GET_BLK_IDX_X(idx, level) (((idx) & (0x01)) << (level))
+#define GET_BLK_IDX_Y(idx, level) (((idx) >> (0x01)) << (level))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QINDEX_LARGE_BLOCK_THR \
+  100  // Use increased thresholds for midres for speed 9 when qindex is above
+       // this threshold
+
+#define CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part) \
+  ((3 * (thresh_exit_part)) >> 2)
+/*!\brief Set the thresholds for variance based partition.
+ *
+ * Set the variance split thresholds for following the block sizes:
+ * 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+ * 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+ * currently only used on key frame. The thresholds are based om Q, resolution,
+ * noise level, and content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]      cpi                Top level encoder structure
+ * \param[in]      q                  q index
+ * \param[in]      content_lowsumdiff Low sumdiff flag for superblock
+ *
+ * \remark Returns the set of thresholds in \c cpi->vbp_info.thresholds.
+ */
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+                                           int content_lowsumdiff);
+
+/*!\brief Variance based partition selection.
+ *
+ * Select the partitioning based on the variance of the residual signal,
+ * residual generated as the difference between the source and prediction.
+ * The prediction is the reconstructed LAST or reconstructed GOLDEN, whichever
+ * has lower y sad. For LAST, option exists (speed feature) to use motion
+ * compensation based on superblock motion via int_pro_motion_estimation. For
+ * key frames reference is fixed 128 level, so variance is the source variance.
+ * The variance is computed for downsampled inputs (8x8 or 4x4 downsampled),
+ * and selection is done top-down via as set of partition thresholds. defined
+ * for each block level, and set based on Q, resolution, noise level, and
+ * content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       tile         Pointer to TileInfo
+ * \param[in]       td           Pointer to ThreadData
+ * \param[in]       x            Pointer to MACROBLOCK
+ * \param[in]       mi_row       Row coordinate of the superblock in a step
+ size of MI_SIZE
+ * \param[in]       mi_col       Column coordinate of the super block in a step
+ size of MI_SIZE
+ *
+ * \return Returns the partition in \c xd->mi[0]->sb_type. Also sets the low
+ * temporal variance flag and the color sensitivity flag (both used in
+ * nonrd_pickmode).
+ */
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      ThreadData *td, MACROBLOCK *x, int mi_row,
+                                      int mi_col);
+
+// Read out the block's temporal variance for 64x64 SB case.
+int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize);
+// Read out the block's temporal variance for 128x128 SB case.
+int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row,
+                                    int mi_col, BLOCK_SIZE bsize);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_VAR_BASED_PART_H_
diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c
new file mode 100644
index 0000000000..40670178d7
--- /dev/null
+++ b/third_party/aom/av1/encoder/wedge_utils.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1:  Residuals of p1.
+ *      (source - p1)
+ * d:   Difference of p1 and p0.
+ *      (p1 - p0)
+ * m:   The blending mask
+ * N:   Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ *    is equivalent to:
+ *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ *    which is the SSE of the residuals of the compound predictor scaled up by
+ *    MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+                                        const uint8_t *m, int N) {
+  uint64_t csse = 0;
+  int i;
+
+  for (i = 0; i < N; i++) {
+    int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+    t = clamp(t, INT16_MIN, INT16_MAX);
+    csse += t * t;
+  }
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds:    Difference of the squares of the residuals.
+ *        r0**2 - r1**2
+ * m:     The blending mask
+ * N:     Number of pixels
+ * limit: Pre-computed threshold value.
+ *        MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ *                                     >
+ *                                Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ *  which can be simplified to:
+ *
+ *  Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ *  The right hand side does not depend on the mask, and needs to be passed as
+ *  the 'limit' parameter.
+ *
+ *  After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ *  hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ *  Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ *  being small, this should not cause a noticeable issue.
+ */
+int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m,
+                                       int N, int64_t limit) {
+  int64_t acc = 0;
+
+  do {
+    acc += *ds++ * *m++;
+  } while (--N);
+
+  return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+                                       const int16_t *b, int N) {
+  int i;
+
+  for (i = 0; i < N; i++)
+    d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 0000000000..494b0fdf15
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1409 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int stride) {
+  __m128i buf0[32];
+  __m128i buf1[32];
+  const int32_t *cospi;
+
+  int startidx = 0 * stride;
+  int endidx = 31 * stride;
+  // stage 0
+  // stage 1
+  buf1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+  buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+  buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+  buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+  buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+  buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+  buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+  buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+  buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+  buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+  buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+  buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+  buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+  buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+  buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+  buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+  buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+  buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+  buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+  buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+  buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+  buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+  buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+  buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+  buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+  buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+  buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+  buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+  buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+  buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+  buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                      cos_bit);
+  buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+  buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+  buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+  buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+  buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+  buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+  buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+  buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+  buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+  buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+  buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+  buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+  buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+  buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+  buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+  buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+  buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+  buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+  buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+  buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+  buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+  buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+  buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+  buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+  buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                      buf0[31], cos_bit);
+  btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+
+  startidx = 0 * stride;
+  endidx = 31 * stride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    cospi = cospi_arr(cos_bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        cos_bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 3
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    cospi = cospi_arr(cos_bit);
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 5
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+                       const int instride, const int outstride) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+  __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+  __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+  __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+  __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+  __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+  __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+  __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+  __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+  __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+  __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+  __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+  __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+  __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+  __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+  __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+  __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+  __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+  __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+  __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+  __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+  __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+  __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+  __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+  __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+  __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+  __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+  __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+  __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+  __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+  __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+  __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+  __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+  __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+  __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+  __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+  __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+  __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+  __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+  __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+  __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+  __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+  __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+  __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+  __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+  __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+  __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+  __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+  __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+  __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+  __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+  __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+  __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+  __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+  __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+  __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+  __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+  __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+  __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+  __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+  __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+  __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+  __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+  __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+  __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+  __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+  __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+  __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+  __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+  __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+  __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+  __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+  __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+  __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+  __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+  __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+  __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+  __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[63] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[62] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[61] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[60] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[59] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[58] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[57] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[56] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[55] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[54] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[53] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[52] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[51] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[50] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[49] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[48] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[16] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[47] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[17] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[46] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[18] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[45] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[19] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[44] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[20] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[43] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[21] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[42] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[22] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[41] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[23] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[40] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[24] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[39] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[25] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[38] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[26] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[37] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[27] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[36] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[28] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[35] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[29] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[34] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[30] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[33] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[31] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[32] = _mm_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_add_epi32(x1[0], x1[31]);
+  x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+  x2[1] = _mm_add_epi32(x1[1], x1[30]);
+  x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+  x2[2] = _mm_add_epi32(x1[2], x1[29]);
+  x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+  x2[3] = _mm_add_epi32(x1[3], x1[28]);
+  x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+  x2[4] = _mm_add_epi32(x1[4], x1[27]);
+  x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+  x2[5] = _mm_add_epi32(x1[5], x1[26]);
+  x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+  x2[6] = _mm_add_epi32(x1[6], x1[25]);
+  x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+  x2[7] = _mm_add_epi32(x1[7], x1[24]);
+  x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+  x2[8] = _mm_add_epi32(x1[8], x1[23]);
+  x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+  x2[9] = _mm_add_epi32(x1[9], x1[22]);
+  x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+  x2[10] = _mm_add_epi32(x1[10], x1[21]);
+  x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+  x2[11] = _mm_add_epi32(x1[11], x1[20]);
+  x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+  x2[12] = _mm_add_epi32(x1[12], x1[19]);
+  x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+  x2[13] = _mm_add_epi32(x1[13], x1[18]);
+  x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+  x2[14] = _mm_add_epi32(x1[14], x1[17]);
+  x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+  x2[15] = _mm_add_epi32(x1[15], x1[16]);
+  x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+                          __rounding, cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_add_epi32(x2[0], x2[15]);
+  x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+  x3[1] = _mm_add_epi32(x2[1], x2[14]);
+  x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+  x3[2] = _mm_add_epi32(x2[2], x2[13]);
+  x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+  x3[3] = _mm_add_epi32(x2[3], x2[12]);
+  x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+  x3[4] = _mm_add_epi32(x2[4], x2[11]);
+  x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+  x3[5] = _mm_add_epi32(x2[5], x2[10]);
+  x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+  x3[6] = _mm_add_epi32(x2[6], x2[9]);
+  x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+  x3[7] = _mm_add_epi32(x2[7], x2[8]);
+  x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+                          __rounding, cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_add_epi32(x2[32], x2[47]);
+  x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+  x3[33] = _mm_add_epi32(x2[33], x2[46]);
+  x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+  x3[34] = _mm_add_epi32(x2[34], x2[45]);
+  x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+  x3[35] = _mm_add_epi32(x2[35], x2[44]);
+  x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+  x3[36] = _mm_add_epi32(x2[36], x2[43]);
+  x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+  x3[37] = _mm_add_epi32(x2[37], x2[42]);
+  x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+  x3[38] = _mm_add_epi32(x2[38], x2[41]);
+  x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+  x3[39] = _mm_add_epi32(x2[39], x2[40]);
+  x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+  x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+  x3[63] = _mm_add_epi32(x2[63], x2[48]);
+  x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+  x3[62] = _mm_add_epi32(x2[62], x2[49]);
+  x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+  x3[61] = _mm_add_epi32(x2[61], x2[50]);
+  x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+  x3[60] = _mm_add_epi32(x2[60], x2[51]);
+  x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+  x3[59] = _mm_add_epi32(x2[59], x2[52]);
+  x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+  x3[58] = _mm_add_epi32(x2[58], x2[53]);
+  x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+  x3[57] = _mm_add_epi32(x2[57], x2[54]);
+  x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+  x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_add_epi32(x3[0], x3[7]);
+  x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+  x4[1] = _mm_add_epi32(x3[1], x3[6]);
+  x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+  x4[2] = _mm_add_epi32(x3[2], x3[5]);
+  x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+  x4[3] = _mm_add_epi32(x3[3], x3[4]);
+  x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+                          __rounding, cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_add_epi32(x3[16], x3[23]);
+  x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+  x4[17] = _mm_add_epi32(x3[17], x3[22]);
+  x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+  x4[18] = _mm_add_epi32(x3[18], x3[21]);
+  x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+  x4[19] = _mm_add_epi32(x3[19], x3[20]);
+  x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+  x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+  x4[31] = _mm_add_epi32(x3[31], x3[24]);
+  x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+  x4[30] = _mm_add_epi32(x3[30], x3[25]);
+  x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+  x4[29] = _mm_add_epi32(x3[29], x3[26]);
+  x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+  x4[28] = _mm_add_epi32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+                          __rounding, cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_add_epi32(x4[0], x4[3]);
+  x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+  x5[1] = _mm_add_epi32(x4[1], x4[2]);
+  x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+                          __rounding, cos_bit);
+  x5[7] = x4[7];
+  x5[8] = _mm_add_epi32(x4[8], x4[11]);
+  x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+  x5[9] = _mm_add_epi32(x4[9], x4[10]);
+  x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+  x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+  x5[15] = _mm_add_epi32(x4[15], x4[12]);
+  x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+  x5[14] = _mm_add_epi32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+                          __rounding, cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_add_epi32(x4[32], x4[39]);
+  x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+  x5[33] = _mm_add_epi32(x4[33], x4[38]);
+  x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+  x5[34] = _mm_add_epi32(x4[34], x4[37]);
+  x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+  x5[35] = _mm_add_epi32(x4[35], x4[36]);
+  x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+  x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+  x5[47] = _mm_add_epi32(x4[47], x4[40]);
+  x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+  x5[46] = _mm_add_epi32(x4[46], x4[41]);
+  x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+  x5[45] = _mm_add_epi32(x4[45], x4[42]);
+  x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+  x5[44] = _mm_add_epi32(x4[44], x4[43]);
+  x5[48] = _mm_add_epi32(x4[48], x4[55]);
+  x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+  x5[49] = _mm_add_epi32(x4[49], x4[54]);
+  x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+  x5[50] = _mm_add_epi32(x4[50], x4[53]);
+  x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+  x5[51] = _mm_add_epi32(x4[51], x4[52]);
+  x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+  x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+  x5[63] = _mm_add_epi32(x4[63], x4[56]);
+  x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+  x5[62] = _mm_add_epi32(x4[62], x4[57]);
+  x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+  x5[61] = _mm_add_epi32(x4[61], x4[58]);
+  x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+  x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+                          __rounding, cos_bit);
+  x6[4] = _mm_add_epi32(x5[4], x5[5]);
+  x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+  x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+  x6[7] = _mm_add_epi32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+                          __rounding, cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_add_epi32(x5[16], x5[19]);
+  x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+  x6[17] = _mm_add_epi32(x5[17], x5[18]);
+  x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+  x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+  x6[23] = _mm_add_epi32(x5[23], x5[20]);
+  x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+  x6[22] = _mm_add_epi32(x5[22], x5[21]);
+  x6[24] = _mm_add_epi32(x5[24], x5[27]);
+  x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+  x6[25] = _mm_add_epi32(x5[25], x5[26]);
+  x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+  x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+  x6[31] = _mm_add_epi32(x5[31], x5[28]);
+  x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+  x6[30] = _mm_add_epi32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+                          __rounding, cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+                          __rounding, cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+                          __rounding, cos_bit);
+  x7[8] = _mm_add_epi32(x6[8], x6[9]);
+  x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+  x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+  x7[11] = _mm_add_epi32(x6[11], x6[10]);
+  x7[12] = _mm_add_epi32(x6[12], x6[13]);
+  x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+  x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+  x7[15] = _mm_add_epi32(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+                          __rounding, cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+                          __rounding, cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_add_epi32(x6[32], x6[35]);
+  x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+  x7[33] = _mm_add_epi32(x6[33], x6[34]);
+  x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+  x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+  x7[39] = _mm_add_epi32(x6[39], x6[36]);
+  x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+  x7[38] = _mm_add_epi32(x6[38], x6[37]);
+  x7[40] = _mm_add_epi32(x6[40], x6[43]);
+  x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+  x7[41] = _mm_add_epi32(x6[41], x6[42]);
+  x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+  x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+  x7[47] = _mm_add_epi32(x6[47], x6[44]);
+  x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+  x7[46] = _mm_add_epi32(x6[46], x6[45]);
+  x7[48] = _mm_add_epi32(x6[48], x6[51]);
+  x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+  x7[49] = _mm_add_epi32(x6[49], x6[50]);
+  x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+  x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+  x7[55] = _mm_add_epi32(x6[55], x6[52]);
+  x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+  x7[54] = _mm_add_epi32(x6[54], x6[53]);
+  x7[56] = _mm_add_epi32(x6[56], x6[59]);
+  x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+  x7[57] = _mm_add_epi32(x6[57], x6[58]);
+  x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+  x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+  x7[63] = _mm_add_epi32(x6[63], x6[60]);
+  x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+  x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+                          __rounding, cos_bit);
+  x8[16] = _mm_add_epi32(x7[16], x7[17]);
+  x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+  x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+  x8[19] = _mm_add_epi32(x7[19], x7[18]);
+  x8[20] = _mm_add_epi32(x7[20], x7[21]);
+  x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+  x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+  x8[23] = _mm_add_epi32(x7[23], x7[22]);
+  x8[24] = _mm_add_epi32(x7[24], x7[25]);
+  x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+  x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+  x8[27] = _mm_add_epi32(x7[27], x7[26]);
+  x8[28] = _mm_add_epi32(x7[28], x7[29]);
+  x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+  x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+  x8[31] = _mm_add_epi32(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+                          __rounding, cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+                          __rounding, cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+                          __rounding, cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+                          __rounding, cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+                          __rounding, cos_bit);
+  x9[32] = _mm_add_epi32(x8[32], x8[33]);
+  x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+  x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+  x9[35] = _mm_add_epi32(x8[35], x8[34]);
+  x9[36] = _mm_add_epi32(x8[36], x8[37]);
+  x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+  x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+  x9[39] = _mm_add_epi32(x8[39], x8[38]);
+  x9[40] = _mm_add_epi32(x8[40], x8[41]);
+  x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+  x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+  x9[43] = _mm_add_epi32(x8[43], x8[42]);
+  x9[44] = _mm_add_epi32(x8[44], x8[45]);
+  x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+  x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+  x9[47] = _mm_add_epi32(x8[47], x8[46]);
+  x9[48] = _mm_add_epi32(x8[48], x8[49]);
+  x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+  x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+  x9[51] = _mm_add_epi32(x8[51], x8[50]);
+  x9[52] = _mm_add_epi32(x8[52], x8[53]);
+  x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+  x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+  x9[55] = _mm_add_epi32(x8[55], x8[54]);
+  x9[56] = _mm_add_epi32(x8[56], x8[57]);
+  x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+  x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+  x9[59] = _mm_add_epi32(x8[59], x8[58]);
+  x9[60] = _mm_add_epi32(x8[60], x8[61]);
+  x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+  x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+  x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+                          x10[63], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+                          x10[62], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+                          x10[61], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+                          x10[60], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+                          x10[59], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+                          x10[58], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+                          x10[57], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+                          x10[56], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+                          x10[55], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+                          x10[54], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+                          x10[53], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+                          x10[52], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+                          x10[51], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+                          x10[50], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+                          x10[49], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+                          x10[48], __rounding, cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+  // stage 11
+  output[startidx] = x10[0];
+  output[endidx] = x10[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[32];
+  output[endidx] = x10[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[16];
+  output[endidx] = x10[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[48];
+  output[endidx] = x10[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[8];
+  output[endidx] = x10[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[40];
+  output[endidx] = x10[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[24];
+  output[endidx] = x10[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[56];
+  output[endidx] = x10[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[4];
+  output[endidx] = x10[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[36];
+  output[endidx] = x10[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[20];
+  output[endidx] = x10[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[52];
+  output[endidx] = x10[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[12];
+  output[endidx] = x10[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[44];
+  output[endidx] = x10[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[28];
+  output[endidx] = x10[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[60];
+  output[endidx] = x10[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[2];
+  output[endidx] = x10[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[34];
+  output[endidx] = x10[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[18];
+  output[endidx] = x10[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[50];
+  output[endidx] = x10[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[10];
+  output[endidx] = x10[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[42];
+  output[endidx] = x10[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[26];
+  output[endidx] = x10[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[58];
+  output[endidx] = x10[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[6];
+  output[endidx] = x10[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[38];
+  output[endidx] = x10[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[22];
+  output[endidx] = x10[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[54];
+  output[endidx] = x10[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[14];
+  output[endidx] = x10[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[46];
+  output[endidx] = x10[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[30];
+  output[endidx] = x10[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[62];
+  output[endidx] = x10[1];
+}
+
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int col_num) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; i++) {
+    output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
new file mode 100644
index 0000000000..b143df3523
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -0,0 +1,3010 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m256i x1[16];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 7
+  output[0] = x1[0];
+  output[1] = x1[8];
+  output[2] = x1[4];
+  output[3] = x1[12];
+  output[4] = x1[2];
+  output[5] = x1[10];
+  output[6] = x1[6];
+  output[7] = x1[14];
+  output[8] = x1[1];
+  output[9] = x1[9];
+  output[10] = x1[5];
+  output[11] = x1[13];
+  output[12] = x1[3];
+  output[13] = x1[11];
+  output[14] = x1[7];
+  output[15] = x1[15];
+}
+
+static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m256i x1[32];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+  btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+  btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+  btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+  btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+  btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+  btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+  btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+  btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+
+  // stage 4
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+
+  // stage 7
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[0];
+  output[1] = x1[16];
+  output[2] = x1[8];
+  output[3] = x1[24];
+  output[4] = x1[4];
+  output[5] = x1[20];
+  output[6] = x1[12];
+  output[7] = x1[28];
+  output[8] = x1[2];
+  output[9] = x1[18];
+  output[10] = x1[10];
+  output[11] = x1[26];
+  output[12] = x1[6];
+  output[13] = x1[22];
+  output[14] = x1[14];
+  output[15] = x1[30];
+  output[16] = x1[1];
+  output[17] = x1[17];
+  output[18] = x1[9];
+  output[19] = x1[25];
+  output[20] = x1[5];
+  output[21] = x1[21];
+  output[22] = x1[13];
+  output[23] = x1[29];
+  output[24] = x1[3];
+  output[25] = x1[19];
+  output[26] = x1[11];
+  output[27] = x1[27];
+  output[28] = x1[7];
+  output[29] = x1[23];
+  output[30] = x1[15];
+  output[31] = x1[31];
+}
+
+static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+  __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+  __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+  __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
+  __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
+  __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
+  __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
+  __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
+  __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
+  __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
+  __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
+  __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
+  __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
+  __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
+  __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
+  __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
+  __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
+  __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
+  __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
+  __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
+  __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
+  __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
+  __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
+  __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
+  __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
+  __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
+  __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
+  __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
+  __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
+  __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
+  __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
+  __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
+  __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
+  __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
+  __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m256i x1[64];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+  btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+  btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+  btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+  btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+  btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+  btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+  btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+  btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+  btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+  btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+  btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+  btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+  btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+  btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+  btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+  btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+  btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+  btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+  btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+  btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+  btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+  btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+  btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+  btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[31]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[30]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[10], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[13], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[47]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[46]);
+  btf_16_adds_subs_avx2(&x1[34], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[44]);
+  btf_16_adds_subs_avx2(&x1[36], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[37], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[38], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[40]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[48]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[49]);
+  btf_16_adds_subs_avx2(&x1[61], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[51]);
+  btf_16_adds_subs_avx2(&x1[59], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[58], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[57], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[55]);
+
+  // stage 4
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
+
+  // stage 5
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[39]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[38]);
+  btf_16_adds_subs_avx2(&x1[34], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[36]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[40]);
+  btf_16_adds_subs_avx2(&x1[46], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[45], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[44], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[55]);
+  btf_16_adds_subs_avx2(&x1[49], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[50], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[51], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[56]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[57]);
+  btf_16_adds_subs_avx2(&x1[61], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[59]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
+
+  // stage 7
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[35]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[34]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[36]);
+  btf_16_adds_subs_avx2(&x1[38], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[40], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[41], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[44]);
+  btf_16_adds_subs_avx2(&x1[46], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[51]);
+  btf_16_adds_subs_avx2(&x1[49], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[55], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[54], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[59]);
+  btf_16_adds_subs_avx2(&x1[57], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[60]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[61]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
+
+  // stage 9
+  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[33]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[34]);
+  btf_16_adds_subs_avx2(&x1[36], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[38]);
+  btf_16_adds_subs_avx2(&x1[40], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[43], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[44], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[46]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[49]);
+  btf_16_adds_subs_avx2(&x1[51], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[52], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[55], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[57]);
+  btf_16_adds_subs_avx2(&x1[59], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[61]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[62]);
+
+  // stage 10
+  btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 11
+  output[0] = x1[0];
+  output[1] = x1[32];
+  output[2] = x1[16];
+  output[3] = x1[48];
+  output[4] = x1[8];
+  output[5] = x1[40];
+  output[6] = x1[24];
+  output[7] = x1[56];
+  output[8] = x1[4];
+  output[9] = x1[36];
+  output[10] = x1[20];
+  output[11] = x1[52];
+  output[12] = x1[12];
+  output[13] = x1[44];
+  output[14] = x1[28];
+  output[15] = x1[60];
+  output[16] = x1[2];
+  output[17] = x1[34];
+  output[18] = x1[18];
+  output[19] = x1[50];
+  output[20] = x1[10];
+  output[21] = x1[42];
+  output[22] = x1[26];
+  output[23] = x1[58];
+  output[24] = x1[6];
+  output[25] = x1[38];
+  output[26] = x1[22];
+  output[27] = x1[54];
+  output[28] = x1[14];
+  output[29] = x1[46];
+  output[30] = x1[30];
+  output[31] = x1[62];
+  output[32] = x1[1];
+  output[33] = x1[33];
+  output[34] = x1[17];
+  output[35] = x1[49];
+  output[36] = x1[9];
+  output[37] = x1[41];
+  output[38] = x1[25];
+  output[39] = x1[57];
+  output[40] = x1[5];
+  output[41] = x1[37];
+  output[42] = x1[21];
+  output[43] = x1[53];
+  output[44] = x1[13];
+  output[45] = x1[45];
+  output[46] = x1[29];
+  output[47] = x1[61];
+  output[48] = x1[3];
+  output[49] = x1[35];
+  output[50] = x1[19];
+  output[51] = x1[51];
+  output[52] = x1[11];
+  output[53] = x1[43];
+  output[54] = x1[27];
+  output[55] = x1[59];
+  output[56] = x1[7];
+  output[57] = x1[39];
+  output[58] = x1[23];
+  output[59] = x1[55];
+  output[60] = x1[15];
+  output[61] = x1[47];
+  output[62] = x1[31];
+  output[63] = x1[63];
+}
+
+static INLINE void fdct32_avx2(const __m256i *input, __m256i *output,
+                               int8_t cos_bit) {
+  __m256i x1[32];
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+  // stage 0
+  // stage 1
+  btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+  btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+  btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+  btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+  btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+  btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+  btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+  btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+  btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+  btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+  btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+  btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+  btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+  btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+  btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+  btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+  // stage 2
+  btf_32_add_sub_avx2(&x1[0], &x1[15]);
+  btf_32_add_sub_avx2(&x1[1], &x1[14]);
+  btf_32_add_sub_avx2(&x1[2], &x1[13]);
+  btf_32_add_sub_avx2(&x1[3], &x1[12]);
+  btf_32_add_sub_avx2(&x1[4], &x1[11]);
+  btf_32_add_sub_avx2(&x1[5], &x1[10]);
+  btf_32_add_sub_avx2(&x1[6], &x1[9]);
+  btf_32_add_sub_avx2(&x1[7], &x1[8]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_32_add_sub_avx2(&x1[0], &x1[7]);
+  btf_32_add_sub_avx2(&x1[1], &x1[6]);
+  btf_32_add_sub_avx2(&x1[2], &x1[5]);
+  btf_32_add_sub_avx2(&x1[3], &x1[4]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[23]);
+  btf_32_add_sub_avx2(&x1[17], &x1[22]);
+  btf_32_add_sub_avx2(&x1[18], &x1[21]);
+  btf_32_add_sub_avx2(&x1[19], &x1[20]);
+  btf_32_add_sub_avx2(&x1[31], &x1[24]);
+  btf_32_add_sub_avx2(&x1[30], &x1[25]);
+  btf_32_add_sub_avx2(&x1[29], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[27]);
+
+  // stage 4
+  btf_32_add_sub_avx2(&x1[0], &x1[3]);
+  btf_32_add_sub_avx2(&x1[1], &x1[2]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[11]);
+  btf_32_add_sub_avx2(&x1[9], &x1[10]);
+  btf_32_add_sub_avx2(&x1[15], &x1[12]);
+  btf_32_add_sub_avx2(&x1[14], &x1[13]);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
+
+  // stage 5
+  btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
+  btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[4], &x1[5]);
+  btf_32_add_sub_avx2(&x1[7], &x1[6]);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[19]);
+  btf_32_add_sub_avx2(&x1[17], &x1[18]);
+  btf_32_add_sub_avx2(&x1[23], &x1[20]);
+  btf_32_add_sub_avx2(&x1[22], &x1[21]);
+  btf_32_add_sub_avx2(&x1[24], &x1[27]);
+  btf_32_add_sub_avx2(&x1[25], &x1[26]);
+  btf_32_add_sub_avx2(&x1[31], &x1[28]);
+  btf_32_add_sub_avx2(&x1[30], &x1[29]);
+
+  // stage 6
+  btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
+  btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[9]);
+  btf_32_add_sub_avx2(&x1[11], &x1[10]);
+  btf_32_add_sub_avx2(&x1[12], &x1[13]);
+  btf_32_add_sub_avx2(&x1[15], &x1[14]);
+  btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
+
+  // stage 7
+  btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
+  btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[17]);
+  btf_32_add_sub_avx2(&x1[19], &x1[18]);
+  btf_32_add_sub_avx2(&x1[20], &x1[21]);
+  btf_32_add_sub_avx2(&x1[23], &x1[22]);
+  btf_32_add_sub_avx2(&x1[24], &x1[25]);
+  btf_32_add_sub_avx2(&x1[27], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[29]);
+  btf_32_add_sub_avx2(&x1[31], &x1[30]);
+
+  // stage 8
+  btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
+  btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[0];
+  output[1] = x1[16];
+  output[2] = x1[8];
+  output[3] = x1[24];
+  output[4] = x1[4];
+  output[5] = x1[20];
+  output[6] = x1[12];
+  output[7] = x1[28];
+  output[8] = x1[2];
+  output[9] = x1[18];
+  output[10] = x1[10];
+  output[11] = x1[26];
+  output[12] = x1[6];
+  output[13] = x1[22];
+  output[14] = x1[14];
+  output[15] = x1[30];
+  output[16] = x1[1];
+  output[17] = x1[17];
+  output[18] = x1[9];
+  output[19] = x1[25];
+  output[20] = x1[5];
+  output[21] = x1[21];
+  output[22] = x1[13];
+  output[23] = x1[29];
+  output[24] = x1[3];
+  output[25] = x1[19];
+  output[26] = x1[11];
+  output[27] = x1[27];
+  output[28] = x1[7];
+  output[29] = x1[23];
+  output[30] = x1[15];
+  output[31] = x1[31];
+}
+
+static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output,
+                                   int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+  // stage 1
+  __m256i x1[64];
+  btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+  btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+  btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+  btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+  btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+  btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+  btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+  btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+  btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+  btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+  btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+  btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+  btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+  btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+  btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+  btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+  btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+  btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+  btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+  btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+  btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+  btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+  btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+  btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+  btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+  btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+  btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+  btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+  btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+  btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+  btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+  btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+  // stage 2
+  btf_32_add_sub_avx2(&x1[0], &x1[31]);
+  btf_32_add_sub_avx2(&x1[1], &x1[30]);
+  btf_32_add_sub_avx2(&x1[2], &x1[29]);
+  btf_32_add_sub_avx2(&x1[3], &x1[28]);
+  btf_32_add_sub_avx2(&x1[4], &x1[27]);
+  btf_32_add_sub_avx2(&x1[5], &x1[26]);
+  btf_32_add_sub_avx2(&x1[6], &x1[25]);
+  btf_32_add_sub_avx2(&x1[7], &x1[24]);
+  btf_32_add_sub_avx2(&x1[8], &x1[23]);
+  btf_32_add_sub_avx2(&x1[9], &x1[22]);
+  btf_32_add_sub_avx2(&x1[10], &x1[21]);
+  btf_32_add_sub_avx2(&x1[11], &x1[20]);
+  btf_32_add_sub_avx2(&x1[12], &x1[19]);
+  btf_32_add_sub_avx2(&x1[13], &x1[18]);
+  btf_32_add_sub_avx2(&x1[14], &x1[17]);
+  btf_32_add_sub_avx2(&x1[15], &x1[16]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 3
+  btf_32_add_sub_avx2(&x1[0], &x1[15]);
+  btf_32_add_sub_avx2(&x1[1], &x1[14]);
+  btf_32_add_sub_avx2(&x1[2], &x1[13]);
+  btf_32_add_sub_avx2(&x1[3], &x1[12]);
+  btf_32_add_sub_avx2(&x1[4], &x1[11]);
+  btf_32_add_sub_avx2(&x1[5], &x1[10]);
+  btf_32_add_sub_avx2(&x1[6], &x1[9]);
+  btf_32_add_sub_avx2(&x1[7], &x1[8]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[47]);
+  btf_32_add_sub_avx2(&x1[33], &x1[46]);
+  btf_32_add_sub_avx2(&x1[34], &x1[45]);
+  btf_32_add_sub_avx2(&x1[35], &x1[44]);
+  btf_32_add_sub_avx2(&x1[36], &x1[43]);
+  btf_32_add_sub_avx2(&x1[37], &x1[42]);
+  btf_32_add_sub_avx2(&x1[38], &x1[41]);
+  btf_32_add_sub_avx2(&x1[39], &x1[40]);
+  btf_32_add_sub_avx2(&x1[63], &x1[48]);
+  btf_32_add_sub_avx2(&x1[62], &x1[49]);
+  btf_32_add_sub_avx2(&x1[61], &x1[50]);
+  btf_32_add_sub_avx2(&x1[60], &x1[51]);
+  btf_32_add_sub_avx2(&x1[59], &x1[52]);
+  btf_32_add_sub_avx2(&x1[58], &x1[53]);
+  btf_32_add_sub_avx2(&x1[57], &x1[54]);
+  btf_32_add_sub_avx2(&x1[56], &x1[55]);
+
+  // stage 4
+  btf_32_add_sub_avx2(&x1[0], &x1[7]);
+  btf_32_add_sub_avx2(&x1[1], &x1[6]);
+  btf_32_add_sub_avx2(&x1[2], &x1[5]);
+  btf_32_add_sub_avx2(&x1[3], &x1[4]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[23]);
+  btf_32_add_sub_avx2(&x1[17], &x1[22]);
+  btf_32_add_sub_avx2(&x1[18], &x1[21]);
+  btf_32_add_sub_avx2(&x1[19], &x1[20]);
+  btf_32_add_sub_avx2(&x1[31], &x1[24]);
+  btf_32_add_sub_avx2(&x1[30], &x1[25]);
+  btf_32_add_sub_avx2(&x1[29], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[27]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
+
+  // stage 5
+  btf_32_add_sub_avx2(&x1[0], &x1[3]);
+  btf_32_add_sub_avx2(&x1[1], &x1[2]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[11]);
+  btf_32_add_sub_avx2(&x1[9], &x1[10]);
+  btf_32_add_sub_avx2(&x1[15], &x1[12]);
+  btf_32_add_sub_avx2(&x1[14], &x1[13]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[39]);
+  btf_32_add_sub_avx2(&x1[33], &x1[38]);
+  btf_32_add_sub_avx2(&x1[34], &x1[37]);
+  btf_32_add_sub_avx2(&x1[35], &x1[36]);
+  btf_32_add_sub_avx2(&x1[47], &x1[40]);
+  btf_32_add_sub_avx2(&x1[46], &x1[41]);
+  btf_32_add_sub_avx2(&x1[45], &x1[42]);
+  btf_32_add_sub_avx2(&x1[44], &x1[43]);
+  btf_32_add_sub_avx2(&x1[48], &x1[55]);
+  btf_32_add_sub_avx2(&x1[49], &x1[54]);
+  btf_32_add_sub_avx2(&x1[50], &x1[53]);
+  btf_32_add_sub_avx2(&x1[51], &x1[52]);
+  btf_32_add_sub_avx2(&x1[63], &x1[56]);
+  btf_32_add_sub_avx2(&x1[62], &x1[57]);
+  btf_32_add_sub_avx2(&x1[61], &x1[58]);
+  btf_32_add_sub_avx2(&x1[60], &x1[59]);
+
+  // stage 6
+  btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[4], &x1[5]);
+  btf_32_add_sub_avx2(&x1[7], &x1[6]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[19]);
+  btf_32_add_sub_avx2(&x1[17], &x1[18]);
+  btf_32_add_sub_avx2(&x1[23], &x1[20]);
+  btf_32_add_sub_avx2(&x1[22], &x1[21]);
+  btf_32_add_sub_avx2(&x1[24], &x1[27]);
+  btf_32_add_sub_avx2(&x1[25], &x1[26]);
+  btf_32_add_sub_avx2(&x1[31], &x1[28]);
+  btf_32_add_sub_avx2(&x1[30], &x1[29]);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
+
+  // stage 7
+  btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[9]);
+  btf_32_add_sub_avx2(&x1[11], &x1[10]);
+  btf_32_add_sub_avx2(&x1[12], &x1[13]);
+  btf_32_add_sub_avx2(&x1[15], &x1[14]);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[35]);
+  btf_32_add_sub_avx2(&x1[33], &x1[34]);
+  btf_32_add_sub_avx2(&x1[39], &x1[36]);
+  btf_32_add_sub_avx2(&x1[38], &x1[37]);
+  btf_32_add_sub_avx2(&x1[40], &x1[43]);
+  btf_32_add_sub_avx2(&x1[41], &x1[42]);
+  btf_32_add_sub_avx2(&x1[47], &x1[44]);
+  btf_32_add_sub_avx2(&x1[46], &x1[45]);
+  btf_32_add_sub_avx2(&x1[48], &x1[51]);
+  btf_32_add_sub_avx2(&x1[49], &x1[50]);
+  btf_32_add_sub_avx2(&x1[55], &x1[52]);
+  btf_32_add_sub_avx2(&x1[54], &x1[53]);
+  btf_32_add_sub_avx2(&x1[56], &x1[59]);
+  btf_32_add_sub_avx2(&x1[57], &x1[58]);
+  btf_32_add_sub_avx2(&x1[63], &x1[60]);
+  btf_32_add_sub_avx2(&x1[62], &x1[61]);
+
+  // stage 8
+  btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[17]);
+  btf_32_add_sub_avx2(&x1[19], &x1[18]);
+  btf_32_add_sub_avx2(&x1[20], &x1[21]);
+  btf_32_add_sub_avx2(&x1[23], &x1[22]);
+  btf_32_add_sub_avx2(&x1[24], &x1[25]);
+  btf_32_add_sub_avx2(&x1[27], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[29]);
+  btf_32_add_sub_avx2(&x1[31], &x1[30]);
+  btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
+
+  // stage 9
+  btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[33]);
+  btf_32_add_sub_avx2(&x1[35], &x1[34]);
+  btf_32_add_sub_avx2(&x1[36], &x1[37]);
+  btf_32_add_sub_avx2(&x1[39], &x1[38]);
+  btf_32_add_sub_avx2(&x1[40], &x1[41]);
+  btf_32_add_sub_avx2(&x1[43], &x1[42]);
+  btf_32_add_sub_avx2(&x1[44], &x1[45]);
+  btf_32_add_sub_avx2(&x1[47], &x1[46]);
+  btf_32_add_sub_avx2(&x1[48], &x1[49]);
+  btf_32_add_sub_avx2(&x1[51], &x1[50]);
+  btf_32_add_sub_avx2(&x1[52], &x1[53]);
+  btf_32_add_sub_avx2(&x1[55], &x1[54]);
+  btf_32_add_sub_avx2(&x1[56], &x1[57]);
+  btf_32_add_sub_avx2(&x1[59], &x1[58]);
+  btf_32_add_sub_avx2(&x1[60], &x1[61]);
+  btf_32_add_sub_avx2(&x1[63], &x1[62]);
+
+  // stage 10
+  btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 11
+  output[0] = x1[0];
+  output[1] = x1[32];
+  output[2] = x1[16];
+  output[3] = x1[48];
+  output[4] = x1[8];
+  output[5] = x1[40];
+  output[6] = x1[24];
+  output[7] = x1[56];
+  output[8] = x1[4];
+  output[9] = x1[36];
+  output[10] = x1[20];
+  output[11] = x1[52];
+  output[12] = x1[12];
+  output[13] = x1[44];
+  output[14] = x1[28];
+  output[15] = x1[60];
+  output[16] = x1[2];
+  output[17] = x1[34];
+  output[18] = x1[18];
+  output[19] = x1[50];
+  output[20] = x1[10];
+  output[21] = x1[42];
+  output[22] = x1[26];
+  output[23] = x1[58];
+  output[24] = x1[6];
+  output[25] = x1[38];
+  output[26] = x1[22];
+  output[27] = x1[54];
+  output[28] = x1[14];
+  output[29] = x1[46];
+  output[30] = x1[30];
+  output[31] = x1[62];
+  output[32] = x1[1];
+  output[33] = x1[33];
+  output[34] = x1[17];
+  output[35] = x1[49];
+  output[36] = x1[9];
+  output[37] = x1[41];
+  output[38] = x1[25];
+  output[39] = x1[57];
+  output[40] = x1[5];
+  output[41] = x1[37];
+  output[42] = x1[21];
+  output[43] = x1[53];
+  output[44] = x1[13];
+  output[45] = x1[45];
+  output[46] = x1[29];
+  output[47] = x1[61];
+  output[48] = x1[3];
+  output[49] = x1[35];
+  output[50] = x1[19];
+  output[51] = x1[51];
+  output[52] = x1[11];
+  output[53] = x1[43];
+  output[54] = x1[27];
+  output[55] = x1[59];
+  output[56] = x1[7];
+  output[57] = x1[39];
+  output[58] = x1[23];
+  output[59] = x1[55];
+  output[60] = x1[15];
+  output[61] = x1[47];
+  output[62] = x1[31];
+  output[63] = x1[63];
+}
+
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+  __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm256_subs_epi16(__zero, input[15]);
+  x1[2] = _mm256_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm256_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm256_subs_epi16(__zero, input[11]);
+  x1[8] = _mm256_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm256_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm256_subs_epi16(__zero, input[13]);
+  x1[14] = _mm256_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[2]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[13], &x1[15]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 5
+  btf_16_adds_subs_avx2(&x1[0], &x1[4]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[10], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[15]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_avx2(&x1[0], &x1[8]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[15]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[1];
+  output[1] = x1[14];
+  output[2] = x1[3];
+  output[3] = x1[12];
+  output[4] = x1[5];
+  output[5] = x1[10];
+  output[6] = x1[7];
+  output[7] = x1[8];
+  output[8] = x1[9];
+  output[9] = x1[6];
+  output[10] = x1[11];
+  output[11] = x1[4];
+  output[12] = x1[13];
+  output[13] = x1[2];
+  output[14] = x1[15];
+  output[15] = x1[0];
+}
+
+static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+                                           __m256i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
+    const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
+    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm256_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm256_slli_epi16(input[i], 2);
+  }
+}
+
+static INLINE void store_output_32bit_w16(int32_t *const out,
+                                          const __m256i *const in1,
+                                          const __m256i *const in2,
+                                          const int stride,
+                                          const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out + stride * i), in1[i]);
+    _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]);
+  }
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        int32_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out),
+                       _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
+    _mm256_store_si256(
+        (__m256i *)(out + 8),
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
+    out += stride;
+  }
+}
+
+static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+                                                  int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  _mm256_store_si256((__m256i *)b, b_lo);
+  _mm256_store_si256((__m256i *)(b + 8), b_hi);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
+  }
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
+static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
+  fdct16x32_avx2,       // DCT_DCT
+  NULL,                 // ADST_DCT
+  NULL,                 // DCT_ADST
+  NULL,                 // ADST_ADST
+  NULL,                 // FLIPADST_DCT
+  NULL,                 // DCT_FLIPADST
+  NULL,                 // FLIPADST_FLIPADST
+  NULL,                 // ADST_FLIPADST
+  NULL,                 // FLIPADST_ADST
+  fidentity16x32_avx2,  // IDTX
+  fdct16x32_avx2,       // V_DCT
+  fidentity16x32_avx2,  // H_DCT
+  NULL,                 // V_ADST
+  NULL,                 // H_ADST
+  NULL,                 // V_FLIPADST
+  NULL                  // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
+  fdct16x32_avx2,       // DCT_DCT
+  NULL,                 // ADST_DCT
+  NULL,                 // DCT_ADST
+  NULL,                 // ADST_ADST
+  NULL,                 // FLIPADST_DCT
+  NULL,                 // DCT_FLIPADST
+  NULL,                 // FLIPADST_FLIPADST
+  NULL,                 // ADST_FLIPADST
+  NULL,                 // FLIPADST_ADST
+  fidentity16x32_avx2,  // IDTX
+  fidentity16x32_avx2,  // V_DCT
+  fdct16x32_avx2,       // H_DCT
+  NULL,                 // V_ADST
+  NULL,                 // H_ADST
+  NULL,                 // V_FLIPADST
+  NULL                  // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
+  fdct16x16_new_avx2,       // DCT_DCT
+  fadst16x16_new_avx2,      // ADST_DCT
+  fdct16x16_new_avx2,       // DCT_ADST
+  fadst16x16_new_avx2,      // ADST_ADST
+  fadst16x16_new_avx2,      // FLIPADST_DCT
+  fdct16x16_new_avx2,       // DCT_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst16x16_new_avx2,      // ADST_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_ADST
+  fidentity16x16_new_avx2,  // IDTX
+  fdct16x16_new_avx2,       // V_DCT
+  fidentity16x16_new_avx2,  // H_DCT
+  fadst16x16_new_avx2,      // V_ADST
+  fidentity16x16_new_avx2,  // H_ADST
+  fadst16x16_new_avx2,      // V_FLIPADST
+  fidentity16x16_new_avx2   // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
+  fdct16x16_new_avx2,       // DCT_DCT
+  fdct16x16_new_avx2,       // ADST_DCT
+  fadst16x16_new_avx2,      // DCT_ADST
+  fadst16x16_new_avx2,      // ADST_ADST
+  fdct16x16_new_avx2,       // FLIPADST_DCT
+  fadst16x16_new_avx2,      // DCT_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst16x16_new_avx2,      // ADST_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_ADST
+  fidentity16x16_new_avx2,  // IDTX
+  fidentity16x16_new_avx2,  // V_DCT
+  fdct16x16_new_avx2,       // H_DCT
+  fidentity16x16_new_avx2,  // V_ADST
+  fadst16x16_new_avx2,      // H_ADST
+  fidentity16x16_new_avx2,  // V_FLIPADST
+  fadst16x16_new_avx2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fadst8x8_new_sse2,      // ADST_DCT
+  fdct8x8_new_sse2,       // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fadst8x8_new_sse2,      // FLIPADST_DCT
+  fdct8x8_new_sse2,       // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct8x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst8x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst8x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fdct8x8_new_sse2,       // ADST_DCT
+  fadst8x8_new_sse2,      // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fdct8x8_new_sse2,       // FLIPADST_DCT
+  fadst8x8_new_sse2,      // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct8x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst8x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst8x8_new_sse2       // H_FLIPADST
+};
+
+static INLINE void load_buffer_and_round_shift(const int16_t *in, int stride,
+                                               __m128i *out, int bit) {
+  out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride));
+  out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride));
+  out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride));
+  out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride));
+  out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride));
+  out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride));
+  out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride));
+  out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride));
+  out[0] = _mm_slli_epi16(out[0], bit);
+  out[1] = _mm_slli_epi16(out[1], bit);
+  out[2] = _mm_slli_epi16(out[2], bit);
+  out[3] = _mm_slli_epi16(out[3], bit);
+  out[4] = _mm_slli_epi16(out[4], bit);
+  out[5] = _mm_slli_epi16(out[5], bit);
+  out[6] = _mm_slli_epi16(out[6], bit);
+  out[7] = _mm_slli_epi16(out[7], bit);
+}
+
+static INLINE void load_buffer_and_flip_round_shift(const int16_t *in,
+                                                    int stride, __m128i *out,
+                                                    int bit) {
+  out[7] = load_16bit_to_16bit(in + 0 * stride);
+  out[6] = load_16bit_to_16bit(in + 1 * stride);
+  out[5] = load_16bit_to_16bit(in + 2 * stride);
+  out[4] = load_16bit_to_16bit(in + 3 * stride);
+  out[3] = load_16bit_to_16bit(in + 4 * stride);
+  out[2] = load_16bit_to_16bit(in + 5 * stride);
+  out[1] = load_16bit_to_16bit(in + 6 * stride);
+  out[0] = load_16bit_to_16bit(in + 7 * stride);
+  out[7] = _mm_slli_epi16(out[7], bit);
+  out[6] = _mm_slli_epi16(out[6], bit);
+  out[5] = _mm_slli_epi16(out[5], bit);
+  out[4] = _mm_slli_epi16(out[4], bit);
+  out[3] = _mm_slli_epi16(out[3], bit);
+  out[2] = _mm_slli_epi16(out[2], bit);
+  out[1] = _mm_slli_epi16(out[1], bit);
+  out[0] = _mm_slli_epi16(out[0], bit);
+}
+
+#define TRANSPOSE_8X8_AVX2()                                         \
+  {                                                                  \
+    /* aa0:    00 10 01 11  02 12 03 13 | 40 50 41 51  42 52 43 53*/ \
+    /* aa1:    04 14 05 15  06 16 07 17 | 44 54 45 55  46 56 47 57*/ \
+    /* aa2:    20 30 21 31  22 32 23 33 | 60 70 61 71  62 72 63 73*/ \
+    /* aa3:    24 34 25 35  26 36 27 37 | 64 74 65 75  66 76 67 77*/ \
+    const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1);               \
+    const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1);               \
+    const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3);               \
+    const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3);               \
+    /* Unpack 32 bit elements resulting in: */                       \
+    /* bb0: 00 10 20 30  01 11 21 31 | 40 50 60 70  41 51 61 71*/    \
+    /* bb1: 02 12 22 32  03 13 23 33 | 42 52 62 72  43 53 63 73*/    \
+    /* bb2: 04 14 24 34  05 15 25 35 | 44 54 64 74  45 55 65 75*/    \
+    /* bb2: 06 16 26 36  07 17 27 37 | 46 56 66 76  47 57 67 77*/    \
+    const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2);             \
+    const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2);             \
+    const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3);             \
+    const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3);             \
+    /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/       \
+    /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/       \
+    /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/       \
+    /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/       \
+    c0 = _mm256_permute4x64_epi64(bb0, 0xd8);                        \
+    c1 = _mm256_permute4x64_epi64(bb1, 0xd8);                        \
+    c2 = _mm256_permute4x64_epi64(bb2, 0xd8);                        \
+    c3 = _mm256_permute4x64_epi64(bb3, 0xd8);                        \
+  }
+
+static INLINE void transpose_round_shift_flip_8x8(__m128i *const in,
+                                                  __m128i *const out, int bit) {
+  __m256i c0, c1, c2, c3;
+  bit = -bit;
+  const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
+  const __m256i s04 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+  const __m256i s15 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+  const __m256i s26 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+  const __m256i s37 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+  const __m256i a0 = _mm256_adds_epi16(s04, rounding);
+  const __m256i a1 = _mm256_adds_epi16(s15, rounding);
+  const __m256i a2 = _mm256_adds_epi16(s26, rounding);
+  const __m256i a3 = _mm256_adds_epi16(s37, rounding);
+
+  // b0: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
+  // b1: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
+  // b2: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
+  // b3: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
+  const __m256i b0 = _mm256_srai_epi16(a0, bit);
+  const __m256i b1 = _mm256_srai_epi16(a1, bit);
+  const __m256i b2 = _mm256_srai_epi16(a2, bit);
+  const __m256i b3 = _mm256_srai_epi16(a3, bit);
+
+  TRANSPOSE_8X8_AVX2()
+
+  // Unpack 64 bit elements resulting in:
+  // out[7]: 00 10 20 30  40 50 60 70
+  // out[6]: 01 11 21 31  41 51 61 71
+  // out[5]: 02 12 22 32  42 52 62 72
+  // out[4]: 03 13 23 33  43 53 63 73
+  // out[3]: 04 14 24 34  44 54 64 74
+  // out[2]: 05 15 25 35  45 55 65 75
+  // out[1]: 06 16 26 36  46 56 66 76
+  // out[0]: 07 17 27 37  47 57 67 77
+  out[7] = _mm256_castsi256_si128(c0);
+  out[6] = _mm256_extractf128_si256(c0, 1);
+  out[5] = _mm256_castsi256_si128(c1);
+  out[4] = _mm256_extractf128_si256(c1, 1);
+  out[3] = _mm256_castsi256_si128(c2);
+  out[2] = _mm256_extractf128_si256(c2, 1);
+  out[1] = _mm256_castsi256_si128(c3);
+  out[0] = _mm256_extractf128_si256(c3, 1);
+}
+
+static INLINE void transpose_round_shift_8x8(__m128i *const in,
+                                             __m128i *const out, int bit) {
+  __m256i c0, c1, c2, c3;
+  bit = -bit;
+  const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
+  const __m256i s04 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+  const __m256i s15 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+  const __m256i s26 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+  const __m256i s37 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+  const __m256i a0 = _mm256_adds_epi16(s04, rounding);
+  const __m256i a1 = _mm256_adds_epi16(s15, rounding);
+  const __m256i a2 = _mm256_adds_epi16(s26, rounding);
+  const __m256i a3 = _mm256_adds_epi16(s37, rounding);
+
+  // b0: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
+  // b1: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
+  // b2: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
+  // b3: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
+  const __m256i b0 = _mm256_srai_epi16(a0, bit);
+  const __m256i b1 = _mm256_srai_epi16(a1, bit);
+  const __m256i b2 = _mm256_srai_epi16(a2, bit);
+  const __m256i b3 = _mm256_srai_epi16(a3, bit);
+
+  TRANSPOSE_8X8_AVX2()
+  // Unpack 64 bit elements resulting in:
+  // out[7]: 00 10 20 30  40 50 60 70
+  // out[6]: 01 11 21 31  41 51 61 71
+  // out[5]: 02 12 22 32  42 52 62 72
+  // out[4]: 03 13 23 33  43 53 63 73
+  // out[3]: 04 14 24 34  44 54 64 74
+  // out[2]: 05 15 25 35  45 55 65 75
+  // out[1]: 06 16 26 36  46 56 66 76
+  // out[0]: 07 17 27 37  47 57 67 77
+  out[0] = _mm256_castsi256_si128(c0);
+  out[1] = _mm256_extractf128_si256(c0, 1);
+  out[2] = _mm256_castsi256_si128(c1);
+  out[3] = _mm256_extractf128_si256(c1, 1);
+  out[4] = _mm256_castsi256_si128(c2);
+  out[5] = _mm256_extractf128_si256(c2, 1);
+  out[6] = _mm256_castsi256_si128(c3);
+  out[7] = _mm256_extractf128_si256(c3, 1);
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out + i * stride),
+                       _mm256_cvtepi16_epi32(in[i]));
+  }
+}
+
+static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // Condition to check shift bit is avoided while round shifting, by assuming
+  // that shift[0] will always be positive.
+  assert(shift[0] > 0);
+  if (ud_flip)
+    load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]);
+  else
+    load_buffer_and_round_shift(input, stride, buf0, shift[0]);
+
+  col_txfm(buf0, buf0, cos_bit_col);
+  // Condition to check shift bit is avoided while round shifting, by assuming
+  // that shift[1] will always be negative.
+  assert(shift[1] < 0);
+
+  if (lr_flip) {
+    transpose_round_shift_flip_8x8(buf0, buf1, shift[1]);
+  } else {
+    transpose_round_shift_8x8(buf0, buf1, shift[1]);
+  }
+
+  buf = buf1;
+  row_txfm(buf, buf, cos_bit_row);
+
+  // Round and shift operation is avoided here as the shift bit is assumed to be
+  // zero always.
+  assert(shift[2] == 0);
+  store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8);
+}
+
+static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_16X16;
+  __m256i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int32_t i = 0;
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+  }
+  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+  transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+
+  __m256i *buf;
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_avx2(buf1 + width * i, buf, width);
+  } else {
+    buf = buf1 + width * i;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf, width, shift[2]);
+  store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
+}
+
+static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_32X32;
+  __m256i buf0[32], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+                                           height);
+    } else {
+      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    }
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
+    transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m256i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_avx2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[64];
+    __m256i bufB[64];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+    store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_16X32;
+  __m256i buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
+  }
+  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+  transpose_16bit_16x16_avx2(buf0, buf1);
+  transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
+
+  for (int i = 0; i < 2; i++) {
+    __m256i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_avx2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height,
+                                              width);
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m256i buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+                                           height);
+    } else {
+      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    }
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+  }
+
+  __m256i *buf;
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_avx2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf, width, shift[2]);
+  store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width);
+}
+
+static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[64];
+    __m256i bufB[64];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[32];
+    __m256i bufB[32];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    fdct32_avx2(bufA, bufA, cos_bit_row);
+    fdct32_avx2(bufB, bufB, cos_bit_row);
+    round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m256i buf0[64], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < height_div16; ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width);
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m256i buf0[64], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
+  const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < height_div16; ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div16; i++) {
+    __m256i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32);
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
+                               __m256i *in1, __m128i *out0, __m128i *out1,
+                               __m128i *out2, __m128i *out3,
+                               const __m256i *__rounding, int8_t *cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+  __m256i u0 = _mm256_madd_epi16(t0, *w0);
+  __m256i u1 = _mm256_madd_epi16(t1, *w0);
+  __m256i v0 = _mm256_madd_epi16(t0, *w1);
+  __m256i v1 = _mm256_madd_epi16(t1, *w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, *__rounding);
+  __m256i a1 = _mm256_add_epi32(u1, *__rounding);
+  __m256i b0 = _mm256_add_epi32(v0, *__rounding);
+  __m256i b1 = _mm256_add_epi32(v1, *__rounding);
+
+  __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
+
+  __m256i temp0 = _mm256_packs_epi32(c0, c1);
+  __m256i temp1 = _mm256_packs_epi32(d0, d1);
+
+  *out0 = _mm256_castsi256_si128(temp0);
+  *out1 = _mm256_castsi256_si128(temp1);
+  *out2 = _mm256_extracti128_si256(temp0, 0x01);
+  *out3 = _mm256_extracti128_si256(temp1, 0x01);
+}
+
+static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
+                                    int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = _mm256_adds_epi16(input[0], input[7]);
+  x1[7] = _mm256_subs_epi16(input[0], input[7]);
+  x1[1] = _mm256_adds_epi16(input[1], input[6]);
+  x1[6] = _mm256_subs_epi16(input[1], input[6]);
+  x1[2] = _mm256_adds_epi16(input[2], input[5]);
+  x1[5] = _mm256_subs_epi16(input[2], input[5]);
+  x1[3] = _mm256_adds_epi16(input[3], input[4]);
+  x1[4] = _mm256_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
+                  cos_bit);
+  x2[5] = x1[5];
+  x2[6] = x1[6];
+  x2[7] = x1[7];
+
+  // stage 3
+  __m256i x3[8];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
+                  cos_bit);
+  x3[0] = x2[0];
+  x3[1] = x2[1];
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
+                  cos_bit);
+  x3[2] = x2[2];
+  x3[3] = x2[3];
+  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
+                  cos_bit);
+  x4[4] = x3[4];
+  x4[7] = x3[7];
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
+                  cos_bit);
+  x4[5] = x3[5];
+  x4[6] = x3[6];
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm256_subs_epi16(__zero, input[7]);
+  x1[2] = _mm256_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm256_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm256_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
+                  cos_bit);
+  x2[2] = x1[2];
+  x2[3] = x1[3];
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
+                  cos_bit);
+  x2[6] = x1[6];
+  x2[7] = x1[7];
+
+  // stage 3
+  __m256i x3[8];
+  x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
+                  cos_bit);
+  x4[4] = x3[4];
+  x4[5] = x3[5];
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
+                  cos_bit);
+  x4[6] = x3[6];
+  x4[7] = x3[7];
+
+  // stage 5
+  __m256i x5[8];
+  x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m256i x6[8];
+  btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
+                  cos_bit);
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
+                  cos_bit);
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
+                  cos_bit);
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
+                  cos_bit);
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm256_adds_epi16(input[0], input[0]);
+  output[1] = _mm256_adds_epi16(input[1], input[1]);
+  output[2] = _mm256_adds_epi16(input[2], input[2]);
+  output[3] = _mm256_adds_epi16(input[3], input[3]);
+  output[4] = _mm256_adds_epi16(input[4], input[4]);
+  output[5] = _mm256_adds_epi16(input[5], input[5]);
+  output[6] = _mm256_adds_epi16(input[6], input[6]);
+  output[7] = _mm256_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i temp0, temp1, temp2, temp3;
+  __m256i in0, in1;
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  __m256i cospi_arr[12];
+
+  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
+                                         cospi_m32_p32, 0x1);
+  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p48_p16, 0x1);
+  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_m16_p48, 0x1);
+  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
+                                         cospi_m48_m16, 0x1);
+  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
+                                         cospi_m16_p48, 0x1);
+  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
+                                         cospi_p24_p40, 0x1);
+  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
+                                         cospi_m40_p24, 0x1);
+  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
+                                         cospi_p28_p36, 0x1);
+  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
+                                         cospi_m36_p28, 0x1);
+  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
+                                          cospi_p12_p52, 0x1);
+  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
+                                          cospi_m52_p12, 0x1);
+
+  __m256i x[8];
+  x[0] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
+  x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
+                                 0x1);
+  x[2] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
+  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
+                                 0x1);
+  x[4] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
+  x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
+                                 0x1);
+  x[6] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
+  x[7] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = _mm256_adds_epi16(x[0], x[1]);
+  x1[7] = _mm256_subs_epi16(x[0], x[1]);
+  x1[1] = _mm256_adds_epi16(x[2], x[3]);
+  x1[6] = _mm256_subs_epi16(x[2], x[3]);
+  x1[2] = _mm256_adds_epi16(x[4], x[5]);
+  x1[5] = _mm256_subs_epi16(x[4], x[5]);
+  x1[3] = _mm256_adds_epi16(x[6], x[7]);
+  x1[4] = _mm256_subs_epi16(x[6], x[7]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+  x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+  x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
+  x2[2] = x1[4];
+  x2[3] = x1[7];
+  btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
+              &temp2, &temp3, &__rounding_256, &cos_bit);
+  x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
+  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+  // stage 3
+  __m256i x3[8];
+  x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
+  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+  x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
+              _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
+  x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
+  x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
+  x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
+  x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
+  x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
+  x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
+  btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
+              &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
+  x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
+  x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
+  x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
+  x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
+  in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
+  in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
+  btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+
+  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
+  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+  // stage 5
+  __m256i x5[4];
+  in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
+  in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
+  btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
+              &output[10], &output[6], &__rounding_256, &cos_bit);
+  x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
+  x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
+  x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
+  x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
+
+  // stage 6
+  in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
+  btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
+              &output[9], &output[7], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
+  in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
+  btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
+              &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+  __m256i in0, in1;
+  __m128i temp0, temp1, temp2, temp3;
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  __m256i cospi_arr[20];
+
+  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_p32_m32, 0x1);
+  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_p32_m32, 0x1);
+  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+                                         cospi_m48_p16, 0x1);
+  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+                                         cospi_p16_p48, 0x1);
+  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+                                         cospi_m48_p16, 0x1);
+  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+                                         cospi_p16_p48, 0x1);
+  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+                                         cospi_p40_p24, 0x1);
+  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
+                                         cospi_p24_m40, 0x1);
+  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
+                                          cospi_m24_p40, 0x1);
+  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+                                          cospi_p40_p24, 0x1);
+  cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
+                                          cospi_p10_p54, 0x1);
+  cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
+                                          cospi_p54_m10, 0x1);
+  cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
+                                          cospi_p26_p38, 0x1);
+  cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
+                                          cospi_p38_m26, 0x1);
+  cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
+                                          cospi_p42_p22, 0x1);
+  cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
+                                          cospi_p22_m42, 0x1);
+  cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
+                                          cospi_p58_p06, 0x1);
+  cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
+                                          cospi_p06_m58, 0x1);
+
+  __m256i x[8];
+  x[0] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
+  x[1] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
+  x[2] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
+  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
+                                 0x1);
+  x[4] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
+  x[5] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
+  x[6] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
+  x[7] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = x[0];
+  x1[1] = _mm256_subs_epi16(__zero, x[7]);
+  x1[2] = x[2];
+  x1[3] = _mm256_subs_epi16(__zero, x[5]);
+  x1[4] = _mm256_subs_epi16(__zero, x[4]);
+  x1[5] = x[3];
+  x1[6] = _mm256_subs_epi16(__zero, x[6]);
+  x1[7] = x[1];
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
+  x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
+  x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
+  x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
+  in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
+  in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
+  btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
+  in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
+  btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 3
+  __m256i x3[8];
+  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+  x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
+  x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
+  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
+  x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[4] = x3[4];
+  x4[5] = x3[5];
+  in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
+  in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
+  btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
+  btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 5
+  __m256i x5[8];
+  x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
+  x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
+  x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
+  x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
+  x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
+  x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
+  x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
+
+  // stage 6
+  __m256i x6[8];
+  x6[0] = x5[0];
+  x6[1] = x5[2];
+  x6[2] = x5[1];
+  x6[3] = x5[3];
+  in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
+  btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
+  btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
+              &temp2, &temp3, &__rounding_256, &cos_bit);
+  x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 7
+  __m256i x7[8];
+  x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
+  x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
+  x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
+  x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
+  x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
+  x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
+  x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
+  x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
+
+  // stage 8
+  in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
+  btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
+              &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
+  btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
+              &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
+  btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
+              &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
+  btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
+              &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i temp;
+  for (int i = 0; i < 16; i += 2) {
+    temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
+                                   input[i + 1], 0x1);
+    const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
+    const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
+    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+    temp = _mm256_packs_epi32(b_lo, b_hi);
+    output[i] = _mm256_castsi256_si128(temp);
+    output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
+  }
+}
+
+static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x8_new_avx2,       // DCT_DCT
+  fdct8x8_new_avx2,       // ADST_DCT
+  fadst8x8_new_avx2,      // DCT_ADST
+  fadst8x8_new_avx2,      // ADST_ADST
+  fdct8x8_new_avx2,       // FLIPADST_DCT
+  fadst8x8_new_avx2,      // DCT_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x8_new_avx2,      // ADST_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_ADST
+  fidentity8x8_new_avx2,  // IDTX
+  fidentity8x8_new_avx2,  // V_DCT
+  fdct8x8_new_avx2,       // H_DCT
+  fidentity8x8_new_avx2,  // V_ADST
+  fadst8x8_new_avx2,      // H_ADST
+  fidentity8x8_new_avx2,  // V_FLIPADST
+  fadst8x8_new_avx2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_avx2,       // DCT_DCT
+  fadst8x16_new_avx2,      // ADST_DCT
+  fdct8x16_new_avx2,       // DCT_ADST
+  fadst8x16_new_avx2,      // ADST_ADST
+  fadst8x16_new_avx2,      // FLIPADST_DCT
+  fdct8x16_new_avx2,       // DCT_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x16_new_avx2,      // ADST_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_ADST
+  fidentity8x16_new_avx2,  // IDTX
+  fdct8x16_new_avx2,       // V_DCT
+  fidentity8x16_new_avx2,  // H_DCT
+  fadst8x16_new_avx2,      // V_ADST
+  fidentity8x16_new_avx2,  // H_ADST
+  fadst8x16_new_avx2,      // V_FLIPADST
+  fidentity8x16_new_avx2   // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
+  fdct8x8_new_avx2,       // DCT_DCT
+  fadst8x8_new_avx2,      // ADST_DCT
+  fdct8x8_new_avx2,       // DCT_ADST
+  fadst8x8_new_avx2,      // ADST_ADST
+  fadst8x8_new_avx2,      // FLIPADST_DCT
+  fdct8x8_new_avx2,       // DCT_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x8_new_avx2,      // ADST_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_ADST
+  fidentity8x8_new_avx2,  // IDTX
+  fdct8x8_new_avx2,       // V_DCT
+  fidentity8x8_new_avx2,  // H_DCT
+  fadst8x8_new_avx2,      // V_ADST
+  fidentity8x8_new_avx2,  // H_ADST
+  fadst8x8_new_avx2,      // V_FLIPADST
+  fidentity8x8_new_avx2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
+  fdct8x16_new_avx2,       // DCT_DCT
+  fdct8x16_new_avx2,       // ADST_DCT
+  fadst8x16_new_avx2,      // DCT_ADST
+  fadst8x16_new_avx2,      // ADST_ADST
+  fdct8x16_new_avx2,       // FLIPADST_DCT
+  fadst8x16_new_avx2,      // DCT_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x16_new_avx2,      // ADST_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_ADST
+  fidentity8x16_new_avx2,  // IDTX
+  fidentity8x16_new_avx2,  // V_DCT
+  fdct8x16_new_avx2,       // H_DCT
+  fidentity8x16_new_avx2,  // V_ADST
+  fadst8x16_new_avx2,      // H_ADST
+  fidentity8x16_new_avx2,  // V_FLIPADST
+  fadst8x16_new_avx2       // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  __m256i buf2[8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  __m128i *bufl, *bufu;
+  if (lr_flip) {
+    bufl = buf0;
+    bufu = buf0 + 8;
+    flip_buf_sse2(buf1 + width * 0, bufl, width);
+    flip_buf_sse2(buf1 + width * 1, bufu, width);
+  } else {
+    bufl = buf1 + width * 0;
+    bufu = buf1 + width * 1;
+  }
+  pack_reg(bufl, bufu, buf2);
+  row_txfm(buf2, buf2, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf2, width, shift[2]);
+  store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width);
+}
+
+static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  __m256i buf2[8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
+    load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
+  } else {
+    load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
+    load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
+  }
+  pack_reg(buf0, &buf0[8], buf2);
+  round_shift_16bit_w16_avx2(buf2, height, shift[0]);
+  col_txfm(buf2, buf2, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf2, height, shift[1]);
+  transpose_16bit_16x8_avx2(buf2, buf2);
+  extract_reg(buf2, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_avx2,   // 8x8 transform
+  lowbd_fwd_txfm2d_16x16_avx2,     // 16x16 transform
+  lowbd_fwd_txfm2d_32x32_avx2,     // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
+  lowbd_fwd_txfm2d_8x16_avx2,      // 8x16 transform
+  lowbd_fwd_txfm2d_16x8_avx2,      // 16x8 transform
+  lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
+  lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_avx2,     // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,  // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,  // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,  // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,  // 32x8 transform
+  lowbd_fwd_txfm2d_16x64_avx2,     // 16x64 transform
+  lowbd_fwd_txfm2d_64x16_avx2,     // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 0000000000..825da8d7b4
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+static INLINE void store_output_32bit_w8(int32_t *const out,
+                                         const __m128i *const in1,
+                                         const __m128i *const in2,
+                                         const int stride, const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm_store_si128((__m128i *)(out + stride * i), in1[i]);
+    _mm_store_si128((__m128i *)(out + stride * i + 4), in2[i]);
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output,
+                             const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit,
+                          const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num);
+  }
+}
+
+static void fdct64_new_sse4_1(__m128i *input, __m128i *output,
+                              const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  (void)stage_range;
+  for (int col = 0; col < col_num; col++) {
+    av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num);
+  }
+}
+static void idtx32x32_sse4_1(__m128i *input, __m128i *output,
+                             const int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+
+  for (int i = 0; i < 8; i++) {
+    av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1);
+  }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return fdct32_sse4_1;
+    case TXFM_TYPE_DCT64: return fdct64_new_sse4_1;
+    case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride,
+                                     const TXFM_2D_FLIP_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  // TODO(sarahparker) This does not currently support rectangular transforms
+  // and will break without splitting txfm_size out into row and col size.
+  // Rectangular transforms use c code only, so it should be ok for now.
+  // It will be corrected when there are sse implementations for rectangular
+  // transforms.
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]);
+}
+
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+                                           int32_t *output, const int stride,
+                                           const TXFM_2D_FLIP_CFG *cfg,
+                                           int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+
+  const int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+  int col_num = txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+                                                        txfm_size);
+  /*col wise transform*/
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+
+  /*row wise transform*/
+  for (int col = 0; col < (col_num >> 1); col++) {
+    av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
+                      (col_num >> 1));
+  }
+
+  txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+  av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  (void)bd;
+  fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m128i buf0[64], buf1[512];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[32];
+    __m128i bufB[32];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1);
+    av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+  }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_sse4_1,    // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_sse4_1,    // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_sse4_1,    // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+                               int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
new file mode 100644
index 0000000000..aaad76e5ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#include <immintrin.h>
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+                                     __m256i *in0, __m256i *in1,
+                                     const __m256i _r, const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i ww0 = _mm256_set1_epi32(w0);
+  const __m256i ww1 = _mm256_set1_epi32(w1);
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+                                     __m256i *in0, __m256i *in1,
+                                     const __m256i _r, const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i ww0 = _mm256_set1_epi32(w0);
+  const __m256i ww1 = _mm256_set1_epi32(w1);
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+                                         __m256i *in0, __m256i *in1,
+                                         const __m256i _r,
+                                         const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+                                         __m256i *in0, __m256i *in1,
+                                         const __m256i _r,
+                                         const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 0000000000..a4def754b0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2673 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i u[4], v[4];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], cospi_p32_p32);  // 0
+  u[1] = _mm_madd_epi16(v[0], cospi_p32_m32);  // 2
+  u[2] = _mm_madd_epi16(v[1], cospi_p16_p48);  // 1
+  u[3] = _mm_madd_epi16(v[1], cospi_p48_m16);  // 3
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[3], __rounding);
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[1]);
+  output[1] = _mm_packs_epi32(u[2], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x1[4];
+  x1[0] = _mm_adds_epi16(input[0], input[3]);
+  x1[3] = _mm_subs_epi16(input[0], input[3]);
+  x1[1] = _mm_adds_epi16(input[1], input[2]);
+  x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+  // stage 2
+  __m128i x2[4];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+  // stage 3
+  output[0] = x2[0];
+  output[1] = x2[2];
+  output[2] = x2[1];
+  output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+                 &x1[6], &x2[5], &x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+                 &x2[1], &x3[0], &x3[1]);
+  btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+                 &x2[3], &x3[2], &x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+                 &x3[7], &x4[4], &x4[7]);
+  btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+                 &x3[6], &x4[5], &x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = _mm_adds_epi16(input[0], input[15]);
+  x1[15] = _mm_subs_epi16(input[0], input[15]);
+  x1[1] = _mm_adds_epi16(input[1], input[14]);
+  x1[14] = _mm_subs_epi16(input[1], input[14]);
+  x1[2] = _mm_adds_epi16(input[2], input[13]);
+  x1[13] = _mm_subs_epi16(input[2], input[13]);
+  x1[3] = _mm_adds_epi16(input[3], input[12]);
+  x1[12] = _mm_subs_epi16(input[3], input[12]);
+  x1[4] = _mm_adds_epi16(input[4], input[11]);
+  x1[11] = _mm_subs_epi16(input[4], input[11]);
+  x1[5] = _mm_adds_epi16(input[5], input[10]);
+  x1[10] = _mm_subs_epi16(input[5], input[10]);
+  x1[6] = _mm_adds_epi16(input[6], input[9]);
+  x1[9] = _mm_subs_epi16(input[6], input[9]);
+  x1[7] = _mm_adds_epi16(input[7], input[8]);
+  x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+  x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+  x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+  x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+  x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+  x2[14] = x1[14];
+  x2[15] = x1[15];
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+  x3[4] = x2[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+  x3[7] = x2[7];
+  x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+  x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+  x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+  x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+  x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+  // stage 4
+  __m128i x4[16];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+  x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+  x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+  x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+  x4[8] = x3[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+  x4[11] = x3[11];
+  x4[12] = x3[12];
+  x4[15] = x3[15];
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = x4[0];
+  x5[1] = x4[1];
+  x5[2] = x4[2];
+  x5[3] = x4[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+  x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+  x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+  x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+  x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+  // stage 7
+  output[0] = x6[0];
+  output[1] = x6[8];
+  output[2] = x6[4];
+  output[3] = x6[12];
+  output[4] = x6[2];
+  output[5] = x6[10];
+  output[6] = x6[6];
+  output[7] = x6[14];
+  output[8] = x6[1];
+  output[9] = x6[9];
+  output[10] = x6[5];
+  output[11] = x6[13];
+  output[12] = x6[3];
+  output[13] = x6[11];
+  output[14] = x6[7];
+  output[15] = x6[15];
+}
+
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m128i x1[32];
+  x1[0] = _mm_adds_epi16(input[0], input[31]);
+  x1[31] = _mm_subs_epi16(input[0], input[31]);
+  x1[1] = _mm_adds_epi16(input[1], input[30]);
+  x1[30] = _mm_subs_epi16(input[1], input[30]);
+  x1[2] = _mm_adds_epi16(input[2], input[29]);
+  x1[29] = _mm_subs_epi16(input[2], input[29]);
+  x1[3] = _mm_adds_epi16(input[3], input[28]);
+  x1[28] = _mm_subs_epi16(input[3], input[28]);
+  x1[4] = _mm_adds_epi16(input[4], input[27]);
+  x1[27] = _mm_subs_epi16(input[4], input[27]);
+  x1[5] = _mm_adds_epi16(input[5], input[26]);
+  x1[26] = _mm_subs_epi16(input[5], input[26]);
+  x1[6] = _mm_adds_epi16(input[6], input[25]);
+  x1[25] = _mm_subs_epi16(input[6], input[25]);
+  x1[7] = _mm_adds_epi16(input[7], input[24]);
+  x1[24] = _mm_subs_epi16(input[7], input[24]);
+  x1[8] = _mm_adds_epi16(input[8], input[23]);
+  x1[23] = _mm_subs_epi16(input[8], input[23]);
+  x1[9] = _mm_adds_epi16(input[9], input[22]);
+  x1[22] = _mm_subs_epi16(input[9], input[22]);
+  x1[10] = _mm_adds_epi16(input[10], input[21]);
+  x1[21] = _mm_subs_epi16(input[10], input[21]);
+  x1[11] = _mm_adds_epi16(input[11], input[20]);
+  x1[20] = _mm_subs_epi16(input[11], input[20]);
+  x1[12] = _mm_adds_epi16(input[12], input[19]);
+  x1[19] = _mm_subs_epi16(input[12], input[19]);
+  x1[13] = _mm_adds_epi16(input[13], input[18]);
+  x1[18] = _mm_subs_epi16(input[13], input[18]);
+  x1[14] = _mm_adds_epi16(input[14], input[17]);
+  x1[17] = _mm_subs_epi16(input[14], input[17]);
+  x1[15] = _mm_adds_epi16(input[15], input[16]);
+  x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+  // stage 2
+  __m128i x2[32];
+  x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+  x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+  x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+  x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+  x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+  x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+  x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+  x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+  x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+  x2[16] = x1[16];
+  x2[17] = x1[17];
+  x2[18] = x1[18];
+  x2[19] = x1[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+  x2[28] = x1[28];
+  x2[29] = x1[29];
+  x2[30] = x1[30];
+  x2[31] = x1[31];
+
+  // stage 3
+  __m128i x3[32];
+  x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+  x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+  x3[8] = x2[8];
+  x3[9] = x2[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+  x3[14] = x2[14];
+  x3[15] = x2[15];
+  x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+  x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+  x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+  x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+  x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+  x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+  x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+  x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+  x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+  x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+  x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+  x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+  x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+  x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+  x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+  x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+  // stage 4
+  __m128i x4[32];
+  x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+  x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+  x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+  x4[4] = x3[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+  x4[7] = x3[7];
+  x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+  x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+  x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+  x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+  x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+  x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+  x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+  x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+  x4[16] = x3[16];
+  x4[17] = x3[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+  x4[22] = x3[22];
+  x4[23] = x3[23];
+  x4[24] = x3[24];
+  x4[25] = x3[25];
+  x4[30] = x3[30];
+  x4[31] = x3[31];
+
+  // stage 5
+  __m128i x5[32];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+  x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+  x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+  x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+  x5[8] = x4[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+  x5[11] = x4[11];
+  x5[12] = x4[12];
+  x5[15] = x4[15];
+  x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+  x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+  x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+  x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+  x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+  x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+  x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+  x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+  x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+  x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+  x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+  x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+  x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+  x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+  x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+  x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+  // stage 6
+  __m128i x6[32];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+  x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+  x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+  x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+  x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+  x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+  x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+  x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+  x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+  x6[16] = x5[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+  x6[19] = x5[19];
+  x6[20] = x5[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+  x6[23] = x5[23];
+  x6[24] = x5[24];
+  x6[27] = x5[27];
+  x6[28] = x5[28];
+  x6[31] = x5[31];
+
+  // stage 7
+  __m128i x7[32];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  x7[4] = x6[4];
+  x7[5] = x6[5];
+  x7[6] = x6[6];
+  x7[7] = x6[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+  x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+  x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+  x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+  x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+  x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+  x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+  x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+  x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+  x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+  x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+  x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+  x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+  x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+  x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+  x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+  x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+  // stage 8
+  __m128i x8[32];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  x8[8] = x7[8];
+  x8[9] = x7[9];
+  x8[10] = x7[10];
+  x8[11] = x7[11];
+  x8[12] = x7[12];
+  x8[13] = x7[13];
+  x8[14] = x7[14];
+  x8[15] = x7[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+  // stage 9
+  output[0] = x8[0];
+  output[1] = x8[16];
+  output[2] = x8[8];
+  output[3] = x8[24];
+  output[4] = x8[4];
+  output[5] = x8[20];
+  output[6] = x8[12];
+  output[7] = x8[28];
+  output[8] = x8[2];
+  output[9] = x8[18];
+  output[10] = x8[10];
+  output[11] = x8[26];
+  output[12] = x8[6];
+  output[13] = x8[22];
+  output[14] = x8[14];
+  output[15] = x8[30];
+  output[16] = x8[1];
+  output[17] = x8[17];
+  output[18] = x8[9];
+  output[19] = x8[25];
+  output[20] = x8[5];
+  output[21] = x8[21];
+  output[22] = x8[13];
+  output[23] = x8[29];
+  output[24] = x8[3];
+  output[25] = x8[19];
+  output[26] = x8[11];
+  output[27] = x8[27];
+  output[28] = x8[7];
+  output[29] = x8[23];
+  output[30] = x8[15];
+  output[31] = x8[31];
+}
+
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+  __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+  __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+  __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+  __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+  __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+  __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+  __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+  __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+  __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+  __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+  __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+  __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+  __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+  __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+  __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+  __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+  __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+  __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+  __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+  __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+  __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+  __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+  __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+  __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+  __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+  __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+  __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+  __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+  __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+  __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+  __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+  __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_adds_epi16(input[0], input[63]);
+  x1[63] = _mm_subs_epi16(input[0], input[63]);
+  x1[1] = _mm_adds_epi16(input[1], input[62]);
+  x1[62] = _mm_subs_epi16(input[1], input[62]);
+  x1[2] = _mm_adds_epi16(input[2], input[61]);
+  x1[61] = _mm_subs_epi16(input[2], input[61]);
+  x1[3] = _mm_adds_epi16(input[3], input[60]);
+  x1[60] = _mm_subs_epi16(input[3], input[60]);
+  x1[4] = _mm_adds_epi16(input[4], input[59]);
+  x1[59] = _mm_subs_epi16(input[4], input[59]);
+  x1[5] = _mm_adds_epi16(input[5], input[58]);
+  x1[58] = _mm_subs_epi16(input[5], input[58]);
+  x1[6] = _mm_adds_epi16(input[6], input[57]);
+  x1[57] = _mm_subs_epi16(input[6], input[57]);
+  x1[7] = _mm_adds_epi16(input[7], input[56]);
+  x1[56] = _mm_subs_epi16(input[7], input[56]);
+  x1[8] = _mm_adds_epi16(input[8], input[55]);
+  x1[55] = _mm_subs_epi16(input[8], input[55]);
+  x1[9] = _mm_adds_epi16(input[9], input[54]);
+  x1[54] = _mm_subs_epi16(input[9], input[54]);
+  x1[10] = _mm_adds_epi16(input[10], input[53]);
+  x1[53] = _mm_subs_epi16(input[10], input[53]);
+  x1[11] = _mm_adds_epi16(input[11], input[52]);
+  x1[52] = _mm_subs_epi16(input[11], input[52]);
+  x1[12] = _mm_adds_epi16(input[12], input[51]);
+  x1[51] = _mm_subs_epi16(input[12], input[51]);
+  x1[13] = _mm_adds_epi16(input[13], input[50]);
+  x1[50] = _mm_subs_epi16(input[13], input[50]);
+  x1[14] = _mm_adds_epi16(input[14], input[49]);
+  x1[49] = _mm_subs_epi16(input[14], input[49]);
+  x1[15] = _mm_adds_epi16(input[15], input[48]);
+  x1[48] = _mm_subs_epi16(input[15], input[48]);
+  x1[16] = _mm_adds_epi16(input[16], input[47]);
+  x1[47] = _mm_subs_epi16(input[16], input[47]);
+  x1[17] = _mm_adds_epi16(input[17], input[46]);
+  x1[46] = _mm_subs_epi16(input[17], input[46]);
+  x1[18] = _mm_adds_epi16(input[18], input[45]);
+  x1[45] = _mm_subs_epi16(input[18], input[45]);
+  x1[19] = _mm_adds_epi16(input[19], input[44]);
+  x1[44] = _mm_subs_epi16(input[19], input[44]);
+  x1[20] = _mm_adds_epi16(input[20], input[43]);
+  x1[43] = _mm_subs_epi16(input[20], input[43]);
+  x1[21] = _mm_adds_epi16(input[21], input[42]);
+  x1[42] = _mm_subs_epi16(input[21], input[42]);
+  x1[22] = _mm_adds_epi16(input[22], input[41]);
+  x1[41] = _mm_subs_epi16(input[22], input[41]);
+  x1[23] = _mm_adds_epi16(input[23], input[40]);
+  x1[40] = _mm_subs_epi16(input[23], input[40]);
+  x1[24] = _mm_adds_epi16(input[24], input[39]);
+  x1[39] = _mm_subs_epi16(input[24], input[39]);
+  x1[25] = _mm_adds_epi16(input[25], input[38]);
+  x1[38] = _mm_subs_epi16(input[25], input[38]);
+  x1[26] = _mm_adds_epi16(input[26], input[37]);
+  x1[37] = _mm_subs_epi16(input[26], input[37]);
+  x1[27] = _mm_adds_epi16(input[27], input[36]);
+  x1[36] = _mm_subs_epi16(input[27], input[36]);
+  x1[28] = _mm_adds_epi16(input[28], input[35]);
+  x1[35] = _mm_subs_epi16(input[28], input[35]);
+  x1[29] = _mm_adds_epi16(input[29], input[34]);
+  x1[34] = _mm_subs_epi16(input[29], input[34]);
+  x1[30] = _mm_adds_epi16(input[30], input[33]);
+  x1[33] = _mm_subs_epi16(input[30], input[33]);
+  x1[31] = _mm_adds_epi16(input[31], input[32]);
+  x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+  x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+  x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+  x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+  x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+  x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+  x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+  x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+  x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+  x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+  x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+  x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+  x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+  x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+  x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+  x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+  x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+  x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+  x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+  x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+  x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+  x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+  x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+  x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+  x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+  x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+  x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+  x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+  x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+  x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+  x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+  x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+  x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+  x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+  x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+  x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+  x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+  x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+  x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+  x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+  x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+  x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+  x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+  x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+  x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+  x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+  x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+  x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+  x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+  x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+  x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+  x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+  x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+  x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+  x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+  x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+  x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+  x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+  x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+  x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+  x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+  x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+  x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+  x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+  x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+  x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+  x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+  x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+  x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+  x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+  x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+  x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+  x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+  x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+  x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+  x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+  x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+  x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+  x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+  x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+  x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+  x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+  x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+  x5[7] = x4[7];
+  x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+  x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+  x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+  x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+  x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+  x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+  x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+  x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+  x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+  x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+  x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+  x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+  x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+  x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+  x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+  x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+  x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+  x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+  x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+  x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+  x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+  x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+  x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+  x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+  x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+  x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+  x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+  x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+  x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+  x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+  x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+  x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+  x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+  x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+  x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+  x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+  x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+  x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+  x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+  x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+  x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+  x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+  x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+  x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+  x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+  x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+  x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+  x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+  x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+  x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+  x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+  x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+  x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+  x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+  x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+  x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+  x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+  x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+  x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+  x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+  x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+  x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+  x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+  x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+  x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+  x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+  x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+  x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+  x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+  x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+  x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+  x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+  x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+  x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+  x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+  x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+  x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+  x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+  x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+  x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+  x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+  x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+  x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+  x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+  x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+  x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+  x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+  x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+  x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+  x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+  x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+  x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+  x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+  x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+  x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+  x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+  x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+  x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+  x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+  x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+  x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+  x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+  x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+  x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+  x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+  x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+  x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+  x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+  x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+  x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+  x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+  x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+  x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+  x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+  x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+  x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+  x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+  x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+  x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+  x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+  x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+  x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+  x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+  x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+  x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+  x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+  x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+  x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+  x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+  x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+  x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+  x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+  x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+  x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+  x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+  x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+  x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+  x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+  x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+  x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+  x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+  btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+  btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+  btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+  btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+  btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+  btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+  btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+  btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+  btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+  btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+  btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+  btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+  btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+  btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+  btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+  // stage 11
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u[8], v[8];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u[2] = _mm_unpacklo_epi16(in7, __zero);
+  u[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+  v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[6], __rounding);
+
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[2]);
+  output[1] = _mm_packs_epi32(u[1], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+                 &x1[3], &x2[2], &x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+                 &x1[7], &x2[6], &x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+                 &x3[5], &x4[4], &x4[5]);
+  btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+                 &x3[7], &x4[6], &x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+                 &x5[1], &x6[0], &x6[1]);
+  btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+                 &x5[3], &x6[2], &x6[3]);
+  btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+                 &x5[5], &x6[4], &x6[5]);
+  btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+                 &x5[7], &x6[6], &x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+  u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+  u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+  u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+  u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+  u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+  u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+  u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+  v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02);  // s0 + s2
+  v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02);  // s0 + s2
+  v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04);  // s4 + s5
+  v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04);  // s4 + s5
+  v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03);  // x1
+  v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03);  // x1
+  v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01);  // s1 - s3
+  v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01);  // s1 - s3
+  v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02);  // -s4 + s6
+  v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02);  // -s4 + s6
+  v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03);  // s4
+  v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03);  // s4
+  v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+  v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+  u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+  u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+  u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+  u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+  u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+  u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+  u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+  u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+  u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+  u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+  u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+  u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+  u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+  u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+  v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+  v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+  v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+  v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+  v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+  v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+  v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+  v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+  u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+  u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+  u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+  u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+  u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+  u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+  u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+  u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+  output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+  output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+  output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[15]);
+  x1[2] = _mm_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm_subs_epi16(__zero, input[11]);
+  x1[8] = _mm_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm_subs_epi16(__zero, input[13]);
+  x1[14] = _mm_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+  x2[12] = x1[12];
+  x2[13] = x1[13];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+  x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+  x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+  x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+  // stage 4
+  __m128i x4[16];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  x4[10] = x3[10];
+  x4[11] = x3[11];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+  x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+  x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+  x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+  x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+  // stage 7
+  __m128i x7[16];
+  x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+  x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+  x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+  x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+  x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+  x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+  x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+  x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+  x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+  x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+  x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+  x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+  x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+  x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+  // stage 8
+  __m128i x8[16];
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+  // stage 9
+  output[0] = x8[1];
+  output[1] = x8[14];
+  output[2] = x8[3];
+  output[3] = x8[12];
+  output[4] = x8[5];
+  output[5] = x8[10];
+  output[6] = x8[7];
+  output[7] = x8[8];
+  output[8] = x8[9];
+  output[9] = x8[6];
+  output[10] = x8[11];
+  output[11] = x8[4];
+  output[12] = x8[13];
+  output[13] = x8[2];
+  output[14] = x8[15];
+  output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fadst4x4_new_sse2,      // ADST_DCT
+  fdct4x4_new_sse2,       // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fadst4x4_new_sse2,      // FLIPADST_DCT
+  fdct4x4_new_sse2,       // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fdct4x4_new_sse2,       // V_DCT
+  fidentity4x4_new_sse2,  // H_DCT
+  fadst4x4_new_sse2,      // V_ADST
+  fidentity4x4_new_sse2,  // H_ADST
+  fadst4x4_new_sse2,      // V_FLIPADST
+  fidentity4x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fdct4x4_new_sse2,       // ADST_DCT
+  fadst4x4_new_sse2,      // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fdct4x4_new_sse2,       // FLIPADST_DCT
+  fadst4x4_new_sse2,      // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fidentity4x4_new_sse2,  // V_DCT
+  fdct4x4_new_sse2,       // H_DCT
+  fidentity4x4_new_sse2,  // V_ADST
+  fadst4x4_new_sse2,      // H_ADST
+  fidentity4x4_new_sse2,  // V_FLIPADST
+  fadst4x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fadst4x8_new_sse2,      // ADST_DCT
+  fdct4x8_new_sse2,       // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fadst4x8_new_sse2,      // FLIPADST_DCT
+  fdct4x8_new_sse2,       // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct4x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst4x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst4x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fdct8x4_new_sse2,       // ADST_DCT
+  fadst8x4_new_sse2,      // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fdct8x4_new_sse2,       // FLIPADST_DCT
+  fadst8x4_new_sse2,      // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fidentity8x4_new_sse2,  // V_DCT
+  fdct8x4_new_sse2,       // H_DCT
+  fidentity8x4_new_sse2,  // V_ADST
+  fadst8x4_new_sse2,      // H_ADST
+  fidentity8x4_new_sse2,  // V_FLIPADST
+  fadst8x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fadst8x4_new_sse2,      // ADST_DCT
+  fdct8x4_new_sse2,       // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fadst8x4_new_sse2,      // FLIPADST_DCT
+  fdct8x4_new_sse2,       // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fdct8x4_new_sse2,       // V_DCT
+  fidentity8x4_new_sse2,  // H_DCT
+  fadst8x4_new_sse2,      // V_ADST
+  fidentity8x4_new_sse2,  // H_ADST
+  fadst8x4_new_sse2,      // V_FLIPADST
+  fidentity8x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fdct4x8_new_sse2,       // ADST_DCT
+  fadst4x8_new_sse2,      // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fdct4x8_new_sse2,       // FLIPADST_DCT
+  fadst4x8_new_sse2,      // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct4x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst4x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst4x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fadst8x8_new_sse2,      // ADST_DCT
+  fdct8x8_new_sse2,       // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fadst8x8_new_sse2,      // FLIPADST_DCT
+  fdct8x8_new_sse2,       // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct8x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst8x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst8x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fdct8x8_new_sse2,       // ADST_DCT
+  fadst8x8_new_sse2,      // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fdct8x8_new_sse2,       // FLIPADST_DCT
+  fadst8x8_new_sse2,      // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct8x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst8x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst8x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fadst8x16_new_sse2,      // ADST_DCT
+  fdct8x16_new_sse2,       // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fadst8x16_new_sse2,      // FLIPADST_DCT
+  fdct8x16_new_sse2,       // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fdct8x16_new_sse2,       // V_DCT
+  fidentity8x16_new_sse2,  // H_DCT
+  fadst8x16_new_sse2,      // V_ADST
+  fidentity8x16_new_sse2,  // H_ADST
+  fadst8x16_new_sse2,      // V_FLIPADST
+  fidentity8x16_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fdct8x16_new_sse2,       // ADST_DCT
+  fadst8x16_new_sse2,      // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fdct8x16_new_sse2,       // FLIPADST_DCT
+  fadst8x16_new_sse2,      // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fidentity8x16_new_sse2,  // V_DCT
+  fdct8x16_new_sse2,       // H_DCT
+  fidentity8x16_new_sse2,  // V_ADST
+  fadst8x16_new_sse2,      // H_ADST
+  fidentity8x16_new_sse2,  // V_FLIPADST
+  fadst8x16_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct8x32_new_sse2,   // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  fidentity8x32_new_sse2,  // V_DCT
+  av1_fdct8x32_new_sse2,   // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[4], buf1[4], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x4(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  store_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)stride;
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+  transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + 8 * i, buf, width);
+    } else {
+      buf = buf1 + 8 * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  store_rect_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  store_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+  transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+  transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+  for (int i = 0; i < 4; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x4(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  store_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 2; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+    }
+  } else {
+    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 1; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 2; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+    }
+  } else {
+    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+  const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div8; i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32);
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+  const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16);
+  }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  NULL,                             // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  NULL,                             // 32x64 transform
+  NULL,                             // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  else
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 0000000000..3cb869a8fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a = _mm_unpacklo_epi16(input[i], one);
+    const __m128i b = scale_round_sse2(a, NewSqrt2);
+    output[i] = _mm_packs_epi32(b, b);
+  }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm_adds_epi16(input[0], input[0]);
+  output[1] = _mm_adds_epi16(input[1], input[1]);
+  output[2] = _mm_adds_epi16(input[2], input[2]);
+  output[3] = _mm_adds_epi16(input[3], input[3]);
+  output[4] = _mm_adds_epi16(input[4], input[4]);
+  output[5] = _mm_adds_epi16(input[5], input[5]);
+  output[6] = _mm_adds_epi16(input[6], input[6]);
+  output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+                                    int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4 and 5
+  output[0] = x3[0];
+  output[4] = x3[1];
+  output[2] = x3[2];
+  output[6] = x3[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], output[1], output[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], output[5], output[3]);
+}
+
+static INLINE void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+  // stage 5, 6 and 7
+  output[7] = _mm_adds_epi16(x4[0], x4[4]);
+  output[3] = _mm_subs_epi16(x4[0], x4[4]);
+  output[0] = _mm_adds_epi16(x4[1], x4[5]);
+  output[4] = _mm_subs_epi16(x4[1], x4[5]);
+  output[5] = _mm_adds_epi16(x4[2], x4[6]);
+  output[1] = _mm_subs_epi16(x4[2], x4[6]);
+  output[2] = _mm_adds_epi16(x4[3], x4[7]);
+  output[6] = _mm_subs_epi16(x4[3], x4[7]);
+
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, output[7], output[0], output[7],
+              output[0]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, output[5], output[2], output[5],
+              output[2]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, output[3], output[4], output[3],
+              output[4]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, output[1], output[6], output[1],
+              output[6]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm_slli_epi16(input[i], 2);
+  }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct8x32_new_sse2,   // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  av1_fdct8x32_new_sse2,   // V_DCT
+  fidentity8x32_new_sse2,  // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 0000000000..b58911fcb2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+  const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  if (log_scale) {
+    const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+    round = _mm_mulhrs_epi16(round, round_scale);
+  }
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+  init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, int log_scale,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                            __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi32(*c);
+  __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
+
+  __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+  __m256i q_hi = _mm256_srli_epi64(q, 32);
+  const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+  q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+  q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+  q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+  q_hi = _mm256_slli_epi64(q_hi, 32);
+  q = _mm256_or_si256(q_lo, q_hi);
+  const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+  const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+  q = _mm256_andnot_si256(mask, q);
+
+  __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+  dq = _mm256_srai_epi32(dq, log_scale);
+  q = _mm256_sign_epi32(q, *c);
+  dq = _mm256_sign_epi32(dq, *c);
+
+  _mm256_storeu_si256((__m256i *)qcoeff, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+  const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+  const __m128i zr = _mm_setzero_si128();
+  const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+  const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+  const __m256i iscan =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+  const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+  __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+  cur_eob = _mm256_and_si256(cur_eob, nz);
+  *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  (void)scan;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 8;
+  __m256i qp[3], coeff;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+  while (n_coeffs > 0) {
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+    quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+  {
+    __m256i eob_s;
+    eob_s = _mm256_shuffle_epi32(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 1);
+    eob = _mm256_max_epi16(eob, eob_s);
+    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                            _mm256_extractf128_si256(eob, 1));
+    *eob_ptr = _mm_extract_epi16(final_eob, 0);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 0000000000..40b3b460b6
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+                                         const int shift, const int scale,
+                                         __m128i *qcoeff, __m128i *dquan,
+                                         __m128i *sign) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+
+  *sign = _mm_cmplt_epi32(*coeff, zero);
+  *sign = _mm_or_si128(*sign, one);
+  *coeff = _mm_abs_epi32(*coeff);
+
+  qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+  qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+  qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+  qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+  qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+  dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+  dquan[0] = _mm_srli_epi64(dquan[0], scale);
+  const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+  qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+                                         const __m128i *sign,
+                                         const __m128i *param, const int shift,
+                                         const int scale, tran_low_t *qAddr,
+                                         tran_low_t *dqAddr) {
+  __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+  __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+  qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+  qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+  dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+  dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+  // combine L&H
+  qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+  qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+  qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+  qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+  dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+  dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+  dquan[0] = _mm_and_si128(dquan[0], mask0H);
+  dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+  qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+  dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+  qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+  dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+  qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+  dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
+  _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+  _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+                            __m128i *eob) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, iscanIdx;
+  const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+  const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+  __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+  __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+  nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+  nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+  mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+  iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+  iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+  iscanIdx = _mm_and_si128(iscanIdx, mask);
+  *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+  __m128i eob_shuffled;
+  uint16_t eobValue;
+  eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eobValue = _mm_extract_epi16(*eob, 0);
+  return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
+  __m128i eob = _mm_setzero_si128();
+  const tran_low_t *src = coeff_ptr;
+  tran_low_t *quanAddr = qcoeff_ptr;
+  tran_low_t *dquanAddr = dqcoeff_ptr;
+  const int shift = 16 - log_scale;
+  const int coeff_stride = 4;
+  const int quan_stride = coeff_stride;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+  memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+  coeff[0] = _mm_loadu_si128((__m128i const *)src);
+  const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+  const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+  qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+  qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+  qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+  qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+                            dequant_ptr[0]);
+
+  // DC and first 3 AC
+  quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+
+  // update round/quan/dquan for AC
+  qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+  qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+  qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+  qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr, dquanAddr);
+
+  // next 4 AC
+  coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+  quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr + quan_stride, dquanAddr + quan_stride);
+
+  find_eob(quanAddr, iscan, &eob);
+
+  count -= 8;
+
+  // loop for the rest of AC
+  while (count > 0) {
+    src += coeff_stride << 1;
+    quanAddr += quan_stride << 1;
+    dquanAddr += quan_stride << 1;
+    iscan += quan_stride << 1;
+
+    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+    quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr, dquanAddr);
+
+    quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr + quan_stride,
+                          dquanAddr + quan_stride);
+
+    find_eob(quanAddr, iscan, &eob);
+
+    count -= 8;
+  }
+  *eob_ptr = get_accumulated_eob(&eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c
new file mode 100644
index 0000000000..52ddc66437
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>  // AVX2
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static int64_t k_means_horizontal_sum_avx2(__m256i a) {
+  const __m128i low = _mm256_castsi256_si128(a);
+  const __m128i high = _mm256_extracti128_si256(a, 1);
+  const __m128i sum = _mm_add_epi64(low, high);
+  const __m128i sum_high = _mm_unpackhi_epi64(sum, sum);
+  int64_t res;
+  _mm_storel_epi64((__m128i *)&res, _mm_add_epi64(sum, sum_high));
+  return res;
+}
+
+void av1_calc_indices_dim1_avx2(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
+  const __m256i v_zero = _mm256_setzero_si256();
+  __m256i sum = _mm256_setzero_si256();
+  __m256i cents[PALETTE_MAX_SIZE];
+  for (int j = 0; j < k; ++j) {
+    cents[j] = _mm256_set1_epi16(centroids[j]);
+  }
+
+  for (int i = 0; i < n; i += 16) {
+    const __m256i in = _mm256_loadu_si256((__m256i *)data);
+    __m256i ind = _mm256_setzero_si256();
+    // Compute the distance to the first centroid.
+    __m256i d1 = _mm256_sub_epi16(in, cents[0]);
+    __m256i dist_min = _mm256_abs_epi16(d1);
+
+    for (int j = 1; j < k; ++j) {
+      // Compute the distance to the centroid.
+      d1 = _mm256_sub_epi16(in, cents[j]);
+      const __m256i dist = _mm256_abs_epi16(d1);
+      // Compare to the minimal one.
+      const __m256i cmp = _mm256_cmpgt_epi16(dist_min, dist);
+      dist_min = _mm256_min_epi16(dist_min, dist);
+      const __m256i ind1 = _mm256_set1_epi16(j);
+      ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
+                            _mm256_and_si256(cmp, ind1));
+    }
+
+    const __m256i p1 = _mm256_packus_epi16(ind, v_zero);
+    const __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
+    const __m128i d2 = _mm256_extracti128_si256(px, 0);
+
+    _mm_storeu_si128((__m128i *)indices, d2);
+
+    if (total_dist) {
+      // Square, convert to 32 bit and add together.
+      dist_min = _mm256_madd_epi16(dist_min, dist_min);
+      // Convert to 64 bit and add to sum.
+      const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero);
+      const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero);
+      sum = _mm256_add_epi64(sum, dist1);
+      sum = _mm256_add_epi64(sum, dist2);
+    }
+
+    indices += 16;
+    data += 16;
+  }
+  if (total_dist) {
+    *total_dist = k_means_horizontal_sum_avx2(sum);
+  }
+}
+
+void av1_calc_indices_dim2_avx2(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
+  const __m256i v_zero = _mm256_setzero_si256();
+  const __m256i permute = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+  __m256i sum = _mm256_setzero_si256();
+  __m256i ind[2];
+  __m256i cents[PALETTE_MAX_SIZE];
+  for (int j = 0; j < k; ++j) {
+    const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+    cents[j] = _mm256_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, cy, cx,
+                                cy, cx, cy, cx);
+  }
+
+  for (int i = 0; i < n; i += 16) {
+    for (int l = 0; l < 2; ++l) {
+      const __m256i in = _mm256_loadu_si256((__m256i *)data);
+      ind[l] = _mm256_setzero_si256();
+      // Compute the distance to the first centroid.
+      __m256i d1 = _mm256_sub_epi16(in, cents[0]);
+      __m256i dist_min = _mm256_madd_epi16(d1, d1);
+
+      for (int j = 1; j < k; ++j) {
+        // Compute the distance to the centroid.
+        d1 = _mm256_sub_epi16(in, cents[j]);
+        const __m256i dist = _mm256_madd_epi16(d1, d1);
+        // Compare to the minimal one.
+        const __m256i cmp = _mm256_cmpgt_epi32(dist_min, dist);
+        dist_min = _mm256_min_epi32(dist_min, dist);
+        const __m256i ind1 = _mm256_set1_epi32(j);
+        ind[l] = _mm256_or_si256(_mm256_andnot_si256(cmp, ind[l]),
+                                 _mm256_and_si256(cmp, ind1));
+      }
+      if (total_dist) {
+        // Convert to 64 bit and add to sum.
+        const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero);
+        const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero);
+        sum = _mm256_add_epi64(sum, dist1);
+        sum = _mm256_add_epi64(sum, dist2);
+      }
+      data += 16;
+    }
+    // Cast to 8 bit and store.
+    const __m256i d2 = _mm256_packus_epi32(ind[0], ind[1]);
+    const __m256i d3 = _mm256_packus_epi16(d2, v_zero);
+    const __m256i d4 = _mm256_permutevar8x32_epi32(d3, permute);
+    const __m128i d5 = _mm256_extracti128_si256(d4, 0);
+    _mm_storeu_si128((__m128i *)indices, d5);
+    indices += 16;
+  }
+  if (total_dist) {
+    *total_dist = k_means_horizontal_sum_avx2(sum);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c
new file mode 100644
index 0000000000..6c75822350
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static int64_t k_means_horizontal_sum_sse2(__m128i a) {
+  const __m128i sum1 = _mm_unpackhi_epi64(a, a);
+  const __m128i sum2 = _mm_add_epi64(a, sum1);
+  int64_t res;
+  _mm_storel_epi64((__m128i *)&res, sum2);
+  return res;
+}
+
+void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
+  const __m128i v_zero = _mm_setzero_si128();
+  __m128i sum = _mm_setzero_si128();
+  __m128i cents[PALETTE_MAX_SIZE];
+  for (int j = 0; j < k; ++j) {
+    cents[j] = _mm_set1_epi16(centroids[j]);
+  }
+
+  for (int i = 0; i < n; i += 8) {
+    const __m128i in = _mm_loadu_si128((__m128i *)data);
+    __m128i ind = _mm_setzero_si128();
+    // Compute the distance to the first centroid.
+    __m128i d1 = _mm_sub_epi16(in, cents[0]);
+    __m128i d2 = _mm_sub_epi16(cents[0], in);
+    __m128i dist_min = _mm_max_epi16(d1, d2);
+
+    for (int j = 1; j < k; ++j) {
+      // Compute the distance to the centroid.
+      d1 = _mm_sub_epi16(in, cents[j]);
+      d2 = _mm_sub_epi16(cents[j], in);
+      const __m128i dist = _mm_max_epi16(d1, d2);
+      // Compare to the minimal one.
+      const __m128i cmp = _mm_cmpgt_epi16(dist_min, dist);
+      dist_min = _mm_min_epi16(dist_min, dist);
+      const __m128i ind1 = _mm_set1_epi16(j);
+      ind = _mm_or_si128(_mm_andnot_si128(cmp, ind), _mm_and_si128(cmp, ind1));
+    }
+    if (total_dist) {
+      // Square, convert to 32 bit and add together.
+      dist_min = _mm_madd_epi16(dist_min, dist_min);
+      // Convert to 64 bit and add to sum.
+      const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero);
+      const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero);
+      sum = _mm_add_epi64(sum, dist1);
+      sum = _mm_add_epi64(sum, dist2);
+    }
+    __m128i p2 = _mm_packus_epi16(ind, v_zero);
+    _mm_storel_epi64((__m128i *)indices, p2);
+    indices += 8;
+    data += 8;
+  }
+  if (total_dist) {
+    *total_dist = k_means_horizontal_sum_sse2(sum);
+  }
+}
+
+void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
+  const __m128i v_zero = _mm_setzero_si128();
+  __m128i sum = _mm_setzero_si128();
+  __m128i ind[2];
+  __m128i cents[PALETTE_MAX_SIZE];
+  for (int j = 0; j < k; ++j) {
+    const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+    cents[j] = _mm_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx);
+  }
+
+  for (int i = 0; i < n; i += 8) {
+    for (int l = 0; l < 2; ++l) {
+      const __m128i in = _mm_loadu_si128((__m128i *)data);
+      ind[l] = _mm_setzero_si128();
+      // Compute the distance to the first centroid.
+      __m128i d1 = _mm_sub_epi16(in, cents[0]);
+      __m128i dist_min = _mm_madd_epi16(d1, d1);
+
+      for (int j = 1; j < k; ++j) {
+        // Compute the distance to the centroid.
+        d1 = _mm_sub_epi16(in, cents[j]);
+        const __m128i dist = _mm_madd_epi16(d1, d1);
+        // Compare to the minimal one.
+        const __m128i cmp = _mm_cmpgt_epi32(dist_min, dist);
+        const __m128i dist1 = _mm_andnot_si128(cmp, dist_min);
+        const __m128i dist2 = _mm_and_si128(cmp, dist);
+        dist_min = _mm_or_si128(dist1, dist2);
+        const __m128i ind1 = _mm_set1_epi32(j);
+        ind[l] = _mm_or_si128(_mm_andnot_si128(cmp, ind[l]),
+                              _mm_and_si128(cmp, ind1));
+      }
+      if (total_dist) {
+        // Convert to 64 bit and add to sum.
+        const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero);
+        const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero);
+        sum = _mm_add_epi64(sum, dist1);
+        sum = _mm_add_epi64(sum, dist2);
+      }
+      data += 8;
+    }
+    // Cast to 8 bit and store.
+    const __m128i d2 = _mm_packus_epi16(ind[0], ind[1]);
+    const __m128i d3 = _mm_packus_epi16(d2, v_zero);
+    _mm_storel_epi64((__m128i *)indices, d3);
+    indices += 8;
+  }
+  if (total_dist) {
+    *total_dist = k_means_horizontal_sum_sse2(sum);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 0000000000..75c5172f85
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+  const __m256i zero = _mm256_setzero_si256();
+  _mm256_storeu_si256((__m256i *)qcoeff, zero);
+  _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *thr, __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  if (log_scale > 0) {
+    const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+    round = _mm_add_epi16(round, rnd);
+    round = _mm_srai_epi16(round, log_scale);
+  }
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+
+  if (log_scale == 1) {
+    qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+  }
+
+  init_one_qp(&dequant, &qp[2]);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask.
+  *thr = _mm256_sub_epi16(*thr, _mm256_set1_epi16(1));
+}
+
+static INLINE void update_qp(__m256i *thr, __m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+  *thr = _mm256_permute2x128_si256(*thr, *thr, 0x11);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+  const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr);
+  const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                           tran_low_t *coeff_ptr) {
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+  __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+  eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+  eob_s = _mm_minpos_epu16(eob_s);
+  return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
+static INLINE int16_t accumulate_eob256(__m256i eob256) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+  __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+  __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+static AOM_FORCE_INLINE void quantize_lp_16_first(
+    const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr,
+    int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256,
+    __m256i *dequant256, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff);
+
+  const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+  *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
+static AOM_FORCE_INLINE void quantize_lp_16(
+    const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr,
+    int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256,
+    __m256i *quant256, __m256i *dequant256, __m256i *eob) {
+  const __m256i coeff =
+      _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff);
+  _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), dqcoeff);
+
+  const __m256i iscan =
+      _mm256_loadu_si256((const __m256i *)(iscan_ptr + n_coeffs));
+  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+  *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
+void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  __m256i eob256 = _mm256_setzero_si256();
+
+  // Setup global values.
+  __m256i round256 =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  __m256i quant256 =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  __m256i dequant256 =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+
+  // Populate upper AC values.
+  round256 = _mm256_permute4x64_epi64(round256, 0x54);
+  quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+  dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+
+  // Process DC and the first 15 AC coeffs.
+  quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+                       &quant256, &dequant256, &eob256);
+
+  if (n_coeffs > 16) {
+    // Overwrite the DC constants with AC constants
+    dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+    quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+    round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+    // AC only loop.
+    for (int idx = 16; idx < n_coeffs; idx += 16) {
+      quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+                     &quant256, &dequant256, &eob256);
+    }
+  }
+
+  *eob_ptr = accumulate_eob256(eob256);
+}
+
+static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+  const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
+  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static AOM_FORCE_INLINE void quantize_fp_16(
+    const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr,
+    const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    __m256i *eob) {
+  const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]);
+    const __m256i abs_q = _mm256_mulhi_epi16(tmp_rnd, qp[1]);
+    const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+    const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+    const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256());
+
+    store_coefficients_avx2(q, qcoeff_ptr);
+    store_coefficients_avx2(dq, dqcoeff_ptr);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  } else {
+    write_zero(qcoeff_ptr);
+    write_zero(dqcoeff_ptr);
+  }
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  const int log_scale = 0;
+  const int step = 16;
+  __m256i qp[3], thr;
+  __m256i eob = _mm256_setzero_si256();
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+  quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(&thr, qp);
+
+  while (n_coeffs > 0) {
+    quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+                   &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
+
+static AOM_FORCE_INLINE void quantize_fp_32x32(
+    const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr,
+    const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    __m256i *eob) {
+  const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]);
+    const __m256i abs_q = _mm256_mulhi_epu16(tmp_rnd, qp[1]);
+    const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+    const __m256i abs_dq =
+        _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 1);
+    const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256());
+    const __m256i dq = _mm256_sign_epi16(abs_dq, coeff);
+
+    store_coefficients_avx2(q, qcoeff_ptr);
+    store_coefficients_avx2(dq, dqcoeff_ptr);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  } else {
+    write_zero(qcoeff_ptr);
+    write_zero(dqcoeff_ptr);
+  }
+}
+
+void av1_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  const int log_scale = 1;
+  const unsigned int step = 16;
+  __m256i qp[3], thr;
+  __m256i eob = _mm256_setzero_si256();
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+  quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+                    &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(&thr, qp);
+
+  while (n_coeffs > 0) {
+    quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+                      &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_fp_64x64(const __m256i *thr, const __m256i *qp,
+                                     const tran_low_t *coeff_ptr,
+                                     const int16_t *iscan_ptr,
+                                     tran_low_t *qcoeff_ptr,
+                                     tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(abs_coeff, qp[0]), mask);
+    const __m256i qh = _mm256_slli_epi16(_mm256_mulhi_epi16(tmp_rnd, qp[1]), 2);
+    const __m256i ql =
+        _mm256_srli_epi16(_mm256_mullo_epi16(tmp_rnd, qp[1]), 14);
+    const __m256i abs_q = _mm256_or_si256(qh, ql);
+    const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(abs_q, qp[2]), 14);
+    const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 2);
+    const __m256i abs_dq = _mm256_or_si256(dqh, dql);
+    const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi16(abs_dq, coeff);
+    // Check the signed q/dq value here instead of the absolute value. When
+    // dequant equals 4, the dequant threshold (*thr) becomes 0 after being
+    // scaled down by (1 + log_scale). See init_qp(). When *thr is 0 and the
+    // abs_coeff is 0, the nzflag will be set. As a result, the eob will be
+    // incorrectly calculated. The psign instruction corrects the error by
+    // zeroing out q/dq if coeff is zero.
+    const __m256i z_mask = _mm256_cmpeq_epi16(dq, _mm256_setzero_si256());
+    const __m256i nz_mask = _mm256_cmpeq_epi16(z_mask, _mm256_setzero_si256());
+
+    store_coefficients_avx2(q, qcoeff_ptr);
+    store_coefficients_avx2(dq, dqcoeff_ptr);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  } else {
+    write_zero(qcoeff_ptr);
+    write_zero(dqcoeff_ptr);
+  }
+}
+
+void av1_quantize_fp_64x64_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  const int log_scale = 2;
+  const unsigned int step = 16;
+  __m256i qp[3], thr;
+  __m256i eob = _mm256_setzero_si256();
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+  quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+                    &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(&thr, qp);
+
+  while (n_coeffs > 0) {
+    quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+                      &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
new file mode 100644
index 0000000000..b533894015
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m128i *c0, __m128i *c1) {
+  const tran_low_t *addr = coeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+    const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+    const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+    const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+    *c0 = _mm_packs_epi32(x0, x1);
+    *c1 = _mm_packs_epi32(x2, x3);
+  } else {
+    *c0 = _mm_load_si128((const __m128i *)addr);
+    *c1 = _mm_load_si128((const __m128i *)addr + 1);
+  }
+}
+
+static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
+                                tran_low_t *qcoeff, intptr_t offset) {
+  tran_low_t *addr = qcoeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+    __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+    __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+    _mm_store_si128((__m128i *)addr, y0);
+    _mm_store_si128((__m128i *)addr + 1, y1);
+
+    sign_bits = _mm_cmplt_epi16(*qc1, zero);
+    y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+    y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+    _mm_store_si128((__m128i *)addr + 2, y0);
+    _mm_store_si128((__m128i *)addr + 3, y1);
+  } else {
+    _mm_store_si128((__m128i *)addr, *qc0);
+    _mm_store_si128((__m128i *)addr + 1, *qc1);
+  }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
+  const __m128i zero = _mm_setzero_si128();
+  tran_low_t *addr = qcoeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+    _mm_store_si128((__m128i *)addr + 2, zero);
+    _mm_store_si128((__m128i *)addr + 3, zero);
+  } else {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+  }
+}
+
+static INLINE void quantize(const int16_t *iscan_ptr,
+                            const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const __m128i *round0, const __m128i *round1,
+                            const __m128i *quant0, const __m128i *quant1,
+                            const __m128i *dequant0, const __m128i *dequant1,
+                            const __m128i *thr0, const __m128i *thr1,
+                            __m128i *eob) {
+  __m128i coeff0, coeff1;
+  // Do DC and first 15 AC
+  read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+                                     _mm_cmpeq_epi16(qcoeff0, *thr0));
+  const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+                                     _mm_cmpeq_epi16(qcoeff1, *thr1));
+  const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+  if (nzflag) {
+    qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+    qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+    const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+    const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+    // Reinsert signs
+    qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+    qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+    qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+    qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+    write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+    coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+    coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+    write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+    const __m128i zero = _mm_setzero_si128();
+    // Scan for eob
+    const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+    const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+    const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+    const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+    const __m128i iscan0 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+    const __m128i iscan1 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+    // Add one to convert from indices to counts
+    const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+    const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+    const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+    const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+    const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+    *eob = _mm_max_epi16(*eob, eob2);
+  } else {
+    write_zero(qcoeff_ptr, n_coeffs);
+    write_zero(dqcoeff_ptr, n_coeffs);
+  }
+}
+
+void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+  const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+  __m128i eob = _mm_setzero_si128();
+
+  quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+           &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+             &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+             &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  {
+    __m128i eob_shuffled;
+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    *eob_ptr = _mm_extract_epi16(eob, 1);
+  }
+}
+
+static INLINE void quantize_lp(const int16_t *iscan_ptr,
+                               const int16_t *coeff_ptr, intptr_t n_coeffs,
+                               int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                               const __m128i *round0, const __m128i *round1,
+                               const __m128i *quant0, const __m128i *quant1,
+                               const __m128i *dequant0, const __m128i *dequant1,
+                               __m128i *eob) {
+  const int16_t *read = coeff_ptr + n_coeffs;
+  __m128i coeff0 = _mm_load_si128((const __m128i *)read);
+  __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+  const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+  const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+  // Reinsert signs
+  qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+  qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+  int16_t *addr = qcoeff_ptr + n_coeffs;
+  _mm_store_si128((__m128i *)addr, qcoeff0);
+  _mm_store_si128((__m128i *)addr + 1, qcoeff1);
+
+  coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+  coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+  addr = dqcoeff_ptr + n_coeffs;
+  _mm_store_si128((__m128i *)addr, coeff0);
+  _mm_store_si128((__m128i *)addr + 1, coeff1);
+
+  const __m128i zero = _mm_setzero_si128();
+  // Scan for eob
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+  const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+  const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+
+  const __m128i iscan0 =
+      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+  const __m128i iscan1 =
+      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+
+  // Add one to convert from indices to counts
+  const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+  const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+  const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+  const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+  const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+  *eob = _mm_max_epi16(*eob, eob2);
+}
+
+void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  __m128i eob = _mm_setzero_si128();
+
+  // DC and first 15 AC
+  quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+              &round1, &quant0, &quant1, &dequant0, &dequant1, &eob);
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+                &round1, &quant1, &quant1, &dequant1, &dequant1, &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..ad4ae274e2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, fp_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m1, m5
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+  psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+
+  or                              r6, r2
+  jz .skip_iter
+
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+  jmp .accumulate_eob
+.skip_iter:
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                           [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                     word [r3q], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..618758105a
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(av1_ssim_parms_16x16_sse2)
+sym(av1_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(av1_ssim_parms_8x8_sse2)
+sym(av1_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c
new file mode 100644
index 0000000000..830f40ecb0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i av1_denoiser_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
+    const __m128i *k_16, const __m128i *l3, const __m128i *l32,
+    const __m128i *l21, __m128i acc_diff) {
+  // Calculate differences
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  __m128i v_running_avg_y;
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+  // Clamp absolute difference to 16 to be used to get mask. Doing this
+  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+  const __m128i clamped_absdiff =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+  // Get masks for l2 l1 and l0 adjustments.
+  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+  // Get adjustments for l2, l1, and l0.
+  __m128i adj2 = _mm_and_si128(mask2, *l32);
+  const __m128i adj1 = _mm_and_si128(mask1, *l21);
+  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+  __m128i adj, padj, nadj;
+
+  // Combine the adjustments and get absolute adjustments.
+  adj2 = _mm_add_epi8(adj2, adj1);
+  adj = _mm_sub_epi8(*l3, adj2);
+  adj = _mm_andnot_si128(mask0, adj);
+  adj = _mm_or_si128(adj, adj0);
+
+  // Restore the sign and get positive and negative adjustments.
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+
+  // Calculate filtered value.
+  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Adjustments <=7, and each element in acc_diff can fit in signed
+  // char.
+  acc_diff = _mm_adds_epi8(acc_diff, padj);
+  acc_diff = _mm_subs_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i av1_denoiser_adj_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
+  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+  // Calculate differences.
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+  // Clamp absolute difference to delta to get the adjustment.
+  const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+  // Restore the sign and get positive and negative adjustments.
+  __m128i padj, nadj;
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+  // Calculate filtered value.
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Accumulate the adjustments.
+  acc_diff = _mm_subs_epi8(acc_diff, padj);
+  acc_diff = _mm_adds_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int av1_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
+                                       const uint8_t *mc_running_avg_y,
+                                       int mc_avg_y_stride,
+                                       uint8_t *running_avg_y, int avg_y_stride,
+                                       int increase_denoising, BLOCK_SIZE bs,
+                                       int motion_magnitude, int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const int b_height = block_size_high[bs] >> 1;
+
+  for (r = 0; r < b_height; ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+           width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    acc_diff = av1_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
+                                      running_buffer[r], &k_0, &k_4, &k_8,
+                                      &k_16, &l3, &l32, &l21, acc_diff);
+    memcpy(running_avg_y, running_buffer[r], width);
+    memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = sum_diff_16x1(acc_diff);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
+          acc_diff = av1_denoiser_adj_16x1_sse2(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
+              k_delta, acc_diff);
+          memcpy(running_avg_y, running_buffer[r], width);
+          memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
+                 width);
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = sum_diff_16x1(acc_diff);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+// Denoise 16x16 to 128x128 blocks.
+static int av1_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t *running_avg_y, int avg_y_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude) {
+  int sum_diff_thresh, r, c, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  __m128i acc_diff[8][8];
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const int b_width = block_size_wide[bs];
+  const int b_height = block_size_high[bs];
+  const int b_width_shift4 = b_width >> 4;
+
+  for (r = 0; r < 8; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r] = _mm_setzero_si128();
+    }
+  }
+
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r >> 4] = av1_denoiser_16x1_sse2(
+          sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
+          &l32, &l21, acc_diff[c][r >> 4]);
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
+  }
+
+  {
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
+        sum_diff = 0;
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            acc_diff[c][r >> 4] =
+                av1_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
+                                           k_0, k_delta, acc_diff[c][r >> 4]);
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+            }
+          }
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int av1_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg, int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising, BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+    return av1_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
+                                     avg, avg_stride, increase_denoising, bs,
+                                     motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return av1_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
+                                       avg, avg_stride, increase_denoising, bs,
+                                       motion_magnitude, 8);
+  } else {
+    return COPY_BLOCK;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 0000000000..7a0f32898b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int stride);
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+                       const int instride, const int outstride);
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_sse4_1(const __m128i *input, __m128i *output,
+                        const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_sse4_1(const __m128i *input, __m128i *output,
+                        const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int col_num);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+                                    __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+                                __m128i *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    const __m128i ww0 = _mm_set1_epi32(w0);                    \
+    const __m128i ww1 = _mm_set1_epi32(w1);                    \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);          \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);          \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = av1_round_shift_32_sse4_1(out0, bit);               \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);          \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);          \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = av1_round_shift_32_sse4_1(out1, bit);               \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit);    \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);                   \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);                   \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                               \
+    out0 = _mm_add_epi32(out0, r);                                      \
+    out0 = _mm_srai_epi32(out0, bit);                                   \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);                   \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);                   \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                               \
+    out1 = _mm_add_epi32(out1, r);                                      \
+    out1 = _mm_srai_epi32(out1, bit);                                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/third_party/aom/av1/encoder/x86/cnn_avx2.c b/third_party/aom/av1/encoder/x86/cnn_avx2.c
new file mode 100644
index 0000000000..ee93b3d5a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/cnn_avx2.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+
+// This mask rearranges source pixels in the order shown below.
+// shuffle_src_layer0[0][8]: applied on source pixels 0 to 7.
+// shuffle_src_layer0[1][8]: applied on source pixels 7 to 14.
+// This shuffling is needed to process 3 5x5 blocks which need
+// source pixels in the following order.
+// 1st 5x5 block: source pixels needed are 0 to 4,
+// 2nd 5x5 block: source pixels needed are 4 to 8,
+// 3rd 5x5 block: source pixels needed are 8 to 12.
+// Source pixels are loaded like mentioned below.
+// load_src0 : 0, 1, 2, 3, 4, 5, 6, 7
+// load_src1 : 7, 8, 9, 10, 11, 12, 13, 14
+// After applying masks, source bytes will be in the order:
+// load_src0 : 0, 1, 2, 3, 4, 4, 5, 6
+//             consists 5 pixels needed for 1st 5x5 block and
+//             first 3 pixels needed for 2nd 5x5 block.
+// load_src1 : 7, 8, 8, 9, 10, 11, 12, x
+//             consists last 2 pixels needed for 2nd 5x5 block and
+//             5 pixels needed for 3rd 5x5 block.
+DECLARE_ALIGNED(32, static const uint32_t,
+                shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 },
+                                              { 0, 1, 1, 2, 3, 4, 5, 0 } };
+
+// This mask rearrange the weights to match shuffled source pixels order.
+DECLARE_ALIGNED(32, static const uint32_t,
+                shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 },
+                                                 { 3, 4, 0, 1, 2, 3, 4, 0 } };
+
+// Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2.
+// For layer 1 and layer 2, convolution happens at 2x2 as filter_width and
+// filter_height are equal to 2. So rearranging the weights in the
+// order shown below to match source pixels. Basically this mask replicates
+// the weights across the width of 2.
+DECLARE_ALIGNED(32, static const uint32_t,
+                shuffle_weight_layer_1_and_2[2][8]) = {
+  { 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 }
+};
+
+// After the stages of multiplication and accumulation, the output values
+// in the register will be jumbled. In order to store register into
+// output buffer in a proper way, the following mask is applied on output
+// register.
+DECLARE_ALIGNED(32, static const uint32_t,
+                shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 };
+
+// Load weights needed for layer 0 (for 5x5 block processing),
+// and fill the registers appropriately to match source pixel mapping.
+static INLINE void prepare_weights_for_5x5_convolve(
+    const float *layer_config_weights, int off, float weight[5][8],
+    const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0,
+    const __m256i weight_mask_1) {
+  for (int row = 0; row < 5; ++row) {
+    for (int col = 0; col < 5; ++col) {
+      weight[row][col] = layer_config_weights[off];
+      off += cstep;
+    }
+  }
+  shuffle_weight[0] = _mm256_loadu_ps(weight[0]);
+  shuffle_weight[1] = _mm256_loadu_ps(weight[1]);
+  shuffle_weight[2] = _mm256_loadu_ps(weight[2]);
+  shuffle_weight[3] = _mm256_loadu_ps(weight[3]);
+  shuffle_weight[4] = _mm256_loadu_ps(weight[4]);
+
+  shuffle_weight[0] =
+      _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_0);
+  shuffle_weight[1] =
+      _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_0);
+  shuffle_weight[2] =
+      _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_0);
+  shuffle_weight[3] =
+      _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_0);
+  shuffle_weight[4] =
+      _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_0);
+  shuffle_weight[5] =
+      _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_1);
+  shuffle_weight[6] =
+      _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_1);
+  shuffle_weight[7] =
+      _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_1);
+  shuffle_weight[8] =
+      _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_1);
+  shuffle_weight[9] =
+      _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_1);
+}
+
+// For each row, loads source pixels 0 to 7(load_src_0), 7 to 14(load_src_1) and
+// arranges them appropriately to process 3 blocks.
+#define PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS()                            \
+  do {                                                                 \
+    for (int row = 0; row < 5; row++) {                                \
+      load_src_0 = _mm256_loadu_ps(input_ptr);                         \
+      load_src_1 = _mm256_loadu_ps(input_ptr + 7);                     \
+      load_src_0 = _mm256_permutevar8x32_ps(load_src_0, block0_1);     \
+      load_src_1 = _mm256_permutevar8x32_ps(load_src_1, block1_2);     \
+      load_src_0 = _mm256_mul_ps(load_src_0, shuffle_weight[0 + row]); \
+      load_src_1 = _mm256_mul_ps(load_src_1, shuffle_weight[5 + row]); \
+      accum_src_0 = _mm256_add_ps(load_src_0, accum_src_0);            \
+      accum_src_1 = _mm256_add_ps(load_src_1, accum_src_1);            \
+      input_ptr += in_stride;                                          \
+    }                                                                  \
+  } while (0)
+
+// Load masks needed for shuffling of output and weights.
+static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask,
+                                                       __m256i *weight_mask) {
+  // Load shuffle buffer needed to sort the output.
+  *output_mask =
+      _mm256_load_si256((const __m256i *)shuffle_output_layer_1_and_2);
+
+  // Load shuffle buffers needed for weight.
+  weight_mask[0] =
+      _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[0]);
+  weight_mask[1] =
+      _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[1]);
+}
+
+// Load weights needed for layer 1 and 2 (for 2x2 block processing),
+// and fill the registers appropriately to match source pixel mapping.
+static INLINE void prepare_weights_for_2x2_convolve(
+    const float *layer_config_weights, int off, const int cstep,
+    __m256 *shuffle_weight, __m256i *weight_mask) {
+  // Weights needed for 2x2 block.
+  float weight[4] = { 0 };
+  for (int i = 0; i < 4; ++i) {
+    weight[i] = layer_config_weights[off];
+    off += cstep;
+  }
+
+  const __m256 weight_vec = _mm256_castps128_ps256(_mm_loadu_ps(weight));
+  shuffle_weight[0] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[0]);
+  shuffle_weight[1] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[1]);
+}
+
+// Do convolution of one 5x5 block.
+#define PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(w, accum0, in_stride)           \
+  do {                                                                   \
+    __m128 load_src[5];                                                  \
+    load_src[0] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[0][4];                      \
+    input_ptr += in_stride;                                              \
+    load_src[1] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[1][4];                      \
+    input_ptr += in_stride;                                              \
+    load_src[2] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[2][4];                      \
+    input_ptr += in_stride;                                              \
+    load_src[3] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[3][4];                      \
+    input_ptr += in_stride;                                              \
+    load_src[4] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[4][4];                      \
+                                                                         \
+    load_src[0] = _mm_mul_ps(load_src[0], _mm256_castps256_ps128(w[0])); \
+    load_src[1] = _mm_mul_ps(load_src[1], _mm256_castps256_ps128(w[1])); \
+    load_src[2] = _mm_mul_ps(load_src[2], _mm256_castps256_ps128(w[2])); \
+    load_src[3] = _mm_mul_ps(load_src[3], _mm256_castps256_ps128(w[3])); \
+    load_src[4] = _mm_mul_ps(load_src[4], _mm256_castps256_ps128(w[4])); \
+                                                                         \
+    accum0 = _mm_add_ps(load_src[0], accum0);                            \
+    load_src[1] = _mm_add_ps(load_src[1], load_src[2]);                  \
+    load_src[3] = _mm_add_ps(load_src[3], load_src[4]);                  \
+    load_src[1] = _mm_add_ps(load_src[1], load_src[3]);                  \
+    accum0 = _mm_add_ps(accum0, load_src[1]);                            \
+  } while (0)
+
+// Do convolution on 8 horizontal 2x2 blocks.
+static INLINE void perform_convolve_for_8h_2x2_blocks(
+    const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
+    __m256i shuffle_output_mask) {
+  __m256 load_src[4];
+  // Load input into source registers.
+  load_src[0] = _mm256_loadu_ps(input_ptr);
+  load_src[1] = _mm256_loadu_ps(input_ptr + 8);
+  load_src[2] = _mm256_loadu_ps(input_ptr + in_stride);
+  load_src[3] = _mm256_loadu_ps(input_ptr + in_stride + 8);
+
+  // Multiply the loaded input with corresponding weights.
+  load_src[0] = _mm256_mul_ps(load_src[0], weight[0]);
+  load_src[1] = _mm256_mul_ps(load_src[1], weight[0]);
+  load_src[2] = _mm256_mul_ps(load_src[2], weight[1]);
+  load_src[3] = _mm256_mul_ps(load_src[3], weight[1]);
+
+  // Accumulate across 2x2 blocks.
+  load_src[0] = _mm256_add_ps(load_src[0], load_src[2]);
+  load_src[1] = _mm256_add_ps(load_src[1], load_src[3]);
+  load_src[0] = _mm256_hadd_ps(load_src[0], load_src[1]);
+
+  // Sort the output in order to store into output buffer.
+  load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
+  *out_accum = _mm256_add_ps(*out_accum, load_src[0]);
+}
+
+// Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks.
+static INLINE void perform_convolve_for_4hx2v_2x2_blocks(
+    const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
+    __m256i shuffle_output_mask) {
+  __m256 load_src[4];
+  // Load input into source registers.
+  load_src[0] = _mm256_loadu_ps(input_ptr);
+  load_src[1] = _mm256_loadu_ps(input_ptr + in_stride);
+  load_src[2] = _mm256_loadu_ps(input_ptr + (in_stride * 2));
+  load_src[3] = _mm256_loadu_ps(input_ptr + (in_stride * 3));
+
+  // Multiply the loaded input with corresponding weights.
+  load_src[0] = _mm256_mul_ps(load_src[0], weight[0]);
+  load_src[1] = _mm256_mul_ps(load_src[1], weight[1]);
+  load_src[2] = _mm256_mul_ps(load_src[2], weight[0]);
+  load_src[3] = _mm256_mul_ps(load_src[3], weight[1]);
+
+  // Accumulate across 2x2 blocks.
+  load_src[0] = _mm256_add_ps(load_src[0], load_src[1]);
+  load_src[2] = _mm256_add_ps(load_src[2], load_src[3]);
+  load_src[0] = _mm256_hadd_ps(load_src[0], load_src[2]);
+
+  // Sort the output in order to store into output buffer.
+  load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
+  *out_accum = _mm256_add_ps(*out_accum, load_src[0]);
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when
+// filter_width and filter_height are equal to 5.
+// CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config.
+// Based on the configuration set for each layer, the current encoder
+// always chooses the case of no_maxpool_padding_valid.
+// And also for layer 0 convolution happens at 5x5 level as the
+// filter_width and filter_height are set as 5.
+static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int channel_step) {
+  const int kFilterWidth = 5;
+  const int kFilterHeight = 5;
+  const int kSkipWidth = 4;
+  const int kSkipHeight = 4;
+  assert(layer_config->filter_width == kFilterWidth &&
+         layer_config->filter_height == kFilterHeight);
+  assert(layer_config->skip_width == kSkipWidth &&
+         layer_config->skip_height == kSkipHeight);
+
+  // Load shuffle buffers needed for source.
+  const __m256i block0_1 =
+      _mm256_load_si256((const __m256i *)shuffle_src_layer0[0]);
+  const __m256i block1_2 =
+      _mm256_load_si256((const __m256i *)shuffle_src_layer0[1]);
+
+  // Load shuffle buffers needed for weight.
+  const __m256i weight_mask_0 =
+      _mm256_load_si256((const __m256i *)shuffle_weight_layer0[0]);
+  const __m256i weight_mask_1 =
+      _mm256_load_si256((const __m256i *)shuffle_weight_layer0[1]);
+
+  // Width needs to be moved to go to next iteration of processing 3 5x5 blocks.
+  const int kSkipWidthForNextIter = kSkipWidth * 3;
+
+  // Minimum width required to process 3 5x5 blocks at a time.
+  // min width (for processing 3 5x5 block) = 2*skip_width + filter_width
+  // Here, skip_width specifies how much width we should move while processing
+  // next block convolution and filter_width specifies for how many pixels
+  // filter needs to be applied.
+  const int kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    const float out_ch_bias = layer_config->bias[i];
+    for (int k = 0; k < layer_config->in_channels; ++k) {
+      __m256 shuffle_weight[10];
+
+      // Weights needed are 5x5, for SIMD purpose made this array as 5x8.
+      float weight[5][8] = { { 0 } };
+      int off = k * layer_config->out_channels + i;
+
+      // In layer 0, the convolution process happens at 5x5.
+      // The weights needed for 5x5 block are same across the in-channels,
+      // which is why the load of weights happens once for each in-channel.
+      prepare_weights_for_5x5_convolve(layer_config->weights, off, weight,
+                                       cstep, shuffle_weight, weight_mask_0,
+                                       weight_mask_1);
+
+      for (int h = 0, u = 0; h < in_height - kFilterHeight + 1;
+           h += kSkipHeight, ++u) {
+        const int out_h = u * out_stride;
+        int v = 0;
+        int w = 0;
+        int rem_width = in_width;
+        // Processing 3 5x5 blocks at a time, if sufficient width is present.
+        while (rem_width >= kMinWidthFor3_5x5Blocks) {
+          __m256 load_src_0, load_src_1;
+          __m256 accum_src_0 = _mm256_setzero_ps();
+          __m256 accum_src_1 = _mm256_setzero_ps();
+          const float *input_ptr = &input[k][h * in_stride + w];
+          PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS();
+
+          // Accumulate across column.
+          __m256 accum = _mm256_hadd_ps(accum_src_0, accum_src_1);
+          __m128 tmp_reg_0 = _mm256_extractf128_ps(accum_src_0, 1);
+          __m128 tmp_reg_1 = _mm256_extractf128_ps(accum_src_1, 1);
+
+          __m128 accum_l = _mm256_castps256_ps128(accum);
+          __m128 accum_h = _mm256_extractf128_ps(accum, 1);
+
+          __m128 tmp_reg_2 = _mm_add_ps(accum_l, tmp_reg_0);
+          __m128 tmp_reg_3 = _mm_add_ps(tmp_reg_0, accum_h);
+          __m128 tmp_reg_4 = _mm_add_ps(tmp_reg_1, accum_h);
+
+          // 1st 5x5 block output.
+          output[i][out_h + v] =
+              out_ch_bias + _mm_cvtss_f32(tmp_reg_2) +
+              _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 1));
+
+          // 2nd 5x5 block output.
+          output[i][out_h + v + 1] =
+              out_ch_bias +
+              _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_3, tmp_reg_3, 1)) +
+              _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 2));
+
+          // 3rd 5x5 block output.
+          output[i][out_h + v + 2] =
+              out_ch_bias +
+              _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_4, tmp_reg_4, 2)) +
+              _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 3));
+
+          v += 3;
+          w += kSkipWidthForNextIter;
+          rem_width -= kSkipWidthForNextIter;
+        }
+
+        // Process remaining blocks as single 5x5 block at a time.
+        while (rem_width >= kFilterWidth) {
+          float last_column_sum = 0;
+          __m128 accum = _mm_setzero_ps();
+          const float *input_ptr = &input[k][h * in_stride + w];
+          PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(shuffle_weight, accum, in_stride);
+
+          // Accumulate across column.
+          accum = _mm_hadd_ps(accum, accum);
+          output[i][out_h + v] = out_ch_bias + last_column_sum +
+                                 _mm_cvtss_f32(accum) +
+                                 _mm_cvtss_f32(_mm_shuffle_ps(accum, accum, 1));
+
+          v += 1;
+          w += kSkipWidth;
+          rem_width -= kSkipWidth;
+        }
+      }
+    }
+  }
+}
+
+// AVX2 implementation for layer 1.
+static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+    const float **input, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int channel_step) {
+  __m256i weight_mask[2];
+  __m256i shuffle_output_mask;
+  load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
+
+  const int kInHeight = 16;
+  const int kFilterHeight = 2;
+  const int kSkipHeight = 2;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+    // out_accum registers are used to store the 2x2 convolve outputs
+    // (calculated over input block size), which are accumulated across the
+    // in_channels. As per the design, each iteration of for loop processes 8
+    // (horizontal) 2x2 blocks and stores in corresponding out_accum register
+    // (as input size is 16x16, a total of 64 2x2 blocks are present and 8
+    // out_accum registers are enough to store the outputs).
+    // Hence for loops corresponding to 'j' and 'h', below, run over the number
+    // of out_accum registers.
+    __m256 out_accum[8];
+    for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg;
+    for (int k = 0; k < layer_config->in_channels; ++k) {
+      __m256 shuffle_weight[2];
+      int off = k * layer_config->out_channels + i;
+      // In layer 1, the convolution process happens at 2x2.
+      // The weights needed for 2x2 block are same across the in-channels,
+      // which is why the load of weights happens once for each in-channel.
+      prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
+                                       shuffle_weight, weight_mask);
+
+      for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
+           h += kSkipHeight, ++u) {
+        const float *input_ptr = &input[k][h * in_stride];
+        perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight,
+                                           &out_accum[u], shuffle_output_mask);
+      }
+    }
+    // Store output of layer 1.
+    for (int j = 0; j < 8; ++j) {
+      _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]);
+    }
+  }
+}
+
+// AVX2 implementation for layer 2.
+static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+    const float **input, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int channel_step) {
+  __m256i weight_mask[2];
+  __m256i shuffle_output_mask;
+  load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
+
+  const int kInHeight = 8;
+  const int kFilterHeight = 2;
+  const int kSkipHeight = 2;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+    // out_accum registers are used to store the 2x2 convolve outputs
+    // (calculated over input block size), which are accumulated across the
+    // in_channels. As per the design, each iteration of for loop processes 8
+    // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding
+    // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are
+    // present and 2 out_accum registers are enough to store the outputs).
+    // Hence for loops corresponding to 'j' and 'h', below, run over the number
+    // of out_accum registers.
+    __m256 out_accum[2];
+
+    // Height needs to be moved to go to next iteration of processing
+    // while processing 2 2x2 blocks vertically.
+    const int kSkipHeightForNextIter = kSkipHeight * 2;
+    for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg;
+    for (int k = 0; k < layer_config->in_channels; ++k) {
+      __m256 shuffle_weight[2];
+      int off = k * layer_config->out_channels + i;
+      // In layer 2, the convolution process happens at 2x2.
+      // The weights needed for 2x2 block are same across the in-channels,
+      // which is why the load of weights happens once for each in-channel.
+      prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
+                                       shuffle_weight, weight_mask);
+
+      for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
+           h += kSkipHeightForNextIter, ++u) {
+        const float *input_ptr = &input[k][h * in_stride];
+        perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride,
+                                              shuffle_weight, &out_accum[u],
+                                              shuffle_output_mask);
+      }
+    }
+    // Store output of layer 2.
+    for (int j = 0; j < 2; ++j) {
+      _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]);
+    }
+  }
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when
+// filter_width and filter_height are equal to 2.
+// As per the layer config set by av1_intra_mode_cnn_partition_cnn_config,
+// the filter_width and filter_height are equal to 2 for layer >= 1. So
+// convolution happens at 2x2 for layer >= 1.
+void cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int channel_step) {
+  assert(layer_config->filter_width == 2 && layer_config->filter_height == 2);
+  assert(layer_config->skip_width == 2 && layer_config->skip_height == 2);
+
+  if (in_width == 16 && in_height == 16) {
+    // This case of in_width and in_height equal to 16 corresponds to layer 1.
+    // The output size of this layer is 8x8.
+    cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+        input, in_stride, layer_config, output, out_stride, start_idx, cstep,
+        channel_step);
+  } else if (in_width == 8 && in_height == 8) {
+    // This case of in_width and in_height equal to 8 corresponds to layer 2.
+    // The output size of this layer is 4x4.
+    cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+        input, in_stride, layer_config, output, out_stride, start_idx, cstep,
+        channel_step);
+  } else {
+    // For layer equal to 3 and 4, the input is of size 4x4 and 2x2
+    // respectively. Implementing SIMD for these cases might not be optimal,
+    // which is why we call C path for layer >= 3.
+    av1_cnn_convolve_no_maxpool_padding_valid_c(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  }
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c().
+// As per the current encoder, av1_cnn_convolve function gets called for
+// block size equal to 64x64. av1_cnn_convolve() uses layer config values
+// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few
+// details related to each layer's config parameters.
+// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht
+//     0         64x64    16x16      5         5         4       4
+//     1         16x16    8x8        2         2         2       2
+//     2         8x8      4x4        2         2         2       2
+//     3         4x4      2x2        2         2         2       2
+//     4         2x2      1x1        2         2         2       2
+// Here,
+// filter_wd = filter_width and filter_ht = filter_height,
+// skip_wd = skip_width and skip_ht = skip_height.
+void av1_cnn_convolve_no_maxpool_padding_valid_avx2(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+    int start_idx, int cstep, int channel_step) {
+  if (layer_config->filter_width == 5 && layer_config->filter_height == 5 &&
+      layer_config->skip_width == 4 && layer_config->skip_height == 4) {
+    cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  } else if (layer_config->filter_width == 2 &&
+             layer_config->filter_height == 2 &&
+             layer_config->skip_width == 2 && layer_config->skip_height == 2) {
+    cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  } else {
+    av1_cnn_convolve_no_maxpool_padding_valid_c(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000000..b185548184
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -0,0 +1,82 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+
+  RET
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 0000000000..9627f75930
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+#include <immintrin.h>  /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+                              const int height, uint8_t *const levels) {
+  const int stride = height + TX_PAD_HOR;
+  const __m256i y_zeros = _mm256_setzero_si256();
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf_end = levels + (width + TX_PAD_BOTTOM) * stride;
+  uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+  do {
+    yy_storeu_256(bottom_buf, y_zeros);
+    bottom_buf += 32;
+  } while (bottom_buf < bottom_buf_end);
+
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (height == 4) {
+    do {
+      const __m256i c0 = yy_loadu_256(cf);
+      const __m256i c1 = yy_loadu_256(cf + 8);
+      const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+      const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+      const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+      const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      ls += 32;
+      cf += 16;
+      i += 4;
+    } while (i < width);
+  } else if (height == 8) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      const __m128i res0 = _mm256_castsi256_si128(res);
+      const __m128i res1 = _mm256_extracti128_si256(res, 1);
+      xx_storel_64(ls, res0);
+      *(int32_t *)(ls + height) = 0;
+      xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+      *(int32_t *)(ls + height + stride) = 0;
+      xx_storel_64(ls + stride * 2, res1);
+      *(int32_t *)(ls + height + stride * 2) = 0;
+      xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+      *(int32_t *)(ls + height + stride * 3) = 0;
+      cf += 32;
+      ls += stride << 2;
+      i += 4;
+    } while (i < width);
+  } else if (height == 16) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      xx_storeu_128(ls, _mm256_castsi256_si128(res));
+      xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+      cf += 32;
+      *(int32_t *)(ls + height) = 0;
+      *(int32_t *)(ls + stride + height) = 0;
+      ls += stride << 1;
+      i += 2;
+    } while (i < width);
+  } else {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      cf += 32;
+      *(int32_t *)(ls + height) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < width);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 0000000000..d23a688747
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+                                           const int stride,
+                                           const ptrdiff_t *const offsets,
+                                           __m128i *const level) {
+  level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+  level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+  level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+  level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+  level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+  const __m128i const_3 = _mm_set1_epi8(3);
+  const __m128i const_4 = _mm_set1_epi8(4);
+  __m128i count;
+
+  count = _mm_min_epu8(level[0], const_3);
+  level[1] = _mm_min_epu8(level[1], const_3);
+  level[2] = _mm_min_epu8(level[2], const_3);
+  level[3] = _mm_min_epu8(level[3], const_3);
+  level[4] = _mm_min_epu8(level[4], const_3);
+  count = _mm_add_epi8(count, level[1]);
+  count = _mm_add_epi8(count, level[2]);
+  count = _mm_add_epi8(count, level[3]);
+  count = _mm_add_epi8(count, level[4]);
+  count = _mm_avg_epu8(count, _mm_setzero_si128());
+  count = _mm_min_epu8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int width,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+  __m128i pos_to_offset =
+      (width == 4)
+          ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+          : _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21,
+                          21, 21);
+  __m128i count;
+  __m128i level[5];
+  int8_t *cc = coeff_contexts;
+  int col = width;
+
+  assert(!(width % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    col -= 4;
+  } while (col);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int width,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int col = width;
+
+  assert(!(width % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    col -= 4;
+  } while (col);
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int width,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int col = width;
+
+  assert(!(width % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    col -= 4;
+  } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int width,
+                                           const ptrdiff_t *const offsets,
+                                           int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int col = width;
+  __m128i count;
+  __m128i level[5];
+  __m128i pos_to_offset[3];
+
+  assert(!(width % 2));
+
+  if (width == 8) {
+    pos_to_offset[0] =
+        _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  } else if (width < 8) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21,
+                                     21, 21, 21, 21, 21);
+  } else {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                                     16, 16, 16, 16, 16);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  }
+  pos_to_offset[2] = _mm_set1_epi8(21);
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset[0]);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    col -= 2;
+  } while (col);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int width,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  int col = width;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(width % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    col -= 2;
+  } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int width,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+  int col = width;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(width % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    col -= 2;
+  } while (col);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = height + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int col = width;
+  __m128i pos_to_offset[5];
+  __m128i pos_to_offset_large[3];
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 16));
+
+  pos_to_offset_large[2] = _mm_set1_epi8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width < real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+        11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width > real_height
+    pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(16);
+  }
+
+  do {
+    int h = height;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)cc, count);
+      levels += 16;
+      cc += 16;
+      h -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (h);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--col);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = height + TX_PAD_HOR;
+  const __m128i pos_to_offset_large =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int col = width;
+
+  assert(!(height % 16));
+
+  do {
+    __m128i pos_to_offset =
+        _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+    int h = height;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      h -= 16;
+    } while (h);
+
+    levels += TX_PAD_HOR;
+  } while (--col);
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = height + TX_PAD_HOR;
+  __m128i pos_to_offset[3];
+  __m128i count;
+  __m128i level[5];
+  int col = width;
+
+  assert(!(height % 16));
+
+  pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+  do {
+    int h = height;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      h -= 16;
+    } while (h);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--col);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = height + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (height == 4) {
+      get_4_nz_map_contexts_2d(levels, width, offsets, coeff_contexts);
+    } else if (height == 8) {
+      get_8_coeff_contexts_2d(levels, width, offsets, coeff_contexts);
+    } else if (height == 16) {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (height == 4) {
+      get_4_nz_map_contexts_hor(levels, width, offsets, coeff_contexts);
+    } else if (height == 8) {
+      get_8_coeff_contexts_hor(levels, width, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (height == 4) {
+      get_4_nz_map_contexts_ver(levels, width, offsets, coeff_contexts);
+    } else if (height == 8) {
+      get_8_coeff_contexts_ver(levels, width, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  }
+
+  const int bhl = get_txb_bhl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (width << bhl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (width << bhl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 0000000000..72bd8e3411
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+                                const int height, uint8_t *const levels) {
+  const int stride = height + TX_PAD_HOR;
+  const __m128i zeros = _mm_setzero_si128();
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf = levels + stride * width;
+  uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+  do {
+    _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+    bottom_buf += 16;
+  } while (bottom_buf < bottom_buf_end);
+
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (height == 4) {
+    do {
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+      xx_storeu_128(ls, lsAB);
+      ls += (stride << 1);
+      cf += (height << 1);
+      i += 2;
+    } while (i < width);
+  } else if (height == 8) {
+    do {
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      xx_storeu_128(ls, absAB8);
+      ls += stride;
+      cf += height;
+      i += 1;
+    } while (i < width);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m128i coeffA = xx_loadu_128(cf);
+        const __m128i coeffB = xx_loadu_128(cf + 4);
+        const __m128i coeffC = xx_loadu_128(cf + 8);
+        const __m128i coeffD = xx_loadu_128(cf + 12);
+        const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+        const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+        const __m128i absAB = _mm_abs_epi16(coeffAB);
+        const __m128i absCD = _mm_abs_epi16(coeffCD);
+        const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+        xx_storeu_128(ls + j, absABCD);
+        j += 16;
+        cf += 16;
+      } while (j < height);
+      *(int32_t *)(ls + height) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < width);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 0000000000..57725d1795
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m256i *c) {
+  const tran_low_t *addr = coeff + offset;
+
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+    const __m256i y = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(y, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)addr);
+  }
+}
+
+static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff,
+                                                    const int16_t *dqcoeff,
+                                                    __m256i *sse_256) {
+  const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
+  const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
+  // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+  const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+  // r0 r1 r2 r3 r4 r5 r6 r7
+  const __m256i error = _mm256_madd_epi16(diff, diff);
+  // r0+r1 r2+r3 | r0+r1 r2+r3 | r4+r5 r6+r7 | r4+r5 r6+r7
+  const __m256i error_hi = _mm256_hadd_epi32(error, error);
+  // r0+r1 | r2+r3 | r4+r5 | r6+r7
+  *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256());
+}
+
+static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff,
+                                                    const int16_t *dqcoeff,
+                                                    __m256i *sse_256) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
+  const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
+  const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16));
+  const __m256i _dqcoeff_1 =
+      _mm256_loadu_si256((const __m256i *)(dqcoeff + 16));
+
+  // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+  const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0);
+  const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1);
+
+  // r0 r1 r2 r3 r4 r5 r6 r7
+  const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0);
+  const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1);
+  const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1);
+
+  // For extreme input values, the accumulation needs to happen in 64 bit
+  // precision to avoid any overflow.
+  const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero);
+  const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero);
+  const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo);
+  *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0);
+}
+
+static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff,
+                                                    const int16_t *dqcoeff,
+                                                    __m256i *sse_256,
+                                                    intptr_t num_coeff) {
+  const __m256i zero = _mm256_setzero_si256();
+  for (int i = 0; i < num_coeff; i += 64) {
+    // Load 64 elements for coeff and dqcoeff.
+    const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
+    const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16));
+    const __m256i _dqcoeff_1 =
+        _mm256_loadu_si256((const __m256i *)(dqcoeff + 16));
+    const __m256i _coeff_2 = _mm256_loadu_si256((const __m256i *)(coeff + 32));
+    const __m256i _dqcoeff_2 =
+        _mm256_loadu_si256((const __m256i *)(dqcoeff + 32));
+    const __m256i _coeff_3 = _mm256_loadu_si256((const __m256i *)(coeff + 48));
+    const __m256i _dqcoeff_3 =
+        _mm256_loadu_si256((const __m256i *)(dqcoeff + 48));
+
+    // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+    const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0);
+    const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1);
+    const __m256i diff_2 = _mm256_sub_epi16(_dqcoeff_2, _coeff_2);
+    const __m256i diff_3 = _mm256_sub_epi16(_dqcoeff_3, _coeff_3);
+
+    // r0 r1 r2 r3 r4 r5 r6 r7
+    const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0);
+    const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1);
+    const __m256i error_2 = _mm256_madd_epi16(diff_2, diff_2);
+    const __m256i error_3 = _mm256_madd_epi16(diff_3, diff_3);
+    // r00 r01 r02 r03 r04 r05 r06 r07
+    const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1);
+    // r10 r11 r12 r13 r14 r15 r16 r17
+    const __m256i err_final_1 = _mm256_add_epi32(error_2, error_3);
+
+    // For extreme input values, the accumulation needs to happen in 64 bit
+    // precision to avoid any overflow. r00 r01 r04 r05
+    const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero);
+    // r02 r03 r06 r07
+    const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero);
+    // r10 r11 r14 r15
+    const __m256i exp1_error_lo = _mm256_unpacklo_epi32(err_final_1, zero);
+    // r12 r13 r16 r17
+    const __m256i exp1_error_hi = _mm256_unpackhi_epi32(err_final_1, zero);
+
+    const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo);
+    const __m256i sum_temp_1 = _mm256_add_epi64(exp1_error_hi, exp1_error_lo);
+    const __m256i sse_256_temp = _mm256_add_epi64(sum_temp_1, sum_temp_0);
+    *sse_256 = _mm256_add_epi64(*sse_256, sse_256_temp);
+    coeff += 64;
+    dqcoeff += 64;
+  }
+}
+
+int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+                                intptr_t num_coeff) {
+  assert(num_coeff % 16 == 0);
+  __m256i sse_256 = _mm256_setzero_si256();
+  int64_t sse;
+
+  if (num_coeff == 16)
+    av1_block_error_num_coeff16_avx2(coeff, dqcoeff, &sse_256);
+  else if (num_coeff == 32)
+    av1_block_error_num_coeff32_avx2(coeff, dqcoeff, &sse_256);
+  else
+    av1_block_error_num_coeff64_avx2(coeff, dqcoeff, &sse_256, num_coeff);
+
+  // Save the higher 64 bit of each 128 bit lane.
+  const __m256i sse_hi = _mm256_srli_si256(sse_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+  // Accumulate the sse_256 register to get final sse
+  const __m128i sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                                        _mm256_extractf128_si256(sse_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)&sse, sse_128);
+  return sse;
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_reg_64hi, ssz_reg_64hi;
+  __m128i sse_reg128, ssz_reg128;
+  int64_t sse;
+  int i;
+  const __m256i zero_reg = _mm256_setzero_si256();
+
+  // init sse and ssz registerd to zero
+  sse_reg = _mm256_setzero_si256();
+  ssz_reg = _mm256_setzero_si256();
+
+  for (i = 0; i < block_size; i += 16) {
+    // load 32 bytes from coeff and dqcoeff
+    read_coeff(coeff, i, &coeff_reg);
+    read_coeff(dqcoeff, i, &dqcoeff_reg);
+    // dqcoeff - coeff
+    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+    // madd (dqcoeff - coeff)
+    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+    // madd coeff
+    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+    // expand each double word of madd (dqcoeff - coeff) to quad word
+    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+    // expand each double word of madd (coeff) to quad word
+    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+  }
+  // save the higher 64 bit of each 128 bit lane
+  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+  // add the higher 64 bit to the low 64 bit
+  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+  // add each 64 bit from each of the 128 bit lane of the 256 bit
+  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+                             _mm256_extractf128_si256(sse_reg, 1));
+
+  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+                             _mm256_extractf128_si256(ssz_reg, 1));
+
+  // store the results
+  _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
+
+  _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+  _mm256_zeroupper();
+  return sse;
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c
new file mode 100644
index 0000000000..61f65c623f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) {
+  __m128i reg_hi = _mm_srli_si128(reg, 8);
+  reg = _mm_add_epi64(reg, reg_hi);
+
+  return reg;
+}
+
+int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff,
+                                intptr_t block_size) {
+  assert(block_size % 16 == 0);
+  assert(block_size >= 16);
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum_0 = zero;
+  __m128i accum_1 = zero;
+
+  for (int i = 0; i < block_size; i += 16) {
+    // Load 8 elements for coeff and dqcoeff.
+    const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff);
+    const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8));
+    // Compute the diff
+    const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0);
+    const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1);
+    // Compute the error
+    const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0);
+    const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1);
+
+    const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero);
+    const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero);
+    const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero);
+    const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero);
+
+    // Accumulate
+    accum_0 = _mm_add_epi64(accum_0, error_lo_0);
+    accum_1 = _mm_add_epi64(accum_1, error_lo_1);
+    accum_0 = _mm_add_epi64(accum_0, error_hi_0);
+    accum_1 = _mm_add_epi64(accum_1, error_hi_1);
+
+    // Advance
+    coeff += 16;
+    dqcoeff += 16;
+  }
+
+  __m128i accum = _mm_add_epi64(accum_0, accum_1);
+  // Reduce sum the register
+  accum = reduce_sum_epi64(accum);
+
+  // Store the results.
+#if AOM_ARCH_X86_64
+  return _mm_cvtsi128_si64(accum);
+#else
+  int64_t result;
+  _mm_storel_epi64((__m128i *)&result, accum);
+  return result;
+#endif  // AOM_ARCH_X86_64
+}
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
new file mode 100644
index 0000000000..6407c106ab
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+  lea %1, [%1 + %2 * 4]
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+  mova     m%1, [%2 + (%3) * 4]
+  packssdw m%1, [%2 + (%3) * 4 + 16]
+%endmacro
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+.loop:
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
+  paddd     m2, m3
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
+  paddq     m4, m0
+  punpckhdq m2, m5
+  paddq     m6, m7
+  paddq     m6, m2
+  jg .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+%if AOM_ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 0000000000..ebe75310e9
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+                                     size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+  // Align the input to the word boundary
+  for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+    crc = _mm_crc32_u8(crc, *buf);
+  }
+
+#ifdef __x86_64__
+  uint64_t crc64 = crc;
+  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len)
+  crc = (uint32_t)crc64;
+#endif
+  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len)
+  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len)
+  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len)
+  return (crc ^ 0xFFFFFFFF);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
new file mode 100644
index 0000000000..340307cb3e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/common.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i;
+  int64_t temp1[8];
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i += 16) {
+    __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i));
+    __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8));
+    __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i));
+    __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8));
+
+    __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff);
+    __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2);
+    __m256i diff1h = _mm256_srli_epi64(diff1, 32);
+    __m256i diff2h = _mm256_srli_epi64(diff2, 32);
+    __m256i res = _mm256_mul_epi32(diff1, diff1);
+    __m256i res1 = _mm256_mul_epi32(diff1h, diff1h);
+    __m256i res2 = _mm256_mul_epi32(diff2, diff2);
+    __m256i res3 = _mm256_mul_epi32(diff2h, diff2h);
+    __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+                                        _mm256_add_epi64(res2, res3));
+    __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32);
+    __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32);
+    res = _mm256_mul_epi32(mm256_coeff, mm256_coeff);
+    res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh);
+    res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2);
+    res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2);
+    __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+                                           _mm256_add_epi64(res2, res3));
+    _mm256_storeu_si256((__m256i *)temp1, res_diff);
+    _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff);
+
+    error += temp1[0] + temp1[1] + temp1[2] + temp1[3];
+    sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..b0b2757568
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "av1/common/common.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i += 8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32((int)0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+                         _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+                         _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+                         _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+                         _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(
+        _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i *)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error += diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
new file mode 100644
index 0000000000..9cdf21fc7c
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -0,0 +1,3132 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h> /*AVX2*/
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
+                                        int stride, int flipud, int fliplr,
+                                        int shift) {
+  __m128i out1[8];
+  if (!flipud) {
+    out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  } else {
+    out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  }
+  if (!fliplr) {
+    out[0] = _mm256_cvtepi16_epi32(out1[0]);
+    out[1] = _mm256_cvtepi16_epi32(out1[1]);
+    out[2] = _mm256_cvtepi16_epi32(out1[2]);
+    out[3] = _mm256_cvtepi16_epi32(out1[3]);
+    out[4] = _mm256_cvtepi16_epi32(out1[4]);
+    out[5] = _mm256_cvtepi16_epi32(out1[5]);
+    out[6] = _mm256_cvtepi16_epi32(out1[6]);
+    out[7] = _mm256_cvtepi16_epi32(out1[7]);
+
+  } else {
+    out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0]));
+    out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1]));
+    out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2]));
+    out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3]));
+    out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4]));
+    out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5]));
+    out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6]));
+    out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7]));
+  }
+  out[0] = _mm256_slli_epi32(out[0], shift);
+  out[1] = _mm256_slli_epi32(out[1], shift);
+  out[2] = _mm256_slli_epi32(out[2], shift);
+  out[3] = _mm256_slli_epi32(out[3], shift);
+  out[4] = _mm256_slli_epi32(out[4], shift);
+  out[5] = _mm256_slli_epi32(out[5], shift);
+  out[6] = _mm256_slli_epi32(out[6], shift);
+  out[7] = _mm256_slli_epi32(out[7], shift);
+}
+static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm256_add_epi32(in[0], rounding);
+  in[1] = _mm256_add_epi32(in[1], rounding);
+  in[2] = _mm256_add_epi32(in[2], rounding);
+  in[3] = _mm256_add_epi32(in[3], rounding);
+  in[4] = _mm256_add_epi32(in[4], rounding);
+  in[5] = _mm256_add_epi32(in[5], rounding);
+  in[6] = _mm256_add_epi32(in[6], rounding);
+  in[7] = _mm256_add_epi32(in[7], rounding);
+
+  in[0] = _mm256_srai_epi32(in[0], shift);
+  in[1] = _mm256_srai_epi32(in[1], shift);
+  in[2] = _mm256_srai_epi32(in[2], shift);
+  in[3] = _mm256_srai_epi32(in[3], shift);
+  in[4] = _mm256_srai_epi32(in[4], shift);
+  in[5] = _mm256_srai_epi32(in[5], shift);
+  in[6] = _mm256_srai_epi32(in[6], shift);
+  in[7] = _mm256_srai_epi32(in[7], shift);
+}
+static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
+                                         int stride, int flipud, int fliplr,
+                                         int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+  load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
+}
+static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
+                                         int stride, int height, int outstride,
+                                         int flipud, int fliplr) {
+  __m256i out1[64];
+  if (!flipud) {
+    for (int i = 0; i < height; i++) {
+      out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride));
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      out1[(height - 1) - i] =
+          _mm256_loadu_si256((const __m256i *)(input + i * stride));
+    }
+  }
+  if (!fliplr) {
+    for (int i = 0; i < height; i++) {
+      out[i * outstride] =
+          _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i]));
+      out[i * outstride + 1] =
+          _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1));
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      out[i * outstride + 1] = _mm256_cvtepi16_epi32(
+          mm_reverse_epi16(_mm256_castsi256_si128(out1[i])));
+      out[i * outstride + 0] = _mm256_cvtepi16_epi32(
+          mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1)));
+    }
+  }
+}
+
+static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
+                                        const int instride,
+                                        const int outstride) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]);
+  u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]);
+
+  u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]);
+  u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]);
+
+  u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]);
+  u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]);
+
+  u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]);
+  u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
+                                           int stride) {
+  if (bit < 0) {
+    bit = -bit;
+    __m256i round = _mm256_set1_epi32(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[stride * i] = _mm256_add_epi32(in[stride * i], round);
+      in[stride * i] = _mm256_srai_epi32(in[stride * i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[stride * i] = _mm256_slli_epi32(in[stride * i], bit);
+    }
+  }
+}
+static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out,
+                                     const int stride, const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out), in[i]);
+    out += stride;
+  }
+}
+static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
+                                                 __m256i *out) {
+  fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
+}
+
+static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
+                                        const __m256i *w1, const __m256i *n1,
+                                        const __m256i *rounding, int bit) {
+  __m256i x, y;
+
+  x = _mm256_mullo_epi32(*w0, *n0);
+  y = _mm256_mullo_epi32(*w1, *n1);
+  x = _mm256_add_epi32(x, y);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                       \
+    const __m256i ww0 = _mm256_set1_epi32(w0);               \
+    const __m256i ww1 = _mm256_set1_epi32(w1);               \
+    const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);     \
+    const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);     \
+    out0 = _mm256_add_epi32(in0_w0, in1_w1);                 \
+    round_shift_32_8xn_avx2(&out0, 1, -bit, 1);              \
+    const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);     \
+    const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);     \
+    out1 = _mm256_sub_epi32(in0_w1, in1_w0);                 \
+    round_shift_32_8xn_avx2(&out1, 1, -bit, 1);              \
+  } while (0)
+
+#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                \
+    const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);              \
+    const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);              \
+    out0 = _mm256_add_epi32(in0_w0, in1_w1);                          \
+    out0 = _mm256_add_epi32(out0, r);                                 \
+    out0 = _mm256_srai_epi32(out0, bit);                              \
+    const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);              \
+    const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);              \
+    out1 = _mm256_sub_epi32(in0_w1, in1_w0);                          \
+    out1 = _mm256_add_epi32(out1, r);                                 \
+    out1 = _mm256_srai_epi32(out1, bit);                              \
+  } while (0)
+
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out,
+                                  const int8_t cos_bit, int instride,
+                                  int outstride);
+static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                       const int col_num, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  __m256i u[8], v[8];
+  for (int col = 0; col < col_num; ++col) {
+    u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+    v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+    u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+    u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+    u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+    u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+    u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+    v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+    v[0] = _mm256_add_epi32(u[0], u[3]);
+    v[3] = _mm256_sub_epi32(u[0], u[3]);
+    v[1] = _mm256_add_epi32(u[1], u[2]);
+    v[2] = _mm256_sub_epi32(u[1], u[2]);
+
+    v[5] = _mm256_mullo_epi32(u[5], cospim32);
+    v[6] = _mm256_mullo_epi32(u[6], cospi32);
+    v[5] = _mm256_add_epi32(v[5], v[6]);
+    v[5] = _mm256_add_epi32(v[5], rnding);
+    v[5] = _mm256_srai_epi32(v[5], bit);
+
+    u[0] = _mm256_mullo_epi32(u[5], cospi32);
+    v[6] = _mm256_mullo_epi32(u[6], cospim32);
+    v[6] = _mm256_sub_epi32(u[0], v[6]);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    // stage 3
+    // type 0
+    v[0] = _mm256_mullo_epi32(v[0], cospi32);
+    v[1] = _mm256_mullo_epi32(v[1], cospi32);
+    u[0] = _mm256_add_epi32(v[0], v[1]);
+    u[0] = _mm256_add_epi32(u[0], rnding);
+    u[0] = _mm256_srai_epi32(u[0], bit);
+
+    u[1] = _mm256_sub_epi32(v[0], v[1]);
+    u[1] = _mm256_add_epi32(u[1], rnding);
+    u[1] = _mm256_srai_epi32(u[1], bit);
+
+    // type 1
+    v[0] = _mm256_mullo_epi32(v[2], cospi48);
+    v[1] = _mm256_mullo_epi32(v[3], cospi16);
+    u[2] = _mm256_add_epi32(v[0], v[1]);
+    u[2] = _mm256_add_epi32(u[2], rnding);
+    u[2] = _mm256_srai_epi32(u[2], bit);
+
+    v[0] = _mm256_mullo_epi32(v[2], cospi16);
+    v[1] = _mm256_mullo_epi32(v[3], cospi48);
+    u[3] = _mm256_sub_epi32(v[1], v[0]);
+    u[3] = _mm256_add_epi32(u[3], rnding);
+    u[3] = _mm256_srai_epi32(u[3], bit);
+
+    u[4] = _mm256_add_epi32(v[4], v[5]);
+    u[5] = _mm256_sub_epi32(v[4], v[5]);
+    u[6] = _mm256_sub_epi32(v[7], v[6]);
+    u[7] = _mm256_add_epi32(v[7], v[6]);
+
+    // stage 4
+    // stage 5
+    v[0] = _mm256_mullo_epi32(u[4], cospi56);
+    v[1] = _mm256_mullo_epi32(u[7], cospi8);
+    v[0] = _mm256_add_epi32(v[0], v[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[4]
+
+    v[0] = _mm256_mullo_epi32(u[4], cospi8);
+    v[1] = _mm256_mullo_epi32(u[7], cospi56);
+    v[0] = _mm256_sub_epi32(v[1], v[0]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[7]
+
+    v[0] = _mm256_mullo_epi32(u[5], cospi24);
+    v[1] = _mm256_mullo_epi32(u[6], cospi40);
+    v[0] = _mm256_add_epi32(v[0], v[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[5]
+
+    v[0] = _mm256_mullo_epi32(u[5], cospi40);
+    v[1] = _mm256_mullo_epi32(u[6], cospi24);
+    v[0] = _mm256_sub_epi32(v[1], v[0]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[6]
+
+    out[0 * outstride + col] = u[0];  // buf0[0]
+    out[4 * outstride + col] = u[1];  // buf0[1]
+    out[2 * outstride + col] = u[2];  // buf0[2]
+    out[6 * outstride + col] = u[3];  // buf0[3]
+  }
+}
+static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        const int col_num, const int outstirde) {
+  (void)col_num;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i x, y;
+  for (int col = 0; col < col_num; ++col) {
+    u0 = in[0 * col_num + col];
+    u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]);
+    u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]);
+    u3 = in[4 * col_num + col];
+    u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]);
+    u5 = in[6 * col_num + col];
+    u6 = in[2 * col_num + col];
+    u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm256_mullo_epi32(u2, cospi32);
+    y = _mm256_mullo_epi32(u3, cospi32);
+    v2 = _mm256_add_epi32(x, y);
+    v2 = _mm256_add_epi32(v2, rnding);
+    v2 = _mm256_srai_epi32(v2, bit);
+
+    v3 = _mm256_sub_epi32(x, y);
+    v3 = _mm256_add_epi32(v3, rnding);
+    v3 = _mm256_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm256_mullo_epi32(u6, cospi32);
+    y = _mm256_mullo_epi32(u7, cospi32);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    v7 = _mm256_sub_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm256_add_epi32(v0, v2);
+    u1 = _mm256_add_epi32(v1, v3);
+    u2 = _mm256_sub_epi32(v0, v2);
+    u3 = _mm256_sub_epi32(v1, v3);
+    u4 = _mm256_add_epi32(v4, v6);
+    u5 = _mm256_add_epi32(v5, v7);
+    u6 = _mm256_sub_epi32(v4, v6);
+    u7 = _mm256_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm256_mullo_epi32(u4, cospi16);
+    y = _mm256_mullo_epi32(u5, cospi48);
+    v4 = _mm256_add_epi32(x, y);
+    v4 = _mm256_add_epi32(v4, rnding);
+    v4 = _mm256_srai_epi32(v4, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi48);
+    y = _mm256_mullo_epi32(u5, cospim16);
+    v5 = _mm256_add_epi32(x, y);
+    v5 = _mm256_add_epi32(v5, rnding);
+    v5 = _mm256_srai_epi32(v5, bit);
+
+    x = _mm256_mullo_epi32(u6, cospim48);
+    y = _mm256_mullo_epi32(u7, cospi16);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi16);
+    y = _mm256_mullo_epi32(u7, cospi48);
+    v7 = _mm256_add_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm256_add_epi32(v0, v4);
+    u1 = _mm256_add_epi32(v1, v5);
+    u2 = _mm256_add_epi32(v2, v6);
+    u3 = _mm256_add_epi32(v3, v7);
+    u4 = _mm256_sub_epi32(v0, v4);
+    u5 = _mm256_sub_epi32(v1, v5);
+    u6 = _mm256_sub_epi32(v2, v6);
+    u7 = _mm256_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm256_mullo_epi32(u0, cospi4);
+    y = _mm256_mullo_epi32(u1, cospi60);
+    v0 = _mm256_add_epi32(x, y);
+    v0 = _mm256_add_epi32(v0, rnding);
+    v0 = _mm256_srai_epi32(v0, bit);
+
+    x = _mm256_mullo_epi32(u0, cospi60);
+    y = _mm256_mullo_epi32(u1, cospim4);
+    v1 = _mm256_add_epi32(x, y);
+    v1 = _mm256_add_epi32(v1, rnding);
+    v1 = _mm256_srai_epi32(v1, bit);
+
+    x = _mm256_mullo_epi32(u2, cospi20);
+    y = _mm256_mullo_epi32(u3, cospi44);
+    v2 = _mm256_add_epi32(x, y);
+    v2 = _mm256_add_epi32(v2, rnding);
+    v2 = _mm256_srai_epi32(v2, bit);
+
+    x = _mm256_mullo_epi32(u2, cospi44);
+    y = _mm256_mullo_epi32(u3, cospim20);
+    v3 = _mm256_add_epi32(x, y);
+    v3 = _mm256_add_epi32(v3, rnding);
+    v3 = _mm256_srai_epi32(v3, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi36);
+    y = _mm256_mullo_epi32(u5, cospi28);
+    v4 = _mm256_add_epi32(x, y);
+    v4 = _mm256_add_epi32(v4, rnding);
+    v4 = _mm256_srai_epi32(v4, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi28);
+    y = _mm256_mullo_epi32(u5, cospim36);
+    v5 = _mm256_add_epi32(x, y);
+    v5 = _mm256_add_epi32(v5, rnding);
+    v5 = _mm256_srai_epi32(v5, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi52);
+    y = _mm256_mullo_epi32(u7, cospi12);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi12);
+    y = _mm256_mullo_epi32(u7, cospim52);
+    v7 = _mm256_add_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 7
+    out[0 * outstirde + col] = v1;
+    out[1 * outstirde + col] = v6;
+    out[2 * outstirde + col] = v3;
+    out[3 * outstirde + col] = v4;
+    out[4 * outstirde + col] = v5;
+    out[5 * outstirde + col] = v2;
+    out[6 * outstirde + col] = v7;
+    out[7 * outstirde + col] = v0;
+  }
+}
+static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num,
+                       int outstride) {
+  (void)bit;
+  (void)outstride;
+  int num_iters = 8 * col_num;
+  for (int i = 0; i < num_iters; i += 8) {
+    out[i] = _mm256_add_epi32(in[i], in[i]);
+    out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]);
+    out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]);
+    out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]);
+    out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]);
+    out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]);
+    out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]);
+    out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]);
+  }
+}
+void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  __m256i in[8], out[8];
+  const TX_SIZE tx_size = TX_8X8;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int width = tx_size_wide[tx_size];
+  const int width_div8 = (width >> 3);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case IDTX:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case V_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case H_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case V_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case H_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 8);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        const int col_num, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  __m256i u[16], v[16], x;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm256_add_epi32(u[0], u[7]);
+    v[7] = _mm256_sub_epi32(u[0], u[7]);
+    v[1] = _mm256_add_epi32(u[1], u[6]);
+    v[6] = _mm256_sub_epi32(u[1], u[6]);
+    v[2] = _mm256_add_epi32(u[2], u[5]);
+    v[5] = _mm256_sub_epi32(u[2], u[5]);
+    v[3] = _mm256_add_epi32(u[3], u[4]);
+    v[4] = _mm256_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm256_mullo_epi32(u[10], cospim32);
+    x = _mm256_mullo_epi32(u[13], cospi32);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospi32);
+    x = _mm256_mullo_epi32(u[13], cospim32);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = _mm256_mullo_epi32(u[11], cospim32);
+    x = _mm256_mullo_epi32(u[12], cospi32);
+    v[11] = _mm256_add_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(u[11], cospi32);
+    x = _mm256_mullo_epi32(u[12], cospim32);
+    v[12] = _mm256_sub_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm256_add_epi32(v[0], v[3]);
+    u[3] = _mm256_sub_epi32(v[0], v[3]);
+    u[1] = _mm256_add_epi32(v[1], v[2]);
+    u[2] = _mm256_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm256_mullo_epi32(v[5], cospim32);
+    x = _mm256_mullo_epi32(v[6], cospi32);
+    u[5] = _mm256_add_epi32(u[5], x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_mullo_epi32(v[5], cospi32);
+    x = _mm256_mullo_epi32(v[6], cospim32);
+    u[6] = _mm256_sub_epi32(u[6], x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm256_add_epi32(v[8], v[11]);
+    u[11] = _mm256_sub_epi32(v[8], v[11]);
+    u[9] = _mm256_add_epi32(v[9], v[10]);
+    u[10] = _mm256_sub_epi32(v[9], v[10]);
+    u[12] = _mm256_sub_epi32(v[15], v[12]);
+    u[15] = _mm256_add_epi32(v[15], v[12]);
+    u[13] = _mm256_sub_epi32(v[14], v[13]);
+    u[14] = _mm256_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm256_mullo_epi32(u[0], cospi32);
+    u[1] = _mm256_mullo_epi32(u[1], cospi32);
+    v[0] = _mm256_add_epi32(u[0], u[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    v[1] = _mm256_sub_epi32(u[0], u[1]);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    v[2] = _mm256_mullo_epi32(u[2], cospi48);
+    x = _mm256_mullo_epi32(u[3], cospi16);
+    v[2] = _mm256_add_epi32(v[2], x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_mullo_epi32(u[2], cospi16);
+    x = _mm256_mullo_epi32(u[3], cospi48);
+    v[3] = _mm256_sub_epi32(x, v[3]);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = _mm256_add_epi32(u[4], u[5]);
+    v[5] = _mm256_sub_epi32(u[4], u[5]);
+    v[6] = _mm256_sub_epi32(u[7], u[6]);
+    v[7] = _mm256_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm256_mullo_epi32(u[9], cospim16);
+    x = _mm256_mullo_epi32(u[14], cospi48);
+    v[9] = _mm256_add_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[14] = _mm256_mullo_epi32(u[9], cospi48);
+    x = _mm256_mullo_epi32(u[14], cospim16);
+    v[14] = _mm256_sub_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[10] = _mm256_mullo_epi32(u[10], cospim48);
+    x = _mm256_mullo_epi32(u[13], cospim16);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospim16);
+    x = _mm256_mullo_epi32(u[13], cospim48);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm256_mullo_epi32(v[4], cospi56);
+    x = _mm256_mullo_epi32(v[7], cospi8);
+    u[4] = _mm256_add_epi32(u[4], x);
+    u[4] = _mm256_add_epi32(u[4], rnding);
+    u[4] = _mm256_srai_epi32(u[4], bit);
+
+    u[7] = _mm256_mullo_epi32(v[4], cospi8);
+    x = _mm256_mullo_epi32(v[7], cospi56);
+    u[7] = _mm256_sub_epi32(x, u[7]);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    u[5] = _mm256_mullo_epi32(v[5], cospi24);
+    x = _mm256_mullo_epi32(v[6], cospi40);
+    u[5] = _mm256_add_epi32(u[5], x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_mullo_epi32(v[5], cospi40);
+    x = _mm256_mullo_epi32(v[6], cospi24);
+    u[6] = _mm256_sub_epi32(x, u[6]);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[8] = _mm256_add_epi32(v[8], v[9]);
+    u[9] = _mm256_sub_epi32(v[8], v[9]);
+    u[10] = _mm256_sub_epi32(v[11], v[10]);
+    u[11] = _mm256_add_epi32(v[11], v[10]);
+    u[12] = _mm256_add_epi32(v[12], v[13]);
+    u[13] = _mm256_sub_epi32(v[12], v[13]);
+    u[14] = _mm256_sub_epi32(v[15], v[14]);
+    u[15] = _mm256_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm256_mullo_epi32(u[8], cospi60);
+    x = _mm256_mullo_epi32(u[15], cospi4);
+    v[8] = _mm256_add_epi32(v[8], x);
+    v[8] = _mm256_add_epi32(v[8], rnding);
+    v[8] = _mm256_srai_epi32(v[8], bit);
+
+    v[15] = _mm256_mullo_epi32(u[8], cospi4);
+    x = _mm256_mullo_epi32(u[15], cospi60);
+    v[15] = _mm256_sub_epi32(x, v[15]);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    v[9] = _mm256_mullo_epi32(u[9], cospi28);
+    x = _mm256_mullo_epi32(u[14], cospi36);
+    v[9] = _mm256_add_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[14] = _mm256_mullo_epi32(u[9], cospi36);
+    x = _mm256_mullo_epi32(u[14], cospi28);
+    v[14] = _mm256_sub_epi32(x, v[14]);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[10] = _mm256_mullo_epi32(u[10], cospi44);
+    x = _mm256_mullo_epi32(u[13], cospi20);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospi20);
+    x = _mm256_mullo_epi32(u[13], cospi44);
+    v[13] = _mm256_sub_epi32(x, v[13]);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = _mm256_mullo_epi32(u[11], cospi12);
+    x = _mm256_mullo_epi32(u[12], cospi52);
+    v[11] = _mm256_add_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(u[11], cospi52);
+    x = _mm256_mullo_epi32(u[12], cospi12);
+    v[12] = _mm256_sub_epi32(x, v[12]);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    out[0 * outstride + col] = v[0];
+    out[1 * outstride + col] = v[8];
+    out[2 * outstride + col] = v[4];
+    out[3 * outstride + col] = v[12];
+    out[4 * outstride + col] = v[2];
+    out[5 * outstride + col] = v[10];
+    out[6 * outstride + col] = v[6];
+    out[7 * outstride + col] = v[14];
+    out[8 * outstride + col] = v[1];
+    out[9 * outstride + col] = v[9];
+    out[10 * outstride + col] = v[5];
+    out[11 * outstride + col] = v[13];
+    out[12 * outstride + col] = v[3];
+    out[13 * outstride + col] = v[11];
+    out[14 * outstride + col] = v[7];
+    out[15 * outstride + col] = v[15];
+  }
+}
+static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                         const int num_cols, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < num_cols; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * num_cols + col];
+    u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]);
+    u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]);
+    u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]);
+    u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm256_mullo_epi32(u[2], cospi32);
+    y = _mm256_mullo_epi32(u[3], cospi32);
+    v[2] = _mm256_add_epi32(x, y);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_sub_epi32(x, y);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm256_mullo_epi32(u[6], cospi32);
+    y = _mm256_mullo_epi32(u[7], cospi32);
+    v[6] = _mm256_add_epi32(x, y);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_sub_epi32(x, y);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm256_mullo_epi32(u[10], cospi32);
+    y = _mm256_mullo_epi32(u[11], cospi32);
+    v[10] = _mm256_add_epi32(x, y);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_sub_epi32(x, y);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm256_mullo_epi32(u[14], cospi32);
+    y = _mm256_mullo_epi32(u[15], cospi32);
+    v[14] = _mm256_add_epi32(x, y);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_sub_epi32(x, y);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm256_add_epi32(v[0], v[2]);
+    u[1] = _mm256_add_epi32(v[1], v[3]);
+    u[2] = _mm256_sub_epi32(v[0], v[2]);
+    u[3] = _mm256_sub_epi32(v[1], v[3]);
+    u[4] = _mm256_add_epi32(v[4], v[6]);
+    u[5] = _mm256_add_epi32(v[5], v[7]);
+    u[6] = _mm256_sub_epi32(v[4], v[6]);
+    u[7] = _mm256_sub_epi32(v[5], v[7]);
+    u[8] = _mm256_add_epi32(v[8], v[10]);
+    u[9] = _mm256_add_epi32(v[9], v[11]);
+    u[10] = _mm256_sub_epi32(v[8], v[10]);
+    u[11] = _mm256_sub_epi32(v[9], v[11]);
+    u[12] = _mm256_add_epi32(v[12], v[14]);
+    u[13] = _mm256_add_epi32(v[13], v[15]);
+    u[14] = _mm256_sub_epi32(v[12], v[14]);
+    u[15] = _mm256_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] =
+        av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] =
+        av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+    // stage 5
+    u[0] = _mm256_add_epi32(v[0], v[4]);
+    u[1] = _mm256_add_epi32(v[1], v[5]);
+    u[2] = _mm256_add_epi32(v[2], v[6]);
+    u[3] = _mm256_add_epi32(v[3], v[7]);
+    u[4] = _mm256_sub_epi32(v[0], v[4]);
+    u[5] = _mm256_sub_epi32(v[1], v[5]);
+    u[6] = _mm256_sub_epi32(v[2], v[6]);
+    u[7] = _mm256_sub_epi32(v[3], v[7]);
+    u[8] = _mm256_add_epi32(v[8], v[12]);
+    u[9] = _mm256_add_epi32(v[9], v[13]);
+    u[10] = _mm256_add_epi32(v[10], v[14]);
+    u[11] = _mm256_add_epi32(v[11], v[15]);
+    u[12] = _mm256_sub_epi32(v[8], v[12]);
+    u[13] = _mm256_sub_epi32(v[9], v[13]);
+    u[14] = _mm256_sub_epi32(v[10], v[14]);
+    u[15] = _mm256_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] =
+        av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] =
+        av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+    // stage 7
+    u[0] = _mm256_add_epi32(v[0], v[8]);
+    u[1] = _mm256_add_epi32(v[1], v[9]);
+    u[2] = _mm256_add_epi32(v[2], v[10]);
+    u[3] = _mm256_add_epi32(v[3], v[11]);
+    u[4] = _mm256_add_epi32(v[4], v[12]);
+    u[5] = _mm256_add_epi32(v[5], v[13]);
+    u[6] = _mm256_add_epi32(v[6], v[14]);
+    u[7] = _mm256_add_epi32(v[7], v[15]);
+    u[8] = _mm256_sub_epi32(v[0], v[8]);
+    u[9] = _mm256_sub_epi32(v[1], v[9]);
+    u[10] = _mm256_sub_epi32(v[2], v[10]);
+    u[11] = _mm256_sub_epi32(v[3], v[11]);
+    u[12] = _mm256_sub_epi32(v[4], v[12]);
+    u[13] = _mm256_sub_epi32(v[5], v[13]);
+    u[14] = _mm256_sub_epi32(v[6], v[14]);
+    u[15] = _mm256_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] =
+        av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] =
+        av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+    // stage 9
+    out[0 * outstride + col] = v[1];
+    out[1 * outstride + col] = v[14];
+    out[2 * outstride + col] = v[3];
+    out[3 * outstride + col] = v[12];
+    out[4 * outstride + col] = v[5];
+    out[5 * outstride + col] = v[10];
+    out[6 * outstride + col] = v[7];
+    out[7 * outstride + col] = v[8];
+    out[8 * outstride + col] = v[9];
+    out[9 * outstride + col] = v[6];
+    out[10 * outstride + col] = v[11];
+    out[11 * outstride + col] = v[4];
+    out[12 * outstride + col] = v[13];
+    out[13 * outstride + col] = v[2];
+    out[14 * outstride + col] = v[15];
+    out[15 * outstride + col] = v[0];
+  }
+}
+static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        int col_num, const int outstride) {
+  (void)bit;
+  (void)outstride;
+  __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
+  __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m256i a_low;
+
+  int num_iters = 16 * col_num;
+  for (int i = 0; i < num_iters; i++) {
+    a_low = _mm256_mullo_epi32(in[i], fact);
+    a_low = _mm256_add_epi32(a_low, offset);
+    out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
+  }
+}
+static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16_avx2,   // DCT_DCT
+  fadst16_avx2,  // ADST_DCT
+  fdct16_avx2,   // DCT_ADST
+  fadst16_avx2,  // ADST_ADST
+  fadst16_avx2,  // FLIPADST_DCT
+  fdct16_avx2,   // DCT_FLIPADST
+  fadst16_avx2,  // FLIPADST_FLIPADST
+  fadst16_avx2,  // ADST_FLIPADST
+  fadst16_avx2,  // FLIPADST_ADST
+  idtx16_avx2,   // IDTX
+  fdct16_avx2,   // V_DCT
+  idtx16_avx2,   // H_DCT
+  fadst16_avx2,  // V_ADST
+  idtx16_avx2,   // H_ADST
+  fadst16_avx2,  // V_FLIPADST
+  idtx16_avx2    // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8_avx2,   // DCT_DCT
+  fdct8_avx2,   // ADST_DCT
+  fadst8_avx2,  // DCT_ADST
+  fadst8_avx2,  // ADST_ADST
+  fdct8_avx2,   // FLIPADST_DCT
+  fadst8_avx2,  // DCT_FLIPADST
+  fadst8_avx2,  // FLIPADST_FLIPADST
+  fadst8_avx2,  // ADST_FLIPADST
+  fadst8_avx2,  // FLIPADST_ADST
+  idtx8_avx2,   // IDTX
+  idtx8_avx2,   // V_DCT
+  fdct8_avx2,   // H_DCT
+  idtx8_avx2,   // V_ADST
+  fadst8_avx2,  // H_ADST
+  idtx8_avx2,   // V_FLIPADST
+  fadst8_avx2   // H_FLIPADST
+};
+void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  __m256i in[16], out[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, out, bit, 1, 1);
+  col_txfm_8x8_rounding(out, -shift[1]);
+  col_txfm_8x8_rounding(&out[8], -shift[1]);
+  fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+  fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+  row_txfm(in, out, bit, 2, 2);
+  round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2);
+  store_buffer_avx2(in, coeff, 8, 16);
+  (void)bd;
+}
+static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8_avx2,   // DCT_DCT
+  fadst8_avx2,  // ADST_DCT
+  fdct8_avx2,   // DCT_ADST
+  fadst8_avx2,  // ADST_ADST
+  fadst8_avx2,  // FLIPADST_DCT
+  fdct8_avx2,   // DCT_FLIPADST
+  fadst8_avx2,  // FLIPADST_FLIPADST
+  fadst8_avx2,  // ADST_FLIPADST
+  fadst8_avx2,  // FLIPADST_ADST
+  idtx8_avx2,   // IDTX
+  fdct8_avx2,   // V_DCT
+  idtx8_avx2,   // H_DCT
+  fadst8_avx2,  // V_ADST
+  idtx8_avx2,   // H_ADST
+  fadst8_avx2,  // V_FLIPADST
+  idtx8_avx2    // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16_avx2,   // DCT_DCT
+  fdct16_avx2,   // ADST_DCT
+  fadst16_avx2,  // DCT_ADST
+  fadst16_avx2,  // ADST_ADST
+  fdct16_avx2,   // FLIPADST_DCT
+  fadst16_avx2,  // DCT_FLIPADST
+  fadst16_avx2,  // FLIPADST_FLIPADST
+  fadst16_avx2,  // ADST_FLIPADST
+  fadst16_avx2,  // FLIPADST_ADST
+  idtx16_avx2,   // IDTX
+  idtx16_avx2,   // V_DCT
+  fdct16_avx2,   // H_DCT
+  idtx16_avx2,   // V_ADST
+  fadst16_avx2,  // H_ADST
+  idtx16_avx2,   // V_FLIPADST
+  fadst16_avx2   // H_FLIPADST
+};
+void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  __m256i in[16], out[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
+  round_shift_32_8xn_avx2(in, 16, shift[0], 1);
+  col_txfm(in, out, bit, 2, 2);
+  round_shift_32_8xn_avx2(out, 16, shift[1], 1);
+  fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+  fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+  row_txfm(in, out, bit, 1, 1);
+  round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2);
+  store_buffer_avx2(out, coeff, 8, 16);
+  (void)bd;
+}
+void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m256i in[32], out[32];
+  const TX_SIZE tx_size = TX_16X16;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const int width_div8 = (width >> 3);
+  const int width_div16 = (width >> 4);
+  const int size = (height << 1);
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case ADST_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case DCT_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case ADST_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case IDTX:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case V_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case H_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case V_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case H_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case V_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    case H_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      store_buffer_avx2(out, coeff, 8, 32);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+static INLINE void fdct32_avx2(__m256i *input, __m256i *output,
+                               const int8_t cos_bit, const int instride,
+                               const int outstride) {
+  __m256i buf0[32];
+  __m256i buf1[32];
+  const int32_t *cospi;
+  int startidx = 0 * instride;
+  int endidx = 31 * instride;
+  // stage 0
+  // stage 1
+  buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
+  buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
+  buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
+  buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
+  buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
+  buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
+  buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
+  buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
+  buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
+  buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
+  buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
+  buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
+  buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
+  buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
+  buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
+  buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                    buf0[25], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                    buf0[24], cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
+  buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
+  buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
+  buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
+  buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
+  buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
+  buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
+  buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                    buf1[13], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                    buf1[12], cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
+  buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
+  buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
+  buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
+  buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
+  buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
+  buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
+  buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
+  buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
+  buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
+  buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
+  buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
+  buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
+  buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
+  buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
+  buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
+  buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
+  buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                    cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
+  buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
+  buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
+  buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
+  buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
+  buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
+  buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
+  buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                    buf0[29], cos_bit);
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                    buf0[28], cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3],
+                    cos_bit);
+  buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
+  buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
+  buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
+  buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+                    cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                    buf1[13], cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
+  buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
+  buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
+  buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
+  buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
+  buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
+  buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
+  buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
+  buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
+  buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
+  buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
+  buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
+  buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
+  buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
+  buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6],
+                    cos_bit);
+  buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
+  buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
+  buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
+  buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
+  buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
+  buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
+  buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
+  buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                    buf0[30], cos_bit);
+  btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                    buf0[29], cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                    buf0[25], cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10],
+                    buf1[13], cos_bit);
+  btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11],
+                    buf1[12], cos_bit);
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
+  buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
+  buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
+  buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
+  buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
+  buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
+  buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
+  buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
+  buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
+  buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
+  buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
+  buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
+  buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
+  buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
+  buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17],
+                    buf0[30], cos_bit);
+  btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18],
+                    buf0[29], cos_bit);
+  btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19],
+                    buf0[28], cos_bit);
+  btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22],
+                    buf0[25], cos_bit);
+  btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24],
+                    cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 31 * outstride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output,
+                                  const int8_t cos_bit, int instride,
+                                  int outstride) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; i += 8) {
+    output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2);
+    output[(i + 1) * outstride] =
+        _mm256_slli_epi32(input[(i + 1) * instride], 2);
+    output[(i + 2) * outstride] =
+        _mm256_slli_epi32(input[(i + 2) * instride], 2);
+    output[(i + 3) * outstride] =
+        _mm256_slli_epi32(input[(i + 3) * instride], 2);
+    output[(i + 4) * outstride] =
+        _mm256_slli_epi32(input[(i + 4) * instride], 2);
+    output[(i + 5) * outstride] =
+        _mm256_slli_epi32(input[(i + 5) * instride], 2);
+    output[(i + 6) * outstride] =
+        _mm256_slli_epi32(input[(i + 6) * instride], 2);
+    output[(i + 7) * outstride] =
+        _mm256_slli_epi32(input[(i + 7) * instride], 2);
+  }
+}
+static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
+  fdct32_avx2,     // DCT_DCT
+  NULL,            // ADST_DCT
+  NULL,            // DCT_ADST
+  NULL,            // ADST_ADST
+  NULL,            // FLIPADST_DCT
+  NULL,            // DCT_FLIPADST
+  NULL,            // FLIPADST_FLIPADST
+  NULL,            // ADST_FLIPADST
+  NULL,            // FLIPADST_ADST
+  idtx32x32_avx2,  // IDTX
+  NULL,            // V_DCT
+  NULL,            // H_DCT
+  NULL,            // V_ADST
+  NULL,            // H_ADST
+  NULL,            // V_FLIPADST
+  NULL             // H_FLIPADST
+};
+static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
+  fdct32_avx2,     // DCT_DCT
+  NULL,            // ADST_DCT
+  NULL,            // DCT_ADST
+  NULL,            // ADST_ADST
+  NULL,            // FLIPADST_DCT
+  NULL,            // DCT_FLIPADST
+  NULL,            // FLIPADST_FLIPADST
+  NULL,            // ADST_FLIPADST
+  NULL,            // FLIPADST_ADST
+  idtx32x32_avx2,  // IDTX
+  NULL,            // V_DCT
+  NULL,            // H_DCT
+  NULL,            // V_ADST
+  NULL,            // H_ADST
+  NULL,            // V_FLIPADST
+  NULL             // H_FLIPADST
+};
+void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m256i buf0[128], buf1[128];
+  const int tx_size = TX_32X32;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type];
+  int r, c;
+  const int width_div16 = (width >> 4);
+  const int width_div8 = (width >> 3);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
+                          width_div8, 0, 0);
+    round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
+    col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8,
+             width_div8);
+    col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+             width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
+  }
+
+  for (r = 0; r < height; r += 8) {
+    for (c = 0; c < width_div8; c++) {
+      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+                                  &buf1[c * 8 * width_div8 + (r >> 3)],
+                                  width_div8, width_div8);
+    }
+  }
+
+  for (int i = 0; i < width_div16; i++) {
+    row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8,
+             width_div8);
+    row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8,
+             width_div8);
+    round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
+    round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8);
+  }
+
+  store_buffer_avx2(buf1, output, 8, 128);
+}
+static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x2[0] = _mm256_add_epi32(x1[0], x1[31]);
+  x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
+  x2[1] = _mm256_add_epi32(x1[1], x1[30]);
+  x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
+  x2[2] = _mm256_add_epi32(x1[2], x1[29]);
+  x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
+  x2[3] = _mm256_add_epi32(x1[3], x1[28]);
+  x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
+  x2[4] = _mm256_add_epi32(x1[4], x1[27]);
+  x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
+  x2[5] = _mm256_add_epi32(x1[5], x1[26]);
+  x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
+  x2[6] = _mm256_add_epi32(x1[6], x1[25]);
+  x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
+  x2[7] = _mm256_add_epi32(x1[7], x1[24]);
+  x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
+  x2[8] = _mm256_add_epi32(x1[8], x1[23]);
+  x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
+  x2[9] = _mm256_add_epi32(x1[9], x1[22]);
+  x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
+  x2[10] = _mm256_add_epi32(x1[10], x1[21]);
+  x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
+  x2[11] = _mm256_add_epi32(x1[11], x1[20]);
+  x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
+  x2[12] = _mm256_add_epi32(x1[12], x1[19]);
+  x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
+  x2[13] = _mm256_add_epi32(x1[13], x1[18]);
+  x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
+  x2[14] = _mm256_add_epi32(x1[14], x1[17]);
+  x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
+  x2[15] = _mm256_add_epi32(x1[15], x1[16]);
+  x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48],
+                        *__rounding, cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+}
+static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x3[0] = _mm256_add_epi32(x2[0], x2[15]);
+  x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
+  x3[1] = _mm256_add_epi32(x2[1], x2[14]);
+  x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
+  x3[2] = _mm256_add_epi32(x2[2], x2[13]);
+  x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
+  x3[3] = _mm256_add_epi32(x2[3], x2[12]);
+  x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
+  x3[4] = _mm256_add_epi32(x2[4], x2[11]);
+  x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
+  x3[5] = _mm256_add_epi32(x2[5], x2[10]);
+  x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
+  x3[6] = _mm256_add_epi32(x2[6], x2[9]);
+  x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
+  x3[7] = _mm256_add_epi32(x2[7], x2[8]);
+  x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24],
+                        *__rounding, cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm256_add_epi32(x2[32], x2[47]);
+  x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
+  x3[33] = _mm256_add_epi32(x2[33], x2[46]);
+  x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
+  x3[34] = _mm256_add_epi32(x2[34], x2[45]);
+  x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
+  x3[35] = _mm256_add_epi32(x2[35], x2[44]);
+  x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
+  x3[36] = _mm256_add_epi32(x2[36], x2[43]);
+  x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
+  x3[37] = _mm256_add_epi32(x2[37], x2[42]);
+  x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
+  x3[38] = _mm256_add_epi32(x2[38], x2[41]);
+  x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
+  x3[39] = _mm256_add_epi32(x2[39], x2[40]);
+  x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
+  x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
+  x3[63] = _mm256_add_epi32(x2[63], x2[48]);
+  x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
+  x3[62] = _mm256_add_epi32(x2[62], x2[49]);
+  x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
+  x3[61] = _mm256_add_epi32(x2[61], x2[50]);
+  x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
+  x3[60] = _mm256_add_epi32(x2[60], x2[51]);
+  x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
+  x3[59] = _mm256_add_epi32(x2[59], x2[52]);
+  x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
+  x3[58] = _mm256_add_epi32(x2[58], x2[53]);
+  x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
+  x3[57] = _mm256_add_epi32(x2[57], x2[54]);
+  x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
+  x3[56] = _mm256_add_epi32(x2[56], x2[55]);
+}
+static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      __m256i *cospi_m16, __m256i *cospi_p48,
+                                      __m256i *cospi_m48,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x4[0] = _mm256_add_epi32(x3[0], x3[7]);
+  x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
+  x4[1] = _mm256_add_epi32(x3[1], x3[6]);
+  x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
+  x4[2] = _mm256_add_epi32(x3[2], x3[5]);
+  x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
+  x4[3] = _mm256_add_epi32(x3[3], x3[4]);
+  x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12],
+                        *__rounding, cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm256_add_epi32(x3[16], x3[23]);
+  x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
+  x4[17] = _mm256_add_epi32(x3[17], x3[22]);
+  x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
+  x4[18] = _mm256_add_epi32(x3[18], x3[21]);
+  x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
+  x4[19] = _mm256_add_epi32(x3[19], x3[20]);
+  x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
+  x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
+  x4[31] = _mm256_add_epi32(x3[31], x3[24]);
+  x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
+  x4[30] = _mm256_add_epi32(x3[30], x3[25]);
+  x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
+  x4[29] = _mm256_add_epi32(x3[29], x3[26]);
+  x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
+  x4[28] = _mm256_add_epi32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52],
+                        *__rounding, cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+}
+static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      __m256i *cospi_m16, __m256i *cospi_p48,
+                                      __m256i *cospi_m48,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x5[0] = _mm256_add_epi32(x4[0], x4[3]);
+  x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
+  x5[1] = _mm256_add_epi32(x4[1], x4[2]);
+  x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6],
+                        *__rounding, cos_bit);
+  x5[7] = x4[7];
+  x5[8] = _mm256_add_epi32(x4[8], x4[11]);
+  x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
+  x5[9] = _mm256_add_epi32(x4[9], x4[10]);
+  x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
+  x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
+  x5[15] = _mm256_add_epi32(x4[15], x4[12]);
+  x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
+  x5[14] = _mm256_add_epi32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26],
+                        *__rounding, cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm256_add_epi32(x4[32], x4[39]);
+  x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
+  x5[33] = _mm256_add_epi32(x4[33], x4[38]);
+  x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
+  x5[34] = _mm256_add_epi32(x4[34], x4[37]);
+  x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
+  x5[35] = _mm256_add_epi32(x4[35], x4[36]);
+  x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
+  x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
+  x5[47] = _mm256_add_epi32(x4[47], x4[40]);
+  x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
+  x5[46] = _mm256_add_epi32(x4[46], x4[41]);
+  x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
+  x5[45] = _mm256_add_epi32(x4[45], x4[42]);
+  x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
+  x5[44] = _mm256_add_epi32(x4[44], x4[43]);
+  x5[48] = _mm256_add_epi32(x4[48], x4[55]);
+  x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
+  x5[49] = _mm256_add_epi32(x4[49], x4[54]);
+  x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
+  x5[50] = _mm256_add_epi32(x4[50], x4[53]);
+  x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
+  x5[51] = _mm256_add_epi32(x4[51], x4[52]);
+  x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
+  x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
+  x5[63] = _mm256_add_epi32(x4[63], x4[56]);
+  x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
+  x5[62] = _mm256_add_epi32(x4[62], x4[57]);
+  x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
+  x5[61] = _mm256_add_epi32(x4[61], x4[58]);
+  x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
+  x5[60] = _mm256_add_epi32(x4[60], x4[59]);
+}
+static INLINE void fdct64_stage6_avx2(
+    __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
+    __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+    __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
+    __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24,
+    const __m256i *__rounding, int8_t cos_bit) {
+  btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3],
+                        *__rounding, cos_bit);
+  x6[4] = _mm256_add_epi32(x5[4], x5[5]);
+  x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
+  x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
+  x6[7] = _mm256_add_epi32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13],
+                        *__rounding, cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm256_add_epi32(x5[16], x5[19]);
+  x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
+  x6[17] = _mm256_add_epi32(x5[17], x5[18]);
+  x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
+  x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
+  x6[23] = _mm256_add_epi32(x5[23], x5[20]);
+  x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
+  x6[22] = _mm256_add_epi32(x5[22], x5[21]);
+  x6[24] = _mm256_add_epi32(x5[24], x5[27]);
+  x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
+  x6[25] = _mm256_add_epi32(x5[25], x5[26]);
+  x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
+  x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
+  x6[31] = _mm256_add_epi32(x5[31], x5[28]);
+  x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
+  x6[30] = _mm256_add_epi32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58],
+                        *__rounding, cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50],
+                        *__rounding, cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+}
+static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
+                                      __m256i *cospi_p08, __m256i *cospi_p56,
+                                      __m256i *cospi_p40, __m256i *cospi_p24,
+                                      __m256i *cospi_m08, __m256i *cospi_m56,
+                                      __m256i *cospi_m40, __m256i *cospi_m24,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6],
+                        *__rounding, cos_bit);
+  x7[8] = _mm256_add_epi32(x6[8], x6[9]);
+  x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
+  x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
+  x7[11] = _mm256_add_epi32(x6[11], x6[10]);
+  x7[12] = _mm256_add_epi32(x6[12], x6[13]);
+  x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
+  x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
+  x7[15] = _mm256_add_epi32(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29],
+                        *__rounding, cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25],
+                        *__rounding, cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm256_add_epi32(x6[32], x6[35]);
+  x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
+  x7[33] = _mm256_add_epi32(x6[33], x6[34]);
+  x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
+  x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
+  x7[39] = _mm256_add_epi32(x6[39], x6[36]);
+  x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
+  x7[38] = _mm256_add_epi32(x6[38], x6[37]);
+  x7[40] = _mm256_add_epi32(x6[40], x6[43]);
+  x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
+  x7[41] = _mm256_add_epi32(x6[41], x6[42]);
+  x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
+  x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
+  x7[47] = _mm256_add_epi32(x6[47], x6[44]);
+  x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
+  x7[46] = _mm256_add_epi32(x6[46], x6[45]);
+  x7[48] = _mm256_add_epi32(x6[48], x6[51]);
+  x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
+  x7[49] = _mm256_add_epi32(x6[49], x6[50]);
+  x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
+  x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
+  x7[55] = _mm256_add_epi32(x6[55], x6[52]);
+  x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
+  x7[54] = _mm256_add_epi32(x6[54], x6[53]);
+  x7[56] = _mm256_add_epi32(x6[56], x6[59]);
+  x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
+  x7[57] = _mm256_add_epi32(x6[57], x6[58]);
+  x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
+  x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
+  x7[63] = _mm256_add_epi32(x6[63], x6[60]);
+  x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
+  x7[62] = _mm256_add_epi32(x6[62], x6[61]);
+}
+static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
+                                      const int32_t *cospi,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+
+  btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12],
+                        *__rounding, cos_bit);
+  x8[16] = _mm256_add_epi32(x7[16], x7[17]);
+  x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
+  x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
+  x8[19] = _mm256_add_epi32(x7[19], x7[18]);
+  x8[20] = _mm256_add_epi32(x7[20], x7[21]);
+  x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
+  x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
+  x8[23] = _mm256_add_epi32(x7[23], x7[22]);
+  x8[24] = _mm256_add_epi32(x7[24], x7[25]);
+  x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
+  x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
+  x8[27] = _mm256_add_epi32(x7[27], x7[26]);
+  x8[28] = _mm256_add_epi32(x7[28], x7[29]);
+  x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
+  x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
+  x8[31] = _mm256_add_epi32(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+                        *__rounding, cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+                        *__rounding, cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+                        *__rounding, cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+                        *__rounding, cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+}
+static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
+                                      const int32_t *cospi,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24],
+                        *__rounding, cos_bit);
+  x9[32] = _mm256_add_epi32(x8[32], x8[33]);
+  x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
+  x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
+  x9[35] = _mm256_add_epi32(x8[35], x8[34]);
+  x9[36] = _mm256_add_epi32(x8[36], x8[37]);
+  x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
+  x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
+  x9[39] = _mm256_add_epi32(x8[39], x8[38]);
+  x9[40] = _mm256_add_epi32(x8[40], x8[41]);
+  x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
+  x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
+  x9[43] = _mm256_add_epi32(x8[43], x8[42]);
+  x9[44] = _mm256_add_epi32(x8[44], x8[45]);
+  x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
+  x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
+  x9[47] = _mm256_add_epi32(x8[47], x8[46]);
+  x9[48] = _mm256_add_epi32(x8[48], x8[49]);
+  x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
+  x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
+  x9[51] = _mm256_add_epi32(x8[51], x8[50]);
+  x9[52] = _mm256_add_epi32(x8[52], x8[53]);
+  x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
+  x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
+  x9[55] = _mm256_add_epi32(x8[55], x8[54]);
+  x9[56] = _mm256_add_epi32(x8[56], x8[57]);
+  x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
+  x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
+  x9[59] = _mm256_add_epi32(x8[59], x8[58]);
+  x9[60] = _mm256_add_epi32(x8[60], x8[61]);
+  x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
+  x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
+  x9[63] = _mm256_add_epi32(x8[63], x8[62]);
+}
+static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
+                                       const int32_t *cospi,
+                                       const __m256i *__rounding,
+                                       int8_t cos_bit) {
+  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48],
+                        *__rounding, cos_bit);
+}
+static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
+                        const int instride, const int outstride) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+  // stage 1
+  __m256i x1[64];
+  x1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[16] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[17] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[18] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[19] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[20] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[21] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[22] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[23] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[24] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[25] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[26] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[27] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[28] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[29] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[30] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[31] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  __m256i x2[64];
+  fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+  // stage 3
+  fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+  // stage 4
+  fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &__rounding, cos_bit);
+  // stage 5
+  fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &__rounding, cos_bit);
+  // stage 6
+  fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40,
+                     &cospi_p24, &cospi_m24, &__rounding, cos_bit);
+  // stage 7
+  fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
+                     &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
+                     &__rounding, cos_bit);
+  // stage 8
+  fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
+  // stage 9
+  fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
+  // stage 10
+  fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+
+  // stage 11
+  output[startidx] = x2[0];
+  output[endidx] = x2[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[32];
+  output[endidx] = x2[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[16];
+  output[endidx] = x2[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[48];
+  output[endidx] = x2[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[8];
+  output[endidx] = x2[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[40];
+  output[endidx] = x2[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[24];
+  output[endidx] = x2[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[56];
+  output[endidx] = x2[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[4];
+  output[endidx] = x2[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[36];
+  output[endidx] = x2[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[20];
+  output[endidx] = x2[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[52];
+  output[endidx] = x2[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[12];
+  output[endidx] = x2[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[44];
+  output[endidx] = x2[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[28];
+  output[endidx] = x2[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[60];
+  output[endidx] = x2[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[2];
+  output[endidx] = x2[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[34];
+  output[endidx] = x2[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[18];
+  output[endidx] = x2[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[50];
+  output[endidx] = x2[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[10];
+  output[endidx] = x2[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[42];
+  output[endidx] = x2[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[26];
+  output[endidx] = x2[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[58];
+  output[endidx] = x2[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[6];
+  output[endidx] = x2[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[38];
+  output[endidx] = x2[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[22];
+  output[endidx] = x2[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[54];
+  output[endidx] = x2[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[14];
+  output[endidx] = x2[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[46];
+  output[endidx] = x2[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[30];
+  output[endidx] = x2[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[62];
+  output[endidx] = x2[1];
+}
+void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m256i buf0[512], buf1[512];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct64_avx2;
+  const transform_1d_avx2 row_txfm = fdct64_avx2;
+  const int width_div16 = (width >> 4);
+  const int width_div8 = (width >> 3);
+  int r, c;
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
+                          width_div8, 0, 0);
+    round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
+    col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8);
+    col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+             width_div8);
+    round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
+  }
+
+  for (r = 0; r < height; r += 8) {
+    for (c = 0; c < width_div8; c++) {
+      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+                                  &buf1[c * 8 * width_div8 + (r >> 3)],
+                                  width_div8, width_div8);
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8,
+             width_div16);
+    row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8,
+             width_div16);
+    round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
+                            width_div16);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
+                            width_div16);
+  }
+
+  store_buffer_avx2(buf0, output, 8, 128);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 0000000000..158b4ae439
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,2629 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+static INLINE void store_output_w4(int32_t *const out, const __m128i *const in,
+                                   const int stride, const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+  }
+}
+
+void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in[4];
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  // Convert to int32_t.
+  __m128i op[4];
+  op[0] = _mm_cvtepi16_epi32(in[0]);
+  op[1] = _mm_cvtepi16_epi32(in[1]);
+  op[2] = _mm_cvtepi16_epi32(in[2]);
+  op[3] = _mm_cvtepi16_epi32(in[3]);
+
+  for (int i = 0; i < 2; ++i) {
+    __m128i a1 = op[0];
+    __m128i b1 = op[1];
+    __m128i c1 = op[2];
+    __m128i d1 = op[3];
+    __m128i e1;
+
+    a1 = _mm_add_epi32(a1, b1);  // a1 += b1
+    d1 = _mm_sub_epi32(d1, c1);  // d1 = d1 - c1
+    e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
+    e1 = _mm_srai_epi32(e1, 1);
+    b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
+    c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
+    a1 = _mm_sub_epi32(a1, c1);  // a1 -= c1
+    d1 = _mm_add_epi32(d1, b1);  // d1 += b1
+
+    op[0] = a1;
+    op[1] = c1;
+    op[2] = d1;
+    op[3] = b1;
+
+    if (i == 0) {
+      transpose_32bit_4x4(op, op);
+    }
+  }
+
+  op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
+  op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
+  op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
+  op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
+
+  _mm_storeu_si128((__m128i *)(output + 0), op[0]);
+  _mm_storeu_si128((__m128i *)(output + 4), op[1]);
+  _mm_storeu_si128((__m128i *)(output + 8), op[2]);
+  _mm_storeu_si128((__m128i *)(output + 12), op[3]);
+}
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
+
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[2]);
+  in[3] = _mm_cvtepi16_epi32(in[3]);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+                           const int num_col) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  int endidx = 3 * num_col;
+  s0 = _mm_add_epi32(in[0], in[endidx]);
+  s3 = _mm_sub_epi32(in[0], in[endidx]);
+  endidx -= num_col;
+  s1 = _mm_add_epi32(in[num_col], in[endidx]);
+  s2 = _mm_sub_epi32(in[num_col], in[endidx]);
+
+  // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+  u0 = _mm_mullo_epi32(s0, cospi32);
+  u1 = _mm_mullo_epi32(s1, cospi32);
+  u2 = _mm_add_epi32(u0, u1);
+  v0 = _mm_sub_epi32(u0, u1);
+
+  u3 = _mm_add_epi32(u2, rnding);
+  v1 = _mm_add_epi32(v0, rnding);
+
+  u0 = _mm_srai_epi32(u3, bit);
+  u2 = _mm_srai_epi32(v1, bit);
+
+  // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+  v0 = _mm_mullo_epi32(s2, cospi48);
+  v1 = _mm_mullo_epi32(s3, cospi16);
+  v2 = _mm_add_epi32(v0, v1);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u1 = _mm_srai_epi32(v3, bit);
+
+  v0 = _mm_mullo_epi32(s2, cospi16);
+  v1 = _mm_mullo_epi32(s3, cospi48);
+  v2 = _mm_sub_epi32(v1, v0);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(v3, bit);
+
+  // Note: shift[1] and shift[2] are zeros
+
+  out[0] = u0;
+  out[1] = u1;
+  out[2] = u2;
+  out[3] = u3;
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+                            const int num_col) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
+  __m128i u0, u1, u2, u3;
+
+  int idx = 0 * num_col;
+  s0 = _mm_mullo_epi32(in[idx], sinpi1);
+  s1 = _mm_mullo_epi32(in[idx], sinpi4);
+  t = _mm_add_epi32(in[idx], in[idx + num_col]);
+  idx += num_col;
+  s2 = _mm_mullo_epi32(in[idx], sinpi2);
+  s3 = _mm_mullo_epi32(in[idx], sinpi1);
+  idx += num_col;
+  s4 = _mm_mullo_epi32(in[idx], sinpi3);
+  idx += num_col;
+  s5 = _mm_mullo_epi32(in[idx], sinpi4);
+  s6 = _mm_mullo_epi32(in[idx], sinpi2);
+  s7 = _mm_sub_epi32(t, in[idx]);
+
+  t = _mm_add_epi32(s0, s2);
+  x0 = _mm_add_epi32(t, s5);
+  x1 = _mm_mullo_epi32(s7, sinpi3);
+  t = _mm_sub_epi32(s1, s3);
+  x2 = _mm_add_epi32(t, s6);
+  x3 = s4;
+
+  s0 = _mm_add_epi32(x0, x3);
+  s1 = x1;
+  s2 = _mm_sub_epi32(x2, x3);
+  t = _mm_sub_epi32(x2, x0);
+  s3 = _mm_add_epi32(t, x3);
+
+  u0 = _mm_add_epi32(s0, rnding);
+  u0 = _mm_srai_epi32(u0, bit);
+
+  u1 = _mm_add_epi32(s1, rnding);
+  u1 = _mm_srai_epi32(u1, bit);
+
+  u2 = _mm_add_epi32(s2, rnding);
+  u2 = _mm_srai_epi32(u2, bit);
+
+  u3 = _mm_add_epi32(s3, rnding);
+  u3 = _mm_srai_epi32(u3, bit);
+
+  out[0] = u0;
+  out[1] = u1;
+  out[2] = u2;
+  out[3] = u3;
+}
+static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+  __m128i fact = _mm_set1_epi32(NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a_low;
+
+  for (int i = 0; i < 4; i++) {
+    a_low = _mm_mullo_epi32(in[i * col_num], fact);
+    a_low = _mm_add_epi32(a_low, offset);
+    out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+  }
+}
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+                               int input_stride, TX_TYPE tx_type, int bd) {
+  __m128i in[4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      transpose_32bit_4x4(in, in);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  __m128i u;
+  if (!flipud) {
+    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
+
+  u = _mm_unpackhi_epi64(in[4], in[4]);
+  in[8] = _mm_cvtepi16_epi32(in[4]);
+  in[9] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[5], in[5]);
+  in[10] = _mm_cvtepi16_epi32(in[5]);
+  in[11] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[6], in[6]);
+  in[12] = _mm_cvtepi16_epi32(in[6]);
+  in[13] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[7], in[7]);
+  in[14] = _mm_cvtepi16_epi32(in[7]);
+  in[15] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[3], in[3]);
+  in[6] = _mm_cvtepi16_epi32(in[3]);
+  in[7] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[2], in[2]);
+  in[4] = _mm_cvtepi16_epi32(in[2]);
+  in[5] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[1], in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[1]);
+  in[3] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[0], in[0]);
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(u);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+  in[4] = _mm_slli_epi32(in[4], shift);
+  in[5] = _mm_slli_epi32(in[5], shift);
+  in[6] = _mm_slli_epi32(in[6], shift);
+  in[7] = _mm_slli_epi32(in[7], shift);
+
+  in[8] = _mm_slli_epi32(in[8], shift);
+  in[9] = _mm_slli_epi32(in[9], shift);
+  in[10] = _mm_slli_epi32(in[10], shift);
+  in[11] = _mm_slli_epi32(in[11], shift);
+  in[12] = _mm_slli_epi32(in[12], shift);
+  in[13] = _mm_slli_epi32(in[13], shift);
+  in[14] = _mm_slli_epi32(in[14], shift);
+  in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rounding);
+  in[1] = _mm_add_epi32(in[1], rounding);
+  in[2] = _mm_add_epi32(in[2], rounding);
+  in[3] = _mm_add_epi32(in[3], rounding);
+  in[4] = _mm_add_epi32(in[4], rounding);
+  in[5] = _mm_add_epi32(in[5], rounding);
+  in[6] = _mm_add_epi32(in[6], rounding);
+  in[7] = _mm_add_epi32(in[7], rounding);
+  in[8] = _mm_add_epi32(in[8], rounding);
+  in[9] = _mm_add_epi32(in[9], rounding);
+  in[10] = _mm_add_epi32(in[10], rounding);
+  in[11] = _mm_add_epi32(in[11], rounding);
+  in[12] = _mm_add_epi32(in[12], rounding);
+  in[13] = _mm_add_epi32(in[13], rounding);
+  in[14] = _mm_add_epi32(in[14], rounding);
+  in[15] = _mm_add_epi32(in[15], rounding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+  in[4] = _mm_srai_epi32(in[4], shift);
+  in[5] = _mm_srai_epi32(in[5], shift);
+  in[6] = _mm_srai_epi32(in[6], shift);
+  in[7] = _mm_srai_epi32(in[7], shift);
+  in[8] = _mm_srai_epi32(in[8], shift);
+  in[9] = _mm_srai_epi32(in[9], shift);
+  in[10] = _mm_srai_epi32(in[10], shift);
+  in[11] = _mm_srai_epi32(in[11], shift);
+  in[12] = _mm_srai_epi32(in[12], shift);
+  in[13] = _mm_srai_epi32(in[13], shift);
+  in[14] = _mm_srai_epi32(in[14], shift);
+  in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) {
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rounding);
+  in[1] = _mm_add_epi32(in[1], rounding);
+  in[2] = _mm_add_epi32(in[2], rounding);
+  in[3] = _mm_add_epi32(in[3], rounding);
+  in[4] = _mm_add_epi32(in[4], rounding);
+  in[5] = _mm_add_epi32(in[5], rounding);
+  in[6] = _mm_add_epi32(in[6], rounding);
+  in[7] = _mm_add_epi32(in[7], rounding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+  in[4] = _mm_srai_epi32(in[4], shift);
+  in[5] = _mm_srai_epi32(in[5], shift);
+  in[6] = _mm_srai_epi32(in[6], shift);
+  in[7] = _mm_srai_epi32(in[7], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+  _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+  _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+  _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+  _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+  _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+  _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+  _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+  _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+  _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+                                     const int stride) {
+  _mm_storeu_si128((__m128i *)(output), res[0]);
+  _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+  _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+  _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                           const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[8], v[8];
+
+  int startidx = 0 * col_num;
+  int endidx = 7 * col_num;
+  // Even 8 points 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[startidx], in[endidx]);
+  v[7] = _mm_sub_epi32(in[startidx], in[endidx]);  // v[7]
+  startidx += col_num;
+  endidx -= col_num;
+  u[1] = _mm_add_epi32(in[startidx], in[endidx]);
+  u[6] = _mm_sub_epi32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[2] = _mm_add_epi32(in[startidx], in[endidx]);
+  u[5] = _mm_sub_epi32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[3] = _mm_add_epi32(in[startidx], in[endidx]);
+  v[4] = _mm_sub_epi32(in[startidx], in[endidx]);  // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[1 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[7 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[5 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[3 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[6]
+
+  out[0 * col_num] = u[0];  // buf0[0]
+  out[4 * col_num] = u[1];  // buf0[1]
+  out[2 * col_num] = u[2];  // buf0[2]
+  out[6 * col_num] = u[3];  // buf0[3]
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                           const int col_num) {
+  fdct4x8_sse4_1(in, out, bit, col_num);
+  fdct4x8_sse4_1(in + 1, out + 1, bit, col_num);
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                            const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[col_num * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
+    u3 = in[col_num * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
+    u5 = in[col_num * 6 + col];
+    u6 = in[col_num * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 7
+    out[col_num * 0 + col] = v1;
+    out[col_num * 1 + col] = v6;
+    out[col_num * 2 + col] = v3;
+    out[col_num * 3 + col] = v4;
+    out[col_num * 4 + col] = v5;
+    out[col_num * 5 + col] = v2;
+    out[col_num * 6 + col] = v7;
+    out[col_num * 7 + col] = v0;
+  }
+}
+static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+
+  for (int i = 0; i < col_num; i += 1) {
+    out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]);
+    out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]);
+    out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]);
+    out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]);
+    out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]);
+    out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]);
+    out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]);
+    out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
+  }
+}
+#if !CONFIG_REALTIME_ONLY
+static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+  (void)col_num;
+  for (int j = 0; j < 2; j++) {
+    out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
+    out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
+    out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
+    out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
+    out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
+    out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
+    out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
+    out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
+  }
+}
+#endif
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m128i in[16], out[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case IDTX:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      write_buffer_8x8(out, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+  int row_index = 0;
+  int dst_index = 0;
+  int src_index = 0;
+
+  // row 0, 1, .., 7
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 8);
+
+  // row 8, 9, ..., 15
+  src_index += 16;
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift) {
+  __m128i in[64];
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+  convert_8x8_to_16x16(in, out);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+                                    int stride, int flipud, int fliplr,
+                                    int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  const int16_t *topL = input;
+  const int16_t *topR = input + 4;
+
+  const int16_t *tmp;
+
+  if (fliplr) {
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+  }
+
+  load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out,
+                                    int stride, int flipud, int fliplr,
+                                    int shift) {
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+
+  const int16_t *tmp;
+
+  if (fliplr) {
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+  }
+
+  load_buffer_8x4(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 4 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
+                                    const int stride, const int flipud,
+                                    const int fliplr, const int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+  load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
+}
+#endif
+
+static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift, const int height) {
+  const int16_t *in = input;
+  __m128i *output = out;
+  for (int col = 0; col < height; col++) {
+    in = input + col * stride;
+    output = out + col * 8;
+    load_buffer_4x4(in, output, 4, flipud, fliplr, shift);
+    load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift);
+  }
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                             const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm_mullo_epi32(u[10], cospim32);
+    x = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[13], cospim32);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospim32);
+    x = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi32);
+    x = _mm_mullo_epi32(u[12], cospim32);
+    v[12] = _mm_sub_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm_mullo_epi32(v[5], cospim32);
+    x = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi32);
+    x = _mm_mullo_epi32(v[6], cospim32);
+    u[6] = _mm_sub_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[15] = _mm_add_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm_mullo_epi32(u[0], cospi32);
+    u[1] = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(u[0], u[1]);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(u[0], u[1]);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(u[2], cospi48);
+    x = _mm_mullo_epi32(u[3], cospi16);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(u[2], cospi16);
+    x = _mm_mullo_epi32(u[3], cospi48);
+    v[3] = _mm_sub_epi32(x, v[3]);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm_mullo_epi32(u[9], cospim16);
+    x = _mm_mullo_epi32(u[14], cospi48);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi48);
+    x = _mm_mullo_epi32(u[14], cospim16);
+    v[14] = _mm_sub_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospim48);
+    x = _mm_mullo_epi32(u[13], cospim16);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospim16);
+    x = _mm_mullo_epi32(u[13], cospim48);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm_mullo_epi32(v[4], cospi56);
+    x = _mm_mullo_epi32(v[7], cospi8);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    u[7] = _mm_mullo_epi32(v[4], cospi8);
+    x = _mm_mullo_epi32(v[7], cospi56);
+    u[7] = _mm_sub_epi32(x, u[7]);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    u[5] = _mm_mullo_epi32(v[5], cospi24);
+    x = _mm_mullo_epi32(v[6], cospi40);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi40);
+    x = _mm_mullo_epi32(v[6], cospi24);
+    u[6] = _mm_sub_epi32(x, u[6]);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[11], v[10]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi60);
+    x = _mm_mullo_epi32(u[15], cospi4);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[15] = _mm_mullo_epi32(u[8], cospi4);
+    x = _mm_mullo_epi32(u[15], cospi60);
+    v[15] = _mm_sub_epi32(x, v[15]);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    v[9] = _mm_mullo_epi32(u[9], cospi28);
+    x = _mm_mullo_epi32(u[14], cospi36);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi36);
+    x = _mm_mullo_epi32(u[14], cospi28);
+    v[14] = _mm_sub_epi32(x, v[14]);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi44);
+    x = _mm_mullo_epi32(u[13], cospi20);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi20);
+    x = _mm_mullo_epi32(u[13], cospi44);
+    v[13] = _mm_sub_epi32(x, v[13]);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospi12);
+    x = _mm_mullo_epi32(u[12], cospi52);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi52);
+    x = _mm_mullo_epi32(u[12], cospi12);
+    v[12] = _mm_sub_epi32(x, v[12]);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = v[8];
+    out[2 * col_num + col] = v[4];
+    out[3 * col_num + col] = v[12];
+    out[4 * col_num + col] = v[2];
+    out[5 * col_num + col] = v[10];
+    out[6 * col_num + col] = v[6];
+    out[7 * col_num + col] = v[14];
+    out[8 * col_num + col] = v[1];
+    out[9 * col_num + col] = v[9];
+    out[10 * col_num + col] = v[5];
+    out[11 * col_num + col] = v[13];
+    out[12 * col_num + col] = v[3];
+    out[13 * col_num + col] = v[11];
+    out[14 * col_num + col] = v[7];
+    out[15 * col_num + col] = v[15];
+  }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                              const int num_cols) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < num_cols; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * num_cols + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+    // stage 9
+    out[0 * num_cols + col] = v[1];
+    out[1 * num_cols + col] = v[14];
+    out[2 * num_cols + col] = v[3];
+    out[3 * num_cols + col] = v[12];
+    out[4 * num_cols + col] = v[5];
+    out[5 * num_cols + col] = v[10];
+    out[6 * num_cols + col] = v[7];
+    out[7 * num_cols + col] = v[8];
+    out[8 * num_cols + col] = v[9];
+    out[9 * num_cols + col] = v[6];
+    out[10 * num_cols + col] = v[11];
+    out[11 * num_cols + col] = v[4];
+    out[12 * num_cols + col] = v[13];
+    out[13 * num_cols + col] = v[2];
+    out[14 * num_cols + col] = v[15];
+    out[15 * num_cols + col] = v[0];
+  }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+  // Note:
+  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
+  //  instead of 4 columns
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+  col_txfm_8x8_rounding(&in[32], shift);
+  col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
+  const int size_8x8 = 16 * 4;
+  write_buffer_8x8(&in[0], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[16], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[32], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[48], output);
+}
+static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a_low;
+
+  int num_iters = 16 * col_num;
+  for (int i = 0; i < num_iters; i++) {
+    a_low = _mm_mullo_epi32(in[i], fact);
+    a_low = _mm_add_epi32(a_low, offset);
+    out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+  }
+}
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64], out[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int col_num = 4;
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case IDTX:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      write_buffer_16x16(out, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+  for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  fadst8x8_sse4_1,  // ADST_DCT
+  fdct8x8_sse4_1,   // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fadst8x8_sse4_1,  // FLIPADST_DCT
+  fdct8x8_sse4_1,   // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  idtx8x8_sse4_1,   // IDTX
+  fdct8x8_sse4_1,   // V_DCT
+  idtx8x8_sse4_1,   // H_DCT
+  fadst8x8_sse4_1,  // V_ADST
+  idtx8x8_sse4_1,   // H_ADST
+  fadst8x8_sse4_1,  // V_FLIPADST
+  idtx8x8_sse4_1    // H_FLIPADST
+};
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  NULL,             // ADST_DCT
+  NULL,             // DCT_ADST
+  NULL,             // ADST_ADST
+  NULL,             // FLIPADST_DCT
+  NULL,             // DCT_FLIPADST
+  NULL,             // FLIPADST_FLIPADST
+  NULL,             // ADST_FLIPADST
+  NULL,             // FLIPADST-ADST
+  idtx32x8_sse4_1,  // IDTX
+  NULL,             // V_DCT
+  NULL,             // H_DCT
+  NULL,             // V_ADST
+  NULL,             // H_ADST
+  NULL,             // V_FLIPADST
+  NULL,             // H_FLIPADST
+};
+#endif
+static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_sse4_1,   // DCT_DCT
+  fadst8x8_sse4_1,  // ADST_DCT
+  fdct4x8_sse4_1,   // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fadst8x8_sse4_1,  // FLIPADST_DCT
+  fdct4x8_sse4_1,   // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  idtx8x8_sse4_1,   // IDTX
+  fdct4x8_sse4_1,   // V_DCT
+  idtx8x8_sse4_1,   // H_DCT
+  fadst8x8_sse4_1,  // V_ADST
+  idtx8x8_sse4_1,   // H_ADST
+  fadst8x8_sse4_1,  // V_FLIPADST
+  idtx8x8_sse4_1    // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,   // DCT_DCT
+  fdct16x16_sse4_1,   // ADST_DCT
+  fadst16x16_sse4_1,  // DCT_ADST
+  fadst16x16_sse4_1,  // ADST_ADST
+  fdct16x16_sse4_1,   // FLIPADST_DCT
+  fadst16x16_sse4_1,  // DCT_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
+  fadst16x16_sse4_1,  // ADST_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_ADST
+  idtx16x16_sse4_1,   // IDTX
+  idtx16x16_sse4_1,   // V_DCT
+  fdct16x16_sse4_1,   // H_DCT
+  idtx16x16_sse4_1,   // V_ADST
+  fadst16x16_sse4_1,  // H_ADST
+  idtx16x16_sse4_1,   // V_FLIPADST
+  fadst16x16_sse4_1   // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,   // DCT_DCT
+  fadst16x16_sse4_1,  // ADST_DCT
+  fdct16x16_sse4_1,   // DCT_ADST
+  fadst16x16_sse4_1,  // ADST_ADST
+  fadst16x16_sse4_1,  // FLIPADST_DCT
+  fdct16x16_sse4_1,   // DCT_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
+  fadst16x16_sse4_1,  // ADST_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_ADST
+  idtx16x16_sse4_1,   // IDTX
+  fdct16x16_sse4_1,   // V_DCT
+  idtx16x16_sse4_1,   // H_DCT
+  fadst16x16_sse4_1,  // V_ADST
+  idtx16x16_sse4_1,   // H_ADST
+  fadst16x16_sse4_1,  // V_FLIPADST
+  idtx16x16_sse4_1    // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  fdct8x8_sse4_1,   // ADST_DCT
+  fadst8x8_sse4_1,  // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fdct8x8_sse4_1,   // FLIPADST_DCT
+  fadst8x8_sse4_1,  // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  idtx8x8_sse4_1,   // IDTX
+  idtx8x8_sse4_1,   // V_DCT
+  fdct8x8_sse4_1,   // H_DCT
+  idtx8x8_sse4_1,   // V_ADST
+  fadst8x8_sse4_1,  // H_ADST
+  idtx8x8_sse4_1,   // V_FLIPADST
+  fadst8x8_sse4_1   // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_sse4_1,   // DCT_DCT
+  fdct4x8_sse4_1,   // ADST_DCT
+  fadst8x8_sse4_1,  // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fdct4x8_sse4_1,   // FLIPADST_DCT
+  fadst8x8_sse4_1,  // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  idtx8x8_sse4_1,   // IDTX
+  idtx8x8_sse4_1,   // V_DCT
+  fdct4x8_sse4_1,   // H_DCT
+  idtx8x8_sse4_1,   // V_ADST
+  fadst8x8_sse4_1,  // H_ADST
+  idtx8x8_sse4_1,   // V_FLIPADST
+  fadst8x8_sse4_1   // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_sse4_1,   // DCT_DCT
+  fdct4x4_sse4_1,   // ADST_DCT
+  fadst4x4_sse4_1,  // DCT_ADST
+  fadst4x4_sse4_1,  // ADST_ADST
+  fdct4x4_sse4_1,   // FLIPADST_DCT
+  fadst4x4_sse4_1,  // DCT_FLIPADST
+  fadst4x4_sse4_1,  // FLIPADST_FLIPADST
+  fadst4x4_sse4_1,  // ADST_FLIPADST
+  fadst4x4_sse4_1,  // FLIPADST_ADST
+  idtx4x4_sse4_1,   // IDTX
+  idtx4x4_sse4_1,   // V_DCT
+  fdct4x4_sse4_1,   // H_DCT
+  idtx4x4_sse4_1,   // V_ADST
+  fadst4x4_sse4_1,  // H_ADST
+  idtx4x4_sse4_1,   // V_FLIPADST
+  fadst4x4_sse4_1   // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_sse4_1,   // DCT_DCT
+  fadst4x4_sse4_1,  // ADST_DCT
+  fdct4x4_sse4_1,   // DCT_ADST
+  fadst4x4_sse4_1,  // ADST_ADST
+  fadst4x4_sse4_1,  // FLIPADST_DCT
+  fdct4x4_sse4_1,   // DCT_FLIPADST
+  fadst4x4_sse4_1,  // FLIPADST_FLIPADST
+  fadst4x4_sse4_1,  // ADST_FLIPADST
+  fadst4x4_sse4_1,  // FLIPADST_ADST
+  idtx4x4_sse4_1,   // IDTX
+  fdct4x4_sse4_1,   // V_DCT
+  idtx4x4_sse4_1,   // H_DCT
+  fadst4x4_sse4_1,  // V_ADST
+  idtx4x4_sse4_1,   // H_ADST
+  fadst4x4_sse4_1,  // V_FLIPADST
+  idtx4x4_sse4_1    // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct32_sse4_1,  // DCT_DCT
+  NULL,               // ADST_DCT
+  NULL,               // DCT_ADST
+  NULL,               // ADST_ADST
+  NULL,               // FLIPADST_DCT
+  NULL,               // DCT_FLIPADST
+  NULL,               // FLIPADST_FLIPADST
+  NULL,               // ADST_FLIPADST
+  NULL,               // FLIPADST_ADST
+  av1_idtx32_sse4_1,  // IDTX
+  NULL,               // V_DCT
+  NULL,               // H_DCT
+  NULL,               // V_ADST
+  NULL,               // H_ADST
+  NULL,               // V_FLIPADST
+  NULL                // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,  // DCT_DCT
+  NULL,              // ADST_DCT
+  NULL,              // DCT_ADST
+  NULL,              // ADST_ADST
+  NULL,              // FLIPADST_DCT
+  NULL,              // DCT_FLIPADST
+  NULL,              // FLIPADST_FLIPADST
+  NULL,              // ADST_FLIPADST
+  NULL,              // FLIPADST_ADST
+  idtx16x16_sse4_1,  // IDTX
+  NULL,              // V_DCT
+  NULL,              // H_DCT
+  NULL,              // V_ADST
+  NULL,              // H_ADST
+  NULL,              // V_FLIPADST
+  NULL               // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[32], out[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+    col_txfm(in, in, bit, 2);
+    col_txfm_8x8_rounding(in, -shift[1]);
+    transpose_8x8(in, out + i * 16);
+  }
+
+  if (lr_flip) {
+    flip_buf_sse4_1(in, out, 32);
+    row_txfm(in, out, bit, 2);
+  } else {
+    row_txfm(out, out, bit, 2);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    av1_round_shift_rect_array_32_sse4_1(out + i * 16, in, 16, -shift[2],
+                                         NewSqrt2);
+    write_buffer_8x8(in, coeff + i * 64);
+  }
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[32], out[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, in, bit, 2);
+  col_txfm_8x16_rounding(in, -shift[1]);
+  transpose_8x8(in, out);
+  transpose_8x8(in + 16, out + 16);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(out + i * 16, out, bit, 2);
+    av1_round_shift_rect_array_32_sse4_1(out, out, 16, -shift[2], NewSqrt2);
+    write_buffer_16x8(out, coeff + i * 8, 16);
+  }
+  (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[16];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int txfm_size_col = tx_size_wide[TX_4X16];
+  const int txfm_size_row = tx_size_high[TX_4X16];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col transform
+  load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, outcoeff128, bitcol, 1);
+  col_txfm_8x8_rounding(outcoeff128, -shift[1]);
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < 4; i++) {
+    __m128i tmp[4];
+    row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2);
+    store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col);
+  }
+  (void)bd;
+}
+#endif
+
+void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[16];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int txfm_size_col = tx_size_wide[TX_16X4];
+  const int txfm_size_row = tx_size_high[TX_16X4];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // col transform
+  load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]);
+
+  for (int i = 0; i < (txfm_size_col >> 2); i++) {
+    __m128i *cur_in = &in[i * txfm_size_row];
+    col_txfm(cur_in, cur_in, bitcol, 1);
+    transpose_32bit_4x4(cur_in, cur_in);
+  }
+  col_txfm_8x8_rounding(in, -shift[1]);
+
+  // row transform
+  row_txfm(in, outcoeff128, bitrow, 1);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[128];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  // column transform
+  load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+  load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
+
+  for (int i = 0; i < 4; i++) {
+    col_txfm((in + i), (in + i), bitcol, 4);
+  }
+  col_txfm_16x16_rounding(&in[0], -shift[1]);
+  col_txfm_16x16_rounding(&in[64], -shift[1]);
+  transpose_8nx8n(in, outcoef128, 16, 32);
+
+  // row transform
+  row_txfm(outcoef128, in, bitrow, 8);
+  av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2],
+                                       NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  (void)tx_type;
+  __m128i in[512];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
+  const int txw_idx = get_txw_idx(TX_32X64);
+  const int txh_idx = get_txh_idx(TX_32X64);
+  const int txfm_size_col = tx_size_wide[TX_32X64];
+  const int txfm_size_row = tx_size_high[TX_32X64];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int num_row = txfm_size_row >> 2;
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
+  }
+  for (int i = 0; i < num_col; i++) {
+    col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_row; i++) {
+    av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
+  }
+  for (int i = 0; i < txfm_size_col; i++) {
+    av1_round_shift_rect_array_32_sse4_1(in + i * 16, outcoef128 + i * 8, 8,
+                                         -shift[2], NewSqrt2);
+  }
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  (void)tx_type;
+  __m128i in[512];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
+  const int txw_idx = get_txw_idx(TX_64X32);
+  const int txh_idx = get_txh_idx(TX_64X32);
+  const int txfm_size_col = tx_size_wide[TX_64X32];
+  const int txfm_size_row = tx_size_high[TX_64X32];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int num_row = txfm_size_row >> 2;
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  for (int i = 0; i < 32; i++) {
+    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]);
+    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
+                    shift[0]);
+    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
+                    shift[0]);
+    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
+                    shift[0]);
+  }
+
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col);
+  }
+
+  for (int i = 0; i < num_row; i++) {
+    col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_row; i++) {
+    av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
+  }
+  av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 512, -shift[2],
+                                       NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[128];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
+  col_txfm(in, in, bitcol, 8);
+  col_txfm_16x16_rounding(&in[0], -shift[1]);
+  col_txfm_16x16_rounding(&in[64], -shift[1]);
+  transpose_8nx8n(in, outcoef128, 32, 16);
+
+  // row transform
+  for (int i = 0; i < 4; i++) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, 4);
+  }
+  av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2],
+                                       NewSqrt2);
+  (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  const int txfm_size_col = tx_size_wide[TX_8X32];
+  const int txfm_size_row = tx_size_high[TX_8X32];
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
+  load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
+                   stride, 0, 0, shift[0]);
+
+  for (int i = 0; i < num_col; i++) {
+    col_txfm((in + i), (in + i), bitcol, num_col);
+  }
+  col_txfm_16x16_rounding(in, -shift[1]);
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < txfm_size_col; i += 2) {
+    row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col);
+  }
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  const int txfm_size_col = tx_size_wide[TX_32X8];
+  const int txfm_size_row = tx_size_high[TX_32X8];
+  const int num_col = txfm_size_row >> 2;
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
+  for (int i = 0; i < txfm_size_row; i += 2) {
+    col_txfm((in + i), (in + i), bitcol, txfm_size_row);
+  }
+
+  col_txfm_16x16_rounding(&in[0], -shift[1]);
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_col; i++) {
+    row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col);
+  }
+  (void)bd;
+}
+#endif
+
+void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m128i in[8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int txfm_size_col = tx_size_wide[TX_4X8];
+  const int txfm_size_row = tx_size_high[TX_4X8];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, in, bitcol, 1);
+  col_txfm_4x8_rounding(in, -shift[1]);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *cur_in = &in[i * 4];
+    transpose_32bit_4x4(cur_in, cur_in);
+    row_txfm(cur_in, cur_in, bitrow, 1);
+    av1_round_shift_rect_array_32_sse4_1(cur_in, cur_in, txfm_size_col,
+                                         -shift[2], NewSqrt2);
+    store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4);
+  }
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m128i in[8];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int txfm_size_col = tx_size_wide[TX_8X4];
+  const int txfm_size_row = tx_size_high[TX_8X4];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col tranform
+  load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]);
+  for (int i = 0; i < 2; i++) {
+    __m128i *cur_in = &in[i * txfm_size_row];
+    col_txfm(cur_in, cur_in, bitcol, 1);
+    transpose_32bit_4x4(cur_in, cur_in);
+  }
+  col_txfm_4x8_rounding(in, -shift[1]);
+
+  // row tranform
+  row_txfm(in, outcoeff128, bitrow, 1);
+  av1_round_shift_rect_array_32_sse4_1(outcoeff128, outcoeff128, txfm_size_col,
+                                       -shift[2], NewSqrt2);
+  (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[256];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
+  const int txw_idx = get_txw_idx(TX_16X64);
+  const int txh_idx = get_txh_idx(TX_16X64);
+  const int txfm_size_col = tx_size_wide[TX_16X64];
+  const int txfm_size_row = tx_size_high[TX_16X64];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int num_col = txfm_size_col >> 2;
+  // col tranform
+  for (int i = 0; i < txfm_size_row; i += num_col) {
+    load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
+                    ud_flip, lr_flip, shift[0]);
+  }
+
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
+  }
+
+  col_txfm_16x16_rounding(outcoeff128, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
+
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
+  fdct16x16_sse4_1(in, outcoeff128, bitrow, 8);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[256];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
+  const int txw_idx = get_txw_idx(TX_64X16);
+  const int txh_idx = get_txh_idx(TX_64X16);
+  const int txfm_size_col = tx_size_wide[TX_64X16];
+  const int txfm_size_row = tx_size_high[TX_64X16];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col tranform
+  for (int i = 0; i < txfm_size_row; i++) {
+    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, shift[0]);
+  }
+
+  fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
+  col_txfm_16x16_rounding(outcoeff128, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
+
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+  for (int i = 0; i < 4; i++) {
+    av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4);
+  }
+  memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff));
+  (void)bd;
+}
+#endif
diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c
new file mode 100644
index 0000000000..ca448ca37b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+  { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+  { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+  { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+  { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint32_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse + 2;
+  for (int i = 0; i < block_height; i++) {
+    __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+    __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+    __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    __m256i diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    __m256i diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)dst, diff_lo);
+    dst += 8;
+    _mm256_storeu_si256((__m256i *)dst, diff_hi);
+
+    src1 += stride, src2 += stride2;
+    dst += sse_stride - 8;
+  }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint32_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse + 2;
+  for (int i = 0; i < block_height; i++) {
+    __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+    __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+    __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    __m256i diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    __m256i diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)dst, diff_lo);
+    _mm256_storeu_si256((__m256i *)(dst + 8), diff_hi);
+
+    v_src1 = _mm256_loadu_si256((__m256i *)(src1 + 16));
+    v_src2 = _mm256_loadu_si256((__m256i *)(src2 + 16));
+    v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)(dst + 16), diff_lo);
+    _mm256_storeu_si256((__m256i *)(dst + 24), diff_hi);
+
+    src1 += stride;
+    src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_left(uint32_t *src,
+                                                  __m256i *v256tmp) {
+  *v256tmp = _mm256_loadu_si256((__m256i *)src);
+  // For the first column, replicate the first element twice to the left
+  __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0xEA);
+  *v256tmp = _mm256_inserti128_si256(*v256tmp,
+                                     _mm256_extracti128_si256(v256tmp1, 0), 0);
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_right(uint32_t *src,
+                                                   __m256i *v256tmp) {
+  *v256tmp = _mm256_loadu_si256((__m256i *)src);
+  // For the last column, replicate the last element twice to the right
+  __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0x54);
+  *v256tmp = _mm256_inserti128_si256(*v256tmp,
+                                     _mm256_extracti128_si256(v256tmp1, 1), 1);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+  // Mask the required 5 values inside the vector
+  __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+  __m128i v128a, v128b;
+  // Extract 256b as two 128b registers A and B
+  v128a = _mm256_castsi256_si128(vtmp);
+  v128b = _mm256_extracti128_si256(vtmp, 1);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A2+B2, A3+B3, 0, 0]
+  v128b = _mm_srli_si128(v128a, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  v128b = _mm_srli_si128(v128a, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  return _mm_extract_epi32(v128a, 0);
+}
+
+static void highbd_apply_temporal_filter(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor,
+    int tf_wgt_calc_lvl) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_sse[BH][BW];
+
+  if (block_width == 32) {
+    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  } else {
+    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  }
+
+  __m256i vsrc[5];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  int col;
+  uint32_t *src = frame_sse;
+  for (int i = 2; i < 5; i++) {
+    xx_load_and_pad_left(src, &vsrc[i]);
+    src += SSE_STRIDE;
+  }
+
+  // Copy first row to first 2 vectors
+  vsrc[0] = vsrc[2];
+  vsrc[1] = vsrc[2];
+
+  for (int row = 0; row < block_height - 3; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    xx_load_and_pad_left(src, &vsrc[4]);
+    src += SSE_STRIDE;
+
+    acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (int row = block_height - 3; row < block_height; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (col = 4; col < block_width - 4; col += 4) {
+    src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = _mm256_loadu_si256((__m256i *)src);
+      src += SSE_STRIDE;
+    }
+
+    // Copy first row to first 2 vectors
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (int row = 0; row < block_height - 3; row++) {
+      __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+      __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+      __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+      __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      vsrc[4] = _mm256_loadu_si256((__m256i *)src);
+
+      src += SSE_STRIDE;
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+    }
+    for (int row = block_height - 3; row < block_height; row++) {
+      __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+      __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+      __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+      __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+    }
+  }
+
+  src = frame_sse + col;
+
+  // Load and pad(for first and last col) 3 rows from the top
+  for (int i = 2; i < 5; i++) {
+    xx_load_and_pad_right(src, &vsrc[i]);
+    src += SSE_STRIDE;
+  }
+
+  // Copy first row to first 2 vectors
+  vsrc[0] = vsrc[2];
+  vsrc[1] = vsrc[2];
+
+  for (int row = 0; row < block_height - 3; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    xx_load_and_pad_right(src, &vsrc[4]);
+    src += SSE_STRIDE;
+
+    acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (int row = block_height - 3; row < block_height; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+  }
+
+  double subblock_mses_scaled[4];
+  double d_factor_decayed[4];
+  for (int idx = 0; idx < 4; idx++) {
+    subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+    d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+  }
+  if (tf_wgt_calc_lvl == 0) {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        // Scale down the difference for high bit depth input.
+        diff_sse >>= ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+        const double combined_error =
+            weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+        double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  } else {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        // Scale down the difference for high bit depth input.
+        diff_sse >>= ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+        const double combined_error =
+            weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+        double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  }
+}
+
+void av1_highbd_apply_temporal_filter_avx2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+  uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint16_t *ref =
+        CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    highbd_apply_temporal_filter(
+        ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+        subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+        luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+        weight_factor, d_factor, tf_wgt_calc_lvl);
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c
new file mode 100644
index 0000000000..2032847083
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+  { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint16_t *frame1, const unsigned int stride,
+                              const uint16_t *frame2,
+                              const unsigned int stride2, const int block_width,
+                              const int block_height, uint32_t *frame_sse,
+                              const unsigned int dst_stride) {
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse;
+
+  for (int i = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j += 8) {
+      __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+      __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+      __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2);
+      __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff);
+      __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff);
+
+      __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh);
+      __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh);
+
+      _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+      _mm_storeu_si128((__m128i *)(dst + j + 6), vres2);
+    }
+
+    src1 += stride;
+    src2 += stride2;
+    dst += dst_stride;
+  }
+}
+
+static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col,
+                            int block_width) {
+  __m128i vtmp1 = _mm_loadu_si128((__m128i *)src);
+  __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4));
+  // For the first column, replicate the first element twice to the left
+  dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+  // For the last column, replicate the last element twice to the right
+  dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+  __m128i veca, vecb;
+  // Mask and obtain the required 5 values inside the vector
+  veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+  vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A2+B2, A3+B3, 0, 0]
+  vecb = _mm_srli_si128(veca, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  vecb = _mm_srli_si128(veca, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  return _mm_cvtsi128_si32(veca);
+}
+
+static void highbd_apply_temporal_filter(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor,
+    int tf_wgt_calc_lvl) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_sse[BH][BW];
+
+  get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+                    frame_sse, SSE_STRIDE);
+
+  __m128i vsrc[5][2];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint32_t *src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      xx_load_and_pad(src, vsrc[i], col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Padding for top 2 rows
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (int row = 0; row < block_height - 3; row++) {
+      __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+      __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+      __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+      __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+      __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+      __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+      __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+      __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+      vsrc[0][0] = vsrc[1][0];
+      vsrc[0][1] = vsrc[1][1];
+      vsrc[1][0] = vsrc[2][0];
+      vsrc[1][1] = vsrc[2][1];
+      vsrc[2][0] = vsrc[3][0];
+      vsrc[2][1] = vsrc[3][1];
+      vsrc[3][0] = vsrc[4][0];
+      vsrc[3][1] = vsrc[4][1];
+
+      // Load next row
+      xx_load_and_pad(src, vsrc[4], col, block_width);
+      src += SSE_STRIDE;
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+    }
+    for (int row = block_height - 3; row < block_height; row++) {
+      __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+      __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+      __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+      __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+      __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+      __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+      __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+      __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+      vsrc[0][0] = vsrc[1][0];
+      vsrc[0][1] = vsrc[1][1];
+      vsrc[1][0] = vsrc[2][0];
+      vsrc[1][1] = vsrc[2][1];
+      vsrc[2][0] = vsrc[3][0];
+      vsrc[2][1] = vsrc[3][1];
+      vsrc[3][0] = vsrc[4][0];
+      vsrc[3][1] = vsrc[4][1];
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+    }
+  }
+
+  double subblock_mses_scaled[4];
+  double d_factor_decayed[4];
+  for (int idx = 0; idx < 4; idx++) {
+    subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+    d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+  }
+  if (tf_wgt_calc_lvl == 0) {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        // Scale down the difference for high bit depth input.
+        diff_sse >>= ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+        const double combined_error =
+            weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+        double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  } else {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        // Scale down the difference for high bit depth input.
+        diff_sse >>= ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+        const double combined_error =
+            weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+        double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  }
+}
+
+void av1_highbd_apply_temporal_filter_sse2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+  uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint16_t *ref =
+        CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    highbd_apply_temporal_filter(
+        ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+        subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+        luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+        weight_factor, d_factor, tf_wgt_calc_lvl);
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_avx2.c b/third_party/aom/av1/encoder/x86/ml_avx2.c
new file mode 100644
index 0000000000..6432708416
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_avx2.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
+
+#define CALC_OUTPUT_FOR_2ROWS                                               \
+  const int index = weight_idx + (2 * i * tot_num_inputs);                  \
+  const __m256 weight0 = _mm256_loadu_ps(&weights[index]);                  \
+  const __m256 weight1 = _mm256_loadu_ps(&weights[index + tot_num_inputs]); \
+  const __m256 mul0 = _mm256_mul_ps(inputs256, weight0);                    \
+  const __m256 mul1 = _mm256_mul_ps(inputs256, weight1);                    \
+  hadd[i] = _mm256_hadd_ps(mul0, mul1);
+
+static INLINE void nn_propagate_8to1(
+    const float *const inputs, const float *const weights,
+    const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+    int num_outputs, float *const output_nodes, int is_clip_required) {
+  // Process one output row at a time.
+  for (int out = 0; out < num_outputs; out++) {
+    __m256 in_result = _mm256_setzero_ps();
+    float bias_val = bias[out];
+    for (int in = 0; in < num_inputs_to_process; in += 8) {
+      const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+      const int weight_idx = in + (out * tot_num_inputs);
+      const __m256 weight0 = _mm256_loadu_ps(&weights[weight_idx]);
+      const __m256 mul0 = _mm256_mul_ps(inputs256, weight0);
+      in_result = _mm256_add_ps(in_result, mul0);
+    }
+    const __m128 low_128 = _mm256_castps256_ps128(in_result);
+    const __m128 high_128 = _mm256_extractf128_ps(in_result, 1);
+    const __m128 sum_par_0 = _mm_add_ps(low_128, high_128);
+    const __m128 sum_par_1 = _mm_hadd_ps(sum_par_0, sum_par_0);
+    const __m128 sum_tot =
+        _mm_add_ps(_mm_shuffle_ps(sum_par_1, sum_par_1, 0x99), sum_par_1);
+
+    bias_val += _mm_cvtss_f32(sum_tot);
+    if (is_clip_required) bias_val = AOMMAX(bias_val, 0);
+    output_nodes[out] = bias_val;
+  }
+}
+
+static INLINE void nn_propagate_8to4(
+    const float *const inputs, const float *const weights,
+    const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+    int num_outputs, float *const output_nodes, int is_clip_required) {
+  __m256 hadd[2];
+  for (int out = 0; out < num_outputs; out += 4) {
+    __m128 bias_reg = _mm_loadu_ps(&bias[out]);
+    __m128 in_result = _mm_setzero_ps();
+    for (int in = 0; in < num_inputs_to_process; in += 8) {
+      const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+      const int weight_idx = in + (out * tot_num_inputs);
+      // Process two output row at a time.
+      for (int i = 0; i < 2; i++) {
+        CALC_OUTPUT_FOR_2ROWS
+      }
+
+      const __m256 sum_par = _mm256_hadd_ps(hadd[0], hadd[1]);
+      const __m128 low_128 = _mm256_castps256_ps128(sum_par);
+      const __m128 high_128 = _mm256_extractf128_ps(sum_par, 1);
+      const __m128 result = _mm_add_ps(low_128, high_128);
+
+      in_result = _mm_add_ps(in_result, result);
+    }
+
+    in_result = _mm_add_ps(in_result, bias_reg);
+    if (is_clip_required) in_result = _mm_max_ps(in_result, _mm_setzero_ps());
+    _mm_storeu_ps(&output_nodes[out], in_result);
+  }
+}
+
+static INLINE void nn_propagate_8to8(
+    const float *const inputs, const float *const weights,
+    const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+    int num_outputs, float *const output_nodes, int is_clip_required) {
+  __m256 hadd[4];
+  for (int out = 0; out < num_outputs; out += 8) {
+    __m256 bias_reg = _mm256_loadu_ps(&bias[out]);
+    __m256 in_result = _mm256_setzero_ps();
+    for (int in = 0; in < num_inputs_to_process; in += 8) {
+      const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+      const int weight_idx = in + (out * tot_num_inputs);
+      // Process two output rows at a time.
+      for (int i = 0; i < 4; i++) {
+        CALC_OUTPUT_FOR_2ROWS
+      }
+      const __m256 hh0 = _mm256_hadd_ps(hadd[0], hadd[1]);
+      const __m256 hh1 = _mm256_hadd_ps(hadd[2], hadd[3]);
+
+      __m256 ht_0 = _mm256_permute2f128_ps(hh0, hh1, 0x20);
+      __m256 ht_1 = _mm256_permute2f128_ps(hh0, hh1, 0x31);
+
+      __m256 result = _mm256_add_ps(ht_0, ht_1);
+      in_result = _mm256_add_ps(in_result, result);
+    }
+    in_result = _mm256_add_ps(in_result, bias_reg);
+    if (is_clip_required)
+      in_result = _mm256_max_ps(in_result, _mm256_setzero_ps());
+    _mm256_storeu_ps(&output_nodes[out], in_result);
+  }
+}
+
+static INLINE void nn_propagate_input_multiple_of_8(
+    const float *const inputs, const float *const weights,
+    const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+    bool is_output_layer, int num_outputs, float *const output_nodes) {
+  // The saturation of output is considered for hidden layer which is not equal
+  // to final hidden layer.
+  const int is_clip_required =
+      !is_output_layer && num_inputs_to_process == tot_num_inputs;
+  if (num_outputs % 8 == 0) {
+    nn_propagate_8to8(inputs, weights, bias, num_inputs_to_process,
+                      tot_num_inputs, num_outputs, output_nodes,
+                      is_clip_required);
+  } else if (num_outputs % 4 == 0) {
+    nn_propagate_8to4(inputs, weights, bias, num_inputs_to_process,
+                      tot_num_inputs, num_outputs, output_nodes,
+                      is_clip_required);
+  } else {
+    nn_propagate_8to1(inputs, weights, bias, num_inputs_to_process,
+                      tot_num_inputs, num_outputs, output_nodes,
+                      is_clip_required);
+  }
+}
+
+void av1_nn_predict_avx2(const float *input_nodes,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
+                         float *const output) {
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  int buf_index = 0;
+  int num_inputs = nn_config->num_inputs;
+  assert(num_inputs > 0 && num_inputs <= NN_MAX_NODES_PER_LAYER);
+
+  for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    bool is_output_layer = layer == nn_config->num_hidden_layers;
+    float *const output_nodes = is_output_layer ? output : &buf[buf_index][0];
+    const int num_outputs = is_output_layer
+                                ? nn_config->num_outputs
+                                : nn_config->num_hidden_nodes[layer];
+    assert(num_outputs > 0 && num_outputs <= NN_MAX_NODES_PER_LAYER);
+
+    // Process input multiple of 8 using AVX2 intrinsic.
+    if (num_inputs % 8 == 0) {
+      nn_propagate_input_multiple_of_8(input_nodes, layer_weights, layer_bias,
+                                       num_inputs, num_inputs, is_output_layer,
+                                       num_outputs, output_nodes);
+    } else {
+      // When number of inputs is not multiple of 8, use hybrid approach of AVX2
+      // and SSE3 based on the need.
+      const int in_mul_8 = num_inputs / 8;
+      const int num_inputs_to_process = in_mul_8 * 8;
+      int bias_is_considered = 0;
+      if (in_mul_8) {
+        nn_propagate_input_multiple_of_8(
+            input_nodes, layer_weights, layer_bias, num_inputs_to_process,
+            num_inputs, is_output_layer, num_outputs, output_nodes);
+        bias_is_considered = 1;
+      }
+
+      const float *out_temp = bias_is_considered ? output_nodes : layer_bias;
+      const int input_remaining = num_inputs % 8;
+      if (input_remaining % 4 == 0 && num_outputs % 8 == 0) {
+        for (int out = 0; out < num_outputs; out += 8) {
+          __m128 out_h = _mm_loadu_ps(&out_temp[out + 4]);
+          __m128 out_l = _mm_loadu_ps(&out_temp[out]);
+          for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+            av1_nn_propagate_4to8_sse3(&input_nodes[in],
+                                       &layer_weights[out * num_inputs + in],
+                                       &out_h, &out_l, num_inputs);
+          }
+          if (!is_output_layer) {
+            const __m128 zero = _mm_setzero_ps();
+            out_h = _mm_max_ps(out_h, zero);
+            out_l = _mm_max_ps(out_l, zero);
+          }
+          _mm_storeu_ps(&output_nodes[out + 4], out_h);
+          _mm_storeu_ps(&output_nodes[out], out_l);
+        }
+      } else if (input_remaining % 4 == 0 && num_outputs % 4 == 0) {
+        for (int out = 0; out < num_outputs; out += 4) {
+          __m128 outputs = _mm_loadu_ps(&out_temp[out]);
+          for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+            av1_nn_propagate_4to4_sse3(&input_nodes[in],
+                                       &layer_weights[out * num_inputs + in],
+                                       &outputs, num_inputs);
+          }
+          if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+          _mm_storeu_ps(&output_nodes[out], outputs);
+        }
+      } else if (input_remaining % 4 == 0) {
+        for (int out = 0; out < num_outputs; out++) {
+          __m128 outputs = _mm_load1_ps(&out_temp[out]);
+          for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+            av1_nn_propagate_4to1_sse3(&input_nodes[in],
+                                       &layer_weights[out * num_inputs + in],
+                                       &outputs);
+          }
+          if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+          output_nodes[out] = _mm_cvtss_f32(outputs);
+        }
+      } else {
+        // Use SSE instructions for scalar operations to avoid the latency
+        // of swapping between SIMD and FPU modes.
+        for (int out = 0; out < num_outputs; out++) {
+          __m128 outputs = _mm_load1_ps(&out_temp[out]);
+          for (int in_node = in_mul_8 * 8; in_node < num_inputs; in_node++) {
+            __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+            __m128 weight =
+                _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+            outputs = _mm_add_ps(outputs, _mm_mul_ps(input, weight));
+          }
+          if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+          output_nodes[out] = _mm_cvtss_f32(outputs);
+        }
+      }
+    }
+    // Before processing the next layer, treat the output of current layer as
+    // input to next layer.
+    input_nodes = output_nodes;
+    num_inputs = num_outputs;
+    buf_index = 1 - buf_index;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.c b/third_party/aom/av1/encoder/x86/ml_sse3.c
new file mode 100644
index 0000000000..4748a68d38
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_sse3.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
+
+// In order to avoid the high-latency of swapping between FPU and SIMD
+// operations, we keep the result in a 128-bit register even though we only
+// care about a single value.
+static void nn_propagate_8to1(const float *const inputs,
+                              const float *const weights,
+                              __m128 *const output) {
+  const __m128 inputs_h = _mm_loadu_ps(&inputs[4]);
+  const __m128 inputs_l = _mm_loadu_ps(inputs);
+
+  const __m128 weights_h = _mm_loadu_ps(&weights[4]);
+  const __m128 weights_l = _mm_loadu_ps(weights);
+
+  const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h);
+  const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l);
+  // [7 6 5 4] [3 2 1 0] (weight and input indices)
+
+  const __m128 vadd = _mm_add_ps(mul_l, mul_h);
+  // [7+3 6+2 5+1 4+0]
+  const __m128 hadd1 = _mm_hadd_ps(vadd, vadd);
+  // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0]
+  const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+  // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0]
+  *output = _mm_add_ps(*output, hadd2);
+}
+
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+                                const float *const weights,
+                                __m128 *const output) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  const __m128 weights128 = _mm_loadu_ps(weights);
+
+  const __m128 mul = _mm_mul_ps(inputs128, weights128);
+  // [3 2 1 0] (weight and input indices)
+
+  const __m128 hadd1 = _mm_hadd_ps(mul, mul);
+  // [3+2 1+0 3+2 1+0]
+  const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+  // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0]
+  *output = _mm_add_ps(*output, hadd2);
+}
+
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+                                const float *const weights,
+                                __m128 *const outputs, const int num_inputs) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  __m128 hadd[2];
+  for (int i = 0; i < 2; i++) {  // For each pair of outputs
+    const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+    const __m128 mul0 = _mm_mul_ps(weight0, inputs128);
+    const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+    const __m128 mul1 = _mm_mul_ps(weight1, inputs128);
+    hadd[i] = _mm_hadd_ps(mul0, mul1);
+  }
+  // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+  // hadd[1] = [15+14 13+12 11+10 9+8]
+
+  const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]);
+  // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+
+  *outputs = _mm_add_ps(*outputs, hh);
+}
+
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+                                const float *const weights, __m128 *const out_h,
+                                __m128 *const out_l, const int num_inputs) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  __m128 hadd[4];
+  for (int i = 0; i < 4; i++) {  // For each pair of outputs
+    const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+    const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+    const __m128 mul0 = _mm_mul_ps(inputs128, weight0);
+    const __m128 mul1 = _mm_mul_ps(inputs128, weight1);
+    hadd[i] = _mm_hadd_ps(mul0, mul1);
+  }
+  // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+  // hadd[1] = [15+14 13+12 11+10 9+8]
+  // hadd[2] = [23+22 21+20 19+18 17+16]
+  // hadd[3] = [31+30 29+28 27+26 25+24]
+
+  const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]);
+  // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+  const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]);
+  // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16]
+
+  *out_h = _mm_add_ps(*out_h, hh1);
+  *out_l = _mm_add_ps(*out_l, hh0);
+}
+
+static void nn_propagate_8to4(const float *const inputs,
+                              const float *const weights, __m128 *const outputs,
+                              const int num_inputs) {
+  const __m128 inputs_h = _mm_loadu_ps(inputs + 4);
+  const __m128 inputs_l = _mm_loadu_ps(inputs);
+  // [7 6 5 4] [3 2 1 0] (input indices)
+
+  __m128 add[4];
+  for (int i = 0; i < 4; i++) {  // For each output:
+    const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]);
+    const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]);
+    const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h);
+    const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l);
+    add[i] = _mm_add_ps(mul_l, mul_h);
+  }
+  // add[0] = [7+3 6+2 5+1 4+0]
+  // add[1] = [15+11 14+10 13+9 12+8]
+  // add[2] = [23+19 22+18 21+17 20+16]
+  // add[3] = [31+27 30+26 29+25 28+24]
+
+  const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]);
+  // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16]
+  const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]);
+  // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0]
+
+  const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h);
+  // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16
+  //  15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0]
+
+  *outputs = _mm_add_ps(*outputs, haddhadd);
+}
+
+static void nn_activate8(__m128 *out_h, __m128 *out_l) {
+  const __m128 zero = _mm_setzero_ps();
+  *out_h = _mm_max_ps(*out_h, zero);
+  *out_l = _mm_max_ps(*out_l, zero);
+}
+
+static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); }
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_sse3(const float *input_nodes,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
+                         float *const output) {
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  int buf_index = 0;
+  int num_inputs = nn_config->num_inputs;
+
+  // Hidden layers, except the final iteration is the output layer.
+  for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    bool output_layer = (layer == nn_config->num_hidden_layers);
+    float *const output_nodes = output_layer ? output : &buf[buf_index][0];
+    const int num_outputs = output_layer ? nn_config->num_outputs
+                                         : nn_config->num_hidden_nodes[layer];
+
+    if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out += 8) {
+        __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]);
+        __m128 out_l = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          av1_nn_propagate_4to8_sse3(&input_nodes[in],
+                                     &layer_weights[out * num_inputs + in],
+                                     &out_h, &out_l, num_inputs);
+        }
+        if (!output_layer) nn_activate8(&out_h, &out_l);
+        _mm_storeu_ps(&output_nodes[out + 4], out_h);
+        _mm_storeu_ps(&output_nodes[out], out_l);
+      }
+    } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 8) {
+          nn_propagate_8to4(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &outputs,
+                            num_inputs);
+        }
+        if (!output_layer) nn_activate4(&outputs);
+        _mm_storeu_ps(&output_nodes[out], outputs);
+      }
+    } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          av1_nn_propagate_4to4_sse3(&input_nodes[in],
+                                     &layer_weights[out * num_inputs + in],
+                                     &outputs, num_inputs);
+        }
+        if (!output_layer) nn_activate4(&outputs);
+        _mm_storeu_ps(&output_nodes[out], outputs);
+      }
+    } else if (num_inputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 8) {
+          nn_propagate_8to1(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &total);
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    } else if (num_inputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          av1_nn_propagate_4to1_sse3(
+              &input_nodes[in], &layer_weights[out * num_inputs + in], &total);
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    } else {
+      // Use SSE instructions for scalar operations to avoid the latency of
+      // swapping between SIMD and FPU modes.
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in_node = 0; in_node < num_inputs; in_node++) {
+          __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+          __m128 weight =
+              _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+          total = _mm_add_ps(total, _mm_mul_ps(input, weight));
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    }
+    input_nodes = output_nodes;
+    num_inputs = num_outputs;
+    buf_index = 1 - buf_index;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
+
+// Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential
+// Function. Neural Computation, 11(4):853–862, 1999.
+static AOM_INLINE __m128 approx_exp(__m128 y) {
+#define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
+#define B \
+  127  // Offset for the exponent according to IEEE floating point standard.
+#define C 60801  // Magic number controls the accuracy of approximation
+  const __m128 multiplier = _mm_set1_ps(A);
+  const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C);
+
+  y = _mm_mul_ps(y, multiplier);
+  y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset));
+  return y;
+#undef A
+#undef B
+#undef C
+}
+
+static AOM_INLINE __m128 reduce_max(__m128 reg) {
+  __m128 tmp_reg;
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e);  // 01 00 11 10
+  reg = _mm_max_ps(reg, tmp_reg);
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1);  // 10 11 00 01
+  reg = _mm_max_ps(reg, tmp_reg);
+
+  return reg;
+}
+
+static AOM_INLINE __m128 reduce_sum(__m128 reg) {
+  __m128 tmp_reg;
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e);  // 01 00 11 10
+  reg = _mm_add_ps(reg, tmp_reg);
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1);  // 10 11 00 01
+  reg = _mm_add_ps(reg, tmp_reg);
+
+  return reg;
+}
+
+void av1_nn_fast_softmax_16_sse3(const float *input, float *output) {
+  // Clips at -10 to avoid underflowing
+  const __m128 clipper = _mm_set1_ps(-10.0f);
+
+  // Load in 16 values
+  __m128 in_0 = _mm_loadu_ps(&input[0]);
+  __m128 in_1 = _mm_loadu_ps(&input[4]);
+  __m128 in_2 = _mm_loadu_ps(&input[8]);
+  __m128 in_3 = _mm_loadu_ps(&input[12]);
+
+  // Get the max
+  __m128 max_0 = _mm_max_ps(in_0, in_1);
+  __m128 max_1 = _mm_max_ps(in_2, in_3);
+
+  max_0 = _mm_max_ps(max_0, max_1);
+  max_0 = reduce_max(max_0);
+
+  // Subtract the max off and clip
+  in_0 = _mm_sub_ps(in_0, max_0);
+  in_1 = _mm_sub_ps(in_1, max_0);
+  in_2 = _mm_sub_ps(in_2, max_0);
+  in_3 = _mm_sub_ps(in_3, max_0);
+
+  in_0 = _mm_max_ps(in_0, clipper);
+  in_1 = _mm_max_ps(in_1, clipper);
+  in_2 = _mm_max_ps(in_2, clipper);
+  in_3 = _mm_max_ps(in_3, clipper);
+
+  // Exponentiate and compute the denominator
+  __m128 sum = in_0 = approx_exp(in_0);
+  in_1 = approx_exp(in_1);
+  sum = _mm_add_ps(sum, in_1);
+  in_2 = approx_exp(in_2);
+  sum = _mm_add_ps(sum, in_2);
+  in_3 = approx_exp(in_3);
+  sum = _mm_add_ps(sum, in_3);
+  sum = reduce_sum(sum);
+
+  // Divide to get the probability
+  in_0 = _mm_div_ps(in_0, sum);
+  in_1 = _mm_div_ps(in_1, sum);
+  in_2 = _mm_div_ps(in_2, sum);
+  in_3 = _mm_div_ps(in_3, sum);
+
+  _mm_storeu_ps(&output[0], in_0);
+  _mm_storeu_ps(&output[4], in_1);
+  _mm_storeu_ps(&output[8], in_2);
+  _mm_storeu_ps(&output[12], in_3);
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.h b/third_party/aom/av1/encoder/x86/ml_sse3.h
new file mode 100644
index 0000000000..f41a2474af
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_sse3.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_ML_SSE3_H_
+#define AOM_AV1_ENCODER_X86_ML_SSE3_H_
+
+#include <pmmintrin.h>
+
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+                                const float *const weights,
+                                __m128 *const output);
+
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+                                const float *const weights,
+                                __m128 *const outputs, const int num_inputs);
+
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+                                const float *const weights, __m128 *const out_h,
+                                __m128 *const out_l, const int num_inputs);
+
+#endif  // AOM_AV1_ENCODER_X86_ML_SSE3_H_
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 0000000000..6658ed39a8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,2348 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
+                                        const __m256i *shuffle,
+                                        const __m256i *dgd_ijkl) {
+  // Load two 128-bit chunks from dgd
+  const __m256i s0 = _mm256_inserti128_si256(
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)),
+      _mm_loadu_si128((__m128i *)(dgd + 4)), 1);
+  // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices)
+  // The weird order is so the shuffle stays within 128-bit lanes
+
+  // Shuffle 16x u16 values within lanes according to the mask:
+  // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4]
+  // (Actually we shuffle u8 values as there's no 16-bit shuffle)
+  const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle);
+  // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices)
+
+  // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit
+  // integers then horizontally add pairs of these integers resulting in 8x
+  // 32-bit integers
+  const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1);
+  // d0 = [a b c d] [e f g h] as u32
+
+  // Take the lower-half of d0, extend to u64, add it on to dst (H)
+  const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0));
+  // d0l = [a b] [c d] as u64
+  const __m256i dst0 = yy_load_256(dst);
+  yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
+
+  // Take the upper-half of d0, extend to u64, add it on to dst (H)
+  const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1));
+  // d0h = [e f] [g h] as u64
+  const __m256i dst1 = yy_load_256(dst + 4);
+  yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
+}
+
+static INLINE void acc_stat_highbd_win7_one_line_avx2(
+    const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+    int dgd_stride, const __m256i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
+    int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN;
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
+    const uint16_t X1 = src[j];
+    const uint16_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        const uint16_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        // Load two u16 values from dgd_ijkl combined as a u32,
+        // then broadcast to 8x u32 slots of a 256
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
+        // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
+
+        acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+      }
+    }
+  }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint16_t X1 = src[j];
+    *sumX += X1;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_highbd_avx2` function wants its input to have
+        // interleaved copies of two pixels, but we only have one. However, the
+        // pixels are (effectively) used as inputs to a multiply-accumulate. So
+        // if we set the extra pixel slot to 0, then it is effectively ignored.
+        const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
+
+        acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_highbd_win7_opt_avx2(
+    const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+    int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+    int64_t *H, aom_bit_depth_t bit_depth) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_highbd_win7_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = (M_int[k][l] +
+                 (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+                bit_depth_divider;
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] =
+              (H_int_[n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+static INLINE void acc_stat_highbd_win5_one_line_avx2(
+    const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+    int dgd_stride, const __m256i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
+    const uint16_t X1 = src[j];
+    const uint16_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        const uint16_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        // Load two u16 values from dgd_ijkl combined as a u32,
+        // then broadcast to 8x u32 slots of a 256
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
+        // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
+
+        acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+      }
+    }
+  }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint16_t X1 = src[j];
+    *sumX += X1;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_highbd_avx2` function wants its input to have
+        // interleaved copies of two pixels, but we only have one. However, the
+        // pixels are (effectively) used as inputs to a multiply-accumulate. So
+        // if we set the extra pixel slot to 0, then it is effectively ignored.
+        const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
+
+        acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_highbd_win5_opt_avx2(
+    const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+    int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+    int64_t *H, aom_bit_depth_t bit_depth) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  DECLARE_ALIGNED(
+      32, int64_t,
+      H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_highbd_win5_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64);
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = (M_int64[k][l] +
+                 (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+                bit_depth_divider;
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] =
+              (H_int_[n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8,
+                                   const uint8_t *src8, int h_start, int h_end,
+                                   int v_start, int v_end, int dgd_stride,
+                                   int src_stride, int64_t *M, int64_t *H,
+                                   aom_bit_depth_t bit_depth) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start,
+                                       v_end, dgd_stride, src_stride, M, H,
+                                       bit_depth);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start,
+                                       v_end, dgd_stride, src_stride, M, H,
+                                       bit_depth);
+  } else {
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+                               v_end, dgd_stride, src_stride, M, H, bit_depth);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) {
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(src, dgd));
+}
+
+static INLINE __m256i convert_and_add_avx2(__m256i src) {
+  const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src));
+  const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1));
+  return _mm256_add_epi64(s0, s1);
+}
+
+static INLINE __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1,
+                                              __m256i *src2, __m256i *src3) {
+  // 00 01 10 11 02 03 12 13
+  const __m256i s_0 = _mm256_hadd_epi32(src0, src1);
+  // 20 21 30 31 22 23 32 33
+  const __m256i s_1 = _mm256_hadd_epi32(*src2, *src3);
+  // 00+01 10+11 20+21 30+31 02+03 12+13 22+23 32+33
+  const __m256i s_2 = _mm256_hadd_epi32(s_0, s_1);
+  return convert_and_add_avx2(s_2);
+}
+
+static INLINE __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) {
+  // 00 10 02 12
+  const __m256i t0 = _mm256_unpacklo_epi64(src0, src1);
+  // 01 11 03 13
+  const __m256i t1 = _mm256_unpackhi_epi64(src0, src1);
+  // 00+01 10+11 02+03 12+13
+  const __m256i sum = _mm256_add_epi64(t0, t1);
+  // 00+01 10+11
+  const __m128i sum0 = _mm256_castsi256_si128(sum);
+  // 02+03 12+13
+  const __m128i sum1 = _mm256_extracti128_si256(sum, 1);
+  // 00+01+02+03 10+11+12+13
+  return _mm_add_epi64(sum0, sum1);
+}
+
+static INLINE __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) {
+  // 00 01 02 03
+  const __m256i s0 = convert_and_add_avx2(src0);
+  // 10 11 12 13
+  const __m256i s1 = convert_and_add_avx2(src1);
+  return add_64bit_lvl_avx2(s0, s1);
+}
+
+static INLINE int32_t calc_sum_of_register(__m256i src) {
+  const __m128i src_l = _mm256_castsi256_si128(src);
+  const __m128i src_h = _mm256_extracti128_si256(src, 1);
+  const __m128i sum = _mm_add_epi32(src_l, src_h);
+  const __m128i dst0 = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  const __m128i dst1 = _mm_add_epi32(dst0, _mm_srli_si128(dst0, 4));
+  return _mm_cvtsi128_si32(dst1);
+}
+
+static INLINE void transpose_64bit_4x4_avx2(const __m256i *const src,
+                                            __m256i *const dst) {
+  // Unpack 64 bit elements. Goes from:
+  // src[0]: 00 01 02 03
+  // src[1]: 10 11 12 13
+  // src[2]: 20 21 22 23
+  // src[3]: 30 31 32 33
+  // to:
+  // reg0:    00 10 02 12
+  // reg1:    20 30 22 32
+  // reg2:    01 11 03 13
+  // reg3:    21 31 23 33
+  const __m256i reg0 = _mm256_unpacklo_epi64(src[0], src[1]);
+  const __m256i reg1 = _mm256_unpacklo_epi64(src[2], src[3]);
+  const __m256i reg2 = _mm256_unpackhi_epi64(src[0], src[1]);
+  const __m256i reg3 = _mm256_unpackhi_epi64(src[2], src[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // dst[0]: 00 10 20 30
+  // dst[1]: 01 11 21 31
+  // dst[2]: 02 12 22 32
+  // dst[3]: 03 13 23 33
+  dst[0] = _mm256_inserti128_si256(reg0, _mm256_castsi256_si128(reg1), 1);
+  dst[1] = _mm256_inserti128_si256(reg2, _mm256_castsi256_si128(reg3), 1);
+  dst[2] = _mm256_inserti128_si256(reg1, _mm256_extracti128_si256(reg0, 1), 0);
+  dst[3] = _mm256_inserti128_si256(reg3, _mm256_extracti128_si256(reg2, 1), 0);
+}
+
+// When we load 32 values of int8_t type and need less than 32 values for
+// processing, the below mask is used to make the extra values zero.
+static const int8_t mask_8bit[32] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 16 bytes
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   // 16 bytes
+};
+
+// When we load 16 values of int16_t type and need less than 16 values for
+// processing, the below mask is used to make the extra values zero.
+static const int16_t mask_16bit[32] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 16 bytes
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   // 16 bytes
+};
+
+static INLINE uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start,
+                                            int32_t h_end, int32_t v_start,
+                                            int32_t v_end, int32_t stride) {
+  const uint8_t *src_temp = src + v_start * stride + h_start;
+  const __m256i zero = _mm256_setzero_si256();
+  const int32_t width = h_end - h_start;
+  const int32_t height = v_end - v_start;
+  const int32_t wd_beyond_mul32 = width & 31;
+  const int32_t wd_mul32 = width - wd_beyond_mul32;
+  __m128i mask_low, mask_high;
+  __m256i ss = zero;
+
+  // When width is not multiple of 32, it still loads 32 and to make the data
+  // which is extra (beyond required) as zero using the below mask.
+  if (wd_beyond_mul32 >= 16) {
+    mask_low = _mm_set1_epi8(-1);
+    mask_high = _mm_loadu_si128((__m128i *)(&mask_8bit[32 - wd_beyond_mul32]));
+  } else {
+    mask_low = _mm_loadu_si128((__m128i *)(&mask_8bit[16 - wd_beyond_mul32]));
+    mask_high = _mm_setzero_si128();
+  }
+  const __m256i mask =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(mask_low), mask_high, 1);
+
+  int32_t proc_ht = 0;
+  do {
+    // Process width in multiple of 32.
+    int32_t proc_wd = 0;
+    while (proc_wd < wd_mul32) {
+      const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd));
+      const __m256i sad_0 = _mm256_sad_epu8(s_0, zero);
+      ss = _mm256_add_epi32(ss, sad_0);
+      proc_wd += 32;
+    }
+
+    // Process the remaining width.
+    if (wd_beyond_mul32) {
+      const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd));
+      const __m256i s_m_0 = _mm256_and_si256(s_0, mask);
+      const __m256i sad_0 = _mm256_sad_epu8(s_m_0, zero);
+      ss = _mm256_add_epi32(ss, sad_0);
+    }
+    src_temp += stride;
+    proc_ht++;
+  } while (proc_ht < height);
+
+  const uint32_t sum = calc_sum_of_register(ss);
+  const uint8_t avg = sum / (width * height);
+  return avg;
+}
+
+// Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not
+// 0, it writes (16 - n) more data than required.
+static INLINE void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t avg, int32_t width,
+                                      int32_t height, int16_t *dst,
+                                      int32_t dst_stride,
+                                      int use_downsampled_wiener_stats) {
+  const __m256i avg_reg = _mm256_set1_epi16(avg);
+
+  int32_t proc_ht = 0;
+  do {
+    int ds_factor =
+        use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+    if (use_downsampled_wiener_stats &&
+        (height - proc_ht < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+      ds_factor = height - proc_ht;
+    }
+
+    int32_t proc_wd = 0;
+    while (proc_wd < width) {
+      const __m128i s = _mm_loadu_si128((__m128i *)(src + proc_wd));
+      const __m256i ss = _mm256_cvtepu8_epi16(s);
+      const __m256i d = _mm256_sub_epi16(ss, avg_reg);
+      _mm256_storeu_si256((__m256i *)(dst + proc_wd), d);
+      proc_wd += 16;
+    }
+
+    src += ds_factor * src_stride;
+    dst += ds_factor * dst_stride;
+    proc_ht += ds_factor;
+  } while (proc_ht < height);
+}
+
+// Fills lower-triangular elements of H buffer from upper triangular elements of
+// the same
+static INLINE void fill_lower_triag_elements_avx2(const int32_t wiener_win2,
+                                                  int64_t *const H) {
+  for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
+    __m256i in[4], out[4];
+
+    in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1));
+    in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1));
+    in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1));
+    in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1));
+
+    transpose_64bit_4x4_avx2(in, out);
+
+    _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i),
+                     _mm256_castsi256_si128(out[0]));
+    _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i),
+                     _mm256_castsi256_si128(out[1]));
+    _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]);
+    _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]);
+
+    for (int32_t j = i + 5; j < wiener_win2; j += 4) {
+      in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + j));
+      in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + j));
+      in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + j));
+      in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + j));
+
+      transpose_64bit_4x4_avx2(in, out);
+
+      _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i), out[0]);
+      _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i), out[1]);
+      _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i), out[2]);
+      _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i), out[3]);
+    }
+  }
+}
+
+// Fill H buffer based on loop_count.
+#define INIT_H_VALUES(d, loop_count)                           \
+  for (int g = 0; g < (loop_count); g++) {                     \
+    const __m256i dgd0 =                                       \
+        _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \
+    madd_and_accum_avx2(dgd_mul_df, dgd0, &sum_h[g]);          \
+  }
+
+// Fill M & H buffer.
+#define INIT_MH_VALUES(d)                                      \
+  for (int g = 0; g < wiener_win; g++) {                       \
+    const __m256i dgds_0 =                                     \
+        _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \
+    madd_and_accum_avx2(src_mul_df, dgds_0, &sum_m[g]);        \
+    madd_and_accum_avx2(dgd_mul_df, dgds_0, &sum_h[g]);        \
+  }
+
+// Update the dgd pointers appropriately.
+#define INITIALIZATION(wiener_window_sz)                                 \
+  j = i / (wiener_window_sz);                                            \
+  const int16_t *d_window = d + j;                                       \
+  const int16_t *d_current_row =                                         \
+      d + j + ((i % (wiener_window_sz)) * d_stride);                     \
+  int proc_ht = v_start;                                                 \
+  downsample_factor =                                                    \
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
+  __m256i sum_h[wiener_window_sz];                                       \
+  memset(sum_h, 0, sizeof(sum_h));
+
+// Update the downsample factor appropriately.
+#define UPDATE_DOWNSAMPLE_FACTOR                              \
+  int proc_wd = 0;                                            \
+  if (use_downsampled_wiener_stats &&                         \
+      ((v_end - proc_ht) < WIENER_STATS_DOWNSAMPLE_FACTOR)) { \
+    downsample_factor = v_end - proc_ht;                      \
+  }                                                           \
+  const __m256i df_reg = _mm256_set1_epi16(downsample_factor);
+
+#define CALCULATE_REMAINING_H_WIN5                                             \
+  while (j < wiener_win) {                                                     \
+    d_window = d;                                                              \
+    d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride);      \
+    const __m256i zero = _mm256_setzero_si256();                               \
+    sum_h[0] = zero;                                                           \
+    sum_h[1] = zero;                                                           \
+    sum_h[2] = zero;                                                           \
+    sum_h[3] = zero;                                                           \
+    sum_h[4] = zero;                                                           \
+                                                                               \
+    proc_ht = v_start;                                                         \
+    downsample_factor =                                                        \
+        use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;     \
+    do {                                                                       \
+      UPDATE_DOWNSAMPLE_FACTOR;                                                \
+                                                                               \
+      /* Process the amount of width multiple of 16.*/                         \
+      while (proc_wd < wd_mul16) {                                             \
+        const __m256i dgd =                                                    \
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));          \
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);            \
+        INIT_H_VALUES(d_window + j + proc_wd, 5)                               \
+                                                                               \
+        proc_wd += 16;                                                         \
+      };                                                                       \
+                                                                               \
+      /* Process the remaining width here. */                                  \
+      if (wd_beyond_mul16) {                                                   \
+        const __m256i dgd =                                                    \
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));          \
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);                  \
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);       \
+        INIT_H_VALUES(d_window + j + proc_wd, 5)                               \
+      }                                                                        \
+      proc_ht += downsample_factor;                                            \
+      d_window += downsample_factor * d_stride;                                \
+      d_current_row += downsample_factor * d_stride;                           \
+    } while (proc_ht < v_end);                                                 \
+    const __m256i s_h0 =                                                       \
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);     \
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \
+                        s_h0);                                                 \
+    const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);                      \
+    const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);                   \
+    _mm_storel_epi64(                                                          \
+        (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0);    \
+    j++;                                                                       \
+  }
+
+#define CALCULATE_REMAINING_H_WIN7                                             \
+  while (j < wiener_win) {                                                     \
+    d_window = d;                                                              \
+    d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride);      \
+    const __m256i zero = _mm256_setzero_si256();                               \
+    sum_h[0] = zero;                                                           \
+    sum_h[1] = zero;                                                           \
+    sum_h[2] = zero;                                                           \
+    sum_h[3] = zero;                                                           \
+    sum_h[4] = zero;                                                           \
+    sum_h[5] = zero;                                                           \
+    sum_h[6] = zero;                                                           \
+                                                                               \
+    proc_ht = v_start;                                                         \
+    downsample_factor =                                                        \
+        use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;     \
+    do {                                                                       \
+      UPDATE_DOWNSAMPLE_FACTOR;                                                \
+                                                                               \
+      /* Process the amount of width multiple of 16.*/                         \
+      while (proc_wd < wd_mul16) {                                             \
+        const __m256i dgd =                                                    \
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));          \
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);            \
+        INIT_H_VALUES(d_window + j + proc_wd, 7)                               \
+                                                                               \
+        proc_wd += 16;                                                         \
+      };                                                                       \
+                                                                               \
+      /* Process the remaining width here. */                                  \
+      if (wd_beyond_mul16) {                                                   \
+        const __m256i dgd =                                                    \
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));          \
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);                  \
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);       \
+        INIT_H_VALUES(d_window + j + proc_wd, 7)                               \
+      }                                                                        \
+      proc_ht += downsample_factor;                                            \
+      d_window += downsample_factor * d_stride;                                \
+      d_current_row += downsample_factor * d_stride;                           \
+    } while (proc_ht < v_end);                                                 \
+    const __m256i s_h1 =                                                       \
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);     \
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \
+                        s_h1);                                                 \
+    const __m256i s_h2 =                                                       \
+        hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);     \
+    _mm256_storeu_si256(                                                       \
+        (__m256i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_h2);      \
+    j++;                                                                       \
+  }
+
+// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate
+// the filter tap values required for wiener filtering. Here, the buffer H is of
+// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size
+// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the
+// value above the diagonal (upper triangle) are equal to the values below the
+// diagonal (lower triangle). The calculation of elements/stats of H(upper
+// triangle) and M is done in steps as described below where each step fills
+// specific values of H and M.
+// Once the upper triangular elements of H matrix are derived, the same will be
+// copied to lower triangular using the function
+// fill_lower_triag_elements_avx2().
+// Example: Wiener window size =
+// WIENER_WIN_CHROMA (5) M buffer = [M0 M1 M2 ---- M23 M24] H buffer = Hxy
+// (x-row, y-column) [H00 H01 H02 ---- H023 H024] [H10 H11 H12 ---- H123 H124]
+// [H30 H31 H32 ---- H323 H324]
+// [H40 H41 H42 ---- H423 H424]
+// [H50 H51 H52 ---- H523 H524]
+// [H60 H61 H62 ---- H623 H624]
+//            ||
+//            ||
+// [H230 H231 H232 ---- H2323 H2324]
+// [H240 H241 H242 ---- H2423 H2424]
+// In Step 1, whole M buffers (i.e., M0 to M24) and the first row of H (i.e.,
+// H00 to H024) is filled. The remaining rows of H buffer are filled through
+// steps 2 to 6.
+static void compute_stats_win5_avx2(const int16_t *const d, int32_t d_stride,
+                                    const int16_t *const s, int32_t s_stride,
+                                    int32_t width, int v_start, int v_end,
+                                    int64_t *const M, int64_t *const H,
+                                    int use_downsampled_wiener_stats) {
+  const int32_t wiener_win = WIENER_WIN_CHROMA;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  // Amount of width which is beyond multiple of 16. This case is handled
+  // appropriately to process only the required width towards the end.
+  const int32_t wd_mul16 = width & ~15;
+  const int32_t wd_beyond_mul16 = width - wd_mul16;
+  const __m256i mask =
+      _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16]));
+  int downsample_factor;
+
+  // Step 1: Full M (i.e., M0 to M24) and first row H (i.e., H00 to H024)
+  // values are filled here. Here, the loop over 'j' is executed for values 0
+  // to 4 (wiener_win-1). When the loop executed for a specific 'j', 5 values of
+  // M and H are filled as shown below.
+  // j=0: M0-M4 and H00-H04, j=1: M5-M9 and H05-H09 are filled etc,.
+  int j = 0;
+  do {
+    const int16_t *s_t = s;
+    const int16_t *d_t = d;
+    __m256i sum_m[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+    __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+    downsample_factor =
+        use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+    int proc_ht = v_start;
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+        const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+        const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_MH_VALUES(d_t + j + proc_wd)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+        const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+        const __m256i src_mask = _mm256_and_si256(src, mask);
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_MH_VALUES(d_t + j + proc_wd)
+      }
+      proc_ht += downsample_factor;
+      s_t += downsample_factor * s_stride;
+      d_t += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+
+    const __m256i s_m =
+        hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]);
+    const __m128i s_m_h = convert_32_to_64_add_avx2(sum_m[4], sum_h[4]);
+    _mm256_storeu_si256((__m256i *)(M + wiener_win * j), s_m);
+    _mm_storel_epi64((__m128i *)&M[wiener_win * j + 4], s_m_h);
+
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    _mm256_storeu_si256((__m256i *)(H + wiener_win * j), s_h);
+    _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], s_m_h);
+  } while (++j < wiener_win);
+
+  // The below steps are designed to fill remaining rows of H buffer. Here, aim
+  // is to fill only upper triangle elements correspond to each row and lower
+  // triangle elements are copied from upper-triangle elements. Also, as
+  // mentioned in Step 1, the core function is designed to fill 5
+  // elements/stats/values of H buffer.
+  //
+  // Step 2: Here, the rows 1, 6, 11, 16 and 21 are filled. As we need to fill
+  // only upper-triangle elements, H10 from row1, H60-H64 and H65 from row6,etc,
+  // are need not be filled. As the core function process 5 values, in first
+  // iteration of 'j' only 4 values to be filled i.e., H11-H14 from row1,H66-H69
+  // from row6, etc.
+  for (int i = 1; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN_CHROMA)
+
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN5
+  }
+
+  // Step 3: Here, the rows 2, 7, 12, 17 and 22 are filled. As we need to fill
+  // only upper-triangle elements, H20-H21 from row2, H70-H74 and H75-H76 from
+  // row7, etc, are need not be filled. As the core function process 5 values,
+  // in first iteration of 'j' only 3 values to be filled i.e., H22-H24 from
+  // row2, H77-H79 from row7, etc.
+  for (int i = 2; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN_CHROMA)
+
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN5
+  }
+
+  // Step 4: Here, the rows 3, 8, 13, 18 and 23 are filled. As we need to fill
+  // only upper-triangle elements, H30-H32 from row3, H80-H84 and H85-H87 from
+  // row8, etc, are need not be filled. As the core function process 5 values,
+  // in first iteration of 'j' only 2 values to be filled i.e., H33-H34 from
+  // row3, H88-89 from row8, etc.
+  for (int i = 3; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN_CHROMA)
+
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]);
+    _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN5
+  }
+
+  // Step 5: Here, the rows 4, 9, 14, 19 and 24 are filled. As we need to fill
+  // only upper-triangle elements, H40-H43 from row4, H90-H94 and H95-H98 from
+  // row9, etc, are need not be filled. As the core function process 5 values,
+  // in first iteration of 'j' only 1 values to be filled i.e., H44 from row4,
+  // H99 from row9, etc.
+  for (int i = 4; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN_CHROMA)
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]);
+    _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN5
+  }
+
+  // Step 6: Here, the rows 5, 10, 15 and 20 are filled. As we need to fill only
+  // upper-triangle elements, H50-H54 from row5, H100-H104 and H105-H109 from
+  // row10,etc, are need not be filled. The first iteration of 'j' fills H55-H59
+  // from row5 and H1010-H1014 from row10, etc.
+  for (int i = 5; i < wiener_win2; i += wiener_win) {
+    // Derive j'th iteration from where the H buffer filling needs to be
+    // started.
+    j = i / wiener_win;
+    int shift = 0;
+    do {
+      // Update the dgd pointers appropriately.
+      int proc_ht = v_start;
+      const int16_t *d_window = d + (i / wiener_win);
+      const int16_t *d_current_row =
+          d + (i / wiener_win) + ((i % wiener_win) * d_stride);
+      downsample_factor =
+          use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+      __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+      do {
+        UPDATE_DOWNSAMPLE_FACTOR
+
+        // Process the amount of width multiple of 16.
+        while (proc_wd < wd_mul16) {
+          const __m256i dgd =
+              _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+          const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+          INIT_H_VALUES(d_window + shift + proc_wd, 5)
+
+          proc_wd += 16;
+        }
+
+        // Process the remaining width here.
+        if (wd_beyond_mul16) {
+          const __m256i dgd =
+              _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+          const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+          const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+          INIT_H_VALUES(d_window + shift + proc_wd, 5)
+        }
+        proc_ht += downsample_factor;
+        d_window += downsample_factor * d_stride;
+        d_current_row += downsample_factor * d_stride;
+      } while (proc_ht < v_end);
+
+      const __m256i s_h =
+          hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+      _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)),
+                          s_h);
+      const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);
+      const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);
+      _mm_storel_epi64(
+          (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0);
+      shift++;
+    } while (++j < wiener_win);
+  }
+
+  fill_lower_triag_elements_avx2(wiener_win2, H);
+}
+
+// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate
+// the filter tap values required for wiener filtering. Here, the buffer H is of
+// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size
+// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the
+// value above the diagonal (upper triangle) are equal to the values below the
+// diagonal (lower triangle). The calculation of elements/stats of H(upper
+// triangle) and M is done in steps as described below where each step fills
+// specific values of H and M.
+// Example:
+// Wiener window size = WIENER_WIN (7)
+// M buffer = [M0 M1 M2 ---- M47 M48]
+// H buffer = Hxy (x-row, y-column)
+// [H00 H01 H02 ---- H047 H048]
+// [H10 H11 H12 ---- H147 H148]
+// [H30 H31 H32 ---- H347 H348]
+// [H40 H41 H42 ---- H447 H448]
+// [H50 H51 H52 ---- H547 H548]
+// [H60 H61 H62 ---- H647 H648]
+//            ||
+//            ||
+// [H470 H471 H472 ---- H4747 H4748]
+// [H480 H481 H482 ---- H4847 H4848]
+// In Step 1, whole M buffers (i.e., M0 to M48) and the first row of H (i.e.,
+// H00 to H048) is filled. The remaining rows of H buffer are filled through
+// steps 2 to 8.
+static void compute_stats_win7_avx2(const int16_t *const d, int32_t d_stride,
+                                    const int16_t *const s, int32_t s_stride,
+                                    int32_t width, int v_start, int v_end,
+                                    int64_t *const M, int64_t *const H,
+                                    int use_downsampled_wiener_stats) {
+  const int32_t wiener_win = WIENER_WIN;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  // Amount of width which is beyond multiple of 16. This case is handled
+  // appropriately to process only the required width towards the end.
+  const int32_t wd_mul16 = width & ~15;
+  const int32_t wd_beyond_mul16 = width - wd_mul16;
+  const __m256i mask =
+      _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16]));
+  int downsample_factor;
+
+  // Step 1: Full M (i.e., M0 to M48) and first row H (i.e., H00 to H048)
+  // values are filled here. Here, the loop over 'j' is executed for values 0
+  // to 6. When the loop executed for a specific 'j', 7 values of M and H are
+  // filled as shown below.
+  // j=0: M0-M6 and H00-H06, j=1: M7-M13 and H07-H013 are filled etc,.
+  int j = 0;
+  do {
+    const int16_t *s_t = s;
+    const int16_t *d_t = d;
+    __m256i sum_m[WIENER_WIN] = { _mm256_setzero_si256() };
+    __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
+    downsample_factor =
+        use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+    int proc_ht = v_start;
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+        const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+        const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_MH_VALUES(d_t + j + proc_wd)
+
+        proc_wd += 16;
+      }
+
+      if (wd_beyond_mul16) {
+        const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+        const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+        const __m256i src_mask = _mm256_and_si256(src, mask);
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_MH_VALUES(d_t + j + proc_wd)
+      }
+      proc_ht += downsample_factor;
+      s_t += downsample_factor * s_stride;
+      d_t += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+
+    const __m256i s_m0 =
+        hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]);
+    const __m256i s_m1 =
+        hadd_four_32_to_64_avx2(sum_m[4], sum_m[5], &sum_m[6], &sum_m[6]);
+    _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), s_m0);
+    _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4),
+                     _mm256_castsi256_si128(s_m1));
+    _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6],
+                     _mm256_extracti128_si256(s_m1, 1));
+
+    const __m256i sh_0 =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    const __m256i sh_1 =
+        hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);
+    _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sh_0);
+    _mm_storeu_si128((__m128i *)(H + wiener_win * j + 4),
+                     _mm256_castsi256_si128(sh_1));
+    _mm_storel_epi64((__m128i *)&H[wiener_win * j + 6],
+                     _mm256_extracti128_si256(sh_1, 1));
+  } while (++j < wiener_win);
+
+  // The below steps are designed to fill remaining rows of H buffer. Here, aim
+  // is to fill only upper triangle elements correspond to each row and lower
+  // triangle elements are copied from upper-triangle elements. Also, as
+  // mentioned in Step 1, the core function is designed to fill 7
+  // elements/stats/values of H buffer.
+  //
+  // Step 2: Here, the rows 1, 8, 15, 22, 29, 36 and 43 are filled. As we need
+  // to fill only upper-triangle elements, H10 from row1, H80-H86 and H87 from
+  // row8, etc. are need not be filled. As the core function process 7 values,
+  // in first iteration of 'j' only 6 values to be filled i.e., H11-H16 from
+  // row1 and H88-H813 from row8, etc.
+  for (int i = 1; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN)
+
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+    const __m128i s_h0 = convert_32_to_64_add_avx2(sum_h[4], sum_h[5]);
+    _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i + 4), s_h0);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN7
+  }
+
+  // Step 3: Here, the rows 2, 9, 16, 23, 30, 37 and 44 are filled. As we need
+  // to fill only upper-triangle elements, H20-H21 from row2, H90-H96 and
+  // H97-H98 from row9, etc. are need not be filled. As the core function
+  // process 7 values, in first iteration of 'j' only 5 values to be filled
+  // i.e., H22-H26 from row2 and H99-H913 from row9, etc.
+  for (int i = 2; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN)
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+    const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);
+    const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);
+    _mm_storel_epi64((__m128i *)(H + (i * wiener_win2) + i + 4), s_m_h0);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN7
+  }
+
+  // Step 4: Here, the rows 3, 10, 17, 24, 31, 38 and 45 are filled. As we need
+  // to fill only upper-triangle elements, H30-H32 from row3, H100-H106 and
+  // H107-H109 from row10, etc. are need not be filled. As the core function
+  // process 7 values, in first iteration of 'j' only 4 values to be filled
+  // i.e., H33-H36 from row3 and H1010-H1013 from row10, etc.
+  for (int i = 3; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN)
+
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN7
+  }
+
+  // Step 5: Here, the rows 4, 11, 18, 25, 32, 39 and 46 are filled. As we need
+  // to fill only upper-triangle elements, H40-H43 from row4, H110-H116 and
+  // H117-H1110 from row10, etc. are need not be filled. As the core function
+  // process 7 values, in first iteration of 'j' only 3 values to be filled
+  // i.e., H44-H46 from row4 and H1111-H1113 from row11, etc.
+  for (int i = 4; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN)
+
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN7
+  }
+
+  // Step 6: Here, the rows 5, 12, 19, 26, 33, 40 and 47 are filled. As we need
+  // to fill only upper-triangle elements, H50-H54 from row5, H120-H126 and
+  // H127-H1211 from row12, etc. are need not be filled. As the core function
+  // process 7 values, in first iteration of 'j' only 2 values to be filled
+  // i.e., H55-H56 from row5 and H1212-H1213 from row12, etc.
+  for (int i = 5; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN)
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN7
+  }
+
+  // Step 7: Here, the rows 6, 13, 20, 27, 34, 41 and 48 are filled. As we need
+  // to fill only upper-triangle elements, H60-H65 from row6, H130-H136 and
+  // H137-H1312 from row13, etc. are need not be filled. As the core function
+  // process 7 values, in first iteration of 'j' only 1 value to be filled
+  // i.e., H66 from row6 and H1313 from row13, etc.
+  for (int i = 6; i < wiener_win2; i += wiener_win) {
+    // Update the dgd pointers appropriately and also derive the 'j'th iteration
+    // from where the H buffer filling needs to be started.
+    INITIALIZATION(WIENER_WIN)
+    do {
+      UPDATE_DOWNSAMPLE_FACTOR
+
+      // Process the amount of width multiple of 16.
+      while (proc_wd < wd_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
+
+        proc_wd += 16;
+      }
+
+      // Process the remaining width here.
+      if (wd_beyond_mul16) {
+        const __m256i dgd =
+            _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+        const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+        const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+        INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
+      }
+      proc_ht += downsample_factor;
+      d_window += downsample_factor * d_stride;
+      d_current_row += downsample_factor * d_stride;
+    } while (proc_ht < v_end);
+    const __m256i s_h =
+        hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+    xx_storel_64(&H[(i * wiener_win2) + i], _mm256_castsi256_si128(s_h));
+
+    // process the remaining 'j' iterations.
+    j++;
+    CALCULATE_REMAINING_H_WIN7
+  }
+
+  // Step 8: Here, the rows 7, 14, 21, 28, 35 and 42 are filled. As we need
+  // to fill only upper-triangle elements, H70-H75 from row7, H140-H146 and
+  // H147-H1413 from row14, etc. are need not be filled. The first iteration of
+  // 'j' fills H77-H713 from row7 and H1414-H1420 from row14, etc.
+  for (int i = 7; i < wiener_win2; i += wiener_win) {
+    // Derive j'th iteration from where the H buffer filling needs to be
+    // started.
+    j = i / wiener_win;
+    int shift = 0;
+    do {
+      // Update the dgd pointers appropriately.
+      int proc_ht = v_start;
+      const int16_t *d_window = d + (i / WIENER_WIN);
+      const int16_t *d_current_row =
+          d + (i / WIENER_WIN) + ((i % WIENER_WIN) * d_stride);
+      downsample_factor =
+          use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+      __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
+      do {
+        UPDATE_DOWNSAMPLE_FACTOR
+
+        // Process the amount of width multiple of 16.
+        while (proc_wd < wd_mul16) {
+          const __m256i dgd =
+              _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+          const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+          INIT_H_VALUES(d_window + shift + proc_wd, 7)
+
+          proc_wd += 16;
+        }
+
+        // Process the remaining width here.
+        if (wd_beyond_mul16) {
+          const __m256i dgd =
+              _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+          const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+          const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+          INIT_H_VALUES(d_window + shift + proc_wd, 7)
+        }
+        proc_ht += downsample_factor;
+        d_window += downsample_factor * d_stride;
+        d_current_row += downsample_factor * d_stride;
+      } while (proc_ht < v_end);
+
+      const __m256i sh_0 =
+          hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+      const __m256i sh_1 =
+          hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);
+      _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)),
+                          sh_0);
+      _mm_storeu_si128(
+          (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4),
+          _mm256_castsi256_si128(sh_1));
+      _mm_storel_epi64((__m128i *)&H[(i * wiener_win2) + (wiener_win * j) + 6],
+                       _mm256_extracti128_si256(sh_1, 1));
+      shift++;
+    } while (++j < wiener_win);
+  }
+
+  fill_lower_triag_elements_avx2(wiener_win2, H);
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+                            const uint8_t *src, int16_t *dgd_avg,
+                            int16_t *src_avg, int h_start, int h_end,
+                            int v_start, int v_end, int dgd_stride,
+                            int src_stride, int64_t *M, int64_t *H,
+                            int use_downsampled_wiener_stats) {
+  if (wiener_win != WIENER_WIN && wiener_win != WIENER_WIN_CHROMA) {
+    // Currently, libaom supports Wiener filter processing with window sizes as
+    // WIENER_WIN_CHROMA(5) and WIENER_WIN(7). For any other window size, SIMD
+    // support is not facilitated. Hence, invoke C function for the same.
+    av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
+                        v_start, v_end, dgd_stride, src_stride, M, H,
+                        use_downsampled_wiener_stats);
+    return;
+  }
+
+  const int32_t wiener_halfwin = wiener_win >> 1;
+  const uint8_t avg =
+      calc_dgd_buf_avg_avx2(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+  const int32_t width = h_end - h_start;
+  const int32_t height = v_end - v_start;
+  const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
+  const int32_t s_stride = (width + 15) & ~15;
+
+  // Based on the sf 'use_downsampled_wiener_stats', process either once for
+  // UPDATE_DOWNSAMPLE_FACTOR or for each row.
+  sub_avg_block_avx2(src + v_start * src_stride + h_start, src_stride, avg,
+                     width, height, src_avg, s_stride,
+                     use_downsampled_wiener_stats);
+
+  // Compute (dgd-avg) buffer here which is used to fill H buffer.
+  sub_avg_block_avx2(
+      dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin,
+      dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin,
+      dgd_avg, d_stride, 0);
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_avx2(dgd_avg, d_stride, src_avg, s_stride, width,
+                            v_start, v_end, M, H, use_downsampled_wiener_stats);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win5_avx2(dgd_avg, d_stride, src_avg, s_stride, width,
+                            v_start, v_end, M, H, use_downsampled_wiener_stats);
+  }
+}
+
+static INLINE __m256i pair_set_epi16(int a, int b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+  __m256i sum64 = _mm256_setzero_si256();
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt0_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+                               yy_loadu_256(flt0 + j + 8)),
+            0xd8);
+        const __m256i flt1_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+                               yy_loadu_256(flt1 + j + 8)),
+            0xd8);
+        const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+        const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+        const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+        const __m256i v0 = _mm256_madd_epi16(
+            xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m256i v1 = _mm256_madd_epi16(
+            xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m256i xq_coeff =
+        pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt + j),
+                               yy_loadu_256(flt + j + 8)),
+            0xd8);
+        const __m256i v0 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
+        const __m256i v1 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq_active * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else {
+    __m256i sum32 = _mm256_setzero_si256();
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+        const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    const __m256i sum64_0 =
+        _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+    const __m256i sum64_1 =
+        _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+    sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+  }
+  int64_t sum[4];
+  yy_storeu_256(sum, sum64);
+  err += sum[0] + sum[1] + sum[2] + sum[3];
+  return err;
+}
+
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h00, h01, h11, c0, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+      const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h01 = _mm256_add_epi64(h01, h01_even);
+      h01 = _mm256_add_epi64(h01, h01_odd);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+  const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+  c_low = _mm256_add_epi64(c_low, c_high);
+  const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+                                         _mm256_castsi256_si128(c_low));
+
+  __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+  const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+  h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+  const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+                                           _mm256_castsi256_si128(h0x_low));
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+  const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+  h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+  const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+                                           _mm256_castsi256_si128(h1x_low));
+
+  xx_storeu_128(C, c_128bit);
+  xx_storeu_128(H[0], h0x_128bit);
+  xx_storeu_128(H[1], h1x_128bit);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
+                                                int height, int src_stride,
+                                                const uint8_t *dat8,
+                                                int dat_stride, int32_t *flt0,
+                                                int flt0_stride,
+                                                int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h00, c0;
+  const __m256i zero = _mm256_setzero_si256();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+                                           _mm256_castsi256_si128(h00));
+  const __m128i h00_val =
+      _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+  const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+                                          _mm256_castsi256_si128(c0));
+  const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
+                                                int height, int src_stride,
+                                                const uint8_t *dat8,
+                                                int dat_stride, int32_t *flt1,
+                                                int flt1_stride,
+                                                int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h11, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+                                           _mm256_castsi256_si128(h11));
+  const __m128i h11_val =
+      _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+  const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+                                          _mm256_castsi256_si128(c1));
+  const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
+                               int src_stride, const uint8_t *dat8,
+                               int dat_stride, int32_t *flt0, int flt0_stride,
+                               int32_t *flt1, int flt1_stride, int64_t H[2][2],
+                               int64_t C[2], const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8,
+                                dat_stride, flt0, flt0_stride, flt1,
+                                flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride,
+                             flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h00, h01, h11, c0, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+      const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h01 = _mm256_add_epi64(h01, h01_even);
+      h01 = _mm256_add_epi64(h01, h01_odd);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+  const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+  c_low = _mm256_add_epi64(c_low, c_high);
+  const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+                                         _mm256_castsi256_si128(c_low));
+
+  __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+  const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+  h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+  const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+                                           _mm256_castsi256_si128(h0x_low));
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+  const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+  h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+  const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+                                           _mm256_castsi256_si128(h1x_low));
+
+  xx_storeu_128(C, c_128bit);
+  xx_storeu_128(H[0], h0x_128bit);
+  xx_storeu_128(H[1], h1x_128bit);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h00, c0;
+  const __m256i zero = _mm256_setzero_si256();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+                                           _mm256_castsi256_si128(h00));
+  const __m128i h00_val =
+      _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+  const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+                                          _mm256_castsi256_si128(c0));
+  const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h11, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+                                           _mm256_castsi256_si128(h11));
+  const __m128i h11_val =
+      _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+  const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+                                          _mm256_castsi256_si128(c1));
+  const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
+                                       int height, int src_stride,
+                                       const uint8_t *dat8, int dat_stride,
+                                       int32_t *flt0, int flt0_stride,
+                                       int32_t *flt1, int flt1_stride,
+                                       int64_t H[2][2], int64_t C[2],
+                                       const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                        dat_stride, flt0, flt0_stride, flt1,
+                                        flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+  __m256i sum64 = _mm256_setzero_si256();
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {  // Both filters are enabled
+    const __m256i xq0 = _mm256_set1_epi32(xq[0]);
+    const __m256i xq1 = _mm256_set1_epi32(xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {  // Process 16 pixels at a time
+        // Load 16 pixels each from source image and corrupted image
+        const __m256i s0 = yy_loadu_256(src + j);
+        const __m256i d0 = yy_loadu_256(dat + j);
+        // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices)
+
+        // Shift-up each pixel to match filtered image scaling
+        const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+
+        // Split u0 into two halves and pad each from u16 to i32
+        const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0));
+        const __m256i u0h =
+            _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1));
+        // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
+
+        // Load 16 pixels from each filtered image
+        const __m256i flt0l = yy_loadu_256(flt0 + j);
+        const __m256i flt0h = yy_loadu_256(flt0 + j + 8);
+        const __m256i flt1l = yy_loadu_256(flt1 + j);
+        const __m256i flt1h = yy_loadu_256(flt1 + j + 8);
+        // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
+
+        // Subtract shifted corrupt image from each filtered image
+        const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l);
+        const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h);
+        const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l);
+        const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h);
+
+        // Multiply basis vectors by appropriate coefficients
+        const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0);
+        const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0);
+        const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1);
+        const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1);
+
+        // Add together the contributions from the two basis vectors
+        const __m256i vl = _mm256_add_epi32(v0l, v1l);
+        const __m256i vh = _mm256_add_epi32(v0h, v1h);
+
+        // Right-shift v with appropriate rounding
+        const __m256i vrl =
+            _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
+        const __m256i vrh =
+            _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
+        // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0]
+
+        // Saturate each i32 to an i16 then combine both halves
+        // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
+        const __m256i vr =
+            _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
+        // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0]
+        // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0]
+
+        // Add twin-subspace-sgr-filter to corrupt image then subtract source
+        const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
+
+        // Calculate squared error and add adjacent values
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+
+      const __m256i sum32l =
+          _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+      sum64 = _mm256_add_epi64(sum64, sum32l);
+      const __m256i sum32h =
+          _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels in this row (modulo 16)
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {  // Only one filter enabled
+    const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m256i xq_active = _mm256_set1_epi32(xq_on);
+    const __m256i xq_inactive =
+        _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        // Load 16 pixels from source image
+        const __m256i s0 = yy_loadu_256(src + j);
+        // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+
+        // Load 16 pixels from corrupted image and pad each u16 to i32
+        const __m256i d0 = yy_loadu_256(dat + j);
+        const __m256i d0h =
+            _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1));
+        const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0));
+        // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+        // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+        // Load 16 pixels from the filtered image
+        const __m256i flth = yy_loadu_256(flt + j + 8);
+        const __m256i fltl = yy_loadu_256(flt + j);
+        // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+        const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active);
+        const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active);
+        const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive);
+        const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive);
+
+        const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq);
+        const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq);
+
+        // Shift this down with appropriate rounding
+        const __m256i vrh =
+            _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
+        const __m256i vrl =
+            _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
+        // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+        // Saturate each i32 to an i16 then combine both halves
+        // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
+        const __m256i vr =
+            _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
+        // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16
+        // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+
+        // Subtract twin-subspace-sgr filtered from source image to get error
+        const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
+
+        // Calculate squared error and add adjacent values
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+
+      const __m256i sum32l =
+          _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+      sum64 = _mm256_add_epi64(sum64, sum32l);
+      const __m256i sum32h =
+          _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels in this row (modulo 16)
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq_on * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+    }
+  } else {  // Neither filter is enabled
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 32; j += 32) {
+        // Load 2x16 u16 from source image
+        const __m256i s0l = yy_loadu_256(src + j);
+        const __m256i s0h = yy_loadu_256(src + j + 16);
+
+        // Load 2x16 u16 from corrupted image
+        const __m256i d0l = yy_loadu_256(dat + j);
+        const __m256i d0h = yy_loadu_256(dat + j + 16);
+
+        // Subtract corrupted image from source image
+        const __m256i diffl = _mm256_sub_epi16(d0l, s0l);
+        const __m256i diffh = _mm256_sub_epi16(d0h, s0h);
+
+        // Square error and add adjacent values
+        const __m256i err0l = _mm256_madd_epi16(diffl, diffl);
+        const __m256i err0h = _mm256_madd_epi16(diffh, diffh);
+
+        sum32 = _mm256_add_epi32(sum32, err0l);
+        sum32 = _mm256_add_epi32(sum32, err0h);
+      }
+
+      const __m256i sum32l =
+          _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+      sum64 = _mm256_add_epi64(sum64, sum32l);
+      const __m256i sum32h =
+          _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels (modulu 16)
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+
+  // Sum 4 values from sum64l and sum64h into err
+  int64_t sum[4];
+  yy_storeu_256(sum, sum64);
+  err += sum[0] + sum[1] + sum[2] + sum[3];
+  return err;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 0000000000..50db305802
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,1483 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+                                  const __m128i *shuffle, const __m128i *kl) {
+  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+  const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+  const __m128i d1 =
+      _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+  const __m128i dst0 = xx_loadu_128(dst);
+  const __m128i dst1 = xx_loadu_128(dst + 4);
+  const __m128i r0 = _mm_add_epi32(dst0, d0);
+  const __m128i r1 = _mm_add_epi32(dst1, d1);
+  xx_storeu_128(dst, r0);
+  xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  const int wiener_win = 7;
+  int j, k, l;
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m128i kl =
+            _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    *sumX += X1;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_sse41` function wants its input to have interleaved
+        // copies of two pixels, but we only have one. However, the pixels
+        // are (effectively) used as inputs to a multiply-accumulate.
+        // So if we set the extra pixel slot to 0, then it is effectively
+        // ignored.
+        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+    int use_downsampled_wiener_stats) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint8_t avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t M_int32_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t H_int32_row[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+  int32_t sumX_row = 0;
+  int32_t sumY_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i = i + downsample_factor) {
+      if (use_downsampled_wiener_stats &&
+          (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+        downsample_factor = vert_end - i;
+      }
+      sumX_row = 0;
+      memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+      memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+      memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2 * (WIENER_WIN * 8));
+      acc_stat_win7_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+      sumX += sumX_row * downsample_factor;
+      // Scale M matrix based on the downsampling factor
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+          M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+        }
+      }
+      // Scale H matrix based on the downsampling factor
+      for (k = 0; k < WIENER_WIN2; ++k) {
+        for (l = 0; l < WIENER_WIN * 8; ++l) {
+          H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+        }
+      }
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2; ++k) {
+      for (l = 0; l < WIENER_WIN * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] =
+          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
+                                         const __m128i *shuffle,
+                                         const __m128i *dgd_ijkl) {
+  // Load 256 bits from dgd in two chunks
+  const __m128i s0l = xx_loadu_128(dgd);
+  const __m128i s0h = xx_loadu_128(dgd + 4);
+  // s0l = [7 6 5 4 3 2 1 0] as u16 values (dgd indices)
+  // s0h = [11 10 9 8 7 6 5 4] as u16 values (dgd indices)
+  // (Slightly strange order so we can apply the same shuffle to both halves)
+
+  // Shuffle the u16 values in each half (actually using 8-bit shuffle mask)
+  const __m128i s1l = _mm_shuffle_epi8(s0l, *shuffle);
+  const __m128i s1h = _mm_shuffle_epi8(s0h, *shuffle);
+  // s1l = [4 3 3 2 2 1 1 0] as u16 values (dgd indices)
+  // s1h = [8 7 7 6 6 5 5 4] as u16 values (dgd indices)
+
+  // Multiply s1 by dgd_ijkl resulting in 8x u32 values
+  // Horizontally add pairs of u32 resulting in 4x u32
+  const __m128i dl = _mm_madd_epi16(*dgd_ijkl, s1l);
+  const __m128i dh = _mm_madd_epi16(*dgd_ijkl, s1h);
+  // dl = [d c b a] as u32 values
+  // dh = [h g f e] as u32 values
+
+  // Add these 8x u32 results on to dst in four parts
+  const __m128i dll = _mm_cvtepu32_epi64(dl);
+  const __m128i dlh = _mm_cvtepu32_epi64(_mm_srli_si128(dl, 8));
+  const __m128i dhl = _mm_cvtepu32_epi64(dh);
+  const __m128i dhh = _mm_cvtepu32_epi64(_mm_srli_si128(dh, 8));
+  // dll = [b a] as u64 values, etc.
+
+  const __m128i rll = _mm_add_epi64(xx_loadu_128(dst), dll);
+  xx_storeu_128(dst, rll);
+  const __m128i rlh = _mm_add_epi64(xx_loadu_128(dst + 2), dlh);
+  xx_storeu_128(dst + 2, rlh);
+  const __m128i rhl = _mm_add_epi64(xx_loadu_128(dst + 4), dhl);
+  xx_storeu_128(dst + 4, rhl);
+  const __m128i rhh = _mm_add_epi64(xx_loadu_128(dst + 6), dhh);
+  xx_storeu_128(dst + 6, rhh);
+}
+
+static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
+    const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
+    int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN;
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
+    const uint16_t X1 = src[j];
+    const uint16_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        const uint16_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        // Load two u16 values from dgd as a single u32
+        // Then broadcast to 4x u32 slots of a 128
+        const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l));
+        // dgd_ijkl = [y x y x y x y x] as u16
+
+        acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+      }
+    }
+  }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint16_t X1 = src[j];
+    *sumX += X1;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_highbd_sse41` function wants its input to have
+        // interleaved copies of two pixels, but we only have one. However, the
+        // pixels are (effectively) used as inputs to a multiply-accumulate. So
+        // if we set the extra pixel slot to 0, then it is effectively ignored.
+        const __m128i dgd_ijkl = _mm_set1_epi32((int)D1);
+
+        acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_highbd_win7_opt_sse4_1(
+    const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+    int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+    int64_t *H, aom_bit_depth_t bit_depth) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  // Load just half of the 256-bit shuffle control used for the AVX2 version
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_highbd_win7_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = (M_int[k][l] +
+                 (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+                bit_depth_divider;
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] =
+              (H_int_[n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
+    const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
+    const uint16_t X1 = src[j];
+    const uint16_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        const uint16_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        // Load two u16 values from dgd as a single u32
+        // then broadcast to 4x u32 slots of a 128
+        const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l));
+        // dgd_ijkl = [y x y x y x y x] as u16
+
+        acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+      }
+    }
+  }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint16_t X1 = src[j];
+    *sumX += X1;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_highbd_sse41` function wants its input to have
+        // interleaved copies of two pixels, but we only have one. However, the
+        // pixels are (effectively) used as inputs to a multiply-accumulate. So
+        // if we set the extra pixel slot to 0, then it is effectively ignored.
+        const __m128i dgd_ijkl = _mm_set1_epi32((int)D1);
+
+        acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_highbd_win5_opt_sse4_1(
+    const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+    int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+    int64_t *H, aom_bit_depth_t bit_depth) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  // Load just half of the 256-bit shuffle control used for the AVX2 version
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_highbd_win5_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = (M_int[k][l] +
+                 (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+                bit_depth_divider;
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] =
+              (H_int_[n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8,
+                                     const uint8_t *src8, int h_start,
+                                     int h_end, int v_start, int v_end,
+                                     int dgd_stride, int src_stride, int64_t *M,
+                                     int64_t *H, aom_bit_depth_t bit_depth) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
+                                         v_end, dgd_stride, src_stride, M, H,
+                                         bit_depth);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
+                                         v_end, dgd_stride, src_stride, M, H,
+                                         bit_depth);
+  } else {
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+                               v_end, dgd_stride, src_stride, M, H, bit_depth);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  const int wiener_win = WIENER_WIN_CHROMA;
+  int j, k, l;
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m128i kl =
+            _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    *sumX += X1;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_sse41` function wants its input to have interleaved
+        // copies of two pixels, but we only have one. However, the pixels
+        // are (effectively) used as inputs to a multiply-accumulate.
+        // So if we set the extra pixel slot to 0, then it is effectively
+        // ignored.
+        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+    int use_downsampled_wiener_stats) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint8_t avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t M_int32_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t H_int32_row[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+  int32_t sumX_row = 0;
+  int32_t sumY_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i = i + downsample_factor) {
+      if (use_downsampled_wiener_stats &&
+          (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+        downsample_factor = vert_end - i;
+      }
+      sumX_row = 0;
+      memset(sumY_row, 0,
+             sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+      memset(M_int32_row, 0,
+             sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+      memset(H_int32_row, 0,
+             sizeof(int32_t) * WIENER_WIN2_CHROMA * (WIENER_WIN_CHROMA * 8));
+      acc_stat_win5_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+      sumX += sumX_row * downsample_factor;
+      // Scale M matrix based on the downsampling factor
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+          M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+        }
+      }
+      // Scale H matrix based on the downsampling factor
+      for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+        for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+          H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+        }
+      }
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] =
+          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+                              const uint8_t *src, int16_t *dgd_avg,
+                              int16_t *src_avg, int h_start, int h_end,
+                              int v_start, int v_end, int dgd_stride,
+                              int src_stride, int64_t *M, int64_t *H,
+                              int use_downsampled_wiener_stats) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+                                  dgd_stride, src_stride, M, H,
+                                  use_downsampled_wiener_stats);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+                                  dgd_stride, src_stride, M, H,
+                                  use_downsampled_wiener_stats);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
+                        v_start, v_end, dgd_stride, src_stride, M, H,
+                        use_downsampled_wiener_stats);
+  }
+}
+
+static INLINE __m128i pair_set_epi16(int a, int b) {
+  return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+  __m128i sum64 = _mm_setzero_si128();
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt0_16b =
+            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+        const __m128i flt1_16b =
+            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+        const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+        const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+        const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+        const __m128i v0 = _mm_madd_epi16(
+            xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m128i v1 = _mm_madd_epi16(
+            xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m128i xq_coeff =
+        pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt_16b =
+            _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4));
+        const __m128i v0 =
+            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0));
+        const __m128i v1 =
+            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq_active * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else {
+    __m128i sum32 = _mm_setzero_si128();
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m128i d = xx_loadu_128(dat + j);
+        const __m128i s = xx_loadu_128(src + j);
+        const __m128i d0 = _mm_cvtepu8_epi16(d);
+        const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+        const __m128i s0 = _mm_cvtepu8_epi16(s);
+        const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+        const __m128i diff0 = _mm_sub_epi16(d0, s0);
+        const __m128i diff1 = _mm_sub_epi16(d1, s1);
+        const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+        const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+        sum32 = _mm_add_epi32(sum32, err0);
+        sum32 = _mm_add_epi32(sum32, err1);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+    const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+    sum64 = _mm_add_epi64(sum64_0, sum64_1);
+  }
+  int64_t sum[2];
+  xx_storeu_128(sum, sum64);
+  err += sum[0] + sum[1];
+  return err;
+}
+
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h00, h01, h11, c0, c1;
+  const __m128i zero = _mm_setzero_si128();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i h01_even = _mm_mul_epi32(f1, f2);
+      const __m128i h01_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+      h01 = _mm_add_epi64(h01, h01_even);
+      h01 = _mm_add_epi64(h01, h01_odd);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+  const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+  c_low = _mm_add_epi64(c_low, c_high);
+
+  __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+  const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+  h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+  const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+  h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+  xx_storeu_128(C, c_low);
+  xx_storeu_128(H[0], h0x_low);
+  xx_storeu_128(H[1], h1x_low);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h00, c0;
+  const __m128i zero = _mm_setzero_si128();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+  const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h11, c1;
+  const __m128i zero = _mm_setzero_si128();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+  const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height,
+                                 int src_stride, const uint8_t *dat8,
+                                 int dat_stride, int32_t *flt0, int flt0_stride,
+                                 int32_t *flt1, int flt1_stride,
+                                 int64_t H[2][2], int64_t C[2],
+                                 const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_sse4_1(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt0, flt0_stride, flt1,
+                                  flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_sse4_1(src8, width, height, src_stride, dat8,
+                               dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_sse4_1(src8, width, height, src_stride, dat8,
+                               dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h00, h01, h11, c0, c1;
+  const __m128i zero = _mm_setzero_si128();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i h01_even = _mm_mul_epi32(f1, f2);
+      const __m128i h01_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+      h01 = _mm_add_epi64(h01, h01_even);
+      h01 = _mm_add_epi64(h01, h01_odd);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+  const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+  c_low = _mm_add_epi64(c_low, c_high);
+
+  __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+  const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+  h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+  const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+  h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+  xx_storeu_128(C, c_low);
+  xx_storeu_128(H[0], h0x_low);
+  xx_storeu_128(H[1], h1x_low);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h00, c0;
+  const __m128i zero = _mm_setzero_si128();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+  const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h11, c1;
+  const __m128i zero = _mm_setzero_si128();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+  const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width,
+                                         int height, int src_stride,
+                                         const uint8_t *dat8, int dat_stride,
+                                         int32_t *flt0, int flt0_stride,
+                                         int32_t *flt1, int flt1_stride,
+                                         int64_t H[2][2], int64_t C[2],
+                                         const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                          dat_stride, flt0, flt0_stride, flt1,
+                                          flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+  __m128i sum64 = _mm_setzero_si128();
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {  // Both filters are enabled
+    const __m128i xq0 = _mm_set1_epi32(xq[0]);
+    const __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 8; j += 8) {
+        // Load 8x pixels from source image
+        const __m128i s0 = xx_loadu_128(src + j);
+        // s0 = [7 6 5 4 3 2 1 0] as i16 (indices of src[])
+
+        // Load 8x pixels from corrupted image
+        const __m128i d0 = xx_loadu_128(dat + j);
+        // d0 = [7 6 5 4 3 2 1 0] as i16 (indices of dat[])
+
+        // Shift each pixel value up by SGRPROJ_RST_BITS
+        const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+
+        // Split u0 into two halves and pad each from u16 to i32
+        const __m128i u0l = _mm_cvtepu16_epi32(u0);
+        const __m128i u0h = _mm_cvtepu16_epi32(_mm_srli_si128(u0, 8));
+        // u0h = [7 6 5 4] as i32, u0l = [3 2 1 0] as i32, all dat[] indices
+
+        // Load 8 pixels from first and second filtered images
+        const __m128i flt0l = xx_loadu_128(flt0 + j);
+        const __m128i flt0h = xx_loadu_128(flt0 + j + 4);
+        const __m128i flt1l = xx_loadu_128(flt1 + j);
+        const __m128i flt1h = xx_loadu_128(flt1 + j + 4);
+        // flt0 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt0+j)
+        // flt1 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt1+j)
+
+        // Subtract shifted corrupt image from each filtered image
+        // This gives our two basis vectors for the projection
+        const __m128i flt0l_subu = _mm_sub_epi32(flt0l, u0l);
+        const __m128i flt0h_subu = _mm_sub_epi32(flt0h, u0h);
+        const __m128i flt1l_subu = _mm_sub_epi32(flt1l, u0l);
+        const __m128i flt1h_subu = _mm_sub_epi32(flt1h, u0h);
+        // flt?h_subu = [ f[7]-u[7] f[6]-u[6] f[5]-u[5] f[4]-u[4] ] as i32
+        // flt?l_subu = [ f[3]-u[3] f[2]-u[2] f[1]-u[1] f[0]-u[0] ] as i32
+
+        // Multiply each basis vector by the corresponding coefficient
+        const __m128i v0l = _mm_mullo_epi32(flt0l_subu, xq0);
+        const __m128i v0h = _mm_mullo_epi32(flt0h_subu, xq0);
+        const __m128i v1l = _mm_mullo_epi32(flt1l_subu, xq1);
+        const __m128i v1h = _mm_mullo_epi32(flt1h_subu, xq1);
+
+        // Add together the contribution from each scaled basis vector
+        const __m128i vl = _mm_add_epi32(v0l, v1l);
+        const __m128i vh = _mm_add_epi32(v0h, v1h);
+
+        // Right-shift v with appropriate rounding
+        const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift);
+        const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift);
+
+        // Saturate each i32 value to i16 and combine lower and upper halves
+        const __m128i vr = _mm_packs_epi32(vrl, vrh);
+
+        // Add twin-subspace-sgr-filter to corrupt image then subtract source
+        const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0);
+
+        // Calculate squared error and add adjacent values
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+
+      const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+      sum64 = _mm_add_epi64(sum64, sum32l);
+      const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels in this row (modulo 8)
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {  // Only one filter enabled
+    const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m128i xq_active = _mm_set1_epi32(xq_on);
+    const __m128i xq_inactive =
+        _mm_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 8; j += 8) {
+        // Load 8x pixels from source image
+        const __m128i s0 = xx_loadu_128(src + j);
+        // s0 = [7 6 5 4 3 2 1 0] as u16 (indices of src[])
+
+        // Load 8x pixels from corrupted image and pad each u16 to i32
+        const __m128i d0 = xx_loadu_128(dat + j);
+        const __m128i d0h = _mm_cvtepu16_epi32(_mm_srli_si128(d0, 8));
+        const __m128i d0l = _mm_cvtepu16_epi32(d0);
+        // d0h, d0l = [7 6 5 4], [3 2 1 0] as u32 (indices of dat[])
+
+        // Load 8 pixels from the filtered image
+        const __m128i flth = xx_loadu_128(flt + j + 4);
+        const __m128i fltl = xx_loadu_128(flt + j);
+        // flth, fltl = [7 6 5 4], [3 2 1 0] as i32 (indices of flt+j)
+
+        const __m128i flth_xq = _mm_mullo_epi32(flth, xq_active);
+        const __m128i fltl_xq = _mm_mullo_epi32(fltl, xq_active);
+        const __m128i d0h_xq = _mm_mullo_epi32(d0h, xq_inactive);
+        const __m128i d0l_xq = _mm_mullo_epi32(d0l, xq_inactive);
+
+        const __m128i vh = _mm_add_epi32(flth_xq, d0h_xq);
+        const __m128i vl = _mm_add_epi32(fltl_xq, d0l_xq);
+        // vh = [ xq0(f[7]-d[7]) xq0(f[6]-d[6]) xq0(f[5]-d[5]) xq0(f[4]-d[4]) ]
+        // vl = [ xq0(f[3]-d[3]) xq0(f[2]-d[2]) xq0(f[1]-d[1]) xq0(f[0]-d[0]) ]
+
+        // Shift this down with appropriate rounding
+        const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift);
+        const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift);
+
+        // Saturate vr0 and vr1 from i32 to i16 then pack together
+        const __m128i vr = _mm_packs_epi32(vrl, vrh);
+
+        // Subtract twin-subspace-sgr filtered from source image to get error
+        const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0);
+
+        // Calculate squared error and add adjacent values
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+
+      const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+      sum64 = _mm_add_epi64(sum64, sum32l);
+      const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels in this row (modulo 8)
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq_on * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+    }
+  } else {  // Neither filter is enabled
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 16; j += 16) {
+        // Load 2x8 u16 from source image
+        const __m128i s0 = xx_loadu_128(src + j);
+        const __m128i s1 = xx_loadu_128(src + j + 8);
+        // Load 2x8 u16 from corrupted image
+        const __m128i d0 = xx_loadu_128(dat + j);
+        const __m128i d1 = xx_loadu_128(dat + j + 8);
+
+        // Subtract corrupted image from source image
+        const __m128i diff0 = _mm_sub_epi16(d0, s0);
+        const __m128i diff1 = _mm_sub_epi16(d1, s1);
+
+        // Square error and add adjacent values
+        const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+        const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+
+        sum32 = _mm_add_epi32(sum32, err0);
+        sum32 = _mm_add_epi32(sum32, err1);
+      }
+
+      const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+      sum64 = _mm_add_epi64(sum64, sum32l);
+      const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels (modulu 8)
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+
+  // Sum 4 values from sum64l and sum64h into err
+  int64_t sum[2];
+  xx_storeu_128(sum, sum64);
+  err += sum[0] + sum[1];
+  return err;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/x86/rdopt_avx2.c b/third_party/aom/av1/encoder/x86/rdopt_avx2.c
new file mode 100644
index 0000000000..a0ab3940c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/rdopt_avx2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/rdopt.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+                                          __m256i *xy_sum_32,
+                                          __m256i *xz_sum_32, __m256i *x_sum_32,
+                                          __m256i *x2_sum_32) {
+  // Pixels in this 4x4   [ a b c d ]
+  // are referred to as:  [ e f g h ]
+  //                      [ i j k l ]
+  //                      [ m n o p ]
+
+  const __m256i pixels = _mm256_set_epi64x(
+      loadu_int64(&diff[0 * stride]), loadu_int64(&diff[1 * stride]),
+      loadu_int64(&diff[2 * stride]), loadu_int64(&diff[3 * stride]));
+  // pixels = [d c b a h g f e] [l k j i p o n m] as i16
+
+  const __m256i slli = _mm256_slli_epi64(pixels, 16);
+  // slli = [c b a 0 g f e 0] [k j i 0 o n m 0] as i16
+
+  const __m256i madd_xy = _mm256_madd_epi16(pixels, slli);
+  // madd_xy = [bc+cd ab fg+gh ef] [jk+kl ij no+op mn] as i32
+  *xy_sum_32 = _mm256_add_epi32(*xy_sum_32, madd_xy);
+
+  // Permute control [3 2] [1 0] => [2 1] [0 0], 0b10010000 = 0x90
+  const __m256i perm = _mm256_permute4x64_epi64(slli, 0x90);
+  // perm = [g f e 0 k j i 0] [o n m 0 o n m 0] as i16
+
+  const __m256i madd_xz = _mm256_madd_epi16(slli, perm);
+  // madd_xz = [cg+bf ae gk+fj ei] [ko+jn im oo+nn mm] as i32
+  *xz_sum_32 = _mm256_add_epi32(*xz_sum_32, madd_xz);
+
+  // Sum every element in slli (and then also their squares)
+  const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1));
+  // madd1_slli = [c+b a g+f e] [k+j i o+n m] as i32
+  *x_sum_32 = _mm256_add_epi32(*x_sum_32, madd1_slli);
+
+  const __m256i madd_slli = _mm256_madd_epi16(slli, slli);
+  // madd_slli = [cc+bb aa gg+ff ee] [kk+jj ii oo+nn mm] as i32
+  *x2_sum_32 = _mm256_add_epi32(*x2_sum_32, madd_slli);
+}
+
+void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride,
+                                          int width, int height, float *hcorr,
+                                          float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - right neighbour pixel
+  // z - below neighbour pixel
+  // w - down-right neighbour pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, x2_sum = 0;
+
+  // Process horizontal and vertical correlations through the body in 4x4
+  // blocks.  This excludes the final row and column and possibly one extra
+  // column depending how 3 divides into width and height
+  int32_t xy_xz_tmp[8] = { 0 }, x_x2_tmp[8] = { 0 };
+  __m256i xy_sum_32 = _mm256_setzero_si256();
+  __m256i xz_sum_32 = _mm256_setzero_si256();
+  __m256i x_sum_32 = _mm256_setzero_si256();
+  __m256i x2_sum_32 = _mm256_setzero_si256();
+  for (int i = 0; i <= height - 4; i += 3) {
+    for (int j = 0; j <= width - 4; j += 3) {
+      horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+                             &xz_sum_32, &x_sum_32, &x2_sum_32);
+    }
+    const __m256i hadd_xy_xz = _mm256_hadd_epi32(xy_sum_32, xz_sum_32);
+    // hadd_xy_xz = [ae+bf+cg ei+fj+gk ab+bc+cd ef+fg+gh]
+    //              [im+jn+ko mm+nn+oo ij+jk+kl mn+no+op] as i32
+    yy_storeu_256(xy_xz_tmp, hadd_xy_xz);
+    xy_sum += (int64_t)xy_xz_tmp[5] + xy_xz_tmp[4] + xy_xz_tmp[1];
+    xz_sum += (int64_t)xy_xz_tmp[7] + xy_xz_tmp[6] + xy_xz_tmp[3];
+
+    const __m256i hadd_x_x2 = _mm256_hadd_epi32(x_sum_32, x2_sum_32);
+    // hadd_x_x2 = [aa+bb+cc ee+ff+gg a+b+c e+f+g]
+    //             [ii+jj+kk mm+nn+oo i+j+k m+n+o] as i32
+    yy_storeu_256(x_x2_tmp, hadd_x_x2);
+    x_sum += (int64_t)x_x2_tmp[5] + x_x2_tmp[4] + x_x2_tmp[1];
+    x2_sum += (int64_t)x_x2_tmp[7] + x_x2_tmp[6] + x_x2_tmp[3];
+
+    xy_sum_32 = _mm256_setzero_si256();
+    xz_sum_32 = _mm256_setzero_si256();
+    x_sum_32 = _mm256_setzero_si256();
+    x2_sum_32 = _mm256_setzero_si256();
+  }
+
+  // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+  int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+  // Do we have 2 rows remaining or just the one?  Note that width and height
+  // are powers of 2, so each modulo 3 must be 1 or 2.
+  if (height % 3 == 1) {  // Just horiz corrs on the final row
+    const int16_t x0 = diff[(height - 1) * stride];
+    x_sum += x0;
+    x_finalrow += x0;
+    x2_sum += x0 * x0;
+    x2_finalrow += x0 * x0;
+    for (int j = 0; j < width - 1; ++j) {
+      const int16_t x = diff[(height - 1) * stride + j];
+      const int16_t y = diff[(height - 1) * stride + j + 1];
+      xy_sum += x * y;
+      x_sum += y;
+      x2_sum += y * y;
+      x_finalrow += y;
+      x2_finalrow += y * y;
+    }
+  } else {  // Two rows remaining to do
+    const int16_t x0 = diff[(height - 2) * stride];
+    const int16_t z0 = diff[(height - 1) * stride];
+    x_sum += x0 + z0;
+    x2_sum += x0 * x0 + z0 * z0;
+    x_finalrow += z0;
+    x2_finalrow += z0 * z0;
+    for (int j = 0; j < width - 1; ++j) {
+      const int16_t x = diff[(height - 2) * stride + j];
+      const int16_t y = diff[(height - 2) * stride + j + 1];
+      const int16_t z = diff[(height - 1) * stride + j];
+      const int16_t w = diff[(height - 1) * stride + j + 1];
+
+      // Horizontal and vertical correlations for the penultimate row:
+      xy_sum += x * y;
+      xz_sum += x * z;
+
+      // Now just horizontal correlations for the final row:
+      xy_sum += z * w;
+
+      x_sum += y + w;
+      x2_sum += y * y + w * w;
+      x_finalrow += w;
+      x2_finalrow += w * w;
+    }
+  }
+
+  // Do we have 2 columns remaining or just the one?
+  if (width % 3 == 1) {  // Just vert corrs on the final col
+    const int16_t x0 = diff[width - 1];
+    x_sum += x0;
+    x_finalcol += x0;
+    x2_sum += x0 * x0;
+    x2_finalcol += x0 * x0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 1];
+      xz_sum += x * z;
+      x_finalcol += z;
+      x2_finalcol += z * z;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z;
+        x2_sum += z * z;
+      }
+    }
+  } else {  // Two cols remaining
+    const int16_t x0 = diff[width - 2];
+    const int16_t y0 = diff[width - 1];
+    x_sum += x0 + y0;
+    x2_sum += x0 * x0 + y0 * y0;
+    x_finalcol += y0;
+    x2_finalcol += y0 * y0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 2];
+      const int16_t y = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 2];
+      const int16_t w = diff[(i + 1) * stride + width - 1];
+
+      // Horizontal and vertical correlations for the penultimate col:
+      // Skip these on the last iteration of this loop if we also had two
+      // rows remaining, otherwise the final horizontal and vertical correlation
+      // get erroneously processed twice
+      if (i < height - 2 || height % 3 == 1) {
+        xy_sum += x * y;
+        xz_sum += x * z;
+      }
+
+      x_finalcol += w;
+      x2_finalcol += w * w;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z + w;
+        x2_sum += z * z + w * w;
+      }
+
+      // Now just vertical correlations for the final column:
+      xz_sum += y * w;
+    }
+  }
+
+  // Calculate the simple sums and squared-sums
+  int64_t x_firstrow = 0, x_firstcol = 0;
+  int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+  for (int j = 0; j < width; ++j) {
+    x_firstrow += diff[j];
+    x2_firstrow += diff[j] * diff[j];
+  }
+  for (int i = 0; i < height; ++i) {
+    x_firstcol += diff[i * stride];
+    x2_firstcol += diff[i * stride] * diff[i * stride];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/rdopt_sse4.c b/third_party/aom/av1/encoder/x86/rdopt_sse4.c
new file mode 100644
index 0000000000..12ac146195
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/rdopt_sse4.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/rdopt.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+                                          __m128i *xy_sum_32,
+                                          __m128i *xz_sum_32, __m128i *x_sum_32,
+                                          __m128i *x2_sum_32) {
+  // Pixels in this 4x4   [ a b c d ]
+  // are referred to as:  [ e f g h ]
+  //                      [ i j k l ]
+  //                      [ m n o p ]
+
+  const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride],
+                                         *(int64_t *)&diff[2 * stride]);
+  const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride],
+                                         *(int64_t *)&diff[3 * stride]);
+  // pixelsa = [d c b a l k j i] as i16
+  // pixelsb = [h g f e p o n m] as i16
+
+  const __m128i slli_a = _mm_slli_epi64(pixelsa, 16);
+  const __m128i slli_b = _mm_slli_epi64(pixelsb, 16);
+  // slli_a = [c b a 0 k j i 0] as i16
+  // slli_b = [g f e 0 o n m 0] as i16
+
+  const __m128i xy_madd_a = _mm_madd_epi16(pixelsa, slli_a);
+  const __m128i xy_madd_b = _mm_madd_epi16(pixelsb, slli_b);
+  // xy_madd_a = [bc+cd ab jk+kl ij] as i32
+  // xy_madd_b = [fg+gh ef no+op mn] as i32
+
+  const __m128i xy32 = _mm_hadd_epi32(xy_madd_b, xy_madd_a);
+  // xy32 = [ab+bc+cd ij+jk+kl ef+fg+gh mn+no+op] as i32
+  *xy_sum_32 = _mm_add_epi32(*xy_sum_32, xy32);
+
+  const __m128i xz_madd_a = _mm_madd_epi16(slli_a, slli_b);
+  // xz_madd_a = [bf+cg ae jn+ko im] i32
+
+  const __m128i swap_b = _mm_srli_si128(slli_b, 8);
+  // swap_b = [0 0 0 0 g f e 0] as i16
+  const __m128i xz_madd_b = _mm_madd_epi16(slli_a, swap_b);
+  // xz_madd_b = [0 0 gk+fj ei] i32
+
+  const __m128i xz32 = _mm_hadd_epi32(xz_madd_b, xz_madd_a);
+  // xz32 = [ae+bf+cg im+jn+ko 0 ei+fj+gk] i32
+  *xz_sum_32 = _mm_add_epi32(*xz_sum_32, xz32);
+
+  // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+  // (sum up every element in slli_a and swap_b)
+  const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a);
+  const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a);
+  // sum_slli_a32 = [c+b a k+j i] as i32
+  const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b);
+  // swap_b32 = [g f e 0] as i32
+  *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32);
+  *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32);
+  // sum = [c+b+g a+f k+j+e i] as i32
+
+  // Also sum their squares
+  const __m128i slli_a_2 = _mm_madd_epi16(slli_a, slli_a);
+  const __m128i swap_b_2 = _mm_madd_epi16(swap_b, swap_b);
+  // slli_a_2 = [c2+b2 a2 k2+j2 i2]
+  // swap_b_2 = [0 0 g2+f2 e2]
+  const __m128i sum2 = _mm_hadd_epi32(slli_a_2, swap_b_2);
+  // sum2 = [0 g2+f2+e2 c2+b2+a2 k2+j2+i2]
+  *x2_sum_32 = _mm_add_epi32(*x2_sum_32, sum2);
+}
+
+void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride,
+                                            int width, int height, float *hcorr,
+                                            float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - right neighbour pixel
+  // z - below neighbour pixel
+  // w - down-right neighbour pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, x2_sum = 0;
+
+  // Process horizontal and vertical correlations through the body in 4x4
+  // blocks.  This excludes the final row and column and possibly one extra
+  // column depending how 3 divides into width and height
+  int32_t xy_tmp[4] = { 0 }, xz_tmp[4] = { 0 };
+  int32_t x_tmp[4] = { 0 }, x2_tmp[4] = { 0 };
+  __m128i xy_sum_32 = _mm_setzero_si128();
+  __m128i xz_sum_32 = _mm_setzero_si128();
+  __m128i x_sum_32 = _mm_setzero_si128();
+  __m128i x2_sum_32 = _mm_setzero_si128();
+  for (int i = 0; i <= height - 4; i += 3) {
+    for (int j = 0; j <= width - 4; j += 3) {
+      horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+                             &xz_sum_32, &x_sum_32, &x2_sum_32);
+    }
+    xx_storeu_128(xy_tmp, xy_sum_32);
+    xx_storeu_128(xz_tmp, xz_sum_32);
+    xx_storeu_128(x_tmp, x_sum_32);
+    xx_storeu_128(x2_tmp, x2_sum_32);
+    xy_sum += (int64_t)xy_tmp[3] + xy_tmp[2] + xy_tmp[1];
+    xz_sum += (int64_t)xz_tmp[3] + xz_tmp[2] + xz_tmp[0];
+    x_sum += (int64_t)x_tmp[3] + x_tmp[2] + x_tmp[1] + x_tmp[0];
+    x2_sum += (int64_t)x2_tmp[2] + x2_tmp[1] + x2_tmp[0];
+    xy_sum_32 = _mm_setzero_si128();
+    xz_sum_32 = _mm_setzero_si128();
+    x_sum_32 = _mm_setzero_si128();
+    x2_sum_32 = _mm_setzero_si128();
+  }
+
+  // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+  int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+  // Do we have 2 rows remaining or just the one?  Note that width and height
+  // are powers of 2, so each modulo 3 must be 1 or 2.
+  if (height % 3 == 1) {  // Just horiz corrs on the final row
+    const int16_t x0 = diff[(height - 1) * stride];
+    x_sum += x0;
+    x_finalrow += x0;
+    x2_sum += x0 * x0;
+    x2_finalrow += x0 * x0;
+    for (int j = 0; j < width - 1; ++j) {
+      const int16_t x = diff[(height - 1) * stride + j];
+      const int16_t y = diff[(height - 1) * stride + j + 1];
+      xy_sum += x * y;
+      x_sum += y;
+      x2_sum += y * y;
+      x_finalrow += y;
+      x2_finalrow += y * y;
+    }
+  } else {  // Two rows remaining to do
+    const int16_t x0 = diff[(height - 2) * stride];
+    const int16_t z0 = diff[(height - 1) * stride];
+    x_sum += x0 + z0;
+    x2_sum += x0 * x0 + z0 * z0;
+    x_finalrow += z0;
+    x2_finalrow += z0 * z0;
+    for (int j = 0; j < width - 1; ++j) {
+      const int16_t x = diff[(height - 2) * stride + j];
+      const int16_t y = diff[(height - 2) * stride + j + 1];
+      const int16_t z = diff[(height - 1) * stride + j];
+      const int16_t w = diff[(height - 1) * stride + j + 1];
+
+      // Horizontal and vertical correlations for the penultimate row:
+      xy_sum += x * y;
+      xz_sum += x * z;
+
+      // Now just horizontal correlations for the final row:
+      xy_sum += z * w;
+
+      x_sum += y + w;
+      x2_sum += y * y + w * w;
+      x_finalrow += w;
+      x2_finalrow += w * w;
+    }
+  }
+
+  // Do we have 2 columns remaining or just the one?
+  if (width % 3 == 1) {  // Just vert corrs on the final col
+    const int16_t x0 = diff[width - 1];
+    x_sum += x0;
+    x_finalcol += x0;
+    x2_sum += x0 * x0;
+    x2_finalcol += x0 * x0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 1];
+      xz_sum += x * z;
+      x_finalcol += z;
+      x2_finalcol += z * z;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z;
+        x2_sum += z * z;
+      }
+    }
+  } else {  // Two cols remaining
+    const int16_t x0 = diff[width - 2];
+    const int16_t y0 = diff[width - 1];
+    x_sum += x0 + y0;
+    x2_sum += x0 * x0 + y0 * y0;
+    x_finalcol += y0;
+    x2_finalcol += y0 * y0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 2];
+      const int16_t y = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 2];
+      const int16_t w = diff[(i + 1) * stride + width - 1];
+
+      // Horizontal and vertical correlations for the penultimate col:
+      // Skip these on the last iteration of this loop if we also had two
+      // rows remaining, otherwise the final horizontal and vertical correlation
+      // get erroneously processed twice
+      if (i < height - 2 || height % 3 == 1) {
+        xy_sum += x * y;
+        xz_sum += x * z;
+      }
+
+      x_finalcol += w;
+      x2_finalcol += w * w;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z + w;
+        x2_sum += z * z + w * w;
+      }
+
+      // Now just vertical correlations for the final column:
+      xz_sum += y * w;
+    }
+  }
+
+  // Calculate the simple sums and squared-sums
+  int64_t x_firstrow = 0, x_firstcol = 0;
+  int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+  for (int j = 0; j < width; ++j) {
+    x_firstrow += diff[j];
+    x2_firstrow += diff[j] * diff[j];
+  }
+  for (int i = 0; i < height; ++i) {
+    x_firstcol += diff[i * stride];
+    x2_firstcol += diff[i * stride] * diff[i * stride];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c
new file mode 100644
index 0000000000..a492483721
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
+                             int subpel_x_q3, int subpel_y_q3,
+                             const uint8_t *ref, int ref_stride,
+                             int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
+  // 2-tap yet.
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    if (width >= 16) {
+      int i;
+      assert(!(width & 15));
+      /*Read 16 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 16) {
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
+          comp_pred += 16;
+          ref += 16;
+        }
+        ref += ref_stride - width;
+      }
+    } else if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      assert(!(height & 1));
+      /*Read 8 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
+        comp_pred += 16;
+        ref += 2 * ref_stride;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      assert(!(height & 3));
+      /*Read 4 pixels four rows at a time.*/
+      for (i = 0; i < height; i++) {
+        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
+        comp_pred += 16;
+        ref += 4 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+    uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                    ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                    : temp;
+    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+                        kernel_x, 16, NULL, -1, width, intermediate_height);
+    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+                       kernel_y, 16, width, height);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+                                                    const __m128i *w0,
+                                                    const __m128i *w1,
+                                                    const __m128i *r,
+                                                    void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred8, int width, int height,
+                                    int subpel_x_q3, int subpel_y_q3,
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      /*Read 8 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          comp_pred += 8;
+          ref += 8;
+        }
+        ref += ref_stride - width;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      /*Read 4 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
+        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
+        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        comp_pred += 8;
+        ref += 2 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+                               NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                              kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
+    uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                     : temp;
+    uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(
+        ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
+        MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
+    aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
+                              comp_pred8, width, NULL, -1, kernel_y, 16, width,
+                              height, bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
+  assert(!(width * height & 7));
+  int n = width * height >> 3;
+  for (int i = 0; i < n; i++) {
+    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
+    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  int n;
+  int i;
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  assert(!(width * height & 7));
+  n = width * height >> 3;
+
+  const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+  const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set1_epi16(wt0);
+  const __m128i w1 = _mm_set1_epi16(wt1);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
+
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred16);
+    __m128i p1 = xx_loadu_128(pred);
+
+    highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+  for (i = 0; i < n; i++) {
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
+    comp_pred += 16;
+    pred += 16;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c
new file mode 100644
index 0000000000..df7aa95855
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+  const int8_t w1 = (int8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c
new file mode 100644
index 0000000000..752d6f3f0b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 2)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+  { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+  { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+  { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+  { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = {
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
+};
+
+#define CALC_X_GRADIENT(AC, GI, DF, out) \
+  out = _mm256_abs_epi16(                \
+      _mm256_add_epi16(_mm256_add_epi16(AC, GI), _mm256_slli_epi16(DF, 1)));
+
+#define CALC_Y_GRADIENT(AC, GI, BH, out) \
+  out = _mm256_abs_epi16(                \
+      _mm256_add_epi16(_mm256_sub_epi16(AC, GI), _mm256_slli_epi16(BH, 1)));
+
+double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height,
+                                                 int width, int stride,
+                                                 int edge_thresh) {
+  int count = 0;
+  int64_t accum = 0;
+  // w32 stores width multiple of 32.
+  const int w32 = (width - 1) & ~0x1f;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i edge_threshold = _mm256_set1_epi16(edge_thresh);
+  __m256i num_accumulator = zero;
+  __m256i sum_accumulator = zero;
+
+  //  A | B | C
+  //  D | E | F
+  //  G | H | I
+  // g_x = (A - C) + (G - I) + 2*(D - F)
+  // g_y = (A + C) - (G + I) + 2*(B - H)
+  // v   = 4*E - 2*(D+F+B+H) + (A+C+G+I)
+
+  // Process the width multiple of 32 here.
+  for (int w = 1; w < w32; w += 32) {
+    int h = 1;
+    const int start_idx = h * stride + w;
+    const int stride_0 = start_idx - stride;
+
+    __m256i num_accum_row_lvl = zero;
+    const __m256i A = _mm256_loadu_si256((__m256i *)(&src[stride_0 - 1]));
+    const __m256i C = _mm256_loadu_si256((__m256i *)(&src[stride_0 + 1]));
+    const __m256i D = _mm256_loadu_si256((__m256i *)(&src[start_idx - 1]));
+    const __m256i F = _mm256_loadu_si256((__m256i *)(&src[start_idx + 1]));
+    __m256i B = _mm256_loadu_si256((__m256i *)(&src[stride_0]));
+    __m256i E = _mm256_loadu_si256((__m256i *)(&src[start_idx]));
+
+    const __m256i A_lo = _mm256_unpacklo_epi8(A, zero);
+    const __m256i A_hi = _mm256_unpackhi_epi8(A, zero);
+    const __m256i C_lo = _mm256_unpacklo_epi8(C, zero);
+    const __m256i C_hi = _mm256_unpackhi_epi8(C, zero);
+    const __m256i D_lo = _mm256_unpacklo_epi8(D, zero);
+    const __m256i D_hi = _mm256_unpackhi_epi8(D, zero);
+    const __m256i F_lo = _mm256_unpacklo_epi8(F, zero);
+    const __m256i F_hi = _mm256_unpackhi_epi8(F, zero);
+
+    __m256i sub_AC_lo = _mm256_sub_epi16(A_lo, C_lo);
+    __m256i sub_AC_hi = _mm256_sub_epi16(A_hi, C_hi);
+    __m256i sum_AC_lo = _mm256_add_epi16(A_lo, C_lo);
+    __m256i sum_AC_hi = _mm256_add_epi16(A_hi, C_hi);
+    __m256i sub_DF_lo = _mm256_sub_epi16(D_lo, F_lo);
+    __m256i sub_DF_hi = _mm256_sub_epi16(D_hi, F_hi);
+    __m256i sum_DF_lo = _mm256_add_epi16(D_lo, F_lo);
+    __m256i sum_DF_hi = _mm256_add_epi16(D_hi, F_hi);
+
+    for (; h < height - 1; h++) {
+      __m256i sum_GI_lo, sub_GI_lo, sum_GI_hi, sub_GI_hi, gx_lo, gy_lo, gx_hi,
+          gy_hi;
+      const int k = h * stride + w;
+      const __m256i G = _mm256_loadu_si256((__m256i *)(&src[k + stride - 1]));
+      const __m256i H = _mm256_loadu_si256((__m256i *)(&src[k + stride]));
+      const __m256i I = _mm256_loadu_si256((__m256i *)(&src[k + stride + 1]));
+
+      const __m256i B_lo = _mm256_unpacklo_epi8(B, zero);
+      const __m256i B_hi = _mm256_unpackhi_epi8(B, zero);
+      const __m256i G_lo = _mm256_unpacklo_epi8(G, zero);
+      const __m256i G_hi = _mm256_unpackhi_epi8(G, zero);
+      const __m256i I_lo = _mm256_unpacklo_epi8(I, zero);
+      const __m256i I_hi = _mm256_unpackhi_epi8(I, zero);
+      const __m256i H_lo = _mm256_unpacklo_epi8(H, zero);
+      const __m256i H_hi = _mm256_unpackhi_epi8(H, zero);
+
+      sub_GI_lo = _mm256_sub_epi16(G_lo, I_lo);
+      sub_GI_hi = _mm256_sub_epi16(G_hi, I_hi);
+      sum_GI_lo = _mm256_add_epi16(G_lo, I_lo);
+      sum_GI_hi = _mm256_add_epi16(G_hi, I_hi);
+      const __m256i sub_BH_lo = _mm256_sub_epi16(B_lo, H_lo);
+      const __m256i sub_BH_hi = _mm256_sub_epi16(B_hi, H_hi);
+
+      CALC_X_GRADIENT(sub_AC_lo, sub_GI_lo, sub_DF_lo, gx_lo)
+      CALC_Y_GRADIENT(sum_AC_lo, sum_GI_lo, sub_BH_lo, gy_lo)
+
+      const __m256i ga_lo = _mm256_add_epi16(gx_lo, gy_lo);
+
+      CALC_X_GRADIENT(sub_AC_hi, sub_GI_hi, sub_DF_hi, gx_hi)
+      CALC_Y_GRADIENT(sum_AC_hi, sum_GI_hi, sub_BH_hi, gy_hi)
+
+      const __m256i ga_hi = _mm256_add_epi16(gx_hi, gy_hi);
+
+      __m256i cmp_lo = _mm256_cmpgt_epi16(edge_threshold, ga_lo);
+      __m256i cmp_hi = _mm256_cmpgt_epi16(edge_threshold, ga_hi);
+      const __m256i comp_reg = _mm256_add_epi16(cmp_lo, cmp_hi);
+
+      // v = 4*E -2*(D+F+B+H) + (A+C+G+I)
+      if (_mm256_movemask_epi8(comp_reg) != 0) {
+        const __m256i sum_BH_lo = _mm256_add_epi16(B_lo, H_lo);
+        const __m256i sum_BH_hi = _mm256_add_epi16(B_hi, H_hi);
+
+        // 2*(D+F+B+H)
+        const __m256i sum_DFBH_lo =
+            _mm256_slli_epi16(_mm256_add_epi16(sum_DF_lo, sum_BH_lo), 1);
+        // (A+C+G+I)
+        const __m256i sum_ACGI_lo = _mm256_add_epi16(sum_AC_lo, sum_GI_lo);
+        const __m256i sum_DFBH_hi =
+            _mm256_slli_epi16(_mm256_add_epi16(sum_DF_hi, sum_BH_hi), 1);
+        const __m256i sum_ACGI_hi = _mm256_add_epi16(sum_AC_hi, sum_GI_hi);
+
+        // Convert E register values from 8bit to 16bit
+        const __m256i E_lo = _mm256_unpacklo_epi8(E, zero);
+        const __m256i E_hi = _mm256_unpackhi_epi8(E, zero);
+
+        // 4*E - 2*(D+F+B+H)+ (A+C+G+I)
+        const __m256i var_lo_0 = _mm256_abs_epi16(_mm256_add_epi16(
+            _mm256_sub_epi16(_mm256_slli_epi16(E_lo, 2), sum_DFBH_lo),
+            sum_ACGI_lo));
+        const __m256i var_hi_0 = _mm256_abs_epi16(_mm256_add_epi16(
+            _mm256_sub_epi16(_mm256_slli_epi16(E_hi, 2), sum_DFBH_hi),
+            sum_ACGI_hi));
+        cmp_lo = _mm256_srli_epi16(cmp_lo, 15);
+        cmp_hi = _mm256_srli_epi16(cmp_hi, 15);
+        const __m256i var_lo = _mm256_mullo_epi16(var_lo_0, cmp_lo);
+        const __m256i var_hi = _mm256_mullo_epi16(var_hi_0, cmp_hi);
+
+        num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_lo);
+        num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_hi);
+
+        sum_accumulator = _mm256_add_epi32(sum_accumulator,
+                                           _mm256_unpacklo_epi16(var_lo, zero));
+        sum_accumulator = _mm256_add_epi32(sum_accumulator,
+                                           _mm256_unpackhi_epi16(var_lo, zero));
+        sum_accumulator = _mm256_add_epi32(sum_accumulator,
+                                           _mm256_unpacklo_epi16(var_hi, zero));
+        sum_accumulator = _mm256_add_epi32(sum_accumulator,
+                                           _mm256_unpackhi_epi16(var_hi, zero));
+      }
+      sub_AC_lo = sub_DF_lo;
+      sub_AC_hi = sub_DF_hi;
+      sub_DF_lo = sub_GI_lo;
+      sub_DF_hi = sub_GI_hi;
+      sum_AC_lo = sum_DF_lo;
+      sum_AC_hi = sum_DF_hi;
+      sum_DF_lo = sum_GI_lo;
+      sum_DF_hi = sum_GI_hi;
+      B = E;
+      E = H;
+    }
+    const __m256i num_0 = _mm256_unpacklo_epi16(num_accum_row_lvl, zero);
+    const __m256i num_1 = _mm256_unpackhi_epi16(num_accum_row_lvl, zero);
+    num_accumulator =
+        _mm256_add_epi32(num_accumulator, _mm256_add_epi32(num_0, num_1));
+  }
+
+  // Process the remaining width here.
+  for (int h = 1; h < height - 1; ++h) {
+    for (int w = w32 + 1; w < width - 1; ++w) {
+      const int k = h * stride + w;
+
+      // Compute sobel gradients
+      const int g_x = (src[k - stride - 1] - src[k - stride + 1]) +
+                      (src[k + stride - 1] - src[k + stride + 1]) +
+                      2 * (src[k - 1] - src[k + 1]);
+      const int g_y = (src[k - stride - 1] - src[k + stride - 1]) +
+                      (src[k - stride + 1] - src[k + stride + 1]) +
+                      2 * (src[k - stride] - src[k + stride]);
+      const int ga = abs(g_x) + abs(g_y);
+
+      if (ga < edge_thresh) {
+        // Find Laplacian
+        const int v =
+            4 * src[k] -
+            2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
+            (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
+             src[k + stride + 1]);
+        accum += abs(v);
+        ++count;
+      }
+    }
+  }
+
+  // s0 s1 n0 n1 s2 s3 n2 n3
+  __m256i sum_avx = _mm256_hadd_epi32(sum_accumulator, num_accumulator);
+  __m128i sum_avx_lo = _mm256_castsi256_si128(sum_avx);
+  __m128i sum_avx_hi = _mm256_extractf128_si256(sum_avx, 1);
+  // s0+s2 s1+s3 n0+n2 n1+n3
+  __m128i sum_avx_1 = _mm_add_epi32(sum_avx_lo, sum_avx_hi);
+  // s0+s2+s1+s3 n0+n2+n1+n3
+  __m128i result = _mm_add_epi32(_mm_srli_si128(sum_avx_1, 4), sum_avx_1);
+
+  accum += _mm_cvtsi128_si32(result);
+  count += _mm_extract_epi32(result, 2);
+
+  // If very few smooth pels, return -1 since the estimate is unreliable.
+  return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint16_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+  for (int i = 0; i < block_height; i++) {
+    __m128i vf1_128, vf2_128;
+    __m256i vf1, vf2, vdiff1, vsqdiff1;
+
+    vf1_128 = _mm_loadu_si128((__m128i *)(src1));
+    vf2_128 = _mm_loadu_si128((__m128i *)(src2));
+    vf1 = _mm256_cvtepu8_epi16(vf1_128);
+    vf2 = _mm256_cvtepu8_epi16(vf2_128);
+    vdiff1 = _mm256_sub_epi16(vf1, vf2);
+    vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+
+    _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(int *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride, src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint16_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+  for (int i = 0; i < block_height; i++) {
+    __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
+
+    vsrc1 = _mm256_loadu_si256((__m256i *)src1);
+    vsrc2 = _mm256_loadu_si256((__m256i *)src2);
+    vmax = _mm256_max_epu8(vsrc1, vsrc2);
+    vmin = _mm256_min_epu8(vsrc1, vsrc2);
+    vdiff = _mm256_subs_epu8(vmax, vmin);
+
+    __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
+    __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
+    vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
+    vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
+
+    vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+    vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
+    _mm256_storeu_si256((__m256i *)(dst), vres1);
+    _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(int *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride;
+    src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col,
+                                                int block_width) {
+  __m128i v128tmp = _mm_loadu_si128((__m128i *)(src));
+  if (col == 0) {
+    // For the first column, replicate the first element twice to the left
+    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]);
+  }
+  if (col == block_width - 4) {
+    // For the last column, replicate the last element twice to the right
+    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]);
+  }
+  return _mm256_cvtepu16_epi32(v128tmp);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+  // Mask the required 5 values inside the vector
+  __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+  __m128i v128a, v128b;
+  // Extract 256b as two 128b registers A and B
+  v128a = _mm256_castsi256_si128(vtmp);
+  v128b = _mm256_extracti128_si256(vtmp, 1);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A2+B2, A3+B3, 0, 0]
+  v128b = _mm_srli_si128(v128a, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  v128b = _mm_srli_si128(v128a, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  return _mm_extract_epi32(v128a, 0);
+}
+
+// AVX2 implementation of approx_exp()
+static AOM_INLINE __m256 approx_exp_avx2(__m256 y) {
+#define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
+#define B \
+  127  // Offset for the exponent according to IEEE floating point standard.
+#define C 60801  // Magic number controls the accuracy of approximation
+  const __m256 multiplier = _mm256_set1_ps(A);
+  const __m256i offset = _mm256_set1_epi32(B * (1 << 23) - C);
+
+  y = _mm256_mul_ps(y, multiplier);
+  y = _mm256_castsi256_ps(_mm256_add_epi32(_mm256_cvttps_epi32(y), offset));
+  return y;
+#undef A
+#undef B
+#undef C
+}
+
+static void apply_temporal_filter(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint16_t *frame_sse, uint32_t *luma_sse_sum,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor,
+    int tf_wgt_calc_lvl) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_sse[BH][BW];
+
+  if (block_width == 32) {
+    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  } else {
+    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  }
+
+  __m256i vsrc[5];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = xx_load_and_pad(src, col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Copy first row to first 2 vectors
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (int row = 0; row < block_height; row++) {
+      __m256i vsum = _mm256_setzero_si256();
+
+      // Add 5 consecutive rows
+      for (int i = 0; i < 5; i++) {
+        vsum = _mm256_add_epi32(vsum, vsrc[i]);
+      }
+
+      // Push all elements by one element to the top
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      // Load next row to the last element
+      if (row <= block_height - 4) {
+        vsrc[4] = xx_load_and_pad(src, col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        vsrc[4] = vsrc[3];
+      }
+
+      // Accumulate the sum horizontally
+      for (int i = 0; i < 4; i++) {
+        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i);
+      }
+    }
+  }
+
+  double subblock_mses_scaled[4];
+  double d_factor_decayed[4];
+  for (int idx = 0; idx < 4; idx++) {
+    subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+    d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+  }
+  if (tf_wgt_calc_lvl == 0) {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+        const double combined_error =
+            weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+        double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  } else {
+    __m256d subblock_mses_reg[4];
+    __m256d d_factor_mul_n_decay_qr_invs[4];
+    const __m256 zero = _mm256_set1_ps(0.0f);
+    const __m256 point_five = _mm256_set1_ps(0.5f);
+    const __m256 seven = _mm256_set1_ps(7.0f);
+    const __m256d inv_num_ref_pixel_256bit = _mm256_set1_pd(inv_num_ref_pixels);
+    const __m256d weight_factor_256bit = _mm256_set1_pd(weight_factor);
+    const __m256 tf_weight_scale = _mm256_set1_ps((float)TF_WEIGHT_SCALE);
+    // Maintain registers to hold mse and d_factor at subblock level.
+    subblock_mses_reg[0] = _mm256_set1_pd(subblock_mses_scaled[0]);
+    subblock_mses_reg[1] = _mm256_set1_pd(subblock_mses_scaled[1]);
+    subblock_mses_reg[2] = _mm256_set1_pd(subblock_mses_scaled[2]);
+    subblock_mses_reg[3] = _mm256_set1_pd(subblock_mses_scaled[3]);
+    d_factor_mul_n_decay_qr_invs[0] = _mm256_set1_pd(d_factor_decayed[0]);
+    d_factor_mul_n_decay_qr_invs[1] = _mm256_set1_pd(d_factor_decayed[1]);
+    d_factor_mul_n_decay_qr_invs[2] = _mm256_set1_pd(d_factor_decayed[2]);
+    d_factor_mul_n_decay_qr_invs[3] = _mm256_set1_pd(d_factor_decayed[3]);
+
+    for (int i = 0; i < block_height; i++) {
+      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      uint32_t *luma_sse_sum_temp = luma_sse_sum + i * BW;
+      for (int j = 0; j < block_width; j += 8) {
+        const __m256i acc_sse =
+            _mm256_lddqu_si256((__m256i *)(acc_5x5_sse[i] + j));
+        const __m256i luma_sse =
+            _mm256_lddqu_si256((__m256i *)((luma_sse_sum_temp + j)));
+
+        // uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+        const __m256i diff_sse = _mm256_add_epi32(acc_sse, luma_sse);
+
+        const __m256d diff_sse_pd_1 =
+            _mm256_cvtepi32_pd(_mm256_castsi256_si128(diff_sse));
+        const __m256d diff_sse_pd_2 =
+            _mm256_cvtepi32_pd(_mm256_extracti128_si256(diff_sse, 1));
+
+        // const double window_error = diff_sse * inv_num_ref_pixels;
+        const __m256d window_error_1 =
+            _mm256_mul_pd(diff_sse_pd_1, inv_num_ref_pixel_256bit);
+        const __m256d window_error_2 =
+            _mm256_mul_pd(diff_sse_pd_2, inv_num_ref_pixel_256bit);
+
+        // const int subblock_idx = y_blk_raster_offset + (j >= block_width /
+        // 2);
+        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+        const __m256d blk_error = subblock_mses_reg[subblock_idx];
+
+        // const double combined_error =
+        // weight_factor *window_error + subblock_mses_scaled[subblock_idx];
+        const __m256d combined_error_1 = _mm256_add_pd(
+            _mm256_mul_pd(window_error_1, weight_factor_256bit), blk_error);
+
+        const __m256d combined_error_2 = _mm256_add_pd(
+            _mm256_mul_pd(window_error_2, weight_factor_256bit), blk_error);
+
+        // d_factor_decayed[subblock_idx]
+        const __m256d d_fact_mul_n_decay =
+            d_factor_mul_n_decay_qr_invs[subblock_idx];
+
+        // double scaled_error = combined_error *
+        // d_factor_decayed[subblock_idx];
+        const __m256d scaled_error_1 =
+            _mm256_mul_pd(combined_error_1, d_fact_mul_n_decay);
+        const __m256d scaled_error_2 =
+            _mm256_mul_pd(combined_error_2, d_fact_mul_n_decay);
+
+        const __m128 scaled_error_ps_1 = _mm256_cvtpd_ps(scaled_error_1);
+        const __m128 scaled_error_ps_2 = _mm256_cvtpd_ps(scaled_error_2);
+
+        const __m256 scaled_error_ps = _mm256_insertf128_ps(
+            _mm256_castps128_ps256(scaled_error_ps_1), scaled_error_ps_2, 0x1);
+
+        // scaled_error = AOMMIN(scaled_error, 7);
+        const __m256 scaled_diff_ps = _mm256_min_ps(scaled_error_ps, seven);
+        const __m256 minus_scaled_diff_ps = _mm256_sub_ps(zero, scaled_diff_ps);
+        // const int weight =
+        //(int)(approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE + 0.5f);
+        const __m256 exp_result = approx_exp_avx2(minus_scaled_diff_ps);
+        const __m256 scale_weight_exp_result =
+            _mm256_mul_ps(exp_result, tf_weight_scale);
+        const __m256 round_result =
+            _mm256_add_ps(scale_weight_exp_result, point_five);
+        __m256i weights_in_32bit = _mm256_cvttps_epi32(round_result);
+
+        __m128i weights_in_16bit =
+            _mm_packus_epi32(_mm256_castsi256_si128(weights_in_32bit),
+                             _mm256_extractf128_si256(weights_in_32bit, 0x1));
+
+        // count[k] += weight;
+        // accumulator[k] += weight * pixel_value;
+        const int stride_idx = i * stride2 + j;
+        const __m128i count_array =
+            _mm_loadu_si128((__m128i *)(count + stride_idx));
+        _mm_storeu_si128((__m128i *)(count + stride_idx),
+                         _mm_add_epi16(count_array, weights_in_16bit));
+
+        const __m256i accumulator_array =
+            _mm256_loadu_si256((__m256i *)(accumulator + stride_idx));
+        const __m128i pred_values =
+            _mm_loadl_epi64((__m128i *)(frame2 + stride_idx));
+
+        const __m256i pred_values_u32 = _mm256_cvtepu8_epi32(pred_values);
+        const __m256i mull_frame2_weight_u32 =
+            _mm256_mullo_epi32(pred_values_u32, weights_in_32bit);
+        _mm256_storeu_si256(
+            (__m256i *)(accumulator + stride_idx),
+            _mm256_add_epi32(accumulator_array, mull_frame2_weight_u32));
+      }
+    }
+  }
+}
+
+void av1_apply_temporal_filter_avx2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with avx2!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+            }
+          }
+        }
+      }
+    }
+
+    apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+                          plane_w, plane_h, subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_sse, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor, tf_wgt_calc_lvl);
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c
new file mode 100644
index 0000000000..842d3b13c8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+  { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint8_t *frame1, const unsigned int stride,
+                              const uint8_t *frame2, const unsigned int stride2,
+                              const int block_width, const int block_height,
+                              uint16_t *frame_sse,
+                              const unsigned int dst_stride) {
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+
+  for (int i = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j += 16) {
+      // Set zero to uninitialized memory to avoid uninitialized loads later
+      *(int *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+      __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+      __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+      __m128i vmax = _mm_max_epu8(vsrc1, vsrc2);
+      __m128i vmin = _mm_min_epu8(vsrc1, vsrc2);
+      __m128i vdiff = _mm_subs_epu8(vmax, vmin);
+
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero);
+      __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero);
+
+      __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1);
+      __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2);
+
+      _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+      _mm_storeu_si128((__m128i *)(dst + j + 10), vres2);
+    }
+
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(int *)(dst + block_width + 2) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride;
+    src2 += stride2;
+    dst += dst_stride;
+  }
+}
+
+static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col,
+                            int block_width) {
+  __m128i vtmp = _mm_loadu_si128((__m128i *)src);
+  __m128i vzero = _mm_setzero_si128();
+  __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero);
+  __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero);
+  // For the first column, replicate the first element twice to the left
+  dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+  // For the last column, replicate the last element twice to the right
+  dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+  __m128i veca, vecb;
+  // Mask and obtain the required 5 values inside the vector
+  veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+  vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A2+B2, A3+B3, 0, 0]
+  vecb = _mm_srli_si128(veca, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  vecb = _mm_srli_si128(veca, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  return _mm_cvtsi128_si32(veca);
+}
+
+static void apply_temporal_filter(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint16_t *frame_sse, uint32_t *luma_sse_sum,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor,
+    int tf_wgt_calc_lvl) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_sse[BH][BW];
+
+  get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+                    frame_sse, SSE_STRIDE);
+
+  __m128i vsrc[5][2];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint16_t *src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      xx_load_and_pad(src, vsrc[i], col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Padding for top 2 rows
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (int row = 0; row < block_height; row++) {
+      __m128i vsum1 = _mm_setzero_si128();
+      __m128i vsum2 = _mm_setzero_si128();
+
+      // Add 5 consecutive rows
+      for (int i = 0; i < 5; i++) {
+        vsum1 = _mm_add_epi32(vsrc[i][0], vsum1);
+        vsum2 = _mm_add_epi32(vsrc[i][1], vsum2);
+      }
+
+      // Push all elements by one element to the top
+      for (int i = 0; i < 4; i++) {
+        vsrc[i][0] = vsrc[i + 1][0];
+        vsrc[i][1] = vsrc[i + 1][1];
+      }
+
+      if (row <= block_height - 4) {
+        // Load next row
+        xx_load_and_pad(src, vsrc[4], col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        // Padding for bottom 2 rows
+        vsrc[4][0] = vsrc[3][0];
+        vsrc[4][1] = vsrc[3][1];
+      }
+
+      // Accumulate the sum horizontally
+      for (int i = 0; i < 4; i++) {
+        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i);
+      }
+    }
+  }
+
+  double subblock_mses_scaled[4];
+  double d_factor_decayed[4];
+  for (int idx = 0; idx < 4; idx++) {
+    subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+    d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+  }
+  if (tf_wgt_calc_lvl == 0) {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+        const double combined_error =
+            weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+        double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  } else {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+        const double combined_error =
+            weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+        double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  }
+}
+
+void av1_apply_temporal_filter_sse2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with sse2!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+                          plane_w, plane_h, subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_sse, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor, tf_wgt_calc_lvl);
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
new file mode 100644
index 0000000000..9cde860534
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+
+  uint64_t csse;
+
+  const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
+  const __m256i v_zext_q = yy_set1_64_from_32i(~0);
+
+  __m256i v_acc0_q = _mm256_setzero_si256();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
+    const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
+    const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
+
+    const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
+
+    const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
+
+    const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
+
+    const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
+
+    const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
+
+    const __m256i v_sum0_q = _mm256_add_epi64(
+        _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
+
+    v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
+
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
+  __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
+  __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
+  v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+#if AOM_ARCH_X86_64
+  csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+  xx_storel_64(&csse, v_acc_q_0);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+                                          int N, int64_t limit) {
+  int64_t acc;
+  __m256i v_acc0_d = _mm256_setzero_si256();
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
+    const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
+
+    const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
+    const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
+    const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
+    const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
+
+    const __m256i v_m0_w =
+        _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
+    const __m256i v_m1_w =
+        _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
+    const __m256i v_m2_w =
+        _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
+    const __m256i v_m3_w =
+        _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
+
+    const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
+    const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
+    const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
+    const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
+
+    const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
+    const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
+
+    const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
+
+    v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
+  v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
+                              _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
+
+  __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
+  __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
+  v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+
+#if AOM_ARCH_X86_64
+  acc = _mm_extract_epi64(v_acc_q_0, 0);
+#else
+  xx_storel_64(&acc, v_acc_q_0);
+#endif
+
+  return acc > limit;
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m256i v_neg_w = _mm256_set1_epi32((int)0xffff0001);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
+    const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
+    const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
+    const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
+    const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
+    const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
+    const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
+    const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
+
+    const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
+    const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
+    const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
+    const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
+    const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
+    const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
+    const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
+    const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
+
+    const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
+
+    _mm256_store_si256((__m256i *)(d), v_r0_w);
+    _mm256_store_si256((__m256i *)(d + 16), v_r1_w);
+    _mm256_store_si256((__m256i *)(d + 32), v_r2_w);
+    _mm256_store_si256((__m256i *)(d + 48), v_r3_w);
+
+    a += 64;
+    b += 64;
+    d += 64;
+    N -= 64;
+  } while (N);
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 0000000000..d7ac2223f2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+  int n8 = n + 8;
+
+  uint64_t csse;
+
+  const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+  const __m128i v_zext_q = xx_set1_64_from_32i(~0);
+
+  __m128i v_acc0_q = _mm_setzero_si128();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m128i v_r0_w = xx_load_128(r1 + n);
+    const __m128i v_r1_w = xx_load_128(r1 + n8);
+    const __m128i v_d0_w = xx_load_128(d + n);
+    const __m128i v_d1_w = xx_load_128(d + n8);
+    const __m128i v_m01_b = xx_load_128(m + n);
+
+    const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+    const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+    const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+    const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+    const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+    const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+    const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+    const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+    const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+    const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+    const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+    const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq0_d, 32));
+    const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq1_d, 32));
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+    n8 += 16;
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if AOM_ARCH_X86_64
+  csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+  xx_storel_64(&csse, v_acc0_q);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+                                          int N, int64_t limit) {
+  int64_t acc;
+
+  __m128i v_sign_d;
+  __m128i v_acc0_d = _mm_setzero_si128();
+  __m128i v_acc1_d = _mm_setzero_si128();
+  __m128i v_acc_q;
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_m01_b = xx_load_128(m);
+    const __m128i v_m23_b = xx_load_128(m + 16);
+    const __m128i v_m45_b = xx_load_128(m + 32);
+    const __m128i v_m67_b = xx_load_128(m + 48);
+
+    const __m128i v_d0_w = xx_load_128(ds);
+    const __m128i v_d1_w = xx_load_128(ds + 8);
+    const __m128i v_d2_w = xx_load_128(ds + 16);
+    const __m128i v_d3_w = xx_load_128(ds + 24);
+    const __m128i v_d4_w = xx_load_128(ds + 32);
+    const __m128i v_d5_w = xx_load_128(ds + 40);
+    const __m128i v_d6_w = xx_load_128(ds + 48);
+    const __m128i v_d7_w = xx_load_128(ds + 56);
+
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+    const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+    const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+    const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+    const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+    const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+    const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+    const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+    const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+    const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+    const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+    const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+    const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+    const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+    const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+    const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+    v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+    v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+  v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+  v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+  v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if AOM_ARCH_X86_64
+  acc = _mm_cvtsi128_si64(v_acc_q);
+#else
+  xx_storel_64(&acc, v_acc_q);
+#endif
+
+  return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+  return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0,
+                                        (short)0xffff, 0, (short)0xffff, 0);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_a0_w = xx_load_128(a);
+    const __m128i v_b0_w = xx_load_128(b);
+    const __m128i v_a1_w = xx_load_128(a + 8);
+    const __m128i v_b1_w = xx_load_128(b + 8);
+    const __m128i v_a2_w = xx_load_128(a + 16);
+    const __m128i v_b2_w = xx_load_128(b + 16);
+    const __m128i v_a3_w = xx_load_128(a + 24);
+    const __m128i v_b3_w = xx_load_128(b + 24);
+
+    const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+    const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+    const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+    const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+    const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+    const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+    const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+    const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+    const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+    xx_store_128(d, v_r0_w);
+    xx_store_128(d + 8, v_r1_w);
+    xx_store_128(d + 16, v_r2_w);
+    xx_store_128(d + 24, v_r3_w);
+
+    a += 32;
+    b += 32;
+    d += 32;
+    N -= 32;
+  } while (N);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
commit	26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree	f435a8308119effd964b339f76abb83a57c29483 /third_party/aom/av1/encoder
parent	Initial commit. (diff)
download	firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip