13 files changed, 6066 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
new file mode 100644
index 0000000000..97f182c660
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -0,0 +1,893 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi32(a_first, b_first);
+  dist_second = _mm_sub_epi32(a_second, b_second);
+  dist_first = _mm_mullo_epi32(dist_first, dist_first);
+  dist_second = _mm_mullo_epi32(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
+
+  *sum = _mm_add_epi32(dist_reg, dist_left);
+  *sum = _mm_add_epi32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
+                                    __m128i *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
+                                    const __m128i *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
+  const __m128i weight_u32 = _mm_set1_epi32(weight);
+  const __m128i sixteen = _mm_set1_epi32(16);
+  const __m128i zero = _mm_setzero_si128();
+
+  // modifier * 3 / index;
+  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
+  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
+  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
+  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
+
+  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
+  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
+  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
+  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
+
+  // Now we have
+  //   mul_lo: 00 a1 00 a0
+  //   mul_hi: 00 a3 00 a2
+  // Unpack as 64 bit words to get even and odd elements
+  //   unpack_lo: 00 a2 00 a0
+  //   unpack_hi: 00 a3 00 a1
+  // Then we can shift and OR the results to get everything in 32-bits
+  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
+  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
+
+  // Round
+  *output = _mm_add_epi32(mul, rounding_u32);
+  *output = _mm_srl_epi32(*output, strength_u128);
+
+  // Multiply with the weight
+  *output = _mm_min_epu32(*output, sixteen);
+  *output = _mm_sub_epi32(sixteen, *output);
+  *output = _mm_mullo_epi32(*output, weight_u32);
+}
+
+static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
+                                    const __m128i *sum_0_u32,
+                                    const __m128i *sum_1_u32,
+                                    const __m128i *mul_constants_0,
+                                    const __m128i *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
+                                                 const __m128i sum_second_u32,
+                                                 const uint16_t *pred,
+                                                 uint16_t *count,
+                                                 uint32_t *accumulator) {
+  // Cast down to 16-bit ints
+  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32);
+  pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
+                                      __m128i *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
+    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
+  }
+}
+
+static void vp9_highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                     &sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_highbd_apply_temporal_filter_luma(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  vp9_highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  vp9_highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
+    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
+  __m128i y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    __m128i y_fst, y_snd;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
+  }
+
+  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_fst, mul_snd;
+
+  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  __m128i u_sum_row_fst, v_sum_row_fst;
+  __m128i u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[1]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_highbd_apply_temporal_filter_chroma(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      vp9_highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      vp9_highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  vp9_highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_sse4_1(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                        block_height, ss_x, ss_y, strength,
+                                        blk_fw, use_whole_blk, y_accum, y_count,
+                                        y_dist_ptr, u_dist_ptr, v_dist_ptr);
+
+  vp9_highbd_apply_temporal_filter_chroma(
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 0000000000..7571bfccac
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,875 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+
+  __m128i dist_first;
+
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_second = _mm_sub_epi16(a_second, b_second);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+  dist_second = _mm_mullo_epi16(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
+                                __m128i *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants,
+                                const int strength, const int rounding,
+                                const __m128i *weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = *weight;
+  const __m128i sixteen = _mm_set1_epi16(16);
+
+  // modifier * 3 / index;
+  sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+  sum = _mm_adds_epu16(sum, rounding_u16);
+  sum = _mm_srl_epi16(sum, strength_u128);
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = _mm_min_epu16(sum, sixteen);
+
+  sum = _mm_sub_epi16(sixteen, sum);
+
+  return _mm_mullo_epi16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+                                   uint16_t *count, uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
+                                           const __m128i sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+  _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
+
+  *sum = _mm_adds_epu16(dist_reg, dist_left);
+  *sum = _mm_adds_epu16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
+                              __m128i *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           __m128i *u_first, __m128i *u_second,
+                                           __m128i *v_first,
+                                           __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    read_dist_8(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
+
+    read_dist_8(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
+  }
+}
+
+// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
+// int in dst.
+static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift_right = _mm_srli_si128(*src, 2);
+
+  const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
+  const __m128i even = _mm_blend_epi16(*src, zero, 170);
+
+  *dst = _mm_add_epi32(even, odd);
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 __m128i *u_mod,
+                                                 __m128i *v_mod) {
+  __m128i y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      __m128i y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = _mm_adds_epu16(y_reg, y_tmp);
+    }
+  } else {
+    __m128i y_first, y_second;
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      __m128i y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = _mm_adds_epu16(y_first, y_tmp_0);
+      y_second = _mm_adds_epu16(y_second, y_tmp_1);
+    }
+
+    hadd_epu16(&y_first, &y_first);
+    hadd_epu16(&y_second, &y_second);
+
+    y_reg = _mm_packus_epi32(y_first, y_second);
+  }
+
+  *u_mod = _mm_adds_epu16(*u_mod, y_reg);
+  *v_mod = _mm_adds_epu16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_luma_16(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  __m128i weight_first, weight_second;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(block_width == 16);
+  (void)block_width;
+
+  // Initialize the weights
+  if (blk_fw) {
+    weight_first = _mm_set1_epi16(blk_fw[0]);
+    weight_second = _mm_set1_epi16(blk_fw[1]);
+  } else {
+    weight_first = _mm_set1_epi16(top_weight);
+    weight_second = weight_first;
+  }
+
+  // First row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        weight_first = _mm_set1_epi16(blk_fw[2]);
+        weight_second = _mm_set1_epi16(blk_fw[3]);
+      } else {
+        weight_first = _mm_set1_epi16(bottom_weight);
+        weight_second = weight_first;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+    sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+    sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                               &weight_second);
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_apply_temporal_filter_luma(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The blockwidth is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usual left-middle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      vp9_apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
+    } else {
+      vp9_apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  vp9_apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  vp9_apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_chroma_8(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+
+  __m128i weight;
+
+  __m128i mul;
+
+  __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  __m128i u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  // Initialize weight
+  if (blk_fw) {
+    weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
+                            blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]);
+  } else {
+    weight = _mm_set1_epi16(top_weight);
+  }
+
+  // First row
+  mul = _mm_load_si128((const __m128i *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = _mm_load_si128((const __m128i *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2],
+                                blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]);
+      } else {
+        weight = _mm_set1_epi16(bottom_weight);
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = _mm_load_si128((const __m128i *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_apply_temporal_filter_chroma(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      vp9_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
+    } else {
+      vp9_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_sse4_1(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                                 ss_x, ss_y, strength, blk_fw_ptr,
+                                 use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                 u_dist_ptr, v_dist_ptr);
+
+  vp9_apply_temporal_filter_chroma(
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
new file mode 100644
index 0000000000..e9943447fd
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -0,0 +1,1537 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  store_output(&out01, (output + 0 * 8));
+  store_output(&out23, (output + 1 * 8));
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+static void fdct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4(in);
+}
+
+static void fadst4_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+  const __m128i kZero = _mm_setzero_si128();
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4(in);
+}
+
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  __m128i in[4];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_4x4(input, in, stride);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit == 2) {
+    const __m128i const_rounding = _mm_set1_epi16(1);
+    res[0] = _mm_add_epi16(res[0], const_rounding);
+    res[1] = _mm_add_epi16(res[1], const_rounding);
+    res[2] = _mm_add_epi16(res[2], const_rounding);
+    res[3] = _mm_add_epi16(res[3], const_rounding);
+    res[4] = _mm_add_epi16(res[4], const_rounding);
+    res[5] = _mm_add_epi16(res[5], const_rounding);
+    res[6] = _mm_add_epi16(res[6], const_rounding);
+    res[7] = _mm_add_epi16(res[7], const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  if (bit == 1) {
+    res[0] = _mm_srai_epi16(res[0], 1);
+    res[1] = _mm_srai_epi16(res[1], 1);
+    res[2] = _mm_srai_epi16(res[2], 1);
+    res[3] = _mm_srai_epi16(res[3], 1);
+    res[4] = _mm_srai_epi16(res[4], 1);
+    res[5] = _mm_srai_epi16(res[5], 1);
+    res[6] = _mm_srai_epi16(res[6], 1);
+    res[7] = _mm_srai_epi16(res[7], 1);
+  } else {
+    res[0] = _mm_srai_epi16(res[0], 2);
+    res[1] = _mm_srai_epi16(res[1], 2);
+    res[2] = _mm_srai_epi16(res[2], 2);
+    res[3] = _mm_srai_epi16(res[3], 2);
+    res[4] = _mm_srai_epi16(res[4], 2);
+    res[5] = _mm_srai_epi16(res[5], 2);
+    res[6] = _mm_srai_epi16(res[6], 2);
+    res[7] = _mm_srai_epi16(res[7], 2);
+  }
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+                                    int stride) {
+  store_output(&res[0], (output + 0 * stride));
+  store_output(&res[1], (output + 1 * stride));
+  store_output(&res[2], (output + 2 * stride));
+  store_output(&res[3], (output + 3 * stride));
+  store_output(&res[4], (output + 4 * stride));
+  store_output(&res[5], (output + 5 * stride));
+  store_output(&res[6], (output + 6 * stride));
+  store_output(&res[7], (output + 7 * stride));
+}
+
+static void fdct8_sse2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  transpose_16bit_8x8(in, in);
+}
+
+static void fadst8_sse2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_setzero_si128();
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0 = in[7];
+  in1 = in[0];
+  in2 = in[5];
+  in3 = in[2];
+  in4 = in[3];
+  in5 = in[4];
+  in6 = in[1];
+  in7 = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  transpose_16bit_8x8(in, in);
+}
+
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  __m128i in[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_8x8(input, in, stride);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+  }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_add_epi16(p[3], t[2]);
+  s[3] = _mm_sub_epi16(p[3], t[2]);
+  s[4] = _mm_sub_epi16(p[4], t[5]);
+  s[5] = _mm_add_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v[0], v[1]);
+  in[9] = _mm_packs_epi32(v[2], v[3]);
+  in[5] = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3] = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7] = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+static void fadst16_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_setzero_si128();
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void fdct16_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_8col(in0);
+  fdct16_8col(in1);
+  transpose_16bit_16x16(in0, in1);
+}
+
+static void fadst16_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+  transpose_16bit_16x16(in0, in1);
+}
+
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m128i in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+  }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
new file mode 100644
index 0000000000..8152dce864
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+  STORE_TRAN_LOW 0, outputq, 0, 2, 3
+  STORE_TRAN_LOW 1, outputq, 8, 2, 3
+
+  RET
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
new file mode 100644
index 0000000000..5930bf491e
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -0,0 +1,327 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i vp9_denoiser_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
+    const __m128i *k_16, const __m128i *l3, const __m128i *l32,
+    const __m128i *l21, __m128i acc_diff) {
+  // Calculate differences
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  __m128i v_running_avg_y;
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+  // Clamp absolute difference to 16 to be used to get mask. Doing this
+  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+  const __m128i clamped_absdiff =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+  // Get masks for l2 l1 and l0 adjustments.
+  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+  // Get adjustments for l2, l1, and l0.
+  __m128i adj2 = _mm_and_si128(mask2, *l32);
+  const __m128i adj1 = _mm_and_si128(mask1, *l21);
+  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+  __m128i adj, padj, nadj;
+
+  // Combine the adjustments and get absolute adjustments.
+  adj2 = _mm_add_epi8(adj2, adj1);
+  adj = _mm_sub_epi8(*l3, adj2);
+  adj = _mm_andnot_si128(mask0, adj);
+  adj = _mm_or_si128(adj, adj0);
+
+  // Restore the sign and get positive and negative adjustments.
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+
+  // Calculate filtered value.
+  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Adjustments <=7, and each element in acc_diff can fit in signed
+  // char.
+  acc_diff = _mm_adds_epi8(acc_diff, padj);
+  acc_diff = _mm_subs_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
+  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+  // Calculate differences.
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+  // Clamp absolute difference to delta to get the adjustment.
+  const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+  // Restore the sign and get positive and negative adjustments.
+  __m128i padj, nadj;
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+  // Calculate filtered value.
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Accumulate the adjustments.
+  acc_diff = _mm_subs_epi8(acc_diff, padj);
+  acc_diff = _mm_adds_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int vp9_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
+                                       const uint8_t *mc_running_avg_y,
+                                       int mc_avg_y_stride,
+                                       uint8_t *running_avg_y, int avg_y_stride,
+                                       int increase_denoising, BLOCK_SIZE bs,
+                                       int motion_magnitude, int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
+
+  for (r = 0; r < b_height; ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+           width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
+                                      running_buffer[r], &k_0, &k_4, &k_8,
+                                      &k_16, &l3, &l32, &l21, acc_diff);
+    memcpy(running_avg_y, running_buffer[r], width);
+    memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = sum_diff_16x1(acc_diff);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
+          acc_diff = vp9_denoiser_adj_16x1_sse2(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
+              k_delta, acc_diff);
+          memcpy(running_avg_y, running_buffer[r], width);
+          memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
+                 width);
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = sum_diff_16x1(acc_diff);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
+static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t *running_avg_y, int avg_y_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude) {
+  int sum_diff_thresh, r, c, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  __m128i acc_diff[4][4];
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
+
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r] = _mm_setzero_si128();
+    }
+  }
+
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r >> 4] = vp9_denoiser_16x1_sse2(
+          sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
+          &l32, &l21, acc_diff[c][r >> 4]);
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
+  }
+
+  {
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
+        sum_diff = 0;
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            acc_diff[c][r >> 4] =
+                vp9_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
+                                           k_0, k_delta, acc_diff[c][r >> 4]);
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+            }
+          }
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg, int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising, BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+    return vp9_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
+                                     avg, avg_stride, increase_denoising, bs,
+                                     motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
+                                       avg, avg_stride, increase_denoising, bs,
+                                       motion_magnitude, 8);
+  } else {
+    return COPY_BLOCK;
+  }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
new file mode 100644
index 0000000000..80442e3594
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -0,0 +1,291 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
+                               const search_site_config *cfg, MV *ref_mv,
+                               uint32_t start_mv_sad, MV *best_mv,
+                               int search_param, int sad_per_bit, int *num00,
+                               const vp9_sad_fn_ptr_t *sad_fn_ptr,
+                               const MV *center_mv) {
+  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+  const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
+  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+  const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int);
+
+  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
+
+  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
+  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv =
+      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+  const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
+
+  const int ref_row = ref_mv->row;
+  const int ref_col = ref_mv->col;
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what =
+      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if VPX_ARCH_X86_64
+  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+  // Starting position
+  unsigned int best_sad = start_mv_sad;
+  int i, j, step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
+#if VPX_ARCH_X86_64
+      __m128i v_blocka[2];
+#else
+      __m128i v_blocka[1];
+#endif
+
+      // Compute the candidate motion vectors
+      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
+      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      __m128i v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
+
+      // If none of them are inside, then move on
+      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if VPX_ARCH_X86_64  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
+        __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d));
+        v_bo32_q =
+            _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d));
+        // Compute the candidate addresses
+        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
+        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
+#else  // VPX_ARCH_X86 //  sizeof(intptr_t) == 4
+        __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
+        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
+        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                         in_what_stride, (uint32_t *)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
+        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
+        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
+        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
+        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
+        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
+        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
+        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
+
+        // Note: This is a use case for vpgather in AVX2
+        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
+        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
+        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
+        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
+
+        __m128i v_cost_10_d, v_cost_32_d;
+        v_cost_10_d = _mm_cvtsi32_si128(cost0);
+        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
+        v_cost_32_d = _mm_cvtsi32_si128(cost2);
+        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
+        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
+      }
+
+      // Now add in the joint cost
+      {
+        const __m128i v_sel_d =
+            _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128());
+        const __m128i v_joint_cost_d =
+            _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d);
+        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+      v_cost_d = _mm_add_epi32(v_cost_d,
+                               _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
+      v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
+      // Add the cost to the sad
+      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        // Try speculatively on 16 bits, so we can use the minpos intrinsic
+        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
+        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
+
+        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
+        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
+
+        // If the local best value is not saturated, just use it, otherwise
+        // find the horizontal minimum again the hard way on 32 bits.
+        // This is executed rarely.
+        if (UNLIKELY(local_best_sad == 0xffff)) {
+          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
+
+          v_loval_d = v_sad_d;
+          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
+          v_hival_d = _mm_srli_si128(v_loval_d, 8);
+          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
+
+          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+          v_hival_d = _mm_srli_si128(v_loval_d, 4);
+          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
+
+          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+
+          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
+          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
+        }
+
+        // Update the global minimum if the local minimum is smaller
+        if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
+#if VPX_ARCH_X86_64
+    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+
+    if (UNLIKELY(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c
new file mode 100644
index 0000000000..99fef31d16
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+
+int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  __m256i sse_256, ssz_256;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_hi, ssz_hi;
+  __m128i sse_128, ssz_128;
+  int64_t sse;
+  const __m256i zero = _mm256_setzero_si256();
+
+  // If the block size is 16 then the results will fit in 32 bits.
+  if (block_size == 16) {
+    __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi;
+    // Load 16 elements for coeff and dqcoeff.
+    coeff_256 = load_tran_low(coeff);
+    dqcoeff_256 = load_tran_low(dqcoeff);
+    // dqcoeff - coeff
+    dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256);
+    // madd (dqcoeff - coeff)
+    dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256);
+    // madd coeff
+    coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256);
+    // Save the higher 64 bit of each 128 bit lane.
+    dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8);
+    coeff_hi = _mm256_srli_si256(coeff_256, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi);
+    coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero);
+    ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero);
+  } else {
+    int i;
+    assert(block_size % 32 == 0);
+    sse_256 = zero;
+    ssz_256 = zero;
+
+    for (i = 0; i < block_size; i += 32) {
+      __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1;
+      // Load 32 elements for coeff and dqcoeff.
+      coeff_0 = load_tran_low(coeff + i);
+      dqcoeff_0 = load_tran_low(dqcoeff + i);
+      coeff_1 = load_tran_low(coeff + i + 16);
+      dqcoeff_1 = load_tran_low(dqcoeff + i + 16);
+      // dqcoeff - coeff
+      dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0);
+      dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1);
+      // madd (dqcoeff - coeff)
+      dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0);
+      dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1);
+      // madd coeff
+      coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0);
+      coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1);
+      // Add the first madd (dqcoeff - coeff) with the second.
+      dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1);
+      // Add the first madd (coeff) with the second.
+      coeff_0 = _mm256_add_epi32(coeff_0, coeff_1);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero);
+      exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero);
+      // expand each double word of madd (coeff) to quad word
+      exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero);
+      exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero);
+      // Add each quad word of madd (dqcoeff - coeff) and madd (coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo);
+      ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi);
+      ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi);
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  ssz_hi = _mm256_srli_si256(ssz_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+  ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256),
+                          _mm256_extractf128_si256(ssz_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)(&sse), sse_128);
+
+  _mm_storel_epi64((__m128i *)(ssz), ssz_128);
+  return sse;
+}
+
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff,
+                                const tran_low_t *dqcoeff, int block_size) {
+  int i;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i sse_256 = zero;
+  __m256i sse_hi;
+  __m128i sse_128;
+  int64_t sse;
+
+  if (block_size == 16) {
+    // Load 16 elements for coeff and dqcoeff.
+    const __m256i _coeff = load_tran_low(coeff);
+    const __m256i _dqcoeff = load_tran_low(dqcoeff);
+    // dqcoeff - coeff
+    const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+    // madd (dqcoeff - coeff)
+    const __m256i error_lo = _mm256_madd_epi16(diff, diff);
+    // Save the higher 64 bit of each 128 bit lane.
+    const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    const __m256i error = _mm256_add_epi32(error_lo, error_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(error, zero);
+  } else {
+    for (i = 0; i < block_size; i += 16) {
+      // Load 16 elements for coeff and dqcoeff.
+      const __m256i _coeff = load_tran_low(coeff);
+      const __m256i _dqcoeff = load_tran_low(dqcoeff);
+      const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+      const __m256i error = _mm256_madd_epi16(diff, diff);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
+      const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
+      // Add each quad word of madd (dqcoeff - coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
+      coeff += 16;
+      dqcoeff += 16;
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)&sse, sse_128);
+  return sse;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
new file mode 100644
index 0000000000..7beec130ab
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -0,0 +1,115 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
+
+SECTION .text
+
+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+.loop:
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
+  paddd     m2, m3
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
+  paddq     m4, m0
+  punpckhdq m2, m5
+  paddq     m6, m7
+  paddq     m6, m2
+  jg .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+%if VPX_ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
+
+; Compute the sum of squared difference between two tran_low_t vectors.
+; Vectors are converted (if necessary) to int16_t for calculations.
+; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+.loop:
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
+  ; accumulate in 64bit
+  punpckldq m3, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m3
+  paddq     m4, m0
+  jnz .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if VPX_ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
new file mode 100644
index 0000000000..94506aad0f
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -0,0 +1,907 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_scale/yv12config.h"
+
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+    const uint8_t *const src, const __m128i *const mask) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+  const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+  const __m128i a_and = _mm_and_si128(a, *mask);
+  const __m128i b_and = _mm_and_si128(b, *mask);
+  return _mm_packus_epi16(a_and, b_and);
+}
+
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi16(0x00FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+      _mm_storeu_si128((__m128i *)dst, d);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi32(0x000000FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+      const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+      const __m128i d2 = _mm_packus_epi16(d0, d1);
+      _mm_storeu_si128((__m128i *)dst, d2);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+                                                  const __m128i c0c1) {
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+  const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+  // round and shift by 7 bit each 16 bit
+  const __m128i t2 = _mm_adds_epi16(t0, k_64);
+  const __m128i t3 = _mm_adds_epi16(t1, k_64);
+  const __m128i t4 = _mm_srai_epi16(t2, 7);
+  const __m128i t5 = _mm_srai_epi16(t3, 7);
+  return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[2], d[2];
+
+      // Horizontal
+      // Even rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // odd rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // Vertical
+      s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[8], d[8];
+
+      // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+      //       Here we tried to not use shuffle instructions which would be slow
+      //       on some x86 CPUs.
+
+      // Horizontal
+      // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
+      // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
+      // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
+      // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
+      // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
+      // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
+      // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
+      // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
+      s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+      s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+      s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+      s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+      s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+      s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+      // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
+      // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
+      // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
+      // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
+      // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
+      // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
+      // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
+      // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
+      d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+      d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+      d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+      d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+      d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+      d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+      d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+      d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+      // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
+      // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
+      // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
+      // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
+      // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
+      // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
+      // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
+      // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
+      s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+      s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+      s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+      s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+      s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+      s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+      // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
+      // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
+      // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
+      // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
+      d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+      d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+      d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+      d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+      d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+      d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+      // Vertical
+      d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  do {
+    load_8bit_8x8(src + 2, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[3]);
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      transpose_16bit_4x8(&s[3], &s[3]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      d[0] = _mm_packus_epi16(d[0], d[2]);
+      d[1] = _mm_packus_epi16(d[1], d[3]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+      d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+      d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      t += 8;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[1] = _mm_packus_epi16(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  do {
+    load_8bit_8x8(src + 4, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[2]);
+      // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      transpose_16bit_4x8(&s[2], &s[2]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
+
+      // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
+      d[0] = _mm_packus_epi16(d[0], d[0]);
+      d[1] = _mm_packus_epi16(d[1], d[1]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+      store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      t += 4;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+                                     __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+                                   const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = 2 * width_hor + 4;  // store 4 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[12], d[6], dd[4];
+  __m128i f0[4], f1[5], f2[5];
+  // The offset of the first row is always less than 1 pixel.
+  const int offset1_q4 = phase_scaler + 1 * step_q4;
+  const int offset2_q4 = phase_scaler + 2 * step_q4;
+  // offset_idxx indicates the pixel offset is even (0) or odd (1).
+  // It's used to choose the src offset and filter coefficient offset.
+  const int offset_idx1 = (offset1_q4 >> 4) & 1;
+  const int offset_idx2 = (offset2_q4 >> 4) & 1;
+  static const shuffle_filter_funcs kShuffleFilterFuncs[2] = {
+    shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+  };
+  static const convolve8_funcs kConvolve8Funcs[2] = {
+    convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+  };
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
+  kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+  // Sub 64 to avoid overflow.
+  // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+  // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+  // When filter phase idx is 1, the two biggest coefficients are shuffled
+  // together, and the sum of them are always no less than 128. Sub 64 here.
+  // After the subtraction, when the sum of all positive coefficients are no
+  // larger than 128, and the sum of all negative coefficients are no
+  // less than -128, there will be no overflow in the convolve8 functions.
+  f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+  f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+  f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+  // horizontal 6x8
+  do {
+    load_8bit_8x8(src, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[4]);
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // OC 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      // 0E 0F 1E 1F 2E 2F 3E 3F  4E 4F 5E 5F 6E 6F 7E 7F
+      transpose_16bit_4x8(&s[4], &s[4]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+      // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+      dd[0] = _mm_packus_epi16(d[0], d[2]);
+      dd[1] = _mm_packus_epi16(d[1], d[3]);
+      dd[2] = _mm_packus_epi16(d[4], d[4]);
+      dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      // 04 14 05 15 24 34 25 35  44 54 45 55 64 74 65 75
+      d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+      d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+      d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      // 04 14 05 15 xx xx xx xx  24 34 25 35 xx xx xx xx
+      // 44 54 45 55 xx xx xx xx  64 74 65 75 xx xx xx xx
+      dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+      dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+      // 00 10 01 11 02 12 03 13  04 14 05 15 xx xx xx xx
+      // 20 30 21 31 22 32 23 33  24 34 25 35 xx xx xx xx
+      // 40 50 41 51 42 52 43 53  44 54 45 55 xx xx xx xx
+      // 60 70 61 71 62 72 63 73  64 74 65 75 xx xx xx xx
+      d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+      d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+      d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+      d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+      // store 4 extra pixels
+      storeu_8bit_16x4(d, t, stride_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      t += 12;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 3 * stride_hor + 4;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+    loadu_8bit_16x4(t, stride_hor, s);
+    y = height_ver;
+
+    do {
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 97
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 B7
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 D7
+      // E0 F0 E1 F1 E2 F2 E3 F3  E4 F4 E5 F5 E6 F6 E7 F7
+      t += 4 * stride_hor;
+      loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[2] = _mm_packus_epi16(d[2], d[3]);
+      d[4] = _mm_packus_epi16(d[4], d[5]);
+
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+      _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+      _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+      _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * 2 * height_ver / 3;
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+                                                  const __m128i *const f) {
+  __m128i ss[4], temp;
+
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+  temp = convolve8_8_ssse3(ss, f);
+  return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+                                     const int w, const __m128i *const f) {
+  int x = w;
+
+  do {
+    __m128i s[8], temp;
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+    s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+    s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+    s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+    s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+    s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+    temp = scale_1_to_2_phase_0_kernel(s, f);
+    _mm_storel_epi64((__m128i *)dst, temp);
+    src += 8;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int src_w, const int src_h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  int max_width;
+  int y;
+  uint8_t *tmp[9];
+  __m128i f[4];
+
+  max_width = (src_w + 7) & ~7;
+  tmp[0] = temp_buffer + 0 * max_width;
+  tmp[1] = temp_buffer + 1 * max_width;
+  tmp[2] = temp_buffer + 2 * max_width;
+  tmp[3] = temp_buffer + 3 * max_width;
+  tmp[4] = temp_buffer + 4 * max_width;
+  tmp[5] = temp_buffer + 5 * max_width;
+  tmp[6] = temp_buffer + 6 * max_width;
+  tmp[7] = temp_buffer + 7 * max_width;
+
+  shuffle_filter_ssse3(coef, f);
+
+  scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+  scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+  scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+  scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+  scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+  scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+  scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+  y = src_h;
+  do {
+    int x;
+    scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+    for (x = 0; x < max_width; x += 8) {
+      __m128i s[8], C, D, CD;
+
+      // Even rows
+      const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+      const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      const __m128i ab = _mm_unpacklo_epi8(a, b);
+      _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+      // Odd rows
+      // Even columns
+      load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+      C = scale_1_to_2_phase_0_kernel(s, f);
+
+      // Odd columns
+      s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+      s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+      s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+      s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+      s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+      s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+      s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+      D = scale_1_to_2_phase_0_kernel(s, f);
+
+      CD = _mm_unpacklo_epi8(C, D);
+      _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
+    }
+
+    src += src_stride;
+    dst += 2 * dst_stride;
+    tmp[8] = tmp[0];
+    tmp[0] = tmp[1];
+    tmp[1] = tmp[2];
+    tmp[2] = tmp[3];
+    tmp[3] = tmp[4];
+    tmp[4] = tmp[5];
+    tmp[5] = tmp[6];
+    tmp[6] = tmp[7];
+    tmp[7] = tmp[8];
+  } while (--y);
+}
+
+void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+                                      YV12_BUFFER_CONFIG *dst,
+                                      uint8_t filter_type, int phase_scaler) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
+  int scaled = 0;
+
+  // phase_scaler is usually 0 or 8.
+  assert(phase_scaler >= 0 && phase_scaler < 16);
+
+  if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
+    // 2 to 1
+    scaled = 1;
+
+    if (phase_scaler == 0) {
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+    } else {
+      const int buffer_stride = (dst_w + 3) & ~3;
+      const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_2_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_2_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_2_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+    // 4 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+    } else {
+      const int buffer_stride = (dst_w + 1) & ~1;
+      const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow
+      const int extra_padding = 16;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+      if (temp_buffer) {
+        scale_plane_4_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_4_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_4_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+    // 4 to 3
+    const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+    const int buffer_stride_ver = (dst_w + 7) & ~7;
+    const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+    // When the vertical filter reads more pixels than the horizontal filter
+    // generated in each row, we need extra padding to avoid heap read overflow.
+    // For example, the horizontal filter generates 18 pixels but the vertical
+    // filter reads 24 pixels in a row. The difference is multiplied by 2 since
+    // two rows are interlaced together in the optimization.
+    const int extra_padding = (buffer_stride_ver > buffer_stride_hor)
+                                  ? 2 * (buffer_stride_ver - buffer_stride_hor)
+                                  : 0;
+    const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+    uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+    if (temp_buffer) {
+      scaled = 1;
+      scale_plane_4_to_3_general(
+          src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+          dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+      scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h,
+                                 vp9_filter_kernels[filter_type], phase_scaler,
+                                 temp_buffer);
+      scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h,
+                                 vp9_filter_kernels[filter_type], phase_scaler,
+                                 temp_buffer);
+      free(temp_buffer);
+    }
+  } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
+    // 1 to 2
+    uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
+    if (temp_buffer) {
+      scaled = 1;
+      scale_plane_1_to_2_phase_0(
+          src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
+          src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+      scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, src_w / 2, src_h / 2,
+                                 vp9_filter_kernels[filter_type][8],
+                                 temp_buffer);
+      scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, src_w / 2, src_h / 2,
+                                 vp9_filter_kernels[filter_type][8],
+                                 temp_buffer);
+      free(temp_buffer);
+    }
+  }
+
+  if (scaled) {
+    vpx_extend_frame_borders(dst);
+  } else {
+    // Call c version for all other scaling ratios.
+    vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+  }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..d7aafe7b01
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bd - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i += 8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32((int32_t)0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+                         _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+                         _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+                         _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+                         _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(
+        _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i *)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error += diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
new file mode 100644
index 0000000000..e6aa71d58a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -0,0 +1,441 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+// Zero fill 8 positions in the output buffer.
+static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm256_storeu_si256((__m256i *)(a), zero);
+  _mm256_storeu_si256((__m256i *)(a + 8), zero);
+#else
+  _mm256_storeu_si256((__m256i *)(a), zero);
+#endif
+}
+
+static VPX_FORCE_INLINE void load_fp_values_avx2(
+    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) {
+  const __m256i eob_lo = eob256;
+  // Copy upper 128 to lower 128
+  const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81);
+  __m256i eob = _mm256_max_epi16(eob_lo, eob_hi);
+  __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const int32_t nzflag =
+      _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr));
+
+  if (nzflag) {
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
+}
+
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                      &dequant);
+  thr = _mm256_setzero_si256();
+
+  quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                 iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                 dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_srai_epi16(dequant, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                   iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                   dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += 8 * 2;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+// Enable this flag when matching the optimized code to
+// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the
+// existing ssse3 code and quantize_fp_32x32_nz_c().
+//
+// #define MATCH_VP9_QUANTIZE_FP_32X32_C
+
+#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C
+static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i abs_dqcoeff =
+      _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+  const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+  store_tran_low(qcoeff, qcoeff_ptr);
+  store_tran_low(dqcoeff, dqcoeff_ptr);
+
+  *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  (void)thr;
+}
+#endif
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  const int32_t nzflag = _mm256_movemask_epi8(thr_mask);
+
+  if (nzflag) {
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask);
+#else
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+#endif
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i abs_dqcoeff =
+        _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+    const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
+}
+
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                      &dequant);
+  thr = _mm256_srli_epi16(dequant, 2);
+  quant = _mm256_slli_epi16(quant, 1);
+  {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)1);
+    round = _mm256_add_epi16(round, rnd);
+    round = _mm256_srai_epi16(round, 1);
+  }
+
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1));
+  quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+#else
+  quantize_fp_32x32_16_no_nzflag(
+      &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+      qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+#endif
+
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                         iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                         dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += 8 * 2;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
+  const __m128i v = _mm_load_si128((const __m128i *)val_ptr);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(v, zero);
+  const __m128i ac = _mm_unpackhi_epi16(v, zero);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void highbd_load_fp_values(
+    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = highbd_init_256(round_ptr);
+  *quant = highbd_init_256(quant_ptr);
+  *dequant = highbd_init_256(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
+    const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
+  const __m256i packed_nz_mask =
+      _mm256_packs_epi32(nz_mask, _mm256_setzero_si256());
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const int step = 8;
+  __m256i round, quant, dequant;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                        &dequant);
+
+  highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                     iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                     dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr);
+  const __m256i tmp_rnd =
+      _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq =
+      _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+    const int16_t *iscan) {
+  const int step = 8;
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                        &dequant);
+  thr = _mm256_srli_epi32(dequant, 2);
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1));
+  quant = _mm256_slli_epi32(quant, 1);
+  round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1);
+
+  highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                           iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                           dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp_32x32(
+        &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+        qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
new file mode 100644
index 0000000000..c877234436
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -0,0 +1,125 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+
+  (void)scan;
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    // Poor man's abs().
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+      qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
new file mode 100644
index 0000000000..d35004e370
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -0,0 +1,253 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const int16_t *round_ptr, const int16_t *quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                           const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+
+  (void)scan;
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one_s16 = _mm_set1_epi16(1);
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+
+  (void)scan;
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  // The 32x32 halves round.
+  round = _mm_add_epi16(round, one_s16);
+  round = _mm_srli_epi16(round, 1);
+
+  // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so
+  // upshift quant to account for this.
+  quant = _mm_slli_epi16(quant, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  // Get the abs value of qcoeff again so we can use shifts for division.
+  qcoeff0 = _mm_abs_epi16(qcoeff0);
+  qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  // Divide by 2.
+  qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+  qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 2);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      // Get the abs value of qcoeff again so we can use shifts for division.
+      qcoeff0 = _mm_abs_epi16(qcoeff0);
+      qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      // Divide by 2.
+      qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+      qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    if (nzflag) {
+      const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    }
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}